From beb16b2c810a87b28e7b8a7aa29d26f842f654b9 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Tue, 10 Dec 2024 03:27:11 -0700
Subject: [PATCH 001/357] [Bugfix] Handle <|tool_call|> token in granite tool
 parser (#11039)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
index b5854ca39ab47..00917c866e496 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -35,11 +35,13 @@ class GraniteToolParser(ToolParser):
 
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
+        self.bot_token = "<|tool_call|>"
 
     def extract_tool_calls(
             self, model_output: str,
             request: ChatCompletionRequest) -> ExtractedToolCallInformation:
-        stripped = model_output.strip()
+        # remove whitespace and the BOT token if it exists
+        stripped = model_output.strip().removeprefix(self.bot_token).lstrip()
         if not stripped or stripped[0] != '[':
             return ExtractedToolCallInformation(tools_called=False,
                                                 tool_calls=[],

From d05f88679bedd73939251a17c3d785a354b2946c Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 10 Dec 2024 19:12:01 +0800
Subject: [PATCH 002/357] [Misc][LoRA] Add PEFTHelper  for LoRA  (#11003)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_lora_manager.py | 58 +++++++++++++++++++++++++--
 vllm/lora/lora.py               | 18 +++++++++
 vllm/lora/models.py             | 42 ++++++++------------
 vllm/lora/peft_helper.py        | 70 +++++++++++++++++++++++++++++++++
 4 files changed, 160 insertions(+), 28 deletions(-)
 create mode 100644 vllm/lora/peft_helper.py

diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 8d109b2c81503..0b76f466702fc 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -1,3 +1,4 @@
+import json
 import os
 from typing import Dict, List
 
@@ -13,6 +14,7 @@
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager,
                               LRUCacheLoRAModelManager)
+from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
                                       WorkerLoRAManager)
@@ -30,18 +32,68 @@
 ]
 
 
+def test_peft_helper(sql_lora_files):
+    lora_config_path = os.path.join(sql_lora_files, "adapter_config.json")
+    with open(lora_config_path) as f:
+        config = json.load(f)
+    peft_helper = PEFTHelper.from_dict(config)
+    assert peft_helper.r == 8
+    assert peft_helper.lora_alpha == 16
+    assert peft_helper.target_modules == [
+        "q_proj",
+        "v_proj",
+        "k_proj",
+        "o_proj",
+        "gate_proj",
+        "up_proj",
+        "down_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+
+    expected_error = "vLLM only supports modules_to_save being None."
+    with pytest.raises(ValueError, match=expected_error):
+        config = dict(
+            r=8,
+            lora_alpha=16,
+            target_modules=["gate_proj"],
+            modules_to_save=["lm_head"],
+        )
+        PEFTHelper.from_dict(config)
+    expected_error = "vLLM does not yet support RSLoRA."
+    with pytest.raises(ValueError, match=expected_error):
+        config = dict(r=8,
+                      lora_alpha=16,
+                      target_modules=["gate_proj"],
+                      use_rslora=True)
+        PEFTHelper.from_dict(config)
+
+    expected_error = "vLLM does not yet support DoRA."
+    with pytest.raises(ValueError, match=expected_error):
+        config = dict(r=8,
+                      lora_alpha=16,
+                      target_modules=["gate_proj"],
+                      use_dora=True)
+        PEFTHelper.from_dict(config)
+
+
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_from_lora_tensors(sql_lora_files, device):
     tensors = load_file(
         os.path.join(sql_lora_files, "adapter_model.safetensors"))
     new_embeddings = load_file(
         os.path.join(sql_lora_files, "new_embeddings.safetensors"))
+
+    lora_config_path = os.path.join(sql_lora_files, "adapter_config.json")
+    with open(lora_config_path) as f:
+        config = json.load(f)
+
+    peft_helper = PEFTHelper.from_dict(config)
     lora_model = LoRAModel.from_lora_tensors(
         1,
-        8,
-        16,
         tensors,
-        device,
+        peft_helper=peft_helper,
+        device=device,
         embeddings=new_embeddings,
         embedding_modules=EMBEDDING_MODULES,
         embedding_padding_modules=EMBEDDING_PADDING_MODULES)
diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
index b648312ba76ec..dde347b78bf81 100644
--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
@@ -4,6 +4,7 @@
 import torch
 import torch.types
 
+from vllm.lora.peft_helper import PEFTHelper
 from vllm.utils import is_pin_memory_available
 
 
@@ -59,6 +60,23 @@ def extra_vocab_size(self) -> int:
         return self.embeddings_tensor.shape[
             0] if self.embeddings_tensor is not None else 0
 
+    @classmethod
+    def from_config(
+        cls,
+        module_name: str,
+        peft_helper: PEFTHelper,
+        embeddings_tensor: Optional[torch.Tensor] = None,
+    ) -> "LoRALayerWeights":
+        return cls(
+            module_name,
+            peft_helper.r,
+            peft_helper.lora_alpha,
+            None,
+            None,
+            None,
+            embeddings_tensor,
+        )
+
     @classmethod
     def create_dummy_lora_weights(
             cls,
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 49cd9f0c236ad..70806a77b9fff 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -21,6 +21,7 @@
                               LinearScalingRotaryEmbeddingWithLora,
                               LoRAMapping)
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
+from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.punica_wrapper import get_punica_wrapper
 from vllm.lora.utils import (from_layer, from_layer_logits_processor,
                              is_regex_target_modules,
@@ -104,14 +105,12 @@ def get_lora(self, module_name: str) -> Optional[LoRALayerWeights]:
     def from_lora_tensors(
         cls,
         lora_model_id: int,
-        rank: int,
-        lora_alpha: int,
         tensors: Dict[str, torch.Tensor],
+        peft_helper: PEFTHelper,
         device: str = "cuda",
         dtype: Optional[torch.dtype] = None,
         embeddings: Optional[Dict[str, torch.Tensor]] = None,
         target_embedding_padding: Optional[int] = None,
-        scaling_factor: Optional[float] = None,
         embedding_modules: Optional[Dict[str, str]] = None,
         embedding_padding_modules: Optional[List[str]] = None,
     ) -> "LoRAModel":
@@ -135,10 +134,9 @@ def from_lora_tensors(
                         if pin_memory:
                             lora_embeddings_tensor = (
                                 lora_embeddings_tensor.pin_memory())
-                loras[module_name] = LoRALayerWeights(module_name, rank,
-                                                      lora_alpha, None, None,
-                                                      None,
-                                                      lora_embeddings_tensor)
+                loras[module_name] = LoRALayerWeights.from_config(
+                    module_name, peft_helper, lora_embeddings_tensor)
+
             if is_bias:
                 loras[module_name].bias = tensor.to(device=device,
                                                     dtype=dtype).t()
@@ -170,7 +168,11 @@ def from_lora_tensors(
 
         for lora in loras.values():
             lora.optimize()
-        return cls(lora_model_id, rank, loras, scaling_factor=scaling_factor)
+
+        return cls(lora_model_id,
+                   peft_helper.r,
+                   loras,
+                   scaling_factor=peft_helper.vllm_scaling_factor)
 
     @classmethod
     def from_local_checkpoint(
@@ -212,6 +214,9 @@ def from_local_checkpoint(
                                                     "new_embeddings.bin")
         with open(lora_config_path) as f:
             config = json.load(f)
+
+        config["vllm_max_position_embeddings"] = max_position_embeddings
+        peft_helper = PEFTHelper.from_dict(config)
         if os.path.isfile(lora_tensor_path):
             tensors: Dict[str, torch.Tensor] = {}
             # Find unexpected modules.
@@ -242,7 +247,7 @@ def from_local_checkpoint(
             # When a bin file is provided, we rely on config to find unexpected
             # modules.
             unexpected_modules = []
-            target_modules = config["target_modules"]
+            target_modules = peft_helper.target_modules
             if not isinstance(target_modules, list):
                 target_modules = [target_modules]
             for module in target_modules:
@@ -256,7 +261,7 @@ def from_local_checkpoint(
             # https://github.com/vllm-project/vllm/pull/5909. But there's no
             # other better mechanism.
             if unexpected_modules and not is_regex_target_modules(
-                    config["target_modules"], expected_lora_modules):
+                    peft_helper.target_modules, expected_lora_modules):
                 raise ValueError(
                     f"While loading {lora_dir}, expected"
                     f" target modules in {expected_lora_modules}"
@@ -274,30 +279,17 @@ def from_local_checkpoint(
             embeddings = torch.load(new_embeddings_bin_file_path,
                                     map_location=device)
 
-        rank = config["r"]
-        lora_alpha = config["lora_alpha"]
-        context_length = config.get("context_length", None)
-        scaling_factor = None
-        if context_length:
-            if max_position_embeddings is None:
-                max_position_embeddings = context_length
-            scaling_factor = float(
-                math.ceil(context_length / max_position_embeddings))
-
         return cls.from_lora_tensors(
             lora_model_id=get_lora_id()
             if lora_model_id is None else lora_model_id,
-            rank=rank,
-            lora_alpha=lora_alpha,
             tensors=tensors,
+            peft_helper=peft_helper,
             device=device,
             dtype=dtype,
             embeddings=embeddings,
             target_embedding_padding=target_embedding_padding,
-            scaling_factor=scaling_factor,
             embedding_modules=embedding_modules,
-            embedding_padding_modules=embedding_padding_modules,
-        )
+            embedding_padding_modules=embedding_padding_modules)
 
 
 class LoRAModelManager(AdapterModelManager):
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
new file mode 100644
index 0000000000000..edf4ba5659575
--- /dev/null
+++ b/vllm/lora/peft_helper.py
@@ -0,0 +1,70 @@
+# Adapted from: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py
+
+import math
+from dataclasses import MISSING, dataclass, field, fields
+from typing import Literal, Optional, Union
+
+
+@dataclass
+class PEFTHelper:
+    # Required fields
+    r: int
+    lora_alpha: int
+    target_modules: Union[list[str], str]
+
+    bias: Literal["none", "all", "lora_only"] = field(default="none")
+    modules_to_save: Optional[list[str]] = field(default=None)
+    use_rslora: bool = field(default=False)
+    use_dora: bool = field(default=False)
+    # long lora field
+    context_length: int = field(default=0)
+    # Extra vllm field, start with 'vllm_' to avoid conflict
+    vllm_max_position_embeddings: Optional[int] = field(default=False)
+    vllm_scaling_factor: Optional[float] = field(default=None)
+
+    def _validate_features(self):
+        error_msg = []
+
+        if self.modules_to_save:
+            error_msg.append("vLLM only supports modules_to_save being None.")
+        if self.use_rslora:
+            error_msg.append("vLLM does not yet support RSLoRA.")
+
+        if self.use_dora:
+            error_msg.append("vLLM does not yet support DoRA.")
+
+        if error_msg:
+            raise ValueError(f"{', '.join(error_msg)}")
+
+    def __post_init__(self):
+        self._validate_features()
+        if self.context_length:
+            if self.vllm_max_position_embeddings is None:
+                self.vllm_max_position_embeddings = self.context_length
+            self.vllm_scaling_factor = float(
+                math.ceil(self.context_length /
+                          self.vllm_max_position_embeddings))
+
+    @classmethod
+    def from_dict(cls, config_dict: dict) -> "PEFTHelper":
+        # Get all field information from the class
+        class_fields = {f.name: f for f in fields(cls)}
+        # Check for required fields
+        required_fields = {
+            name
+            for name, f in class_fields.items()
+            if f.default is MISSING and f.default_factory is MISSING
+        }
+
+        # Identify any missing required fields
+        missing_fields = required_fields - set(config_dict.keys())
+        if missing_fields:
+            raise ValueError(
+                f"Missing required configuration fields: {missing_fields}")
+
+        # Filter out fields that aren't defined in the class
+        filtered_dict = {
+            k: v
+            for k, v in config_dict.items() if k in class_fields
+        }
+        return cls(**filtered_dict)

From 9b9cef3145381721fa950c89718fe71849ac2a55 Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Tue, 10 Dec 2024 09:38:23 -0700
Subject: [PATCH 003/357] [Bugfix] Backport request id validation to v0
 (#11036)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 vllm/engine/multiprocessing/client.py | 4 ++++
 vllm/v1/engine/async_llm.py           | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 32bd83305bb8f..a729023bc00bb 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -576,6 +576,10 @@ async def _process_request(
         if self._errored_with is not None:
             raise ENGINE_DEAD_ERROR(self._errored_with)
 
+        # Ensure the request id is unique among running requests
+        if request_id in self.output_queues:
+            raise ValueError(f"Request {request_id} already exists")
+
         # Constructing guided decoding logits processors is expensive, so we do
         # it here to avoid contending with cpu resources and the GIL on the
         # backend process.
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 26fd650aee4b7..24cafeff63d1e 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -152,7 +152,7 @@ async def add_request(
         """Add new request to the AsyncLLM."""
 
         if self.detokenizer.is_request_active(request_id):
-            raise KeyError(f"Request {request_id} already exists.")
+            raise ValueError(f"Request {request_id} already exists.")
 
         # 1) Create a new AsyncStream for the request.
         stream = self._add_request_to_streams(request_id)

From 250ee65d72a0c7b86ec5cea9cbe9377da21d6439 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=A1via=20B=C3=A9o?=
 <119421251+flaviabeo@users.noreply.github.com>
Date: Tue, 10 Dec 2024 14:38:15 -0300
Subject: [PATCH 004/357] [BUG] Remove token param #10921 (#11022)

Signed-off-by: Flavia Beo <flavia.beo@ibm.com>
---
 vllm/transformers_utils/config.py | 63 ++++++++++++++-----------------
 1 file changed, 29 insertions(+), 34 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 3da99bcbee9ae..4529cf27ef565 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -1,5 +1,6 @@
 import enum
 import json
+import os
 from pathlib import Path
 from typing import Any, Dict, Optional, Type, Union
 
@@ -41,6 +42,7 @@
     from transformers import AutoConfig
 
 MISTRAL_CONFIG_NAME = "params.json"
+HF_TOKEN = os.getenv('HF_TOKEN', None)
 
 logger = init_logger(__name__)
 
@@ -77,8 +79,8 @@ class ConfigFormat(str, enum.Enum):
     MISTRAL = "mistral"
 
 
-def file_or_path_exists(model: Union[str, Path], config_name, revision,
-                        token) -> bool:
+def file_or_path_exists(model: Union[str, Path], config_name: str,
+                        revision: Optional[str]) -> bool:
     if Path(model).exists():
         return (Path(model) / config_name).is_file()
 
@@ -93,7 +95,10 @@ def file_or_path_exists(model: Union[str, Path], config_name, revision,
     # NB: file_exists will only check for the existence of the config file on
     # hf_hub. This will fail in offline mode.
     try:
-        return file_exists(model, config_name, revision=revision, token=token)
+        return file_exists(model,
+                           config_name,
+                           revision=revision,
+                           token=HF_TOKEN)
     except huggingface_hub.errors.OfflineModeIsEnabled:
         # Don't raise in offline mode, all we know is that we don't have this
         # file cached.
@@ -161,7 +166,6 @@ def get_config(
     revision: Optional[str] = None,
     code_revision: Optional[str] = None,
     config_format: ConfigFormat = ConfigFormat.AUTO,
-    token: Optional[str] = None,
     **kwargs,
 ) -> PretrainedConfig:
     # Separate model folder from file path for GGUF models
@@ -173,19 +177,20 @@ def get_config(
 
     if config_format == ConfigFormat.AUTO:
         if is_gguf or file_or_path_exists(
-                model, HF_CONFIG_NAME, revision=revision, token=token):
+                model, HF_CONFIG_NAME, revision=revision):
             config_format = ConfigFormat.HF
-        elif file_or_path_exists(model,
-                                 MISTRAL_CONFIG_NAME,
-                                 revision=revision,
-                                 token=token):
+        elif file_or_path_exists(model, MISTRAL_CONFIG_NAME,
+                                 revision=revision):
             config_format = ConfigFormat.MISTRAL
         else:
             # If we're in offline mode and found no valid config format, then
             # raise an offline mode error to indicate to the user that they
             # don't have files cached and may need to go online.
             # This is conveniently triggered by calling file_exists().
-            file_exists(model, HF_CONFIG_NAME, revision=revision, token=token)
+            file_exists(model,
+                        HF_CONFIG_NAME,
+                        revision=revision,
+                        token=HF_TOKEN)
 
             raise ValueError(f"No supported config format found in {model}")
 
@@ -194,7 +199,7 @@ def get_config(
             model,
             revision=revision,
             code_revision=code_revision,
-            token=token,
+            token=HF_TOKEN,
             **kwargs,
         )
 
@@ -206,7 +211,7 @@ def get_config(
                 model,
                 revision=revision,
                 code_revision=code_revision,
-                token=token,
+                token=HF_TOKEN,
                 **kwargs,
             )
         else:
@@ -216,7 +221,7 @@ def get_config(
                     trust_remote_code=trust_remote_code,
                     revision=revision,
                     code_revision=code_revision,
-                    token=token,
+                    token=HF_TOKEN,
                     **kwargs,
                 )
             except ValueError as e:
@@ -234,7 +239,7 @@ def get_config(
                     raise e
 
     elif config_format == ConfigFormat.MISTRAL:
-        config = load_params_config(model, revision, token=token, **kwargs)
+        config = load_params_config(model, revision, token=HF_TOKEN, **kwargs)
     else:
         raise ValueError(f"Unsupported config format: {config_format}")
 
@@ -256,8 +261,7 @@ def get_config(
 
 def get_hf_file_to_dict(file_name: str,
                         model: Union[str, Path],
-                        revision: Optional[str] = 'main',
-                        token: Optional[str] = None):
+                        revision: Optional[str] = 'main'):
     """
     Downloads a file from the Hugging Face Hub and returns 
     its contents as a dictionary.
@@ -266,7 +270,6 @@ def get_hf_file_to_dict(file_name: str,
     - file_name (str): The name of the file to download.
     - model (str): The name of the model on the Hugging Face Hub.
     - revision (str): The specific version of the model. 
-    - token (str): The Hugging Face authentication token.
 
     Returns:
     - config_dict (dict): A dictionary containing 
@@ -276,8 +279,7 @@ def get_hf_file_to_dict(file_name: str,
 
     if file_or_path_exists(model=model,
                            config_name=file_name,
-                           revision=revision,
-                           token=token):
+                           revision=revision):
 
         if not file_path.is_file():
             try:
@@ -296,9 +298,7 @@ def get_hf_file_to_dict(file_name: str,
     return None
 
 
-def get_pooling_config(model: str,
-                       revision: Optional[str] = 'main',
-                       token: Optional[str] = None):
+def get_pooling_config(model: str, revision: Optional[str] = 'main'):
     """
     This function gets the pooling and normalize 
     config from the model - only applies to 
@@ -315,8 +315,7 @@ def get_pooling_config(model: str,
     """
 
     modules_file_name = "modules.json"
-    modules_dict = get_hf_file_to_dict(modules_file_name, model, revision,
-                                       token)
+    modules_dict = get_hf_file_to_dict(modules_file_name, model, revision)
 
     if modules_dict is None:
         return None
@@ -332,8 +331,7 @@ def get_pooling_config(model: str,
     if pooling:
 
         pooling_file_name = "{}/config.json".format(pooling["path"])
-        pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision,
-                                           token)
+        pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision)
         pooling_type_name = next(
             (item for item, val in pooling_dict.items() if val is True), None)
 
@@ -368,8 +366,8 @@ def get_pooling_config_name(pooling_name: str) -> Union[str, None]:
 
 
 def get_sentence_transformer_tokenizer_config(model: str,
-                                              revision: Optional[str] = 'main',
-                                              token: Optional[str] = None):
+                                              revision: Optional[str] = 'main'
+                                              ):
     """
     Returns the tokenization configuration dictionary for a 
     given Sentence Transformer BERT model.
@@ -379,7 +377,6 @@ def get_sentence_transformer_tokenizer_config(model: str,
     BERT model.
     - revision (str, optional): The revision of the m
     odel to use. Defaults to 'main'.
-    - token (str): A Hugging Face access token.
 
     Returns:
     - dict: A dictionary containing the configuration parameters 
@@ -394,7 +391,7 @@ def get_sentence_transformer_tokenizer_config(model: str,
             "sentence_xlm-roberta_config.json",
             "sentence_xlnet_config.json",
     ]:
-        encoder_dict = get_hf_file_to_dict(config_name, model, revision, token)
+        encoder_dict = get_hf_file_to_dict(config_name, model, revision)
         if encoder_dict:
             break
 
@@ -474,16 +471,14 @@ def _reduce_config(config: VllmConfig):
             exc_info=e)
 
 
-def load_params_config(model: Union[str, Path],
-                       revision: Optional[str],
-                       token: Optional[str] = None,
+def load_params_config(model: Union[str, Path], revision: Optional[str],
                        **kwargs) -> PretrainedConfig:
     # This function loads a params.json config which
     # should be used when loading models in mistral format
 
     config_file_name = "params.json"
 
-    config_dict = get_hf_file_to_dict(config_file_name, model, revision, token)
+    config_dict = get_hf_file_to_dict(config_file_name, model, revision)
     assert isinstance(config_dict, dict)
 
     config_mapping = {

From e7391949267a4eff3d84f02119f442f46b16d163 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 10 Dec 2024 15:08:16 -0500
Subject: [PATCH 005/357] [Core] Update to outlines >= 0.1.8 (#10576)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 requirements-common.txt                                         | 2 +-
 .../guided_decoding/outlines_logits_processors.py               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 112528880c0ac..c71fc458aca13 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -18,7 +18,7 @@ prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
-outlines >= 0.0.43, < 0.1
+outlines >= 0.1.8
 xgrammar >= 0.1.6; platform_machine == "x86_64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index e1309c31f77e7..1f0dbe024609d 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -99,7 +99,7 @@ class RegexLogitsProcessor(BaseLogitsProcessor):
     def _get_guide(cls, regex_string: str,
                    tokenizer: PreTrainedTokenizerBase) -> Guide:
         tokenizer = _adapt_tokenizer(tokenizer)
-        return RegexGuide(regex_string, tokenizer)
+        return RegexGuide.from_regex(regex_string, tokenizer)
 
     def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase):
         """Compile the FSM that drives the regex-structured generation.

From 75f89dc44c6e44cc28bae59d5b40a588735b507b Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 10 Dec 2024 12:40:52 -0800
Subject: [PATCH 006/357] [torch.compile] add a flag to track batchsize
 statistics (#11059)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/envs.py                             |  3 +++
 vllm/forward_context.py                  | 32 +++++++++++++++++++++++-
 vllm/v1/attention/backends/flash_attn.py |  1 +
 vllm/v1/worker/gpu_model_runner.py       |  2 ++
 4 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index ab12a7b48dc53..be5d9985b63a4 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -69,6 +69,7 @@
     VLLM_DISABLED_KERNELS: List[str] = []
     VLLM_USE_V1: bool = False
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = False
+    VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
 
 
 def get_default_cache_root():
@@ -452,6 +453,8 @@ def get_default_config_root():
     # If set, enable multiprocessing in LLM for the V1 code path.
     "VLLM_ENABLE_V1_MULTIPROCESSING":
     lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0"))),
+    "VLLM_LOG_BATCHSIZE_INTERVAL":
+    lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")),
 }
 
 # end-env-vars-definition
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index aaa3e4bb3a1e8..cd136f43c0c57 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -1,8 +1,19 @@
+import time
+from collections import Counter
 from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import Any, Dict, Optional
 
+import vllm.envs as envs
 from vllm.config import VllmConfig
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+track_batchsize: bool = envs.VLLM_LOG_BATCHSIZE_INTERVAL >= 0
+batchsize_counter: Counter = Counter()
+last_logging_time: float = 0
+batchsize_logging_interval: float = envs.VLLM_LOG_BATCHSIZE_INTERVAL
 
 
 @dataclass
@@ -26,7 +37,26 @@ def get_forward_context() -> ForwardContext:
 @contextmanager
 def set_forward_context(context: Any, vllm_config: VllmConfig):
     """A context manager that stores the current forward context,
-    can be attention metadata, etc."""
+    can be attention metadata, etc.
+    Here we can inject common logic for every model forward pass.
+    """
+    global track_batchsize, batchsize_counter
+    global last_logging_time, batchsize_logging_interval
+    if track_batchsize and context is not None:
+        if hasattr(context, "num_prefill_tokens"):
+            # for v0 attention backends
+            batchsize = context.num_prefill_tokens + context.num_decode_tokens
+        else:
+            # for v1 attention backends
+            batchsize = context.num_input_tokens
+        batchsize_counter[batchsize] += 1
+        if time.monotonic() - last_logging_time > batchsize_logging_interval:
+            last_logging_time = time.monotonic()
+            sorted_data = sorted(batchsize_counter.items(),
+                                 key=lambda x: x[1],
+                                 reverse=True)
+            logger.info("Batchsize distribution (batchsize, count): %s",
+                        sorted_data)
     global _forward_context
     prev_context = _forward_context
     _forward_context = ForwardContext(
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 251a103e60f06..c9f04ace644c7 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -56,6 +56,7 @@ class FlashAttentionMetadata:
     seq_start_loc: torch.Tensor
     block_table: torch.Tensor
     slot_mapping: torch.Tensor
+    num_input_tokens: int = 0  # Number of tokens including padding.
 
 
 class FlashAttentionImpl(AttentionImpl):
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0a5adfb28c9bd..a3335fa838352 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -445,6 +445,8 @@ def execute_model(
             # Eager mode.
             num_input_tokens = num_scheduled_tokens
 
+        attn_metadata.num_input_tokens = num_input_tokens
+
         # Get the inputs embeds.
         if encoder_outputs:
             inputs_embeds = self.model.get_input_embeddings(

From 134810b3d9a05510622282479f0f9e2114b88017 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 10 Dec 2024 14:41:23 -0800
Subject: [PATCH 007/357] [V1][Bugfix] Always set enable_chunked_prefill = True
 for V1 (#11061)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/engine/arg_utils.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 3db069ec64ee4..7b9adc401abcf 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -122,7 +122,7 @@ class EngineArgs:
     cpu_offload_gb: float = 0  # GiB
     gpu_memory_utilization: float = 0.90
     max_num_batched_tokens: Optional[int] = None
-    max_num_seqs: int = 256
+    max_num_seqs: Optional[int] = None
     max_logprobs: int = 20  # Default value for OpenAI Chat Completions API
     disable_log_stats: bool = False
     revision: Optional[str] = None
@@ -205,6 +205,9 @@ def __post_init__(self):
         # by user.
         if self.enable_prefix_caching is None:
             self.enable_prefix_caching = bool(envs.VLLM_USE_V1)
+        # Override max_num_seqs if it's not set by user.
+        if self.max_num_seqs is None:
+            self.max_num_seqs = 256 if not envs.VLLM_USE_V1 else 1024
 
         # support `EngineArgs(compilation_config={...})`
         # without having to manually construct a
@@ -1225,19 +1228,19 @@ def _override_v1_engine_args(self, usage_context: UsageContext) -> None:
         """
         assert envs.VLLM_USE_V1, "V1 is not enabled"
 
+        # V1 always uses chunked prefills.
+        self.enable_chunked_prefill = True
+        # When no user override, set the default values based on the usage
+        # context.
+        # TODO(woosuk): Tune the default values for different hardware.
         if self.max_num_batched_tokens is None:
-            # When no user override, set the default values based on the
-            # usage context.
             if usage_context == UsageContext.LLM_CLASS:
-                logger.warning("Setting max_num_batched_tokens to 8192 "
-                               "for LLM_CLASS usage context.")
-                self.max_num_seqs = 1024
                 self.max_num_batched_tokens = 8192
             elif usage_context == UsageContext.OPENAI_API_SERVER:
-                logger.warning("Setting max_num_batched_tokens to 2048 "
-                               "for OPENAI_API_SERVER usage context.")
-                self.max_num_seqs = 1024
                 self.max_num_batched_tokens = 2048
+            logger.warning(
+                "Setting max_num_batched_tokens to %d for %s usage context.",
+                self.max_num_batched_tokens, usage_context.value)
 
     def _override_v1_engine_config(self, engine_config: VllmConfig) -> None:
         """

From 9a93973708d7f52f1d1439f8f32b8c1514d18b86 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Tue, 10 Dec 2024 19:16:22 -0500
Subject: [PATCH 008/357] [Bugfix] Fix Mamba multistep (#11071)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 vllm/attention/backends/placeholder_attn.py | 64 ++++++++++++++++++++-
 vllm/worker/multi_step_model_runner.py      |  4 +-
 2 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
index 888adbffb8578..658039bfc3365 100644
--- a/vllm/attention/backends/placeholder_attn.py
+++ b/vllm/attention/backends/placeholder_attn.py
@@ -11,7 +11,8 @@
 from vllm.multimodal import MultiModalPlaceholderMap
 
 if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUBuilder
+    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
+                                          ModelInputForGPUWithSamplingMetadata)
 
 # Placeholder attention backend for models like Mamba and embedding models that
 # lack attention.
@@ -186,6 +187,67 @@ def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
         )
         return self._cached_decode_metadata
 
+    def advance_step(self,
+                     model_input: "ModelInputForGPUWithSamplingMetadata",
+                     sampled_token_ids: Optional[torch.Tensor],
+                     block_size: int,
+                     num_seqs: int,
+                     num_queries: int,
+                     turn_prefills_into_decodes: bool = False):
+        """
+        Update metadata in-place to advance one decode step.
+        """
+        # When using cudagraph, the num_seqs is padded to the next captured
+        # batch sized, but num_queries tracks the actual number of requests in
+        # the batch. For --enforce-eager mode, num_seqs == num_queries
+        if num_seqs != num_queries:
+            assert num_seqs > num_queries
+            assert self.use_cuda_graph
+
+        assert not turn_prefills_into_decodes, \
+            ("Multi-Step + Chunked-Prefill is not supported for attention-free"
+             "models. turn_prefills_into_decodes is a "
+             "Multi-Step + Chunked-Prefill specific parameter.")
+
+        assert self.seq_lens is not None
+        assert self.max_decode_seq_len == max(self.seq_lens)
+
+        assert self.num_prefills == 0
+        assert self.num_prefill_tokens == 0
+        assert self.num_decode_tokens == num_seqs
+
+        assert self.seq_lens is not None
+        assert len(self.seq_lens) == num_seqs
+        assert self.seq_lens_tensor is not None
+        assert self.seq_lens_tensor.shape == (num_seqs, )
+        assert self.max_query_len == 1
+        assert self.max_prefill_seq_len == 0
+
+        assert self.query_start_loc is not None
+        assert self.query_start_loc.shape == (num_queries + 1, )
+        assert self.seq_start_loc is not None
+        assert self.seq_start_loc.shape == (num_seqs + 1, )
+
+        assert self.context_lens_tensor is not None
+        assert self.context_lens_tensor.shape == (num_queries, )
+
+        assert self.block_tables is not None
+
+        # Update query lengths. Note that we update only queries and not seqs,
+        # since tensors may be padded due to captured cuda graph batch size
+        for i in range(num_queries):
+            self.seq_lens[i] += 1
+        self.max_decode_seq_len = max(self.seq_lens)
+
+        # Update sequences, masking off entries greater than num_queries
+        device = self.seq_lens_tensor.device
+        mask = torch.arange(self.seq_lens_tensor.size(0),
+                            device=device) < num_queries
+        self.seq_lens_tensor += mask.to(self.seq_lens_tensor.dtype)
+        if sampled_token_ids is not None:
+            model_input.input_tokens.masked_scatter_(
+                mask, sampled_token_ids[:num_queries])
+
 
 class PlaceholderAttentionMetadataBuilder(
         AttentionMetadataBuilder[PlaceholderAttentionMetadata]):
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index 3ca0d88a42183..e08a61e31fe42 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -29,7 +29,9 @@
 
 logger = init_logger(__name__)
 
-MULTI_STEP_ATTENTION_BACKENDS = ["FLASH_ATTN", "ROCM_FLASH", "FLASHINFER"]
+MULTI_STEP_ATTENTION_BACKENDS = [
+    "FLASH_ATTN", "ROCM_FLASH", "FLASHINFER", "NO_ATTENTION"
+]
 MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["FLASH_ATTN"]
 
 def _get_supported_attention_backends(chunked_prefill_enabled: bool) \

From d5c5154fcf4c5d65551c98e458cbb027e5f4b672 Mon Sep 17 00:00:00 2001
From: Aurick Qiao <aurickq@users.noreply.github.com>
Date: Tue, 10 Dec 2024 21:09:20 -0500
Subject: [PATCH 009/357] [Misc] LoRA + Chunked Prefill (#9057)

---
 tests/lora/test_chatglm3_tp.py  |  9 ++++++---
 tests/lora/test_gemma.py        |  3 ++-
 tests/lora/test_llama_tp.py     |  6 +++++-
 tests/lora/test_long_context.py |  3 ++-
 tests/lora/test_minicpmv.py     |  3 ++-
 tests/lora/test_minicpmv_tp.py  |  2 ++
 tests/lora/test_mixtral.py      |  1 +
 tests/lora/test_phi.py          |  3 ++-
 tests/lora/test_quant_model.py  |  9 ++++++---
 vllm/config.py                  |  3 ++-
 vllm/core/scheduler.py          | 15 ++++++++++++---
 vllm/worker/model_runner.py     | 12 +++++++-----
 12 files changed, 49 insertions(+), 20 deletions(-)

diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index f17464573459f..49a527b99ac16 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -53,7 +53,8 @@ def test_chatglm3_lora(chatglm3_lora_files):
                    max_loras=4,
                    max_lora_rank=64,
                    tensor_parallel_size=1,
-                   trust_remote_code=True)
+                   trust_remote_code=True,
+                   enable_chunked_prefill=True)
 
     output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
     for i in range(len(EXPECTED_LORA_OUTPUT)):
@@ -73,7 +74,8 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
                    max_lora_rank=64,
                    tensor_parallel_size=4,
                    trust_remote_code=True,
-                   fully_sharded_loras=False)
+                   fully_sharded_loras=False,
+                   enable_chunked_prefill=True)
 
     output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
     for i in range(len(EXPECTED_LORA_OUTPUT)):
@@ -93,7 +95,8 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
                    max_lora_rank=64,
                    tensor_parallel_size=4,
                    trust_remote_code=True,
-                   fully_sharded_loras=True)
+                   fully_sharded_loras=True,
+                   enable_chunked_prefill=True)
     output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
     for i in range(len(EXPECTED_LORA_OUTPUT)):
         assert output1[i] == EXPECTED_LORA_OUTPUT[i]
diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
index 15ec66b0f5502..5ae705e474ec6 100644
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -37,7 +37,8 @@ def test_gemma_lora(gemma_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
                    enable_lora=True,
-                   max_loras=4)
+                   max_loras=4,
+                   enable_chunked_prefill=True)
 
     expected_lora_output = [
         "more important than knowledge.\nAuthor: Albert Einstein\n",
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index d3ca7f878191a..dfeac380951d8 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -78,7 +78,8 @@ def test_llama_lora(sql_lora_files):
                    enable_lora=True,
                    max_num_seqs=16,
                    max_loras=4,
-                   tensor_parallel_size=1)
+                   tensor_parallel_size=1,
+                   enable_chunked_prefill=True)
     generate_and_test(llm, sql_lora_files)
 
 
@@ -120,6 +121,7 @@ def test_llama_lora_tp4(sql_lora_files):
         max_num_seqs=16,
         max_loras=4,
         tensor_parallel_size=4,
+        enable_chunked_prefill=True,
     )
     generate_and_test(llm, sql_lora_files)
 
@@ -135,6 +137,7 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
         max_loras=4,
         tensor_parallel_size=4,
         fully_sharded_loras=True,
+        enable_chunked_prefill=True,
     )
     generate_and_test(llm, sql_lora_files)
 
@@ -151,5 +154,6 @@ def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files):
         tensor_parallel_size=4,
         fully_sharded_loras=True,
         enable_lora_bias=True,
+        enable_chunked_prefill=True,
     )
     generate_and_test(llm, sql_lora_files)
diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py
index eada902c891f7..e7a34f2ced7ed 100644
--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
@@ -124,7 +124,8 @@ def lora_llm(long_context_infos):
         tensor_parallel_size=4,
         # FIXME enable async output processor
         disable_async_output_proc=True,
-        distributed_executor_backend="mp")
+        distributed_executor_backend="mp",
+        enable_chunked_prefill=True)
     yield llm
     del llm
 
diff --git a/tests/lora/test_minicpmv.py b/tests/lora/test_minicpmv.py
index 2c45ce5141f7d..1f3de9edc0d0f 100644
--- a/tests/lora/test_minicpmv.py
+++ b/tests/lora/test_minicpmv.py
@@ -67,7 +67,8 @@ def test_minicpmv_lora(minicpmv_lora_files):
         max_loras=4,
         max_lora_rank=64,
         trust_remote_code=True,
-        gpu_memory_utilization=0.97  # This model is pretty big for CI gpus
+        gpu_memory_utilization=0.97,  # This model is pretty big for CI gpus
+        enable_chunked_prefill=True,
     )
     output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
     for i in range(len(EXPECTED_OUTPUT)):
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index ba29e562e58ec..930f177953a5f 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -69,6 +69,7 @@ def test_minicpmv_tp2(minicpmv_lora_files, fully_sharded):
         tensor_parallel_size=2,
         trust_remote_code=True,
         fully_sharded_loras=fully_sharded,
+        enable_chunked_prefill=True,
     )
 
     output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
@@ -89,6 +90,7 @@ def test_minicpmv_tp4(minicpmv_lora_files, fully_sharded):
         tensor_parallel_size=4,
         trust_remote_code=True,
         fully_sharded_loras=fully_sharded,
+        enable_chunked_prefill=True,
     )
     output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
     for i in range(len(EXPECTED_OUTPUT)):
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index dddc299da446b..150221dfce6ab 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -47,6 +47,7 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
         max_loras=4,
         distributed_executor_backend="ray",
         tensor_parallel_size=tp_size,
+        enable_chunked_prefill=True,
     )
 
     expected_lora_output = [
diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
index 733eff48a9bf3..5a3fcb8d690d9 100644
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@@ -53,7 +53,8 @@ def test_phi2_lora(phi2_lora_files):
                    max_model_len=1024,
                    enable_lora=True,
                    max_loras=2,
-                   enforce_eager=True)
+                   enforce_eager=True,
+                   enable_chunked_prefill=True)
 
     expected_lora_output = [
         "SELECT catalog_publisher, COUNT(*) as num_catalogs FROM catalogs GROUP BY catalog_publisher ORDER BY num_catalogs DESC LIMIT 1;",  # noqa: E501
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index 5432fa4ad0d3a..026269667b473 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -84,7 +84,8 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
         tensor_parallel_size=tp_size,
         gpu_memory_utilization=0.2,  #avoid OOM
         quantization=model.quantization,
-        trust_remote_code=True)
+        trust_remote_code=True,
+        enable_chunked_prefill=True)
 
     if model.quantization is None:
         expected_no_lora_output = [
@@ -176,7 +177,8 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
         tensor_parallel_size=1,
         gpu_memory_utilization=0.2,  #avoid OOM
         quantization=model.quantization,
-        trust_remote_code=True)
+        trust_remote_code=True,
+        enable_chunked_prefill=True)
     output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
 
     del llm_tp1
@@ -189,7 +191,8 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
         max_loras=4,
         tensor_parallel_size=2,
         gpu_memory_utilization=0.2,  #avoid OOM
-        quantization=model.quantization)
+        quantization=model.quantization,
+        enable_chunked_prefill=True)
     output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
 
     del llm_tp2
diff --git a/vllm/config.py b/vllm/config.py
index 5fb9563fcf3a3..c66ddbb47f22e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1698,7 +1698,8 @@ def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
         # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         if scheduler_config.chunked_prefill_enabled:
-            raise ValueError("LoRA is not supported with chunked prefill yet.")
+            logger.warning("LoRA with chunked prefill is still experimental "
+                           "and may be unstable.")
 
 
 @dataclass
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index d23009dae01ee..94c62743883ec 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -166,9 +166,18 @@ def is_empty(self) -> bool:
                 and not self.blocks_to_swap_out and not self.blocks_to_copy)
 
     def _sort_by_lora_ids(self):
-        self.scheduled_seq_groups = sorted(
-            self.scheduled_seq_groups,
-            key=lambda g: (g.seq_group.lora_int_id, g.seq_group.request_id))
+        assert 0 <= self.num_prefill_groups <= len(self.scheduled_seq_groups)
+
+        def key_fn(group: ScheduledSequenceGroup):
+            key = (group.seq_group.lora_int_id, group.seq_group.request_id)
+            if 0 < self.num_prefill_groups < len(self.scheduled_seq_groups):
+                # Sort sequence groups so that all prefills come before all
+                # decodes as required by chunked prefill.
+                return (not group.seq_group.is_prefill(), *key)
+            return key
+
+        self.scheduled_seq_groups = sorted(self.scheduled_seq_groups,
+                                           key=key_fn)
 
     @property
     def lora_requests(self) -> Set[LoRARequest]:
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 1bc5f65c7127f..551b84435fdc0 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -622,11 +622,13 @@ def _compute_lora_input(self, inter_data: InterDataForSeqGroup,
             inter_data.lora_requests.add(seq_group_metadata.lora_request)
         query_len = inter_data.query_lens[seq_idx]
         inter_data.lora_index_mapping.append([lora_id] * query_len)
-        inter_data.lora_prompt_mapping.append(
-            [lora_id] *
-            (query_len if seq_group_metadata.sampling_params
-             and seq_group_metadata.sampling_params.prompt_logprobs is not None
-             else 1))
+        sampling_params = seq_group_metadata.sampling_params
+        if sampling_params and sampling_params.prompt_logprobs is not None:
+            inter_data.lora_prompt_mapping.append([lora_id] * query_len)
+        elif not self.chunked_prefill_enabled or seq_group_metadata.do_sample:
+            inter_data.lora_prompt_mapping.append([lora_id])
+        else:
+            inter_data.lora_prompt_mapping.append([])
 
     def _compute_prompt_adapter_input(
             self, inter_data: InterDataForSeqGroup,

From ffa48c9146fda1e8810d1cfa159e1d70aadae6c6 Mon Sep 17 00:00:00 2001
From: Mor Zusman <mor.zusmann@gmail.com>
Date: Wed, 11 Dec 2024 04:53:37 +0200
Subject: [PATCH 010/357] [Model] PP support for Mamba-like models (#10992)

Signed-off-by: mzusman <mor.zusmann@gmail.com>
---
 docs/source/models/supported_models.rst     |  6 +-
 tests/distributed/test_pipeline_parallel.py |  6 +-
 vllm/config.py                              | 58 +++++++++----
 vllm/model_executor/models/interfaces.py    | 37 ++++++++
 vllm/model_executor/models/jamba.py         | 93 ++++++++++++++-------
 vllm/model_executor/models/mamba.py         | 68 ++++++++++-----
 vllm/model_executor/models/registry.py      | 11 ++-
 vllm/utils.py                               |  5 ++
 vllm/v1/worker/gpu_model_runner.py          |  8 +-
 vllm/v1/worker/gpu_worker.py                |  6 +-
 vllm/worker/cache_engine.py                 | 12 +--
 11 files changed, 229 insertions(+), 81 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 4e5b10967e3bb..6540e023c1ab0 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -128,7 +128,7 @@ Text Generation
     - FalconMamba
     - :code:`tiiuae/falcon-mamba-7b`, :code:`tiiuae/falcon-mamba-7b-instruct`, etc.
     - ✅︎
-    -  
+    - ✅︎
   * - :code:`GemmaForCausalLM`
     - Gemma
     - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc.
@@ -193,7 +193,7 @@ Text Generation
     - Jamba
     - :code:`ai21labs/AI21-Jamba-1.5-Large`, :code:`ai21labs/AI21-Jamba-1.5-Mini`, :code:`ai21labs/Jamba-v0.1`, etc.
     - ✅︎
-    - 
+    - ✅︎
   * - :code:`LlamaForCausalLM`
     - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi
     - :code:`meta-llama/Meta-Llama-3.1-405B-Instruct`, :code:`meta-llama/Meta-Llama-3.1-70B`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-70b-hf`, :code:`01-ai/Yi-34B`, etc.
@@ -203,7 +203,7 @@ Text Generation
     - Mamba
     - :code:`state-spaces/mamba-130m-hf`, :code:`state-spaces/mamba-790m-hf`, :code:`state-spaces/mamba-2.8b-hf`, etc.
     -
-    -
+    - ✅︎
   * - :code:`MiniCPMForCausalLM`
     - MiniCPM
     - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, :code:`openbmb/MiniCPM-S-1B-sft`, etc.
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index b818ca921fcb0..85d408efafe96 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -156,13 +156,13 @@ def iter_params(self, model_name: str):
     # "internlm/internlm-chat-7b": PPTestSettings.fast(),
     "internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),
     "inceptionai/jais-13b-chat": PPTestSettings.fast(),
-    # TODO: Implement PP
-    # "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(),
+    "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
     "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
     "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
     "openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
     # Uses Llama
     # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
+    "state-spaces/mamba-130m-hf": PPTestSettings.fast(),
     "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4),
     "mosaicml/mpt-7b": PPTestSettings.fast(),
     "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
@@ -234,6 +234,8 @@ def iter_params(self, model_name: str):
     "OpenGVLab/InternVL2-1B",
     "microsoft/Phi-3-vision-128k-instruct",
     "fixie-ai/ultravox-v0_3",
+    # [LANGUAGE GENERATION - HYBRID ARCH]
+    "ai21labs/Jamba-tiny-dev",
 ]
 
 
diff --git a/vllm/config.py b/vllm/config.py
index c66ddbb47f22e..2a9f0ebae997d 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -27,8 +27,8 @@
     ConfigFormat, get_config, get_hf_image_processor_config,
     get_hf_text_config, get_pooling_config,
     get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope)
-from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory,
-                        print_warning_once, random_uuid,
+from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
+                        get_cpu_memory, print_warning_once, random_uuid,
                         resolve_obj_by_qualname)
 
 if TYPE_CHECKING:
@@ -284,6 +284,7 @@ def __init__(
             self._verify_tokenizer_mode()
 
         self.is_attention_free = self._init_attention_free()
+        self.is_hybrid = self._init_is_hybrid()
         self.has_inner_state = self._init_has_inner_state()
 
         if current_platform.is_neuron():
@@ -340,6 +341,10 @@ def _init_attention_free(self) -> bool:
         architectures = getattr(self.hf_config, "architectures", [])
         return ModelRegistry.is_attention_free_model(architectures)
 
+    def _init_is_hybrid(self) -> bool:
+        architectures = getattr(self.hf_config, "architectures", [])
+        return ModelRegistry.is_hybrid_model(architectures)
+
     def _init_has_inner_state(self) -> bool:
         architectures = getattr(self.hf_config, "architectures", [])
         return ModelRegistry.model_has_inner_state(architectures)
@@ -669,26 +674,51 @@ def get_num_attention_heads(self,
         num_heads = getattr(self.hf_text_config, "num_attention_heads", 0)
         return num_heads // parallel_config.tensor_parallel_size
 
-    def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
+    def get_layers_start_end_indices(
+            self, parallel_config: "ParallelConfig") -> Tuple[int, int]:
         from vllm.distributed.utils import get_pp_indices
         total_num_hidden_layers = getattr(self.hf_text_config,
                                           "num_hidden_layers", 0)
         pp_rank = parallel_config.rank // parallel_config.tensor_parallel_size
         pp_size = parallel_config.pipeline_parallel_size
         start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size)
-        return end - start
-
-    def get_num_attention_layers(self,
-                                 parallel_config: "ParallelConfig") -> int:
-        if self.is_attention_free:
-            return 0
+        return start, end
 
-        num_layers = self.get_num_layers(parallel_config)
+    def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
+        start, end = self.get_layers_start_end_indices(parallel_config)
+        return end - start
 
-        # Transformers supports layers_block_type @property
-        layers = getattr(self.hf_config, "layers_block_type",
-                         ["attention"] * num_layers)
-        return len([t for t in layers if t == "attention"])
+    def get_num_layers_by_block_type(
+        self,
+        parallel_config: "ParallelConfig",
+        block_type: LayerBlockType = LayerBlockType.attention,
+    ) -> int:
+        # This function relies on 'layers_block_type' in hf_config,
+        # for w/o this attribute, we will need to have workarounds like so
+        attn_block_type = block_type == LayerBlockType.attention
+        is_transformer = not self.is_hybrid and not self.is_attention_free
+        start, end = self.get_layers_start_end_indices(parallel_config)
+
+        if is_transformer:
+            # Handle the basic case first
+            return end - start if attn_block_type else 0
+        elif self.is_attention_free:
+            # Attention free
+            # Note that this code assumes there
+            # is only one type of attention-free block type.
+            return 0 if attn_block_type else end - start
+        else:
+            # Hybrid model
+            layers_block_type_value = getattr(self.hf_config,
+                                              "layers_block_type", None)
+            if layers_block_type_value is None:
+                raise ValueError("The model is an hybrid without a"
+                                 "layers_block_type in the hf_config,"
+                                 "cannot determine the num of "
+                                 f"{block_type.value} layers")
+
+            return sum(t == block_type.value
+                       for t in layers_block_type_value[start:end])
 
     def get_multimodal_config(self) -> "MultiModalConfig":
         """
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index c3979eab905db..70b78fe64f2d8 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -363,6 +363,43 @@ def is_attention_free(
     return isinstance(model, IsAttentionFree)
 
 
+@runtime_checkable
+class IsHybrid(Protocol):
+    """The interface required for all models like Jamba that have both
+    attention and mamba blocks, indicates that 
+    hf_config has 'layers_block_type'"""
+
+    is_hybrid: ClassVar[Literal[True]] = True
+    """
+        A flag that indicates this model has both mamba and attention blocks
+        , also indicates that the model's hf_config has 
+        'layers_block_type' """
+
+
+@runtime_checkable
+class _IsHybridType(Protocol):
+    is_hybrid: ClassVar[Literal[True]]
+
+
+@overload
+def is_hybrid(model: object) -> TypeIs[IsHybrid]:
+    ...
+
+
+@overload
+def is_hybrid(model: Type[object]) -> TypeIs[Type[IsHybrid]]:
+    ...
+
+
+def is_hybrid(
+    model: Union[Type[object], object]
+) -> Union[TypeIs[Type[IsHybrid]], TypeIs[IsHybrid]]:
+    if isinstance(model, type):
+        return isinstance(model, _IsHybridType)
+
+    return isinstance(model, IsHybrid)
+
+
 @runtime_checkable
 class SupportsCrossEncoding(Protocol):
     """The interface required for all models that support cross encoding."""
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 5d5e8ae1ee532..6bb4c13ab35df 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -9,6 +9,7 @@
 from vllm.attention.layer import Attention
 from vllm.config import _BATCH_SIZES_TO_CAPTURE, CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
@@ -25,9 +26,12 @@
                                                     MambaCacheParams)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
+from vllm.utils import LayerBlockType
 
-from .interfaces import HasInnerState, SupportsLoRA
-from .utils import maybe_prefix
+from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP
+from .utils import (is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 
@@ -281,16 +285,24 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             org_num_embeddings=config.vocab_size,
         )
 
-        decoder_layers = []
-        for i in range(config.num_hidden_layers):
-            layer_class = ALL_DECODER_LAYER_TYPES[config.layers_block_type[i]]
-            decoder_layers.append(
-                layer_class(config,
-                            layer_idx=i,
-                            cache_config=cache_config,
-                            quant_config=quant_config,
-                            prefix=f"{prefix}.layers.{i}"))
-        self.layers = nn.ModuleList(decoder_layers)
+        def get_layer(prefix: str):
+            layer_idx = int(prefix.rsplit(".", 1)[1])
+            layer_class = ALL_DECODER_LAYER_TYPES[
+                config.layers_block_type[layer_idx]]
+            return layer_class(
+                config,
+                layer_idx,
+                cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
         self.final_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
 
@@ -304,26 +316,34 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         mamba_cache_params: MambaCacheParams,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        if inputs_embeds is not None:
-            hidden_states = inputs_embeds
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
         else:
-            hidden_states = self.get_input_embeddings(input_ids)
-        residual = None
-        for i in range(len(self.layers)):
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        kv_cache_index = 0
+        mamba_cache_index = 0
+        for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
             kv_cache = None
             layer_mamba_cache_params = None
             if isinstance(layer, JambaAttentionDecoderLayer):
-                kv_cache = kv_caches[(i - self.config.attn_layer_offset) //
-                                     self.config.attn_layer_period]
+                kv_cache = kv_caches[kv_cache_index]
+                kv_cache_index += 1
             if isinstance(layer, JambaMambaDecoderLayer):
-                current_state_layer = i - (1 +
-                                           (i - self.config.attn_layer_offset)
-                                           // self.config.attn_layer_period)
+                current_state_layer = mamba_cache_index
                 layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
                     current_state_layer)
+                mamba_cache_index += 1
 
             hidden_states, residual = layer(
                 positions=positions,
@@ -332,11 +352,17 @@ def forward(
                 attn_metadata=attn_metadata,
                 residual=residual,
                 mamba_cache_params=layer_mamba_cache_params)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
         hidden_states, _ = self.final_layernorm(hidden_states, residual)
         return hidden_states
 
 
-class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA):
+class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
+                       IsHybrid):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -368,6 +394,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         super().__init__()
         self.config = config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
         self.scheduler_config = scheduler_config
         self.model = JambaModel(vllm_config=vllm_config,
                                 prefix=maybe_prefix(prefix, "model"))
@@ -390,6 +418,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                 config.vocab_size)
         self.sampler = get_sampler()
 
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 
@@ -406,10 +437,8 @@ def forward(self,
                 self.scheduler_config.max_num_seqs) if self.scheduler_config
                               else max(_BATCH_SIZES_TO_CAPTURE) + 2)
 
-            layers_type = self.config.layers_block_type
-            num_mamba_layers = sum(
-                [layer_type == "mamba" for layer_type in layers_type])
-
+            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
+                self.vllm_config.parallel_config, LayerBlockType.mamba)
             self.mamba_cache = MambaCacheManager(
                 self.lm_head.weight.dtype, num_mamba_layers, max_batch_size,
                 *self._get_mamba_cache_shape())
@@ -423,7 +452,7 @@ def forward(self,
                                               state_indices_tensor)
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata, mamba_cache_params,
-                                   inputs_embeds)
+                                   intermediate_tensors, inputs_embeds)
         return hidden_states
 
     def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
@@ -504,8 +533,12 @@ def load_weights(self, weights: Iterable[Tuple[str,
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
+
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
@@ -520,6 +553,8 @@ def load_weights(self, weights: Iterable[Tuple[str,
                     if weight_name not in name:
                         continue
 
+                    if is_pp_missing_parameter(name, self):
+                        continue
                     name = name.replace(weight_name, param_name)
                     param = params_dict[name]
                     weight_loader = param.weight_loader
@@ -533,6 +568,8 @@ def load_weights(self, weights: Iterable[Tuple[str,
                     # Skip loading extra bias for GPTQ models.
                     if name.endswith(".bias") and name not in params_dict:
                         continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
 
                     param = params_dict[name]
                     weight_loader = getattr(param, "weight_loader",
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 8bdcd2c5aad1f..1f5cd02711899 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -8,6 +8,7 @@
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import _BATCH_SIZES_TO_CAPTURE, CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
@@ -18,13 +19,16 @@
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import (HasInnerState,
-                                                   IsAttentionFree)
+                                                   IsAttentionFree, SupportsPP)
 from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
                                                     MambaCacheParams)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
+from vllm.utils import LayerBlockType
 
-from .utils import maybe_prefix
+from .utils import (is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 
@@ -95,15 +99,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             org_num_embeddings=config.vocab_size,
         )
 
-        decoder_layers = []
-        for i in range(config.num_hidden_layers):
-            decoder_layers.append(
-                MambaDecoderLayer(config,
-                                  cache_config=cache_config,
-                                  quant_config=quant_config))
-        self.layers = nn.ModuleList(decoder_layers)
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: MambaDecoderLayer(
+                config, cache_config=cache_config, quant_config=quant_config),
+            prefix=f"{prefix}.layers")
+
         self.norm_f = RMSNorm(config.hidden_size,
                               eps=config.layer_norm_epsilon)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embeddings(input_ids)
@@ -114,29 +120,40 @@ def forward(
         positions: torch.Tensor,
         attn_metadata: AttentionMetadata,
         mamba_cache_params: MambaCacheParams,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-
-        if inputs_embeds is not None:
-            hidden_states = inputs_embeds
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
         else:
-            hidden_states = self.get_input_embeddings(input_ids)
-        residual = None
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
 
-        for i in range(len(self.layers)):
+        for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
             hidden_states, residual = layer(
                 positions=positions,
                 hidden_states=hidden_states,
                 attn_metadata=attn_metadata,
                 residual=residual,
-                mamba_cache_params=mamba_cache_params.at_layer_idx(i))
+                mamba_cache_params=mamba_cache_params.at_layer_idx(
+                    i - self.start_layer))
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
         hidden_states, _ = self.norm_f(hidden_states, residual)
 
         return hidden_states
 
 
-class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree):
+class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
@@ -148,7 +165,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         super().__init__()
         self.config = config
+        self.vllm_config = vllm_config
         self.scheduler_config = scheduler_config
+        self.model_config = vllm_config.model_config
         self.backbone = MambaModel(vllm_config=vllm_config,
                                    prefix=maybe_prefix(prefix, "backbone"))
         self.unpadded_vocab_size = config.vocab_size
@@ -174,6 +193,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                 config.vocab_size)
         self.sampler = get_sampler()
 
+        self.make_empty_intermediate_tensors = (
+            self.backbone.make_empty_intermediate_tensors)
+
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.backbone.get_input_embeddings(input_ids)
 
@@ -189,9 +211,12 @@ def forward(self,
             max_batch_size = (VllmConfig.get_graph_batch_size(
                 self.scheduler_config.max_num_seqs) if self.scheduler_config
                               else max(_BATCH_SIZES_TO_CAPTURE) + 2)
+
+            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
+                self.vllm_config.parallel_config, LayerBlockType.mamba)
             self.mamba_cache = MambaCacheManager(
-                self.lm_head.weight.dtype, self.config.num_hidden_layers,
-                max_batch_size, *self._get_mamba_cache_shape())
+                self.lm_head.weight.dtype, num_mamba_layers, max_batch_size,
+                *self._get_mamba_cache_shape())
 
         (
             mamba_cache_tensors,
@@ -204,7 +229,8 @@ def forward(self,
                                               state_indices_tensor)
 
         hidden_states = self.backbone(input_ids, positions, attn_metadata,
-                                      mamba_cache_params, inputs_embeds)
+                                      mamba_cache_params, intermediate_tensors,
+                                      inputs_embeds)
 
         return hidden_states
 
@@ -252,6 +278,8 @@ def load_weights(self, weights: Iterable[Tuple[str,
             # Skip loading extra bias for GPTQ models.
             if name.endswith(".bias") and name not in params_dict:
                 continue
+            if is_pp_missing_parameter(name, self):
+                continue
 
             param = params_dict[name]
             weight_loader = getattr(param, "weight_loader",
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index e69596aa915b5..4beea4641f5ab 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -21,7 +21,7 @@
 from vllm.platforms import current_platform
 
 from .adapters import as_embedding_model
-from .interfaces import (has_inner_state, is_attention_free,
+from .interfaces import (has_inner_state, is_attention_free, is_hybrid,
                          supports_cross_encoding, supports_multimodal,
                          supports_pp)
 from .interfaces_base import is_pooling_model, is_text_generation_model
@@ -218,6 +218,7 @@ class _ModelInfo:
     supports_pp: bool
     has_inner_state: bool
     is_attention_free: bool
+    is_hybrid: bool
 
     @staticmethod
     def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo":
@@ -239,6 +240,7 @@ def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo":
             supports_pp=supports_pp(model),
             has_inner_state=has_inner_state(model),
             is_attention_free=is_attention_free(model),
+            is_hybrid=is_hybrid(model),
         )
 
 
@@ -484,6 +486,13 @@ def is_attention_free_model(
         model_cls, _ = self.inspect_model_cls(architectures)
         return model_cls.is_attention_free
 
+    def is_hybrid_model(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.is_hybrid
+
 
 ModelRegistry = _ModelRegistry({
     model_arch: _LazyRegisteredModel(
diff --git a/vllm/utils.py b/vllm/utils.py
index 7cdb2cb320b05..1882264c19775 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -170,6 +170,11 @@ class Device(enum.Enum):
     CPU = enum.auto()
 
 
+class LayerBlockType(enum.Enum):
+    attention = "attention"
+    mamba = "mamba"
+
+
 class Counter:
 
     def __init__(self, start: int = 0) -> None:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a3335fa838352..8d9976ded7c5e 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -15,8 +15,8 @@
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingType
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, cdiv,
-                        is_pin_memory_available)
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
+                        LayerBlockType, cdiv, is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
 from vllm.v1.outputs import ModelRunnerOutput
@@ -68,8 +68,8 @@ def __init__(
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
 
         # Model-related.
-        self.num_attn_layers = model_config.get_num_attention_layers(
-            parallel_config)
+        self.num_attn_layers = model_config.get_num_layers_by_block_type(
+            parallel_config, LayerBlockType.attention)
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
         self.head_size = model_config.get_head_size()
         self.hidden_size = model_config.get_hidden_size()
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index d32848c3775ae..49e415ab72e0b 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -14,7 +14,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, get_dtype_size
 from vllm.v1.core.scheduler import SchedulerOutput
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
@@ -260,8 +260,8 @@ def _get_cache_block_size(
 ) -> int:
     head_size = model_config.get_head_size()
     num_heads = model_config.get_num_kv_heads(parallel_config)
-    num_attention_layers = model_config.get_num_attention_layers(
-        parallel_config)
+    num_attention_layers = model_config.get_num_layers_by_block_type(
+        parallel_config, LayerBlockType.attention)
 
     key_cache_block = cache_config.block_size * num_heads * head_size
     value_cache_block = key_cache_block
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index ac3270d1c9909..7ccd4571b19df 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -6,8 +6,8 @@
 from vllm.attention import get_attn_backend
 from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size,
-                        is_pin_memory_available)
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType,
+                        get_dtype_size, is_pin_memory_available)
 
 logger = init_logger(__name__)
 
@@ -34,8 +34,8 @@ def __init__(
 
         self.head_size = model_config.get_head_size()
         # Models like Jamba, have mixed typed layers, E.g Mamba
-        self.num_attention_layers = model_config.get_num_attention_layers(
-            parallel_config)
+        self.num_attention_layers = model_config.get_num_layers_by_block_type(
+            parallel_config, LayerBlockType.attention)
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
 
         self.block_size = cache_config.block_size
@@ -105,8 +105,8 @@ def get_cache_block_size(
     ) -> int:
         head_size = model_config.get_head_size()
         num_heads = model_config.get_num_kv_heads(parallel_config)
-        num_attention_layers = model_config.get_num_attention_layers(
-            parallel_config)
+        num_attention_layers = model_config.get_num_layers_by_block_type(
+            parallel_config, LayerBlockType.attention)
 
         key_cache_block = cache_config.block_size * num_heads * head_size
         value_cache_block = key_cache_block

From e39400a4b60d28ff5c0a1a5194068c928adcaf98 Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Wed, 11 Dec 2024 01:51:40 -0300
Subject: [PATCH 011/357] Fix streaming for granite tool call when
 <|tool_call|> is present (#11069)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
index 00917c866e496..dae481a2154a1 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -88,7 +88,11 @@ def extract_tool_calls_streaming(
     ) -> Union[DeltaMessage, None]:
 
         start_idx = consume_space(0, current_text)
-        if not current_text or current_text[start_idx] != '[':
+        if current_text[start_idx:].startswith(self.bot_token):
+            start_idx = consume_space(start_idx + len(self.bot_token),
+                                      current_text)
+        if not current_text or start_idx >= len(current_text)\
+            or current_text[start_idx] != '[':
             return DeltaMessage(content=delta_text)
 
         # bit mask flags for partial JSON parsing. If the name hasn't been

From 2e33fe419186c65a18da6668972d61d7bbc31564 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 11 Dec 2024 13:02:02 +0800
Subject: [PATCH 012/357] [CI/Build] Check transformers v4.47 (#10991)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 requirements-test.txt                                    | 4 ++--
 .../vision_language/mm_processor_kwargs/test_idefics3.py | 9 ---------
 .../models/embedding/vision_language/test_llava_next.py  | 2 +-
 3 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/requirements-test.txt b/requirements-test.txt
index 38a064bca449a..8ceb705cdffd7 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -506,7 +506,7 @@ tiktoken==0.7.0
     #   mistral-common
 timm==1.0.11
     # via -r requirements-test.in
-tokenizers==0.20.3
+tokenizers==0.21.0
     # via transformers
 torch==2.5.1
     # via
@@ -534,7 +534,7 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.46.3
+transformers==4.47.0
     # via
     #   lm-eval
     #   peft
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py
index 31896bfd13e8c..c71a2d359043d 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py
@@ -3,7 +3,6 @@
 
 import pytest
 import torch
-import transformers
 from transformers import AutoImageProcessor, AutoTokenizer
 
 from vllm.inputs import InputContext, token_inputs
@@ -36,8 +35,6 @@ def get_max_idefics3_image_tokens():
     return get_max_idefics3_image_tokens
 
 
-@pytest.mark.skipif(transformers.__version__ < "4.46.0",
-                    reason="Model introduced in HF >= 4.46.0")
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("longest_edge", [None, 168, 336, 400, 2 * 336])
 def test_input_mapper_override(model: str, image_assets: _ImageAssets,
@@ -77,8 +74,6 @@ def test_input_mapper_override(model: str, image_assets: _ImageAssets,
     assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"])
 
 
-@pytest.mark.skipif(transformers.__version__ < "4.46.0",
-                    reason="Model introduced in HF >= 4.46.0")
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("longest_edge, expected_max_tokens", [
     (None, 2873),
@@ -107,8 +102,6 @@ def test_max_tokens_override(get_max_idefics3_image_tokens, model: str,
     assert expected_max_tokens == actual_max_tokens
 
 
-@pytest.mark.skipif(transformers.__version__ < "4.46.0",
-                    reason="Model introduced in HF >= 4.46.0")
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("longest_edge, toks_per_img, num_imgs", [
     (168, 169, 1),
@@ -143,8 +136,6 @@ def test_dummy_data_override(dummy_data_for_idefics3, model: str,
     assert img_tok_count == toks_per_img * num_imgs
 
 
-@pytest.mark.skipif(transformers.__version__ < "4.46.0",
-                    reason="Model introduced in HF >= 4.46.0")
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("longest_edge,expected_toks_per_img,num_imgs", [
     (336, 169 * (1**2 + 1), 1),
diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
index 329c6ba279f89..693abd7252d5e 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -86,7 +86,7 @@ def _run_test(
     )
 
 
-@pytest.mark.skipif(transformers.__version__.startswith("4.46"),
+@pytest.mark.skipif(transformers.__version__ >= "4.46",
                     reason="Model broken with changes in transformers 4.46")
 @pytest.mark.core_model
 @pytest.mark.parametrize("model", MODELS)

From 3fb4b4f1634a896653acc12c72b8e5d6d87a8f82 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Wed, 11 Dec 2024 00:39:53 -0800
Subject: [PATCH 013/357] [ci/build] Fix AMD CI dependencies (#11087)

---
 requirements-rocm.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements-rocm.txt b/requirements-rocm.txt
index 121123611d2da..ccc9062341772 100644
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -5,7 +5,8 @@
 awscli
 boto3
 botocore
+datasets
 ray >= 2.10.0
 peft
 pytest-asyncio
-tensorizer>=2.9.0
\ No newline at end of file
+tensorizer>=2.9.0

From 9974fca047bb332ec68377be4579ea515a300d69 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Wed, 11 Dec 2024 01:01:53 -0800
Subject: [PATCH 014/357] [ci/build] Fix entrypoints test and pin outlines
 version (#11088)

---
 requirements-common.txt                                         | 2 +-
 .../guided_decoding/outlines_logits_processors.py               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index c71fc458aca13..792cd58e80669 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -18,7 +18,7 @@ prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
-outlines >= 0.1.8
+outlines == 0.1.9
 xgrammar >= 0.1.6; platform_machine == "x86_64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index 1f0dbe024609d..b63fed1c8a8c3 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -25,7 +25,7 @@
 from outlines import grammars
 from outlines.caching import cache
 from outlines.fsm.guide import CFGGuide, Generate, Guide, RegexGuide, Write
-from outlines.fsm.json_schema import build_regex_from_schema
+from outlines_core.fsm.json_schema import build_regex_from_schema
 from pydantic import BaseModel
 from transformers import PreTrainedTokenizerBase
 

From 61b1d2f6aef8e29c6a0d795a9c6682d525f4d8cc Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 11 Dec 2024 04:26:36 -0500
Subject: [PATCH 015/357] [Core] v1: Use atexit to handle engine core client
 shutdown (#11076)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/v1/engine/core_client.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index ee89cece73141..4d96b323d1662 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,3 +1,4 @@
+import atexit
 import multiprocessing
 from typing import List, Union
 
@@ -157,6 +158,7 @@ def __init__(
             should_shutdown=self.should_shutdown,
             **kwargs,
         )
+        atexit.register(self.shutdown)
 
     def shutdown(self):
         # Send shutdown signal to background process.

From 2e32f5d28db3cd79f6a421f640e083be1f9468b7 Mon Sep 17 00:00:00 2001
From: B-201 <Joy25810@foxmail.com>
Date: Wed, 11 Dec 2024 17:27:07 +0800
Subject: [PATCH 016/357] [Bugfix] Fix Idefics3 fails during multi-image
 inference (#11080)

Signed-off-by: B-201 <Joy25810@foxmail.com>
---
 vllm/model_executor/models/idefics3.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index e5d2edbd81eb1..17e772e7faa32 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -60,7 +60,8 @@ class Idefics3ImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
     data: torch.Tensor
     """
-    Shape: `(batch_size * num_images, num_channels, height, width)`
+    Shape: `(batch_size * num_images * num_patches, 
+             num_channels, height, width)`
     """
     pixel_attention_mask: Optional[torch.BoolTensor]
 
@@ -520,13 +521,17 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
-            return Idefics3ImagePixelInputs(type="pixel_values",
-                                            data=self._validate_pixel_values(
-                                                flatten_bn(pixel_values,
-                                                           concat=True)),
-                                            pixel_attention_mask=flatten_bn(
-                                                pixel_attention_mask,
-                                                concat=True))
+            if isinstance(pixel_values, list):
+                pixel_values = torch.cat(pixel_values, dim=1)
+                pixel_attention_mask = torch.cat(pixel_attention_mask, dim=1)
+            else:
+                pixel_values = flatten_bn(pixel_values)
+                pixel_attention_mask = flatten_bn(pixel_attention_mask)
+
+            return Idefics3ImagePixelInputs(
+                type="pixel_values",
+                data=self._validate_pixel_values(pixel_values),
+                pixel_attention_mask=pixel_attention_mask)
 
         raise AssertionError("This line should be unreachable.")
 

From 40766ca1b8b0ef92e220595bda96c4336b597e5b Mon Sep 17 00:00:00 2001
From: Rafael Vasquez <rafvasq21@gmail.com>
Date: Wed, 11 Dec 2024 04:27:39 -0500
Subject: [PATCH 017/357] [Bugfix]: Clamp `-inf` logprob values in
 prompt_logprobs (#11073)

Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
---
 vllm/entrypoints/openai/serving_completion.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index c54d5f07cf58c..ee97d35f2b087 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -392,6 +392,12 @@ def request_output_to_completion_response(
             prompt_token_ids = final_res.prompt_token_ids
             assert prompt_token_ids is not None
             prompt_logprobs = final_res.prompt_logprobs
+            if prompt_logprobs:
+                for logprob_dict in prompt_logprobs:
+                    if logprob_dict:
+                        for logprob_values in logprob_dict.values():
+                            if logprob_values.logprob == float('-inf'):
+                                logprob_values.logprob = -9999.0
             prompt_text = final_res.prompt
 
             token_ids: GenericSequence[int]

From 8f10d5e3930f05c2057a831cd80ba24c52b8ceef Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 11 Dec 2024 17:28:00 +0800
Subject: [PATCH 018/357] [Misc] Split up pooling tasks (#10820)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/index.rst                         |   2 +
 docs/source/models/generative_models.rst      | 146 ++++++++++++++++
 docs/source/models/pooling_models.rst         |  99 +++++++++++
 docs/source/models/supported_models.rst       | 157 ++++++++++++------
 docs/source/usage/compatibility_matrix.rst    |  12 +-
 examples/offline_inference_embedding.py       |   7 +-
 ...ine_inference_vision_language_embedding.py |   4 +-
 tests/compile/test_basic_correctness.py       |   4 +-
 tests/core/test_scheduler_encoder_decoder.py  |   2 +-
 .../openai/test_vision_embedding.py           |   2 +-
 .../embedding/language/test_embedding.py      |   2 +-
 .../models/embedding/language/test_scoring.py |  12 +-
 .../vision_language/test_dse_qwen2_vl.py      |   2 +-
 .../vision_language/test_llava_next.py        |   2 +-
 .../embedding/vision_language/test_phi3v.py   |   2 +-
 tests/test_config.py                          |  17 +-
 vllm/config.py                                | 137 ++++++++++-----
 vllm/core/scheduler.py                        |   2 +-
 vllm/engine/arg_utils.py                      |   7 +-
 vllm/engine/llm_engine.py                     |   4 +-
 vllm/entrypoints/llm.py                       |  53 +++---
 vllm/entrypoints/openai/api_server.py         |   8 +-
 vllm/entrypoints/openai/run_batch.py          |   4 +-
 vllm/model_executor/model_loader/utils.py     |   2 +-
 vllm/v1/engine/core.py                        |   2 +-
 vllm/worker/cpu_worker.py                     |   2 +-
 vllm/worker/worker.py                         |   2 +-
 27 files changed, 527 insertions(+), 168 deletions(-)
 create mode 100644 docs/source/models/generative_models.rst
 create mode 100644 docs/source/models/pooling_models.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index ebf1361976c5e..842013d6d49c4 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -94,6 +94,8 @@ Documentation
    :caption: Models
 
    models/supported_models
+   models/generative_models
+   models/pooling_models
    models/adding_model
    models/enabling_multimodal_inputs
 
diff --git a/docs/source/models/generative_models.rst b/docs/source/models/generative_models.rst
new file mode 100644
index 0000000000000..fb71185600863
--- /dev/null
+++ b/docs/source/models/generative_models.rst
@@ -0,0 +1,146 @@
+.. _generative_models:
+
+Generative Models
+=================
+
+vLLM provides first-class support for generative models, which covers most of LLMs.
+
+In vLLM, generative models implement the :class:`~vllm.model_executor.models.VllmModelForTextGeneration` interface.
+Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
+which are then passed through :class:`~vllm.model_executor.layers.Sampler` to obtain the final text.
+
+Offline Inference
+-----------------
+
+The :class:`~vllm.LLM` class provides various methods for offline inference.
+See :ref:`Engine Arguments <engine_args>` for a list of options when initializing the model.
+
+For generative models, the only supported :code:`task` option is :code:`"generate"`.
+Usually, this is automatically inferred so you don't have to specify it.
+
+``LLM.generate``
+^^^^^^^^^^^^^^^^
+
+The :class:`~vllm.LLM.generate` method is available to all generative models in vLLM.
+It is similar to `its counterpart in HF Transformers <https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate>`__,
+except that tokenization and detokenization are also performed automatically.
+
+.. code-block:: python
+
+    llm = LLM(model="facebook/opt-125m")
+    outputs = llm.generate("Hello, my name is")
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+You can optionally control the language generation by passing :class:`~vllm.SamplingParams`.
+For example, you can use greedy sampling by setting :code:`temperature=0`:
+
+.. code-block:: python
+
+    llm = LLM(model="facebook/opt-125m")
+    params = SamplingParams(temperature=0)
+    outputs = llm.generate("Hello, my name is", params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+A code example can be found in `examples/offline_inference.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py>`_.
+
+``LLM.beam_search``
+^^^^^^^^^^^^^^^^^^^
+
+The :class:`~vllm.LLM.beam_search` method implements `beam search <https://huggingface.co/docs/transformers/en/generation_strategies#beam-search-decoding>`__ on top of :class:`~vllm.LLM.generate`.
+For example, to search using 5 beams and output at most 50 tokens:
+
+.. code-block:: python
+
+    llm = LLM(model="facebook/opt-125m")
+    params = BeamSearchParams(beam_width=5, max_tokens=50)
+    outputs = llm.generate("Hello, my name is", params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+``LLM.chat``
+^^^^^^^^^^^^
+
+The :class:`~vllm.LLM.chat` method implements chat functionality on top of :class:`~vllm.LLM.generate`.
+In particular, it accepts input similar to `OpenAI Chat Completions API <https://platform.openai.com/docs/api-reference/chat>`__
+and automatically applies the model's `chat template <https://huggingface.co/docs/transformers/en/chat_templating>`__ to format the prompt.
+
+.. important::
+
+    In general, only instruction-tuned models have a chat template.
+    Base models may perform poorly as they are not trained to respond to the chat conversation.
+
+.. code-block:: python
+
+    llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
+    conversation = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": "Hello"
+        },
+        {
+            "role": "assistant",
+            "content": "Hello! How can I assist you today?"
+        },
+        {
+            "role": "user",
+            "content": "Write an essay about the importance of higher education.",
+        },
+    ]
+    outputs = llm.chat(conversation)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+A code example can be found in `examples/offline_inference_chat.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_chat.py>`_.
+
+If the model doesn't have a chat template or you want to specify another one,
+you can explicitly pass a chat template:
+
+.. code-block:: python
+
+    from vllm.entrypoints.chat_utils import load_chat_template
+
+    # You can find a list of existing chat templates under `examples/`
+    custom_template = load_chat_template(chat_template="<path_to_template>")
+    print("Loaded chat template:", custom_template)
+
+    outputs = llm.chat(conversation, chat_template=custom_template)
+
+Online Inference
+----------------
+
+Our `OpenAI Compatible Server <../serving/openai_compatible_server>`__ can be used for online inference.
+Please click on the above link for more details on how to launch the server.
+
+Completions API
+^^^^^^^^^^^^^^^
+
+Our Completions API is similar to ``LLM.generate`` but only accepts text.
+It is compatible with `OpenAI Completions API <https://platform.openai.com/docs/api-reference/completions>`__
+so that you can use OpenAI client to interact with it.
+A code example can be found in `examples/openai_completion_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py>`_.
+
+Chat API
+^^^^^^^^
+
+Our Chat API is similar to ``LLM.chat``, accepting both text and :ref:`multi-modal inputs <multimodal_inputs>`.
+It is compatible with `OpenAI Chat Completions API <https://platform.openai.com/docs/api-reference/chat>`__
+so that you can use OpenAI client to interact with it.
+A code example can be found in `examples/openai_chat_completion_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client.py>`_.
diff --git a/docs/source/models/pooling_models.rst b/docs/source/models/pooling_models.rst
new file mode 100644
index 0000000000000..7fa66274c3c5a
--- /dev/null
+++ b/docs/source/models/pooling_models.rst
@@ -0,0 +1,99 @@
+.. _pooling_models:
+
+Pooling Models
+==============
+
+vLLM also supports pooling models, including embedding, reranking and reward models.
+
+In vLLM, pooling models implement the :class:`~vllm.model_executor.models.VllmModelForPooling` interface.
+These models use a :class:`~vllm.model_executor.layers.Pooler` to aggregate the final hidden states of the input
+before returning them.
+
+.. note::
+
+    We currently support pooling models primarily as a matter of convenience.
+    As shown in the :ref:`Compatibility Matrix <compatibility_matrix>`, most vLLM features are not applicable to
+    pooling models as they only work on the generation or decode stage, so performance may not improve as much.
+
+Offline Inference
+-----------------
+
+The :class:`~vllm.LLM` class provides various methods for offline inference.
+See :ref:`Engine Arguments <engine_args>` for a list of options when initializing the model.
+
+For pooling models, we support the following :code:`task` options:
+
+- Embedding (:code:`"embed"` / :code:`"embedding"`)
+- Classification (:code:`"classify"`)
+- Sentence Pair Scoring (:code:`"score"`)
+- Reward Modeling (:code:`"reward"`)
+
+The selected task determines the default :class:`~vllm.model_executor.layers.Pooler` that is used:
+
+- Embedding: Extract only the hidden states corresponding to the last token, and apply normalization.
+- Classification: Extract only the hidden states corresponding to the last token, and apply softmax.
+- Sentence Pair Scoring: Extract only the hidden states corresponding to the last token, and apply softmax.
+- Reward Modeling: Extract all of the hidden states and return them directly.
+
+When loading `Sentence Transformers <https://huggingface.co/sentence-transformers>`__ models,
+we attempt to override the default pooler based on its Sentence Transformers configuration file (:code:`modules.json`).
+
+You can customize the model's pooling method via the :code:`override_pooler_config` option,
+which takes priority over both the model's and Sentence Transformers's defaults.
+
+``LLM.encode``
+^^^^^^^^^^^^^^
+
+The :class:`~vllm.LLM.encode` method is available to all pooling models in vLLM.
+It returns the aggregated hidden states directly.
+
+.. code-block:: python
+
+    llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed")
+    outputs = llm.encode("Hello, my name is")
+
+    outputs = model.encode(prompts)
+    for output in outputs:
+        embeddings = output.outputs.embedding
+        print(f"Prompt: {prompt!r}, Embeddings (size={len(embeddings)}: {embeddings!r}")
+
+A code example can be found in `examples/offline_inference_embedding.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_embedding.py>`_.
+
+``LLM.score``
+^^^^^^^^^^^^^
+
+The :class:`~vllm.LLM.score` method outputs similarity scores between sentence pairs.
+It is primarily designed for `cross-encoder models <https://www.sbert.net/examples/applications/cross-encoder/README.html>`__.
+These types of models serve as rerankers between candidate query-document pairs in RAG systems.
+
+.. note::
+
+    vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.
+    To handle RAG at a higher level, you should use integration frameworks such as `LangChain <https://github.com/langchain-ai/langchain>`_.
+
+You can use `these tests <https://github.com/vllm-project/vllm/blob/main/tests/models/embedding/language/test_scoring.py>`_ as reference.
+
+Online Inference
+----------------
+
+Our `OpenAI Compatible Server <../serving/openai_compatible_server>`__ can be used for online inference.
+Please click on the above link for more details on how to launch the server.
+
+Embeddings API
+^^^^^^^^^^^^^^
+
+Our Embeddings API is similar to ``LLM.encode``, accepting both text and :ref:`multi-modal inputs <multimodal_inputs>`.
+
+The text-only API is compatible with `OpenAI Embeddings API <https://platform.openai.com/docs/api-reference/embeddings>`__
+so that you can use OpenAI client to interact with it.
+A code example can be found in `examples/openai_embedding_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_embedding_client.py>`_.
+
+The multi-modal API is an extension of the `OpenAI Embeddings API <https://platform.openai.com/docs/api-reference/embeddings>`__
+that incorporates `OpenAI Chat Completions API <https://platform.openai.com/docs/api-reference/chat>`__,
+so it is not part of the OpenAI standard. Please see :ref:`this page <multimodal_inputs>` for more details on how to use it.
+
+Score API
+^^^^^^^^^
+
+Our Score API is similar to ``LLM.score``.
+Please see `this page <../serving/openai_compatible_server.html#score-api-for-cross-encoder-models>`__ for more details on how to use it.
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 6540e023c1ab0..b9957cf9563b1 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -3,11 +3,21 @@
 Supported Models
 ================
 
-vLLM supports a variety of generative and embedding models from `HuggingFace (HF) Transformers <https://huggingface.co/models>`_.
-This page lists the model architectures that are currently supported by vLLM.
+vLLM supports generative and pooling models across various tasks.
+If a model supports more than one task, you can set the task via the :code:`--task` argument.
+
+For each task, we list the model architectures that have been implemented in vLLM.
 Alongside each architecture, we include some popular models that use it.
 
-For other models, you can check the :code:`config.json` file inside the model repository.
+Loading a Model
+^^^^^^^^^^^^^^^
+
+HuggingFace Hub
++++++++++++++++
+
+By default, vLLM loads models from `HuggingFace (HF) Hub <https://huggingface.co/models>`_.
+
+To determine whether a given model is supported, you can check the :code:`config.json` file inside the HF repository.
 If the :code:`"architectures"` field contains a model architecture listed below, then it should be supported in theory.
 
 .. tip::
@@ -17,38 +27,57 @@ If the :code:`"architectures"` field contains a model architecture listed below,
 
         from vllm import LLM
 
-        llm = LLM(model=...)  # Name or path of your model
+        # For generative models (task=generate) only
+        llm = LLM(model=..., task="generate")  # Name or path of your model
         output = llm.generate("Hello, my name is")
         print(output)
 
-    If vLLM successfully generates text, it indicates that your model is supported.
+        # For pooling models (task={embed,classify,reward}) only
+        llm = LLM(model=..., task="embed")  # Name or path of your model
+        output = llm.encode("Hello, my name is")
+        print(output)
+
+    If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
 
 Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Enabling Multimodal Inputs <enabling_multimodal_inputs>` 
 for instructions on how to implement your model in vLLM.
 Alternatively, you can `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_ to request vLLM support.
 
-.. note::
-    To use models from `ModelScope <https://www.modelscope.cn>`_ instead of HuggingFace Hub, set an environment variable:
+ModelScope
+++++++++++
 
-    .. code-block:: shell
+To use models from `ModelScope <https://www.modelscope.cn>`_ instead of HuggingFace Hub, set an environment variable:
 
-       $ export VLLM_USE_MODELSCOPE=True
+.. code-block:: shell
 
-    And use with :code:`trust_remote_code=True`.
+    $ export VLLM_USE_MODELSCOPE=True
 
-    .. code-block:: python
+And use with :code:`trust_remote_code=True`.
 
-        from vllm import LLM
+.. code-block:: python
 
-        llm = LLM(model=..., revision=..., trust_remote_code=True)  # Name or path of your model
-        output = llm.generate("Hello, my name is")
-        print(output)
+    from vllm import LLM
+
+    llm = LLM(model=..., revision=..., task=..., trust_remote_code=True)
 
-Text-only Language Models
-^^^^^^^^^^^^^^^^^^^^^^^^^
+    # For generative models (task=generate) only
+    output = llm.generate("Hello, my name is")
+    print(output)
 
-Text Generation
----------------
+    # For pooling models (task={embed,classify,reward}) only
+    output = llm.encode("Hello, my name is")
+    print(output)
+
+List of Text-only Language Models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Generative Models
++++++++++++++++++
+
+See :ref:`this page <generative_models>` for more information on how to use generative models.
+
+Text Generation (``--task generate``)
+-------------------------------------
 
 .. list-table::
   :widths: 25 25 50 5 5
@@ -328,8 +357,24 @@ Text Generation
 .. note::
     Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
 
-Text Embedding
---------------
+Pooling Models
+++++++++++++++
+
+See :ref:`this page <pooling_models>` for more information on how to use pooling models.
+
+.. important::
+    Since some model architectures support both generative and pooling tasks,
+    you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
+
+Text Embedding (``--task embed``)
+---------------------------------
+
+Any text generation model can be converted into an embedding model by passing :code:`--task embed`.
+
+.. note::
+    To get the best results, you should use pooling models that are specifically trained as such.
+
+The following table lists those that are tested in vLLM.
 
 .. list-table::
   :widths: 25 25 50 5 5
@@ -371,13 +416,6 @@ Text Embedding
     - 
     - 
 
-.. important::
-  Some model architectures support both generation and embedding tasks.
-  In this case, you have to pass :code:`--task embedding` to run the model in embedding mode.
-
-.. tip::
-  You can override the model's pooling method by passing :code:`--override-pooler-config`.
-
 .. note::
   :code:`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
   You should manually set mean pooling by passing :code:`--override-pooler-config '{"pooling_type": "MEAN"}'`.
@@ -389,8 +427,8 @@ Text Embedding
   On the other hand, its 1.5B variant (:code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
   despite being described otherwise on its model card.
 
-Reward Modeling
----------------
+Reward Modeling (``--task reward``)
+-----------------------------------
 
 .. list-table::
   :widths: 25 25 50 5 5
@@ -416,11 +454,8 @@ Reward Modeling
   For process-supervised reward models such as :code:`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
   e.g.: :code:`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
 
-.. note::
-    As an interim measure, these models are supported in both offline and online inference via Embeddings API.
-
-Classification
----------------
+Classification (``--task classify``)
+------------------------------------
 
 .. list-table::
   :widths: 25 25 50 5 5
@@ -437,11 +472,8 @@ Classification
     - ✅︎
     - ✅︎
 
-.. note::
-    As an interim measure, these models are supported in both offline and online inference via Embeddings API.
-
-Sentence Pair Scoring
----------------------
+Sentence Pair Scoring (``--task score``)
+----------------------------------------
 
 .. list-table::
   :widths: 25 25 50 5 5
@@ -468,13 +500,10 @@ Sentence Pair Scoring
     - 
     - 
 
-.. note::
-    These models are supported in both offline and online inference via Score API.
-
 .. _supported_mm_models:
 
-Multimodal Language Models
-^^^^^^^^^^^^^^^^^^^^^^^^^^
+List of Multimodal Language Models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The following modalities are supported depending on the model:
 
@@ -491,8 +520,15 @@ On the other hand, modalities separated by :code:`/` are mutually exclusive.
 
 - e.g.: :code:`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
 
-Text Generation
----------------
+See :ref:`this page <multimodal_inputs>` on how to pass multi-modal inputs to the model.
+
+Generative Models
++++++++++++++++++
+
+See :ref:`this page <generative_models>` for more information on how to use generative models.
+
+Text Generation (``--task generate``)
+-------------------------------------
 
 .. list-table::
   :widths: 25 25 15 20 5 5 5
@@ -696,8 +732,24 @@ Text Generation
   The official :code:`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
   For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
 
-Multimodal Embedding
---------------------
+Pooling Models
+++++++++++++++
+
+See :ref:`this page <pooling_models>` for more information on how to use pooling models.
+
+.. important::
+    Since some model architectures support both generative and pooling tasks,
+    you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
+
+Text Embedding (``--task embed``)
+---------------------------------
+
+Any text generation model can be converted into an embedding model by passing :code:`--task embed`.
+
+.. note::
+    To get the best results, you should use pooling models that are specifically trained as such.
+
+The following table lists those that are tested in vLLM.
 
 .. list-table::
   :widths: 25 25 15 25 5 5
@@ -728,12 +780,7 @@ Multimodal Embedding
     - 
     - ✅︎
 
-.. important::
-  Some model architectures support both generation and embedding tasks.
-  In this case, you have to pass :code:`--task embedding` to run the model in embedding mode.
-
-.. tip::
-  You can override the model's pooling method by passing :code:`--override-pooler-config`.
+----
 
 Model Support Policy
 =====================
diff --git a/docs/source/usage/compatibility_matrix.rst b/docs/source/usage/compatibility_matrix.rst
index a93632ff36fb8..04dd72b1e3527 100644
--- a/docs/source/usage/compatibility_matrix.rst
+++ b/docs/source/usage/compatibility_matrix.rst
@@ -39,13 +39,13 @@ Feature x Feature
      - :abbr:`prmpt adptr (Prompt Adapter)`
      - :ref:`SD <spec_decode>`
      - CUDA graph
-     - :abbr:`emd (Embedding Models)`
+     - :abbr:`pooling (Pooling Models)`
      - :abbr:`enc-dec (Encoder-Decoder Models)`
      - :abbr:`logP (Logprobs)`
      - :abbr:`prmpt logP (Prompt Logprobs)`
      - :abbr:`async output (Async Output Processing)`
      - multi-step
-     - :abbr:`mm (Multimodal)`
+     - :abbr:`mm (Multimodal Inputs)`
      - best-of
      - beam-search
      - :abbr:`guided dec (Guided Decoding)`
@@ -151,7 +151,7 @@ Feature x Feature
      - 
      - 
      - 
-   * - :abbr:`emd (Embedding Models)`
+   * - :abbr:`pooling (Pooling Models)`
      - ✗
      - ✗
      - ✗ 
@@ -253,7 +253,7 @@ Feature x Feature
      - 
      - 
      - 
-   * - :abbr:`mm (Multimodal)`
+   * - :abbr:`mm (Multimodal Inputs)`
      - ✅
      -  `✗ <https://github.com/vllm-project/vllm/pull/8348>`__ 
      -  `✗ <https://github.com/vllm-project/vllm/pull/7199>`__ 
@@ -386,7 +386,7 @@ Feature x Hardware
      - ✅
      - ✗
      - ✅
-   * - :abbr:`emd (Embedding Models)`
+   * - :abbr:`pooling (Pooling Models)`
      - ✅
      - ✅
      - ✅
@@ -402,7 +402,7 @@ Feature x Hardware
      - ✅
      - ✅
      - ✗
-   * - :abbr:`mm (Multimodal)`
+   * - :abbr:`mm (Multimodal Inputs)`
      - ✅
      - ✅
      - ✅
diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference_embedding.py
index ae158eef2ca4c..17f6d992073d7 100644
--- a/examples/offline_inference_embedding.py
+++ b/examples/offline_inference_embedding.py
@@ -9,7 +9,12 @@
 ]
 
 # Create an LLM.
-model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True)
+model = LLM(
+    model="intfloat/e5-mistral-7b-instruct",
+    task="embed",  # You should pass task="embed" for embedding models
+    enforce_eager=True,
+)
+
 # Generate embedding. The output is a list of PoolingRequestOutputs.
 outputs = model.encode(prompts)
 # Print the outputs.
diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference_vision_language_embedding.py
index e1732d045f949..bf466109f0981 100644
--- a/examples/offline_inference_vision_language_embedding.py
+++ b/examples/offline_inference_vision_language_embedding.py
@@ -59,7 +59,7 @@ def run_e5_v(query: Query):
 
     llm = LLM(
         model="royokong/e5-v",
-        task="embedding",
+        task="embed",
         max_model_len=4096,
     )
 
@@ -88,7 +88,7 @@ def run_vlm2vec(query: Query):
 
     llm = LLM(
         model="TIGER-Lab/VLM2Vec-Full",
-        task="embedding",
+        task="embed",
         trust_remote_code=True,
         mm_processor_kwargs={"num_crops": 4},
     )
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 99781c55b672e..87d5aefea6cb4 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -55,7 +55,7 @@ class TestSetting:
     # embedding model
     TestSetting(
         model="BAAI/bge-multilingual-gemma2",
-        model_args=["--task", "embedding"],
+        model_args=["--task", "embed"],
         pp_size=1,
         tp_size=1,
         attn_backend="FLASHINFER",
@@ -65,7 +65,7 @@ class TestSetting:
     # encoder-based embedding model (BERT)
     TestSetting(
         model="BAAI/bge-base-en-v1.5",
-        model_args=["--task", "embedding"],
+        model_args=["--task", "embed"],
         pp_size=1,
         tp_size=1,
         attn_backend="XFORMERS",
diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py
index 7cd0416d321ef..16bea54936bc8 100644
--- a/tests/core/test_scheduler_encoder_decoder.py
+++ b/tests/core/test_scheduler_encoder_decoder.py
@@ -37,7 +37,7 @@ def test_scheduler_schedule_simple_encoder_decoder():
     num_seq_group = 4
     max_model_len = 16
     scheduler_config = SchedulerConfig(
-        task="generate",
+        "generate",
         max_num_batched_tokens=64,
         max_num_seqs=num_seq_group,
         max_model_len=max_model_len,
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index 425f2a10ec855..43c63daacb17f 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -27,7 +27,7 @@
 def server():
     args = [
         "--task",
-        "embedding",
+        "embed",
         "--dtype",
         "bfloat16",
         "--max-model-len",
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index 5ef8540265d14..f458ef5ef556d 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -54,7 +54,7 @@ def test_models(
         hf_outputs = hf_model.encode(example_prompts)
 
     with vllm_runner(model,
-                     task="embedding",
+                     task="embed",
                      dtype=dtype,
                      max_model_len=None,
                      **vllm_extra_kwargs) as vllm_model:
diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py
index 30fa5ea7b36c0..0c3115d195fc1 100644
--- a/tests/models/embedding/language/test_scoring.py
+++ b/tests/models/embedding/language/test_scoring.py
@@ -35,9 +35,7 @@ def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
     with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
         hf_outputs = hf_model.predict([text_pair]).tolist()
 
-    with vllm_runner(model_name,
-                     task="embedding",
-                     dtype=dtype,
+    with vllm_runner(model_name, task="score", dtype=dtype,
                      max_model_len=None) as vllm_model:
         vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
 
@@ -58,9 +56,7 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
     with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
         hf_outputs = hf_model.predict(text_pairs).tolist()
 
-    with vllm_runner(model_name,
-                     task="embedding",
-                     dtype=dtype,
+    with vllm_runner(model_name, task="score", dtype=dtype,
                      max_model_len=None) as vllm_model:
         vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
 
@@ -82,9 +78,7 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):
     with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
         hf_outputs = hf_model.predict(text_pairs).tolist()
 
-    with vllm_runner(model_name,
-                     task="embedding",
-                     dtype=dtype,
+    with vllm_runner(model_name, task="score", dtype=dtype,
                      max_model_len=None) as vllm_model:
         vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
 
diff --git a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
index 3dd8cb729f8a6..2641987b25a3a 100644
--- a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
+++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
@@ -93,7 +93,7 @@ def _run_test(
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
     with vllm_runner(model,
-                     task="embedding",
+                     task="embed",
                      dtype=dtype,
                      enforce_eager=True,
                      max_model_len=8192) as vllm_model:
diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
index 693abd7252d5e..f4cd8b81a0d7d 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -47,7 +47,7 @@ def _run_test(
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
     with vllm_runner(model,
-                     task="embedding",
+                     task="embed",
                      dtype=dtype,
                      max_model_len=4096,
                      enforce_eager=True) as vllm_model:
diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py
index 6145aff1a5ea2..9374c23dd6ffe 100644
--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
@@ -39,7 +39,7 @@ def _run_test(
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model, task="embedding", dtype=dtype,
+    with vllm_runner(model, task="embed", dtype=dtype,
                      enforce_eager=True) as vllm_model:
         vllm_outputs = vllm_model.encode(input_texts, images=input_images)
 
diff --git a/tests/test_config.py b/tests/test_config.py
index 45b0b938af215..4518adfc31bfc 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -7,11 +7,17 @@
 from vllm.platforms import current_platform
 
 
-@pytest.mark.parametrize(("model_id", "expected_task"), [
-    ("facebook/opt-125m", "generate"),
-    ("intfloat/e5-mistral-7b-instruct", "embedding"),
-])
-def test_auto_task(model_id, expected_task):
+@pytest.mark.parametrize(
+    ("model_id", "expected_runner_type", "expected_task"),
+    [
+        ("facebook/opt-125m", "generate", "generate"),
+        ("intfloat/e5-mistral-7b-instruct", "pooling", "embed"),
+        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "score"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
+    ],
+)
+def test_auto_task(model_id, expected_runner_type, expected_task):
     config = ModelConfig(
         model_id,
         task="auto",
@@ -22,6 +28,7 @@ def test_auto_task(model_id, expected_task):
         dtype="float16",
     )
 
+    assert config.runner_type == expected_runner_type
     assert config.task == expected_task
 
 
diff --git a/vllm/config.py b/vllm/config.py
index 2a9f0ebae997d..2d9a76fe7ddb1 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -45,13 +45,27 @@
 
 logger = init_logger(__name__)
 
-_EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
+_POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
 _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120
 
-TaskOption = Literal["auto", "generate", "embedding"]
+TaskOption = Literal["auto", "generate", "embedding", "embed", "classify",
+                     "score", "reward"]
 
-# "draft" is only used internally for speculative decoding
-_Task = Literal["generate", "embedding", "draft"]
+_ResolvedTask = Literal["generate", "embed", "classify", "score", "reward",
+                        "draft"]
+
+RunnerType = Literal["generate", "pooling", "draft"]
+
+_RUNNER_TASKS: Dict[RunnerType, List[_ResolvedTask]] = {
+    "generate": ["generate"],
+    "pooling": ["embed", "classify", "score", "reward"],
+    "draft": ["draft"],
+}
+
+_TASK_RUNNER: Dict[_ResolvedTask, RunnerType] = {
+    task: runner
+    for runner, tasks in _RUNNER_TASKS.items() for task in tasks
+}
 
 HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig],
                                              PretrainedConfig]]
@@ -144,7 +158,7 @@ class ModelConfig:
     def __init__(
             self,
             model: str,
-            task: Union[TaskOption, _Task],
+            task: Union[TaskOption, Literal["draft"]],
             tokenizer: str,
             tokenizer_mode: str,
             trust_remote_code: bool,
@@ -295,6 +309,7 @@ def __init__(
         supported_tasks, task = self._resolve_task(task, self.hf_config)
         self.supported_tasks = supported_tasks
         self.task: Final = task
+
         self.pooler_config = self._init_pooler_config(override_pooler_config)
 
         self._verify_quantization()
@@ -323,7 +338,7 @@ def _init_pooler_config(
         override_pooler_config: Optional["PoolerConfig"],
     ) -> Optional["PoolerConfig"]:
 
-        if self.task == "embedding":
+        if self.runner_type == "pooling":
             user_config = override_pooler_config or PoolerConfig()
 
             base_config = get_pooling_config(self.model, self.revision)
@@ -357,60 +372,90 @@ def _verify_tokenizer_mode(self) -> None:
                 "either 'auto', 'slow' or 'mistral'.")
         self.tokenizer_mode = tokenizer_mode
 
+    def _get_preferred_task(
+        self,
+        architectures: List[str],
+        supported_tasks: Set[_ResolvedTask],
+    ) -> Optional[_ResolvedTask]:
+        model_id = self.model
+        if get_pooling_config(model_id, self.revision):
+            return "embed"
+        if ModelRegistry.is_cross_encoder_model(architectures):
+            return "score"
+
+        suffix_to_preferred_task: List[Tuple[str, _ResolvedTask]] = [
+            # Other models follow this pattern
+            ("ForCausalLM", "generate"),
+            ("ForConditionalGeneration", "generate"),
+            ("ForSequenceClassification", "classify"),
+            ("ChatModel", "generate"),
+            ("LMHeadModel", "generate"),
+            ("EmbeddingModel", "embed"),
+            ("RewardModel", "reward"),
+        ]
+        _, arch = ModelRegistry.inspect_model_cls(architectures)
+
+        for suffix, pref_task in suffix_to_preferred_task:
+            if arch.endswith(suffix) and pref_task in supported_tasks:
+                return pref_task
+
+        return None
+
     def _resolve_task(
         self,
-        task_option: Union[TaskOption, _Task],
+        task_option: Union[TaskOption, Literal["draft"]],
         hf_config: PretrainedConfig,
-    ) -> Tuple[Set[_Task], _Task]:
+    ) -> Tuple[Set[_ResolvedTask], _ResolvedTask]:
         if task_option == "draft":
             return {"draft"}, "draft"
 
         architectures = getattr(hf_config, "architectures", [])
 
-        task_support: Dict[_Task, bool] = {
+        runner_support: Dict[RunnerType, bool] = {
             # NOTE: Listed from highest to lowest priority,
             # in case the model supports multiple of them
             "generate": ModelRegistry.is_text_generation_model(architectures),
-            "embedding": ModelRegistry.is_pooling_model(architectures),
+            "pooling": ModelRegistry.is_pooling_model(architectures),
         }
-        supported_tasks_lst: List[_Task] = [
-            task for task, is_supported in task_support.items() if is_supported
+        supported_runner_types_lst: List[RunnerType] = [
+            runner_type
+            for runner_type, is_supported in runner_support.items()
+            if is_supported
+        ]
+
+        supported_tasks_lst: List[_ResolvedTask] = [
+            task for runner_type in supported_runner_types_lst
+            for task in _RUNNER_TASKS[runner_type]
         ]
         supported_tasks = set(supported_tasks_lst)
 
         if task_option == "auto":
             selected_task = next(iter(supported_tasks_lst))
 
-            if len(supported_tasks) > 1:
-                suffix_to_preferred_task: List[Tuple[str, _Task]] = [
-                    # Hardcode the models that are exceptions
-                    ("AquilaModel", "generate"),
-                    ("ChatGLMModel", "generate"),
-                    # Other models follow this pattern
-                    ("ForCausalLM", "generate"),
-                    ("ForConditionalGeneration", "generate"),
-                    ("ChatModel", "generate"),
-                    ("LMHeadModel", "generate"),
-                    ("EmbeddingModel", "embedding"),
-                    ("RewardModel", "embedding"),
-                    ("ForSequenceClassification", "embedding"),
-                ]
-                info, arch = ModelRegistry.inspect_model_cls(architectures)
-
-                for suffix, pref_task in suffix_to_preferred_task:
-                    if arch.endswith(suffix) and pref_task in supported_tasks:
-                        selected_task = pref_task
-                        break
-                else:
-                    if (arch.endswith("Model")
-                            and info.architecture.endswith("ForCausalLM")
-                            and "embedding" in supported_tasks):
-                        selected_task = "embedding"
+            if len(supported_tasks_lst) > 1:
+                preferred_task = self._get_preferred_task(
+                    architectures, supported_tasks)
+                if preferred_task is not None:
+                    selected_task = preferred_task
 
                 logger.info(
                     "This model supports multiple tasks: %s. "
                     "Defaulting to '%s'.", supported_tasks, selected_task)
         else:
+            # Aliases
+            if task_option == "embedding":
+                preferred_task = self._get_preferred_task(
+                    architectures, supported_tasks)
+                if preferred_task != "embed":
+                    msg = ("The 'embedding' task will be restricted to "
+                           "embedding models in a future release. Please "
+                           "pass `--task classify`, `--task score`, or "
+                           "`--task reward` explicitly for other pooling "
+                           "models.")
+                    warnings.warn(msg, DeprecationWarning, stacklevel=2)
+
+                task_option = preferred_task or "embed"
+
             if task_option not in supported_tasks:
                 msg = (
                     f"This model does not support the '{task_option}' task. "
@@ -533,7 +578,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
 
         # Async postprocessor is not necessary with embedding mode
         # since there is no token generation
-        if self.task == "embedding":
+        if self.runner_type == "pooling":
             self.use_async_output_proc = False
 
         # Reminder: Please update docs/source/usage/compatibility_matrix.rst
@@ -750,6 +795,14 @@ def is_cross_encoder(self) -> bool:
         architectures = getattr(self.hf_config, "architectures", [])
         return ModelRegistry.is_cross_encoder_model(architectures)
 
+    @property
+    def supported_runner_types(self) -> Set[RunnerType]:
+        return {_TASK_RUNNER[task] for task in self.supported_tasks}
+
+    @property
+    def runner_type(self) -> RunnerType:
+        return _TASK_RUNNER[self.task]
+
 
 class CacheConfig:
     """Configuration for the KV cache.
@@ -1096,7 +1149,7 @@ def _verify_args(self) -> None:
 class SchedulerConfig:
     """Scheduler configuration."""
 
-    task: str = "generate"  # The task to use the model for.
+    runner_type: str = "generate"  # The runner type to launch for the model.
 
     # Maximum number of tokens to be processed in a single iteration.
     max_num_batched_tokens: int = field(default=None)  # type: ignore
@@ -1164,11 +1217,11 @@ def __post_init__(self) -> None:
                 # for higher throughput.
                 self.max_num_batched_tokens = max(self.max_model_len, 2048)
 
-            if self.task == "embedding":
-                # For embedding, choose specific value for higher throughput
+            if self.runner_type == "pooling":
+                # Choose specific value for higher throughput
                 self.max_num_batched_tokens = max(
                     self.max_num_batched_tokens,
-                    _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS,
+                    _POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
                 )
             if self.is_multimodal_model:
                 # The value needs to be at least the number of multimodal tokens
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 94c62743883ec..c3bc6becf0995 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -337,7 +337,7 @@ def __init__(
         self.lora_config = lora_config
 
         version = "selfattn"
-        if (self.scheduler_config.task == "embedding"
+        if (self.scheduler_config.runner_type == "pooling"
                 or self.cache_config.is_attention_free):
             version = "placeholder"
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 7b9adc401abcf..d485c2a9e7208 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1066,7 +1066,7 @@ def create_engine_config(self,
                 if (is_gpu and not use_sliding_window and not use_spec_decode
                         and not self.enable_lora
                         and not self.enable_prompt_adapter
-                        and model_config.task != "embedding"):
+                        and model_config.runner_type != "pooling"):
                     self.enable_chunked_prefill = True
                     logger.warning(
                         "Chunked prefill is enabled by default for models with "
@@ -1083,7 +1083,8 @@ def create_engine_config(self,
                 "errors during the initial memory profiling phase, or result "
                 "in low performance due to small KV cache space. Consider "
                 "setting --max-model-len to a smaller value.", max_model_len)
-        elif self.enable_chunked_prefill and model_config.task == "embedding":
+        elif (self.enable_chunked_prefill
+              and model_config.runner_type == "pooling"):
             msg = "Chunked prefill is not supported for embedding models"
             raise ValueError(msg)
 
@@ -1144,7 +1145,7 @@ def create_engine_config(self,
                 " please file an issue with detailed information.")
 
         scheduler_config = SchedulerConfig(
-            task=model_config.task,
+            runner_type=model_config.runner_type,
             max_num_batched_tokens=self.max_num_batched_tokens,
             max_num_seqs=self.max_num_seqs,
             max_model_len=model_config.max_model_len,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 6eca304b45f07..9be30c635cb2c 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -288,7 +288,7 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
 
         self.model_executor = executor_class(vllm_config=vllm_config, )
 
-        if self.model_config.task != "embedding":
+        if self.model_config.runner_type != "pooling":
             self._initialize_kv_caches()
 
         # If usage stat is enabled, collect relevant info.
@@ -1123,7 +1123,7 @@ def _process_model_outputs(self,
                             seq_group.metrics.model_execute_time = (
                                 o.model_execute_time)
 
-            if self.model_config.task == "embedding":
+            if self.model_config.runner_type == "pooling":
                 self._process_sequence_group_outputs(seq_group, output)
             else:
                 self.output_processor.process_prompt_logprob(seq_group, output)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 2a02187223a33..0bec978c4869c 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -381,19 +381,20 @@ def generate(
             considered legacy and may be deprecated in the future. You should
             instead pass them via the ``inputs`` parameter.
         """
-        task = self.llm_engine.model_config.task
-        if task != "generate":
+        runner_type = self.llm_engine.model_config.runner_type
+        if runner_type != "generate":
             messages = [
                 "LLM.generate() is only supported for (conditional) generation "
                 "models (XForCausalLM, XForConditionalGeneration).",
             ]
 
-            supported_tasks = self.llm_engine.model_config.supported_tasks
-            if "generate" in supported_tasks:
+            supported_runner_types = self.llm_engine.model_config \
+                .supported_runner_types
+            if "generate" in supported_runner_types:
                 messages.append(
-                    "Your model supports the 'generate' task, but is "
-                    f"currently initialized for the '{task}' task. Please "
-                    "initialize the model using `--task generate`.")
+                    "Your model supports the 'generate' runner, but is "
+                    f"currently initialized for the '{runner_type}' runner. "
+                    "Please initialize vLLM using `--task generate`.")
 
             raise ValueError(" ".join(messages))
 
@@ -793,16 +794,18 @@ def encode(
             considered legacy and may be deprecated in the future. You should
             instead pass them via the ``inputs`` parameter.
         """
-        task = self.llm_engine.model_config.task
-        if task != "embedding":
-            messages = ["LLM.encode() is only supported for embedding models."]
+        runner_type = self.llm_engine.model_config.runner_type
+        if runner_type != "pooling":
+            messages = ["LLM.encode() is only supported for pooling models."]
 
-            supported_tasks = self.llm_engine.model_config.supported_tasks
-            if "embedding" in supported_tasks:
+            supported_runner_types = self.llm_engine.model_config \
+                .supported_runner_types
+            if "pooling" in supported_runner_types:
                 messages.append(
-                    "Your model supports the 'embedding' task, but is "
-                    f"currently initialized for the '{task}' task. Please "
-                    "initialize the model using `--task embedding`.")
+                    "Your model supports the 'pooling' runner, but is "
+                    f"currently initialized for the '{runner_type}' runner. "
+                    "Please initialize vLLM using `--task embed`, "
+                    "`--task classify`, `--task score` etc.")
 
             raise ValueError(" ".join(messages))
 
@@ -864,21 +867,23 @@ def score(
             A list of ``PoolingRequestOutput`` objects containing the
             generated scores in the same order as the input prompts.
         """
-        task = self.llm_engine.model_config.task
-        if task != "embedding":
-            messages = ["LLM.score() is only supported for embedding models."]
+        runner_type = self.llm_engine.model_config.runner_type
+        if runner_type != "pooling":
+            messages = ["LLM.score() is only supported for pooling models."]
 
-            supported_tasks = self.llm_engine.model_config.supported_tasks
-            if "embedding" in supported_tasks:
+            supported_runner_types = self.llm_engine.model_config \
+                .supported_runner_types
+            if "pooling" in supported_runner_types:
                 messages.append(
-                    "Your model supports the 'embedding' task, but is "
-                    f"currently initialized for the '{task}' task. Please "
-                    "initialize the model using `--task embedding`.")
+                    "Your model supports the 'pooling' runner, but is "
+                    f"currently initialized for the '{runner_type}' runner. "
+                    "Please initialize vLLM using `--task embed`, "
+                    "`--task classify`, `--task score` etc.")
 
             raise ValueError(" ".join(messages))
 
         if not self.llm_engine.model_config.is_cross_encoder:
-            raise ValueError("Your model does not support the cross encoding")
+            raise ValueError("Your model does not support cross encoding")
 
         tokenizer = self.llm_engine.get_tokenizer()
 
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 0f93eb54111ad..a345f8caeeed2 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -573,7 +573,7 @@ def init_app_state(
         enable_auto_tools=args.enable_auto_tool_choice,
         tool_parser=args.tool_call_parser,
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
-    ) if model_config.task == "generate" else None
+    ) if model_config.runner_type == "generate" else None
     state.openai_serving_completion = OpenAIServingCompletion(
         engine_client,
         model_config,
@@ -582,7 +582,7 @@ def init_app_state(
         prompt_adapters=args.prompt_adapters,
         request_logger=request_logger,
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
-    ) if model_config.task == "generate" else None
+    ) if model_config.runner_type == "generate" else None
     state.openai_serving_embedding = OpenAIServingEmbedding(
         engine_client,
         model_config,
@@ -590,13 +590,13 @@ def init_app_state(
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
-    ) if model_config.task == "embedding" else None
+    ) if model_config.runner_type == "pooling" else None
     state.openai_serving_scores = OpenAIServingScores(
         engine_client,
         model_config,
         base_model_paths,
         request_logger=request_logger
-    ) if (model_config.task == "embedding" \
+    ) if (model_config.runner_type == "pooling" \
           and model_config.is_cross_encoder) else None
     state.openai_serving_tokenization = OpenAIServingTokenization(
         engine_client,
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 00cdb3b6839f5..675daf54c0d0d 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -224,7 +224,7 @@ async def main(args):
         chat_template=None,
         chat_template_content_format="auto",
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
-    ) if model_config.task == "generate" else None
+    ) if model_config.runner_type == "generate" else None
     openai_serving_embedding = OpenAIServingEmbedding(
         engine,
         model_config,
@@ -232,7 +232,7 @@ async def main(args):
         request_logger=request_logger,
         chat_template=None,
         chat_template_content_format="auto",
-    ) if model_config.task == "embedding" else None
+    ) if model_config.runner_type == "pooling" else None
 
     tracker = BatchProgressTracker()
     logger.info("Reading batch from %s...", args.input_file)
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index cfb89e0f336bc..f15e7176b3d50 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -35,7 +35,7 @@ def get_model_architecture(
         architectures = ["QuantMixtralForCausalLM"]
 
     model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
-    if model_config.task == "embedding":
+    if model_config.runner_type == "pooling":
         model_cls = as_embedding_model(model_cls)
 
     return model_cls, arch
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index fdb241e6753fb..55a5c4dff3a5c 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -42,7 +42,7 @@ def __init__(
         executor_class: Type[Executor],
         usage_context: UsageContext,
     ):
-        assert vllm_config.model_config.task != "embedding"
+        assert vllm_config.model_config.runner_type != "pooling"
 
         logger.info("Initializing an LLM engine (v%s) with config: %s",
                     VLLM_VERSION, vllm_config)
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 4fad1a3f4caeb..ba3d4a130a80b 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -163,7 +163,7 @@ def __init__(
                 not in ["medusa", "mlp_speculator", "eagle"]) \
                     else {"return_hidden_states": True}
         ModelRunnerClass: Type[CPUModelRunnerBase] = CPUModelRunner
-        if self.model_config.task == "embedding":
+        if self.model_config.runner_type == "pooling":
             ModelRunnerClass = CPUPoolingModelRunner
         elif self.model_config.is_encoder_decoder:
             ModelRunnerClass = CPUEncoderDecoderModelRunner
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 094dd5a5d08b3..832b9903b7abc 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -75,7 +75,7 @@ def __init__(
                     else {"return_hidden_states": True}
 
         ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
-        if model_config.task == "embedding":
+        if model_config.runner_type == "pooling":
             ModelRunnerClass = PoolingModelRunner
         elif self.model_config.is_encoder_decoder:
             ModelRunnerClass = EncoderDecoderModelRunner

From cad5c0a6eda057eeece87a42fff49fef3e18a2ac Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 11 Dec 2024 21:36:27 +0800
Subject: [PATCH 019/357] [Doc] Update docs to refer to pooling models (#11093)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/usage/faq.rst                    | 7 ++++++-
 vllm/attention/backends/placeholder_attn.py  | 2 +-
 vllm/config.py                               | 8 ++++----
 vllm/core/placeholder_block_space_manager.py | 2 +-
 vllm/engine/arg_utils.py                     | 4 ++--
 vllm/engine/async_llm_engine.py              | 2 +-
 vllm/engine/multiprocessing/client.py        | 2 +-
 vllm/engine/protocol.py                      | 2 +-
 vllm/entrypoints/openai/serving_score.py     | 2 +-
 vllm/sequence.py                             | 6 +++---
 vllm/v1/engine/processor.py                  | 2 +-
 vllm/worker/cpu_worker.py                    | 2 +-
 vllm/worker/hpu_worker.py                    | 4 ++--
 vllm/worker/worker.py                        | 2 +-
 14 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/docs/source/usage/faq.rst b/docs/source/usage/faq.rst
index ce327abd5fa20..d88da32092924 100644
--- a/docs/source/usage/faq.rst
+++ b/docs/source/usage/faq.rst
@@ -11,7 +11,12 @@ A: Assuming that you're referring to using OpenAI compatible server to serve mul
 
     Q: Which model to use for offline inference embedding?
 
-A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model
+A: You can try `e5-mistral-7b-instruct <https://huggingface.co/intfloat/e5-mistral-7b-instruct>`__ and `BAAI/bge-base-en-v1.5 <https://huggingface.co/BAAI/bge-base-en-v1.5>`__;
+more are listed :ref:`here <supported_models>`.
+
+By extracting hidden states, vLLM can automatically convert text generation models like `Llama-3-8B <https://huggingface.co/meta-llama/Meta-Llama-3-8B>`__,
+`Mistral-7B-Instruct-v0.3 <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`__ into embedding models,
+but they are expected be inferior to models that are specifically trained on embedding tasks.
 
 ----------------------------------------
 
diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
index 658039bfc3365..534f79b3a60bf 100644
--- a/vllm/attention/backends/placeholder_attn.py
+++ b/vllm/attention/backends/placeholder_attn.py
@@ -14,7 +14,7 @@
     from vllm.worker.model_runner import (ModelInputForGPUBuilder,
                                           ModelInputForGPUWithSamplingMetadata)
 
-# Placeholder attention backend for models like Mamba and embedding models that
+# Placeholder attention backend for models like Mamba and pooling models that
 # lack attention.
 
 
diff --git a/vllm/config.py b/vllm/config.py
index 2d9a76fe7ddb1..322c8f8990a40 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -152,7 +152,7 @@ class ModelConfig:
             this argument will be used to configure the neuron config that
             can not be gathered from the vllm arguments.
         override_pooler_config: Initialize non default pooling config or
-            override default pooling config for the embedding model.
+            override default pooling config for the pooling model.
     """
 
     def __init__(
@@ -576,7 +576,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
             self.use_async_output_proc = False
             return
 
-        # Async postprocessor is not necessary with embedding mode
+        # Async postprocessor is not necessary for pooling models
         # since there is no token generation
         if self.runner_type == "pooling":
             self.use_async_output_proc = False
@@ -1825,11 +1825,11 @@ class MultiModalConfig:
 
 @dataclass
 class PoolerConfig:
-    """Controls the behavior of output pooling in embedding models."""
+    """Controls the behavior of output pooling in pooling models."""
 
     pooling_type: Optional[str] = None
     """
-    The pooling method of the embedding model. This should be a key in
+    The pooling method of the pooling model. This should be a key in
     :class:`vllm.model_executor.layers.pooler.PoolingType`.
     """
 
diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py
index 26d42b7f1790e..a47e594518534 100644
--- a/vllm/core/placeholder_block_space_manager.py
+++ b/vllm/core/placeholder_block_space_manager.py
@@ -8,7 +8,7 @@
 class PlaceholderBlockSpaceManager(BlockSpaceManager):
     """A version of BlockSpaceManager for use in environments
     where block management is not required. 
-    For example: embedding models or attention-free models like Mamba.
+    For example: pooling models or attention-free models like Mamba.
 
     This class provides the same interface as BlockSpaceManager, but its
     methods perform no actions or return simple values like True in specific
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d485c2a9e7208..7337522bc9952 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -893,7 +893,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             '--override-pooler-config',
             type=PoolerConfig.from_json,
             default=None,
-            help="Override or set the pooling method in the embedding model. "
+            help="Override or set the pooling method for pooling models. "
             "e.g. {\"pooling_type\": \"mean\", \"normalize\": false}.'")
 
         parser.add_argument('--compilation-config',
@@ -1085,7 +1085,7 @@ def create_engine_config(self,
                 "setting --max-model-len to a smaller value.", max_model_len)
         elif (self.enable_chunked_prefill
               and model_config.runner_type == "pooling"):
-            msg = "Chunked prefill is not supported for embedding models"
+            msg = "Chunked prefill is not supported for pooling models"
             raise ValueError(msg)
 
 
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 60dccd7a0812c..32396fd10188d 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1085,7 +1085,7 @@ async def encode(
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from an embedding model.
+        """Generate outputs for a request from a pooling model.
 
         Generate outputs for a request. This method is a coroutine. It adds the
         request into the waiting queue of the LLMEngine and streams the outputs
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index a729023bc00bb..0a046c71e86e8 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -527,7 +527,7 @@ def encode(
         *,
         inputs: Optional[PromptType] = None  # DEPRECATED
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from an embedding model.
+        """Generate outputs for a request from a pooling model.
 
         Generate outputs for a request. This method is a coroutine. It adds the
         request into the waiting queue of the LLMEngine and streams the outputs
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 4079de7d36793..a066836b92708 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -209,7 +209,7 @@ def encode(
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from an embedding model."""
+        """Generate outputs for a request from a pooling model."""
         ...
 
     @abstractmethod
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index fed06fa452955..4929e720c00e4 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -119,7 +119,7 @@ async def create_score(
 
             if prompt_adapter_request is not None:
                 raise NotImplementedError("Prompt adapter is not supported "
-                                          "for embedding models")
+                                          "for scoring models")
 
             if isinstance(tokenizer, MistralTokenizer):
                 raise ValueError(
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 669124319c4f4..b0f3c1cc3609f 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -618,9 +618,9 @@ class SequenceGroup:
         arrival_time: The arrival time of the request.
         lora_request: LoRA request.
         embeddings: The embeddings vectors of the prompt of the sequence group
-            for an embedding model.
+            for a pooling model.
         pooling_params: The pooling parameters used to generate the pooling
-            for an embedding model.
+            for a pooling model.
         encoder_seq: Optional, the single encoder sequence. Should be None
                      unless you are working with an encoder/decoder model.
         trace_headers: OpenTelemetry trace headers.
@@ -1102,7 +1102,7 @@ class PoolerOutput(
         msgspec.Struct,
         omit_defaults=True,  # type: ignore[call-arg]
         array_like=True):  # type: ignore[call-arg]
-    """The output from a pooling operation in the embedding model."""
+    """The output from a pooling operation in the pooling model."""
     outputs: List[EmbeddingSequenceGroupOutput]
 
     # lazy import to avoid circular import
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 120fc64969552..e0e525b30a767 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -59,7 +59,7 @@ def process_inputs(
         priority: int = 0,
     ) -> Tuple[DetokenizerRequest, EngineCoreRequest]:
 
-        # TODO(woosuk): Support embedding mode.
+        # TODO(woosuk): Support pooling models.
         # TODO(woosuk): Check max_logprobs
         # TODO(woosuk): Support encoder-decoder models.
 
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index ba3d4a130a80b..09758a5d9accf 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -178,7 +178,7 @@ def __init__(
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
         self.cache_engine: List[CPUCacheEngine]
-        # Initialize cpu_cache as embedding models don't initialize kv_caches
+        # Initialize cpu_cache as pooling models don't initialize kv_caches
         self.cpu_cache: Optional[List[List[torch.Tensor]]] = None
 
         # Torch profiler. Enabled and configured through env vars:
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index 493f7a9fad098..cca7cd50bfc7b 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -65,8 +65,8 @@ def __init__(
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
         self.cache_engine: List[HPUCacheEngine]
-        # Initialize gpu_cache as embedding models don't initialize kv_caches
-        self.hpu_cache: Optional[List[List[torch.tensor]]] = None
+        # Initialize gpu_cache as pooling models don't initialize kv_caches
+        self.hpu_cache: Optional[List[List[torch.Tensor]]] = None
         # Torch profiler. Enabled and configured through env vars:
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
         if envs.VLLM_TORCH_PROFILER_DIR:
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 832b9903b7abc..a368bb9ee9a5b 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -91,7 +91,7 @@ def __init__(
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
         self.cache_engine: List[CacheEngine]
-        # Initialize gpu_cache as embedding models don't initialize kv_caches
+        # Initialize gpu_cache as pooling models don't initialize kv_caches
         self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
         self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}
 

From b2f775456e4af7412308320a9c11e4dac3086205 Mon Sep 17 00:00:00 2001
From: hissu-hyvarinen <hissu.hyvarinen@amd.com>
Date: Wed, 11 Dec 2024 17:23:37 +0200
Subject: [PATCH 020/357] [CI/Build] Enable prefix caching test for AMD
 (#11098)

Signed-off-by: Hissu Hyvarinen <hissu.hyvarinen@amd.com>
---
 .buildkite/test-pipeline.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 8f57006214c88..df4fa7a6ee9ba 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -201,7 +201,7 @@ steps:
     - python3 offline_profile.py --model facebook/opt-125m
 
 - label: Prefix Caching Test # 9min
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   - tests/prefix_caching

From fd22220687af5ccd89d9f8f2812069ef0422244c Mon Sep 17 00:00:00 2001
From: bingps <46775742+bingps@users.noreply.github.com>
Date: Wed, 11 Dec 2024 23:43:24 +0800
Subject: [PATCH 021/357] [Doc] Installed version of llmcompressor for int8/fp8
 quantization (#11103)

Signed-off-by: Guangda Liu <bingps@users.noreply.github.com>
Co-authored-by: Guangda Liu <bingps@users.noreply.github.com>
---
 docs/source/quantization/fp8.rst  | 2 +-
 docs/source/quantization/int8.rst | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/quantization/fp8.rst b/docs/source/quantization/fp8.rst
index aacd07a34ad46..4dbf8e9d346e1 100644
--- a/docs/source/quantization/fp8.rst
+++ b/docs/source/quantization/fp8.rst
@@ -45,7 +45,7 @@ To produce performant FP8 quantized models with vLLM, you'll need to install the
 
 .. code-block:: console
 
-   $ pip install llmcompressor==0.1.0
+   $ pip install llmcompressor
 
 Quantization Process
 --------------------
diff --git a/docs/source/quantization/int8.rst b/docs/source/quantization/int8.rst
index 04fa308449507..aa5b251becb1c 100644
--- a/docs/source/quantization/int8.rst
+++ b/docs/source/quantization/int8.rst
@@ -19,7 +19,7 @@ To use INT8 quantization with vLLM, you'll need to install the `llm-compressor <
 
 .. code-block:: console
 
-   $ pip install llmcompressor==0.1.0
+   $ pip install llmcompressor
 
 Quantization Process
 --------------------
@@ -142,4 +142,4 @@ Best Practices
 Troubleshooting and Support
 ---------------------------
 
-If you encounter any issues or have feature requests, please open an issue on the ``vllm-project/llm-compressor`` GitHub repository.
\ No newline at end of file
+If you encounter any issues or have feature requests, please open an issue on the ``vllm-project/llm-compressor`` GitHub repository.

From 91642db952458fbb6ae7c2d167757dc86b105991 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 11 Dec 2024 10:43:05 -0800
Subject: [PATCH 022/357] [torch.compile] use depyf to dump torch.compile
 internals (#10972)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 requirements-common.txt        |  1 +
 vllm/compilation/backends.py   | 69 ++++++++++++++++++----------------
 vllm/compilation/decorators.py |  2 +-
 vllm/compilation/monitor.py    | 23 ++++++++++--
 vllm/compilation/wrapper.py    |  4 +-
 vllm/config.py                 |  6 ++-
 vllm/worker/model_runner.py    |  3 +-
 7 files changed, 66 insertions(+), 42 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 792cd58e80669..850b8f4101701 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -33,3 +33,4 @@ six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that need
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
 compressed-tensors == 0.8.0 # required for compressed-tensors
+depyf==0.18.0 # required for profiling and debugging torch.compile
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index f002a8ff905b1..09a3daa731829 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -9,7 +9,7 @@
 import torch.fx as fx
 
 import vllm.envs as envs
-from vllm.config import CompilationConfig
+from vllm.config import CompilationConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.utils import weak_ref_tensors
 
@@ -149,14 +149,15 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
     """
 
     def __init__(self, module: torch.fx.GraphModule,
-                 compile_submod_names: List[str],
-                 compilation_configs: CompilationConfig, graph_pool):
+                 compile_submod_names: List[str], vllm_config: VllmConfig,
+                 graph_pool):
         super().__init__(module)
         from torch._guards import detect_fake_mode
         self.fake_mode = detect_fake_mode()
         self.compile_submod_names = compile_submod_names
-        self.compilation_configs = compilation_configs
+        self.compilation_config = vllm_config.compilation_config
         self.graph_pool = graph_pool
+        self.vllm_config = vllm_config
 
     def run(self, *args):
         fake_args = [
@@ -182,15 +183,15 @@ def call_module(self, target: torch.fx.node.Target,
             compiled_graph_for_general_shape = wrap_inductor(
                 submod,
                 args,
-                self.compilation_configs.inductor_compile_config,
-                self.compilation_configs,
+                self.compilation_config.inductor_compile_config,
+                self.compilation_config,
                 graph_index=index,
                 num_graphs=len(self.compile_submod_names),
                 runtime_shape=None,
-                use_inductor=self.compilation_configs.use_inductor)
+                use_inductor=self.compilation_config.use_inductor)
 
             self.module.__dict__[target] = PiecewiseBackend(
-                submod, self.compilation_configs, self.graph_pool, index,
+                submod, self.vllm_config, self.graph_pool, index,
                 len(self.compile_submod_names), sym_shape_indices,
                 compiled_graph_for_general_shape)
 
@@ -211,7 +212,8 @@ class VllmBackend:
     which handles the post-grad passes.
     """
 
-    compilation_configs: CompilationConfig
+    vllm_config: VllmConfig
+    compilation_config: CompilationConfig
     graph_pool: Any
     _called: bool = False
     # the graph we compiled
@@ -227,7 +229,7 @@ class VllmBackend:
 
     def __init__(
         self,
-        compilation_configs: CompilationConfig,
+        vllm_config: VllmConfig,
     ):
         global global_graph_pool
         if global_graph_pool is None:
@@ -244,13 +246,14 @@ def __init__(
         self.sym_tensor_indices = []
         self.input_buffers = []
 
-        self.compilation_configs = compilation_configs
+        self.vllm_config = vllm_config
+        self.compilation_config = vllm_config.compilation_config
 
         # `torch.compile` is JIT compiled, so we don't need to
         # do anything here
 
     def configure_post_pass(self):
-        config = self.compilation_configs
+        config = self.compilation_config
         self.post_grad_pass_manager.configure(config.pass_config)
 
         # Post-grad custom passes are run using the post_grad_custom_post_pass
@@ -271,7 +274,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         from .monitor import torch_compile_start_time
         dynamo_time = time.time() - torch_compile_start_time
         logger.info("Dynamo bytecode transform time: %.2f s", dynamo_time)
-        self.compilation_configs.compilation_time += dynamo_time
+        self.compilation_config.compilation_time += dynamo_time
 
         # we control the compilation process, each instance can only be
         # called once
@@ -281,7 +284,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         self.configure_post_pass()
 
         self.split_gm, self.piecewise_graphs = split_graph(
-            graph, self.compilation_configs.splitting_ops)
+            graph, self.compilation_config.splitting_ops)
 
         from torch._dynamo.utils import lazy_format_graph_code
         logger.debug("%s", lazy_format_graph_code("before split", self.graph))
@@ -298,13 +301,13 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         # propagate the split graph to the piecewise backend,
         # compile submodules with symbolic shapes
         PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile,
-                                    self.compilation_configs,
+                                    self.vllm_config,
                                     self.graph_pool).run(*example_inputs)
 
         self._called = True
 
-        if not self.compilation_configs.use_cudagraph or \
-            not self.compilation_configs.cudagraph_copy_inputs:
+        if not self.compilation_config.use_cudagraph or \
+            not self.compilation_config.cudagraph_copy_inputs:
             return self.split_gm
 
         # if we need to copy input buffers for cudagraph
@@ -364,10 +367,9 @@ class ConcreteSizeEntry:
 
 class PiecewiseBackend:
 
-    def __init__(self, graph: fx.GraphModule,
-                 compilation_configs: CompilationConfig, graph_pool: Any,
-                 piecewise_compile_index: int, total_piecewise_compiles: int,
-                 sym_shape_indices: List[int],
+    def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
+                 graph_pool: Any, piecewise_compile_index: int,
+                 total_piecewise_compiles: int, sym_shape_indices: List[int],
                  compiled_graph_for_general_shape: Callable):
         """
         The backend for piecewise compilation.
@@ -375,7 +377,7 @@ def __init__(self, graph: fx.GraphModule,
 
         We will compile `self.graph` once for the general shape,
         and then compile for different shapes specified in
-        `compilation_configs.compile_sizes`.
+        `compilation_config.compile_sizes`.
 
         Independently, we will capture cudagraph for different shapes.
 
@@ -383,7 +385,8 @@ def __init__(self, graph: fx.GraphModule,
         compile it first, and then capture cudagraph.
         """
         self.graph = graph
-        self.compilation_configs = compilation_configs
+        self.vllm_config = vllm_config
+        self.compilation_config = vllm_config.compilation_config
         self.graph_pool = graph_pool
         self.piecewise_compile_index = piecewise_compile_index
         self.total_piecewise_compiles = total_piecewise_compiles
@@ -393,10 +396,10 @@ def __init__(self, graph: fx.GraphModule,
             piecewise_compile_index == total_piecewise_compiles - 1)
 
         self.compile_sizes: Set[int] = set(
-            self.compilation_configs.compile_sizes)
+            self.compilation_config.compile_sizes)
         self.capture_sizes: Set[int] = set(
-            self.compilation_configs.capture_sizes
-        ) if self.compilation_configs.use_cudagraph else set()
+            self.compilation_config.capture_sizes
+        ) if self.compilation_config.use_cudagraph else set()
 
         self.first_run_finished = False
 
@@ -423,7 +426,7 @@ def __call__(self, *args) -> Any:
             self.first_run_finished = True
             # no specific sizes to compile
             if self.is_last_graph and not self.to_be_compiled_sizes:
-                end_monitoring_torch_compile(self.compilation_configs)
+                end_monitoring_torch_compile(self.vllm_config)
             return self.compiled_graph_for_general_shape(*args)
 
         runtime_shape = args[self.sym_shape_indices[0]]
@@ -443,28 +446,28 @@ def __call__(self, *args) -> Any:
             entry.runnable = wrap_inductor(
                 self.graph,
                 args,
-                self.compilation_configs.inductor_compile_config,
-                self.compilation_configs,
+                self.compilation_config.inductor_compile_config,
+                self.compilation_config,
                 graph_index=self.piecewise_compile_index,
                 num_graphs=self.total_piecewise_compiles,
                 runtime_shape=runtime_shape,
-                use_inductor=self.compilation_configs.use_inductor)
+                use_inductor=self.compilation_config.use_inductor)
 
             # finished compilations for all required shapes
             if self.is_last_graph and not self.to_be_compiled_sizes:
-                end_monitoring_torch_compile(self.compilation_configs)
+                end_monitoring_torch_compile(self.vllm_config)
 
         if not entry.use_cudagraph:
             return entry.runnable(*args)
 
         if entry.cudagraph is None:
-            if entry.num_finished_warmup < self.compilation_configs.cudagraph_num_of_warmups:  # noqa
+            if entry.num_finished_warmup < self.compilation_config.cudagraph_num_of_warmups:  # noqa
                 entry.num_finished_warmup += 1
                 if self.is_first_graph:
                     logger.debug(
                         "Warming up %s/%s for shape %s",
                         entry.num_finished_warmup,
-                        self.compilation_configs.cudagraph_num_of_warmups,
+                        self.compilation_config.cudagraph_num_of_warmups,
                         runtime_shape)
                 return entry.runnable(*args)
 
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 938430fe2a501..805a217ee6ca1 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -185,7 +185,7 @@ def __call__(self, *args, **kwargs):
                             "Unsupported dynamic dimensions"
                             f" {dims} for argument {k} with type {type(arg)}.")
             # here, it is the starting point of the `torch.compile` process
-            start_monitoring_torch_compile(self.vllm_config.compilation_config)
+            start_monitoring_torch_compile(self.vllm_config)
 
         # if we don't use custom dispatcher, we can directly call the
         # compiled function and let torch.compile handle the dispatching,
diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py
index 3348674b09af2..b97e40415b41b 100644
--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
@@ -1,19 +1,36 @@
+import os
 import time
 
-from vllm.config import CompilationConfig, CompilationLevel
+from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
+context_manager = None
 torch_compile_start_time: float = 0.0
 
 
-def start_monitoring_torch_compile(compilation_config: CompilationConfig):
+def start_monitoring_torch_compile(vllm_config: VllmConfig):
     global torch_compile_start_time
     torch_compile_start_time = time.time()
 
+    compilation_config: CompilationConfig = vllm_config.compilation_config
+    if compilation_config.level == CompilationLevel.PIECEWISE and \
+        compilation_config.debug_dump_path:
+        import depyf
+        path = os.path.join(compilation_config.debug_dump_path,
+                            f"rank_{vllm_config.parallel_config.rank}")
+        global context_manager
+        context_manager = depyf.prepare_debug(path)
+        context_manager.__enter__()
 
-def end_monitoring_torch_compile(compilation_config: CompilationConfig):
+
+def end_monitoring_torch_compile(vllm_config: VllmConfig):
+    compilation_config: CompilationConfig = vllm_config.compilation_config
     if compilation_config.level == CompilationLevel.PIECEWISE:
         logger.info("torch.compile takes %.2f s in total",
                     compilation_config.compilation_time)
+        global context_manager
+        if context_manager is not None:
+            context_manager.__exit__(None, None, None)
+            context_manager = None
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index bc4d292fef402..c10241b483169 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -32,8 +32,8 @@ def __init__(self,
             # default compilation settings
             # compiling the forward method
 
-            backend = get_current_vllm_config(
-            ).compilation_config.init_backend()
+            vllm_config = get_current_vllm_config()
+            backend = vllm_config.compilation_config.init_backend(vllm_config)
 
             compiled_callable = torch.compile(
                 self.forward,
diff --git a/vllm/config.py b/vllm/config.py
index 322c8f8990a40..7f9be5a3a98bc 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2222,6 +2222,7 @@ class CompilationConfig(BaseModel):
             - 1: dynamo as is.
             - 2: dynamo once.
             - 3: piecewise compilation.
+        - debug_dump_path: the path to dump the debug information.
         - backend: the backend for compilation. It needs to be a string.
             - "" (empty string): use the default backend.
             - "eager"/"openxla"/...: use the specified backend registered in PyTorch.
@@ -2289,6 +2290,7 @@ class CompilationConfig(BaseModel):
         certain small batchsizes, where inductor is good at optimizing.
     """ # noqa
     level: int = 0
+    debug_dump_path: str = ""
     backend: str = ""
     custom_ops: List[str] = Field(default_factory=list)
     splitting_ops: List[str] = Field(default_factory=lambda: [
@@ -2394,7 +2396,7 @@ def model_post_init(self, __context: Any) -> None:
         self.static_forward_context = {}
         self.compilation_time = 0.0
 
-    def init_backend(self) -> Union[str, Callable]:
+    def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
         if self.level == CompilationLevel.NO_COMPILATION:
             raise ValueError("No compilation level is set.")
 
@@ -2413,7 +2415,7 @@ def init_backend(self) -> Union[str, Callable]:
         # merge with the config use_inductor
         assert self.level == CompilationLevel.PIECEWISE
         from vllm.compilation.backends import VllmBackend
-        return VllmBackend(self)
+        return VllmBackend(vllm_config)
 
     def init_with_cudagraph_sizes(self, sizes_to_specialize: List[int]):
         """To complete the initialization of config,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 551b84435fdc0..26fd486130ce6 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1162,7 +1162,8 @@ def load_model(self) -> None:
 
         if self.vllm_config.compilation_config.level ==\
             CompilationLevel.DYNAMO_AS_IS and supports_dynamo():
-            backend = self.vllm_config.compilation_config.init_backend()
+            backend = self.vllm_config.compilation_config.init_backend(
+                self.vllm_config)
             self.model = torch.compile(
                 self.model,
                 fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,

From d643c2aba1cd5421200f3a3bad1813dd067233b4 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 11 Dec 2024 10:49:23 -0800
Subject: [PATCH 023/357] [V1] Use input_ids as input for text-only models 
 (#11032)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 68 +++++++++++++++++++++---------
 1 file changed, 47 insertions(+), 21 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 8d9976ded7c5e..e75be21ef2d91 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -61,6 +61,7 @@ def __init__(
             self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
                 cache_config.cache_dtype]
 
+        self.is_multimodal_model = model_config.is_multimodal_model
         self.sliding_window = model_config.get_sliding_window()
         self.block_size = cache_config.block_size
         self.max_model_len = model_config.max_model_len
@@ -103,6 +104,11 @@ def __init__(
         # The batch sizes in the config are in descending order.
         self.cudagraph_batch_sizes = list(
             reversed(self.vllm_config.compilation_config.capture_sizes))
+
+        # Persistent buffers for CUDA graphs.
+        self.input_ids = torch.zeros(self.max_num_tokens,
+                                     dtype=torch.int32,
+                                     device=self.device)
         self.positions = torch.zeros(self.max_num_tokens,
                                      dtype=torch.int64,
                                      device=self.device)
@@ -310,7 +316,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         seq_start_loc_np[0] = 0
         np.cumsum(seq_lens, out=seq_start_loc_np[1:])
 
-        input_ids = input_ids.to(self.device, non_blocking=True)
+        self.input_ids[:total_num_scheduled_tokens].copy_(input_ids,
+                                                          non_blocking=True)
         self.positions[:total_num_scheduled_tokens].copy_(positions,
                                                           non_blocking=True)
         query_start_loc = query_start_loc.to(self.device, non_blocking=True)
@@ -331,7 +338,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # token from the partial request.
         # TODO: Support prompt logprobs.
         logits_indices = query_start_loc[1:] - 1
-        return input_ids, attn_metadata, logits_indices
+        return attn_metadata, logits_indices
 
     def _prepare_sampling(
         self,
@@ -427,13 +434,15 @@ def execute_model(
     ) -> ModelRunnerOutput:
         self._update_states(scheduler_output)
 
-        # Run the encoder.
-        self._execute_encoder(scheduler_output)
-        encoder_outputs = self._gather_encoder_outputs(scheduler_output)
+        if self.is_multimodal_model:
+            # Run the multimodal encoder if any.
+            self._execute_encoder(scheduler_output)
+            encoder_outputs = self._gather_encoder_outputs(scheduler_output)
+        else:
+            encoder_outputs = []
 
         # Prepare the decoder inputs.
-        input_ids, attn_metadata, logits_indices = self._prepare_inputs(
-            scheduler_output)
+        attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
@@ -444,29 +453,39 @@ def execute_model(
         else:
             # Eager mode.
             num_input_tokens = num_scheduled_tokens
-
         attn_metadata.num_input_tokens = num_input_tokens
 
-        # Get the inputs embeds.
-        if encoder_outputs:
-            inputs_embeds = self.model.get_input_embeddings(
-                input_ids, encoder_outputs)
+        if self.is_multimodal_model:
+            # NOTE(woosuk): To unify token ids and soft tokens (vision
+            # embeddings), we always use embeddings (rather than token ids)
+            # as input to the multimodal model, even when the input is text.
+            input_ids = self.input_ids[:num_scheduled_tokens]
+            if encoder_outputs:
+                inputs_embeds = self.model.get_input_embeddings(
+                    input_ids, encoder_outputs)
+            else:
+                inputs_embeds = self.model.get_input_embeddings(input_ids)
+            # TODO(woosuk): Avoid the copy. Optimize.
+            self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds)
+            inputs_embeds = self.inputs_embeds[:num_input_tokens]
+            input_ids = None
         else:
-            inputs_embeds = self.model.get_input_embeddings(input_ids)
-        # NOTE(woosuk): To unify token ids and soft tokens (vision embeddings),
-        # always use embeddings (rather than token ids) as input to the model.
-        # TODO(woosuk): Avoid the copy. Optimize.
-        self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds)
+            # For text-only models, we use token ids as input.
+            # While it is possible to use embeddings as input just like the
+            # multimodal models, it is not desirable for performance since
+            # then the embedding layer is not included in the CUDA graph.
+            input_ids = self.input_ids[:num_input_tokens]
+            inputs_embeds = None
 
         # Run the decoder.
         # Use persistent buffers for CUDA graphs.
         with set_forward_context(attn_metadata, self.vllm_config):
             hidden_states = self.model(
-                input_ids=None,
+                input_ids=input_ids,
                 positions=self.positions[:num_input_tokens],
                 kv_caches=self.kv_caches,
                 attn_metadata=None,
-                inputs_embeds=self.inputs_embeds[:num_input_tokens],
+                inputs_embeds=inputs_embeds,
             )
         hidden_states = hidden_states[:num_scheduled_tokens]
         hidden_states = hidden_states[logits_indices]
@@ -534,13 +553,20 @@ def _dummy_run(
         num_tokens: int,
         kv_caches: List[torch.Tensor],
     ) -> torch.Tensor:
+        if self.is_multimodal_model:
+            input_ids = None
+            inputs_embeds = self.inputs_embeds[:num_tokens]
+        else:
+            input_ids = self.input_ids[:num_tokens]
+            inputs_embeds = None
         with set_forward_context(None, self.vllm_config):
             hidden_states = model(
-                input_ids=None,
+                input_ids=input_ids,
                 positions=self.positions[:num_tokens],
                 kv_caches=kv_caches,
                 attn_metadata=None,
-                inputs_embeds=self.inputs_embeds[:num_tokens])
+                inputs_embeds=inputs_embeds,
+            )
         return hidden_states
 
     def profile_run(self) -> None:

From 66aaa7722df3d7ef9e9bd2942cab5cd0d7473174 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 11 Dec 2024 10:59:50 -0800
Subject: [PATCH 024/357] [torch.compile] remove graph logging in ci (#11110)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 09a3daa731829..4a5dc337d01b8 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -287,9 +287,11 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
             graph, self.compilation_config.splitting_ops)
 
         from torch._dynamo.utils import lazy_format_graph_code
-        logger.debug("%s", lazy_format_graph_code("before split", self.graph))
-        logger.debug("%s", lazy_format_graph_code("after split",
-                                                  self.split_gm))
+
+        # depyf will hook lazy_format_graph_code and dump the graph
+        # for debugging, no need to print the graph here
+        lazy_format_graph_code("before split", self.graph)
+        lazy_format_graph_code("after split", self.split_gm)
 
         compilation_counter.num_piecewise_graphs_seen += len(
             self.piecewise_graphs)

From 72ff3a968682e6a3f7620ab59f2baf5e8eb2777b Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Wed, 11 Dec 2024 11:36:35 -0800
Subject: [PATCH 025/357] [core] Bump ray to use _overlap_gpu_communication in
 compiled graph tests (#10410)

Signed-off-by: Rui Qiao <ubuntu@ip-172-31-15-128.us-west-2.compute.internal>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Rui Qiao <ubuntu@ip-172-31-15-128.us-west-2.compute.internal>
---
 requirements-test.in              |  2 +-
 requirements-test.txt             |  2 +-
 vllm/envs.py                      |  8 ++++++++
 vllm/executor/ray_gpu_executor.py | 17 ++++++++++-------
 4 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/requirements-test.in b/requirements-test.in
index c0b228148ab31..57fddb416317e 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -13,7 +13,7 @@ einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio tests
 peft
-ray[adag]==2.35
+ray[adag]==2.40.0
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 timm # required for internvl test
diff --git a/requirements-test.txt b/requirements-test.txt
index 8ceb705cdffd7..c786a1249bddb 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -410,7 +410,7 @@ pyyaml==6.0.2
     #   ray
     #   timm
     #   transformers
-ray[adag]==2.35.0
+ray[adag]==2.40.0
     # via -r requirements-test.in
 redis==5.2.0
     # via tensorizer
diff --git a/vllm/envs.py b/vllm/envs.py
index be5d9985b63a4..bc8c1499e9534 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -45,6 +45,7 @@
     VLLM_USE_RAY_SPMD_WORKER: bool = False
     VLLM_USE_RAY_COMPILED_DAG: bool = False
     VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True
+    VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = True
     VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
     VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
@@ -337,6 +338,13 @@ def get_default_config_root():
     lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1"))
                  ),
 
+    # If the env var is set, it enables GPU communication overlap in
+    # Ray's compiled DAG. This flag is ignored if
+    # VLLM_USE_RAY_COMPILED_DAG is not set.
+    "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM":
+    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "1"))
+                 ),
+
     # Use dedicated multiprocess context for workers.
     # Both spawn and fork work
     "VLLM_WORKER_MULTIPROC_METHOD":
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 4263fb27265f6..4bf5cbbd18ffe 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -414,12 +414,10 @@ def _check_ray_adag_installation(self):
         import pkg_resources
         from packaging import version
 
-        required_version = version.parse("2.35")
+        required_version = version.parse("2.40")
         current_version = version.parse(
             pkg_resources.get_distribution("ray").version)
-        # TODO: update the constraint once we adapt to the backward
-        # incompatible API change from ray 2.36
-        if current_version != required_version:
+        if current_version < required_version:
             raise ValueError(f"Ray version {required_version} is "
                              f"required, but found {current_version}")
 
@@ -445,6 +443,8 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
 
         logger.info("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = %s",
                     envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL)
+        logger.info("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s",
+                    envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
         with InputNode() as input_data:
             # Example DAG: PP=2, TP=4
             # (ExecuteModelReq, None) -> 0 -> (ExecuteModelReq, IntermediateOutput) -> 4 -> SamplerOutput   # noqa: E501
@@ -480,7 +480,10 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
 
             forward_dag = MultiOutputNode(outputs)
 
-        return forward_dag.experimental_compile(enable_asyncio=enable_asyncio)
+        return forward_dag.experimental_compile(
+            enable_asyncio=enable_asyncio,
+            _overlap_gpu_communication=envs.
+            VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
 
     def __del__(self):
         self.shutdown()
@@ -507,8 +510,8 @@ async def execute_model_async(
 
         serialized_data = self.input_encoder.encode(execute_model_req)
         dag_future = await self.forward_dag.execute_async(serialized_data)
-        outputs = await dag_future
-        return self.output_decoder.decode(outputs[0])
+        output = await dag_future[0]
+        return self.output_decoder.decode(output)
 
     async def _driver_execute_model_async(
         self,

From d1e21a979bba4712f48dac1bbf410e0b57c92e7a Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 12 Dec 2024 06:18:16 +0800
Subject: [PATCH 026/357] [CI/Build] Split up VLM tests (#11083)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml                 | 32 ++++++---
 pyproject.toml                                |  3 +-
 .../vision_language/test_models.py            | 72 ++++++++++++-------
 tests/utils.py                                | 37 ++++++----
 4 files changed, 94 insertions(+), 50 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index df4fa7a6ee9ba..aca505178df06 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -321,7 +321,7 @@ steps:
 
 #####  models test  #####
 
-- label: Basic Models Test # 30min
+- label: Basic Models Test # 24min
   source_file_dependencies:
   - vllm/
   - tests/models
@@ -331,7 +331,7 @@ steps:
     - pytest -v -s models/test_registry.py
     - pytest -v -s models/test_initialization.py
 
-- label: Language Models Test (Standard) # 42min
+- label: Language Models Test (Standard) # 32min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
@@ -342,7 +342,7 @@ steps:
     - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
     - pytest -v -s models/embedding/language -m core_model
 
-- label: Language Models Test (Extended) # 50min
+- label: Language Models Test (Extended) # 1h10min
   optional: true
   source_file_dependencies:
   - vllm/
@@ -353,7 +353,7 @@ steps:
     - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
     - pytest -v -s models/embedding/language -m 'not core_model'
 
-- label: Multi-Modal Models Test (Standard) # 26min
+- label: Multi-Modal Models Test (Standard) # 28min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
@@ -369,7 +369,7 @@ steps:
     - pytest -v -s models/encoder_decoder/language -m core_model
     - pytest -v -s models/encoder_decoder/vision_language -m core_model
 
-- label: Multi-Modal Models Test (Extended) # 1h15m
+- label: Multi-Modal Models Test (Extended) 1 # 1h16m
   optional: true
   source_file_dependencies:
   - vllm/
@@ -380,14 +380,24 @@ steps:
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
+    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
     # HACK - run phi3v tests separately to sidestep this transformers bug
     # https://github.com/huggingface/transformers/issues/34307
     - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
     - pytest -v -s models/embedding/vision_language -m 'not core_model'
     - pytest -v -s models/encoder_decoder/language -m 'not core_model'
     - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
 
+- label: Multi-Modal Models Test (Extended) 2 # 38m
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/vision_language
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
+
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
   optional: true
@@ -446,11 +456,11 @@ steps:
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
-  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
+  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   # Avoid importing model tests that cause CUDA reinitialization error
-  - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
-  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
-  - pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus
+  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - pip install -e ./plugins/vllm_add_dummy_model
   - pytest -v -s distributed/test_distributed_oot.py
@@ -540,7 +550,7 @@ steps:
   # see https://github.com/vllm-project/vllm/pull/5689 for details
   - pytest -v -s distributed/test_custom_all_reduce.py
   - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
-  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
+  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   - pytest -v -s -x lora/test_mixtral.py
 
 - label: LM Eval Large Models # optional
diff --git a/pyproject.toml b/pyproject.toml
index 253b706a774a7..c5a14ecf5aea9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -96,7 +96,8 @@ markers = [
     "core_model: enable this model test in each PR instead of only nightly",
     "cpu_model: enable this model test in CPU tests",
     "quant_model: run this model test under Quantized category",
-    "distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
+    "split: run this test as part of a split",
+    "distributed: run this test only in distributed GPU tests",
     "skip_v1: do not run this test with v1",
     "optional: optional tests that are automatically skipped, include --optional to run them",
 ]
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index ed8f34a677f84..3101d1d2ea831 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -1,7 +1,9 @@
 """Common tests for testing .generate() functionality for single / multiple
 image, embedding, and video support for different VLMs in vLLM.
 """
+import math
 import os
+from collections import defaultdict
 from pathlib import PosixPath
 from typing import Type
 
@@ -10,11 +12,12 @@
 from transformers.utils import is_flash_attn_2_available
 
 from vllm.platforms import current_platform
-from vllm.utils import cuda_device_count_stateless, identity
+from vllm.utils import identity
 
 from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets,
                           _VideoAssets)
-from ....utils import fork_new_process_for_each_test, large_gpu_mark
+from ....utils import (fork_new_process_for_each_test, large_gpu_mark,
+                       multi_gpu_marks)
 from ...utils import check_outputs_equal
 from .vlm_utils import custom_inputs, model_utils, runners
 from .vlm_utils.case_filtering import get_parametrized_options
@@ -382,7 +385,7 @@
         prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
     ),
     ### Tensor parallel / multi-gpu broadcast tests
-    "broadcast-chameleon": VLMTestInfo(
+    "chameleon-broadcast": VLMTestInfo(
         models=["facebook/chameleon-7b"],
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
@@ -393,43 +396,25 @@
         vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
         hf_output_post_proc = lambda hf_output, model: hf_output[:2],
         comparator=check_outputs_equal,
-        marks=[
-            pytest.mark.distributed_2_gpus,
-            pytest.mark.skipif(
-                cuda_device_count_stateless() < 2,
-                reason="Need at least 2 GPUs to run the test.",
-            ),
-        ],
+        marks=multi_gpu_marks(num_gpus=2),
         **COMMON_BROADCAST_SETTINGS # type: ignore
     ),
-    "broadcast-llava": VLMTestInfo(
+    "llava-broadcast": VLMTestInfo(
         models=["llava-hf/llava-1.5-7b-hf"],
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
-        marks=[
-            pytest.mark.distributed_2_gpus,
-            pytest.mark.skipif(
-                cuda_device_count_stateless() < 2,
-                reason="Need at least 2 GPUs to run the test.",
-            )
-        ],
+        marks=multi_gpu_marks(num_gpus=2),
         **COMMON_BROADCAST_SETTINGS # type: ignore
     ),
-    "broadcast-llava_next": VLMTestInfo(
+    "llava_next-broadcast": VLMTestInfo(
         models=["llava-hf/llava-v1.6-mistral-7b-hf"],
         prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
         max_model_len=10240,
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
-        marks=[
-            pytest.mark.distributed_2_gpus,
-            pytest.mark.skipif(
-                cuda_device_count_stateless() < 2,
-                reason="Need at least 2 GPUs to run the test.",
-            )
-        ],
+        marks=multi_gpu_marks(num_gpus=2),
         **COMMON_BROADCAST_SETTINGS # type: ignore
     ),
     ### Custom input edge-cases for specific models
@@ -468,6 +453,41 @@
 # yapf: enable
 
 
+def _mark_splits(
+    test_settings: dict[str, VLMTestInfo],
+    *,
+    num_groups: int,
+) -> dict[str, VLMTestInfo]:
+    name_by_test_info_id = {id(v): k for k, v in test_settings.items()}
+    test_infos_by_model = defaultdict[str, list[VLMTestInfo]](list)
+
+    for info in test_settings.values():
+        for model in info.models:
+            test_infos_by_model[model].append(info)
+
+    models = sorted(test_infos_by_model.keys())
+    split_size = math.ceil(len(models) / num_groups)
+
+    new_test_settings = dict[str, VLMTestInfo]()
+
+    for i in range(num_groups):
+        models_in_group = models[i * split_size:(i + 1) * split_size]
+
+        for model in models_in_group:
+            for info in test_infos_by_model[model]:
+                new_marks = (info.marks or []) + [pytest.mark.split(group=i)]
+                new_info = info._replace(marks=new_marks)
+                new_test_settings[name_by_test_info_id[id(info)]] = new_info
+
+    missing_keys = test_settings.keys() - new_test_settings.keys()
+    assert not missing_keys, f"Missing keys: {missing_keys}"
+
+    return new_test_settings
+
+
+VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
+
+
 ### Test wrappers
 # Wrappers around the core test running func for:
 # - single image
diff --git a/tests/utils.py b/tests/utils.py
index a893667e144a6..afeb708f3bcdc 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -682,10 +682,12 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
 
 
 def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
-    """Gets a pytest skipif mark, which triggers ig the the device doesn't have
-    meet a minimum memory requirement in gb; can be leveraged via 
-    @large_gpu_test to skip tests in environments without enough resources, or
-    called when filtering tests to run directly.
+    """
+    Get a pytest mark, which skips the test if the GPU doesn't meet
+    a minimum memory requirement in GB.
+    
+    This can be leveraged via `@large_gpu_test` to skip tests in environments
+    without enough resources, or called when filtering tests to run directly.
     """
     try:
         if current_platform.is_cpu():
@@ -712,26 +714,37 @@ def large_gpu_test(*, min_gb: int):
 
     Currently, the CI machine uses L4 GPU which has 24 GB VRAM.
     """
-    test_skipif = large_gpu_mark(min_gb)
+    mark = large_gpu_mark(min_gb)
 
     def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
-        return test_skipif(f)
+        return mark(f)
 
     return wrapper
 
 
-def multi_gpu_test(*, num_gpus: int):
-    """
-    Decorate a test to be run only when multiple GPUs are available.
-    """
-    test_selector = getattr(pytest.mark, f"distributed_{num_gpus}_gpus")
+def multi_gpu_marks(*, num_gpus: int):
+    """Get a collection of pytest marks to apply for `@multi_gpu_test`."""
+    test_selector = pytest.mark.distributed(num_gpus=num_gpus)
     test_skipif = pytest.mark.skipif(
         cuda_device_count_stateless() < num_gpus,
         reason=f"Need at least {num_gpus} GPUs to run the test.",
     )
 
+    return [test_selector, test_skipif]
+
+
+def multi_gpu_test(*, num_gpus: int):
+    """
+    Decorate a test to be run only when multiple GPUs are available.
+    """
+    marks = multi_gpu_marks(num_gpus=num_gpus)
+
     def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
-        return test_selector(test_skipif(fork_new_process_for_each_test(f)))
+        func = fork_new_process_for_each_test(f)
+        for mark in reversed(marks):
+            func = mark(func)
+
+        return func
 
     return wrapper
 

From 452a723bf2e8410ee9b47f82f90c7ea48aa6d14f Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Wed, 11 Dec 2024 18:34:54 -0500
Subject: [PATCH 027/357] [V1][Core] Remove should_shutdown to simplify core
 process termination (#11113)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 vllm/v1/engine/core.py        | 13 ++-----------
 vllm/v1/engine/core_client.py |  6 ------
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 55a5c4dff3a5c..a26ffe74a3ae8 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -5,7 +5,6 @@
 import threading
 import time
 from multiprocessing.process import BaseProcess
-from multiprocessing.sharedctypes import Synchronized
 from typing import List, Tuple, Type, Union
 
 import zmq
@@ -133,13 +132,9 @@ def __init__(
         input_path: str,
         output_path: str,
         ready_path: str,
-        should_shutdown: Synchronized,
     ):
         super().__init__(vllm_config, executor_class, usage_context)
 
-        # Signal from main process to shutdown (multiprocessing.Value).
-        self.should_shutdown = should_shutdown
-
         # Background Threads and Queues for IO. These enable us to
         # overlap ZMQ socket IO with GPU since they release the GIL,
         # and to overlap some serialization/deserialization with the
@@ -195,7 +190,6 @@ def make_engine_core_process(
         input_path: str,
         output_path: str,
         ready_path: str,
-        should_shutdown: Synchronized,
     ) -> BaseProcess:
         # The current process might have CUDA context,
         # so we need to spawn a new process.
@@ -210,7 +204,6 @@ def make_engine_core_process(
             "vllm_config": vllm_config,
             "executor_class": executor_class,
             "usage_context": usage_context,
-            "should_shutdown": should_shutdown
         }
         # Run EngineCore busy loop in background process.
         proc = context.Process(target=EngineCoreProc.run_engine_core,
@@ -260,8 +253,8 @@ def signal_handler(signum, frame):
     def run_busy_loop(self):
         """Core busy loop of the EngineCore."""
 
-        # Loop until we get a shutdown signal.
-        while not self.should_shutdown:
+        # Loop until process is sent a SIGINT or SIGTERM
+        while True:
             # 1) Poll the input queue until there is work to do.
             if not self.scheduler.has_unfinished_requests():
                 while True:
@@ -272,8 +265,6 @@ def run_busy_loop(self):
                     except queue.Empty:
                         self._log_stats()
                         logger.debug("EngineCore busy loop waiting.")
-                        if self.should_shutdown:
-                            return
                     except BaseException:
                         raise
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 4d96b323d1662..1d5ddf4db4d7c 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,5 +1,4 @@
 import atexit
-import multiprocessing
 from typing import List, Union
 
 import msgspec
@@ -149,21 +148,16 @@ def __init__(
         self.input_socket.bind(input_path)
 
         # Start EngineCore in background process.
-        self.should_shutdown = multiprocessing.Value('b', False, lock=False)
         self.proc = EngineCoreProc.make_engine_core_process(
             *args,
             input_path=input_path,
             output_path=output_path,
             ready_path=ready_path,
-            should_shutdown=self.should_shutdown,
             **kwargs,
         )
         atexit.register(self.shutdown)
 
     def shutdown(self):
-        # Send shutdown signal to background process.
-        self.should_shutdown = True
-
         # Shut down the zmq context.
         self.ctx.destroy(linger=0)
 

From 4e116833686f3e0c0a223b05b5859ad76843a017 Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>
Date: Wed, 11 Dec 2024 19:55:30 -0500
Subject: [PATCH 028/357] [V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 examples/offline_inference_vision_language.py | 126 ++++++++++++--
 requirements-common.txt                       |   1 +
 tests/v1/engine/test_engine_core.py           |   1 +
 tests/v1/engine/test_engine_core_client.py    |   1 +
 vllm/config.py                                |  10 +-
 vllm/engine/arg_utils.py                      |   8 +
 vllm/v1/engine/__init__.py                    |   3 +-
 vllm/v1/engine/core.py                        |  18 +-
 vllm/v1/engine/mm_input_mapper.py             | 156 ++++++++++++++++--
 vllm/v1/engine/processor.py                   |  35 ++--
 vllm/v1/utils.py                              |  21 +++
 11 files changed, 332 insertions(+), 48 deletions(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index c6a274ee5894b..5e210126dc8fe 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -5,6 +5,8 @@
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
+import random
+
 from transformers import AutoTokenizer
 
 from vllm import LLM, SamplingParams
@@ -23,7 +25,9 @@ def run_llava(question: str, modality: str):
 
     prompt = f"USER: <image>\n{question}\nASSISTANT:"
 
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf", max_model_len=4096)
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf",
+              max_model_len=4096,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -33,7 +37,9 @@ def run_llava_next(question: str, modality: str):
     assert modality == "image"
 
     prompt = f"[INST] <image>\n{question} [/INST]"
-    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
+    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
+              max_model_len=8192,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -44,7 +50,9 @@ def run_llava_next_video(question: str, modality: str):
     assert modality == "video"
 
     prompt = f"USER: <video>\n{question} ASSISTANT:"
-    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
+    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
+              max_model_len=8192,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -61,7 +69,8 @@ def run_llava_onevision(question: str, modality: str):
         <|im_start|>assistant\n"
 
     llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
-              max_model_len=16384)
+              max_model_len=16384,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -71,7 +80,10 @@ def run_fuyu(question: str, modality: str):
     assert modality == "image"
 
     prompt = f"{question}\n"
-    llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
+    llm = LLM(model="adept/fuyu-8b",
+              max_model_len=2048,
+              max_num_seqs=2,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -107,6 +119,7 @@ def run_phi3v(question: str, modality: str):
         max_num_seqs=2,
         # Note - mm_processor_kwargs can also be passed to generate/chat calls
         mm_processor_kwargs={"num_crops": 16},
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
     stop_token_ids = None
     return llm, prompt, stop_token_ids
@@ -118,7 +131,8 @@ def run_paligemma(question: str, modality: str):
 
     # PaliGemma has special prompt format for VQA
     prompt = "caption en"
-    llm = LLM(model="google/paligemma-3b-mix-224")
+    llm = LLM(model="google/paligemma-3b-mix-224",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -128,7 +142,9 @@ def run_chameleon(question: str, modality: str):
     assert modality == "image"
 
     prompt = f"{question}<image>"
-    llm = LLM(model="facebook/chameleon-7b", max_model_len=4096)
+    llm = LLM(model="facebook/chameleon-7b",
+              max_model_len=4096,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -154,6 +170,7 @@ def run_minicpmv(question: str, modality: str):
         max_model_len=4096,
         max_num_seqs=2,
         trust_remote_code=True,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
     # NOTE The stop_token_ids are different for various versions of MiniCPM-V
     # 2.0
@@ -186,6 +203,7 @@ def run_h2ovl(question: str, modality: str):
         model=model_name,
         trust_remote_code=True,
         max_model_len=8192,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -211,6 +229,7 @@ def run_internvl(question: str, modality: str):
         model=model_name,
         trust_remote_code=True,
         max_model_len=4096,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -241,6 +260,7 @@ def run_nvlm_d(question: str, modality: str):
         trust_remote_code=True,
         max_model_len=4096,
         tensor_parallel_size=4,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -260,7 +280,8 @@ def run_blip2(question: str, modality: str):
     # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
     # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
     prompt = f"Question: {question} Answer:"
-    llm = LLM(model="Salesforce/blip2-opt-2.7b")
+    llm = LLM(model="Salesforce/blip2-opt-2.7b",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -274,6 +295,7 @@ def run_qwen_vl(question: str, modality: str):
         trust_remote_code=True,
         max_model_len=1024,
         max_num_seqs=2,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
     prompt = f"{question}Picture 1: <img></img>\n"
@@ -296,6 +318,7 @@ def run_qwen2_vl(question: str, modality: str):
             "min_pixels": 28 * 28,
             "max_pixels": 1280 * 28 * 28,
         },
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
     prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
@@ -315,6 +338,7 @@ def run_pixtral_hf(question: str, modality: str):
     llm = LLM(
         model=model_name,
         max_model_len=8192,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
     prompt = f"<s>[INST]{question}\n[IMG][/INST]"
@@ -338,6 +362,7 @@ def run_mllama(question: str, modality: str):
         max_model_len=4096,
         max_num_seqs=16,
         enforce_eager=True,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
     prompt = f"<|image|><|begin_of_text|>{question}"
@@ -355,6 +380,7 @@ def run_molmo(question, modality):
         model=model_name,
         trust_remote_code=True,
         dtype="bfloat16",
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
     prompt = question
@@ -371,7 +397,8 @@ def run_glm4v(question: str, modality: str):
               max_model_len=2048,
               max_num_seqs=2,
               trust_remote_code=True,
-              enforce_eager=True)
+              enforce_eager=True,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     prompt = question
     stop_token_ids = [151329, 151336, 151338]
     return llm, prompt, stop_token_ids
@@ -394,6 +421,7 @@ def run_idefics3(question: str, modality: str):
                 "longest_edge": 3 * 364
             },
         },
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
     prompt = (
         f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
@@ -410,7 +438,8 @@ def run_aria(question: str, modality: str):
     llm = LLM(model=model_name,
               tokenizer_mode="slow",
               trust_remote_code=True,
-              dtype="bfloat16")
+              dtype="bfloat16",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
 
     prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
               "<|im_end|>\n<|im_start|>assistant\n")
@@ -430,6 +459,7 @@ def run_mantis(question: str, modality: str):
         model="TIGER-Lab/Mantis-8B-siglip-llama3",
         max_model_len=4096,
         hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
     stop_token_ids = [128009]
     return llm, prompt, stop_token_ids
@@ -494,6 +524,35 @@ def get_multi_modal_input(args):
     raise ValueError(msg)
 
 
+def apply_image_repeat(image_repeat_prob, num_prompts, data, prompt, modality):
+    """Repeats images with provided probability of "image_repeat_prob". 
+    Used to simulate hit/miss for the MM preprocessor cache.
+    """
+    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
+    no_yes = [0, 1]
+    probs = [1.0 - image_repeat_prob, image_repeat_prob]
+
+    inputs = []
+    cur_image = data
+    for i in range(num_prompts):
+        if image_repeat_prob is not None:
+            res = random.choices(no_yes, probs)[0]
+            if res == 0:
+                # No repeat => Modify one pixel
+                cur_image = cur_image.copy()
+                new_val = (i // 256 // 256, i // 256, i % 256)
+                cur_image.putpixel((0, 0), new_val)
+
+        inputs.append({
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: cur_image
+            }
+        })
+
+    return inputs
+
+
 def main(args):
     model = args.model_type
     if model not in model_example_map:
@@ -524,14 +583,29 @@ def main(args):
 
     else:
         # Batch inference
-        inputs = [{
-            "prompt": prompt,
-            "multi_modal_data": {
-                modality: data
-            },
-        } for _ in range(args.num_prompts)]
+        if args.image_repeat_prob is not None:
+            # Repeat images with specified probability of "image_repeat_prob"
+            inputs = apply_image_repeat(args.image_repeat_prob,
+                                        args.num_prompts, data, prompt,
+                                        modality)
+        else:
+            # Use the same image for all prompts
+            inputs = [{
+                "prompt": prompt,
+                "multi_modal_data": {
+                    modality: data
+                },
+            } for _ in range(args.num_prompts)]
+
+    if args.time_generate:
+        import time
+        start_time = time.time()
+        outputs = llm.generate(inputs, sampling_params=sampling_params)
+        elapsed_time = time.time() - start_time
+        print("-- generate time = {}".format(elapsed_time))
 
-    outputs = llm.generate(inputs, sampling_params=sampling_params)
+    else:
+        outputs = llm.generate(inputs, sampling_params=sampling_params)
 
     for o in outputs:
         generated_text = o.outputs[0].text
@@ -561,5 +635,23 @@ def main(args):
                         type=int,
                         default=16,
                         help='Number of frames to extract from the video.')
+
+    parser.add_argument(
+        '--image-repeat-prob',
+        type=float,
+        default=None,
+        help='Simulates the hit-ratio for multi-modal preprocessor cache'
+        ' (if enabled)')
+
+    parser.add_argument(
+        '--mm-cache-preprocessor',
+        action='store_true',
+        help='If True, enable caching of multi-modal preprocessor/mapper.')
+
+    parser.add_argument(
+        '--time-generate',
+        action='store_true',
+        help='If True, then print the total generate() call time')
+
     args = parser.parse_args()
     main(args)
diff --git a/requirements-common.txt b/requirements-common.txt
index 850b8f4101701..11984260c580d 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -3,6 +3,7 @@ sentencepiece  # Required for LLaMA tokenizer.
 numpy < 2.0.0
 requests >= 2.26.0
 tqdm
+blake3
 py-cpuinfo
 transformers >= 4.45.2  # Required for Llama 3.2 and Qwen2-VL.
 tokenizers >= 0.19.1  # Required for Llama 3.
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index fef44ac29c41f..a61ec63a365b5 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -28,6 +28,7 @@ def make_request() -> EngineCoreRequest:
         prompt=PROMPT,
         prompt_token_ids=PROMPT_TOKENS,
         mm_inputs=None,
+        mm_hashes=None,
         mm_placeholders=None,
         sampling_params=SamplingParams(),
         eos_token_id=None,
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 4e003a25e91d2..2f1cbec607a91 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -30,6 +30,7 @@ def make_request(params: SamplingParams) -> EngineCoreRequest:
         prompt=PROMPT,
         prompt_token_ids=PROMPT_TOKENS,
         mm_inputs=None,
+        mm_hashes=None,
         mm_placeholders=None,
         sampling_params=params,
         eos_token_id=None,
diff --git a/vllm/config.py b/vllm/config.py
index 7f9be5a3a98bc..08a7b607630af 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -147,6 +147,9 @@ class ModelConfig:
             HuggingFace config.
         mm_processor_kwargs: Arguments to be forwarded to the model's processor
             for multi-modal data, e.g., image processor.
+        mm_cache_preprocessor: If true, then enables caching of the multi-modal 
+            preprocessor/mapper. Otherwise, the mapper executes each time, and 
+            for better performance consider enabling frontend process.
         override_neuron_config: Initialize non default neuron config or
             override default neuron config that are specific to Neuron devices,
             this argument will be used to configure the neuron config that
@@ -185,6 +188,7 @@ def __init__(
             config_format: ConfigFormat = ConfigFormat.AUTO,
             hf_overrides: Optional[HfOverrides] = None,
             mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+            mm_cache_preprocessor: bool = False,
             override_neuron_config: Optional[Dict[str, Any]] = None,
             override_pooler_config: Optional["PoolerConfig"] = None) -> None:
         self.model = model
@@ -251,6 +255,7 @@ def __init__(
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
         self.use_async_output_proc = use_async_output_proc
         self.mm_processor_kwargs = mm_processor_kwargs
+        self.mm_cache_preprocessor = mm_cache_preprocessor
 
         # Set enforce_eager to False if the value is unset.
         if self.enforce_eager is None:
@@ -2686,9 +2691,10 @@ def __str__(self):
             f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
             f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
             f"use_async_output_proc={self.model_config.use_async_output_proc}, "
+            f"mm_cache_preprocessor={self.model_config.mm_cache_preprocessor!r}, "  # noqa
             f"mm_processor_kwargs={self.model_config.mm_processor_kwargs}, "
-            f"pooler_config={self.model_config.pooler_config!r},"
-            f" compilation_config={self.compilation_config!r}")
+            f"pooler_config={self.model_config.pooler_config!r}, "
+            f"compilation_config={self.compilation_config!r}")
 
 
 _current_vllm_config: Optional[VllmConfig] = None
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 7337522bc9952..0c28fe7032728 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -143,6 +143,7 @@ class EngineArgs:
     tokenizer_pool_extra_config: Optional[Dict[str, Any]] = None
     limit_mm_per_prompt: Optional[Mapping[str, int]] = None
     mm_processor_kwargs: Optional[Dict[str, Any]] = None
+    mm_cache_preprocessor: bool = False
     enable_lora: bool = False
     enable_lora_bias: bool = False
     max_loras: int = 1
@@ -593,6 +594,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             type=json.loads,
             help=('Overrides for the multimodal input mapping/processing, '
                   'e.g., image processor. For example: {"num_crops": 4}.'))
+        parser.add_argument(
+            '--mm-cache-preprocessor',
+            action='store_true',
+            help='If true, then enables caching of the multi-modal '
+            'preprocessor/mapper. Otherwise, the mapper executes each time'
+            ', and for better performance consider enabling frontend process.')
 
         # LoRA related configs
         parser.add_argument('--enable-lora',
@@ -965,6 +972,7 @@ def create_model_config(self) -> ModelConfig:
             use_async_output_proc=not self.disable_async_output_proc,
             config_format=self.config_format,
             mm_processor_kwargs=self.mm_processor_kwargs,
+            mm_cache_preprocessor=self.mm_cache_preprocessor,
             override_neuron_config=self.override_neuron_config,
             override_pooler_config=self.override_pooler_config,
         )
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 3cf0e610ae7af..abeea052c1fa5 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -35,7 +35,8 @@ class EngineCoreRequest:
     # always be tokenized?
     prompt: Optional[str]
     prompt_token_ids: List[int]
-    mm_inputs: Optional[List[MultiModalKwargs]]
+    mm_inputs: Optional[List[Optional[MultiModalKwargs]]]
+    mm_hashes: Optional[List[Optional[str]]]
     mm_placeholders: Optional[MultiModalPlaceholderDict]
     sampling_params: SamplingParams
     eos_token_id: Optional[int]
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index a26ffe74a3ae8..877a35e36427a 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -18,7 +18,7 @@
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
                             EngineCoreRequestType)
-from vllm.v1.engine.mm_input_mapper import MMInputMapper
+from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import PickleEncoder
@@ -55,9 +55,6 @@ def __init__(
         vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
         vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
 
-        # Set up multimodal input mapper (e.g., convert PIL images to tensors).
-        self.mm_input_mapper = MMInputMapper(vllm_config.model_config)
-
         # Setup scheduler.
         self.scheduler = Scheduler(vllm_config.scheduler_config,
                                    vllm_config.cache_config,
@@ -65,6 +62,8 @@ def __init__(
 
         self._last_logging_time = time.time()
 
+        self.mm_input_mapper_server = MMInputMapperServer()
+
     def _initialize_kv_caches(self,
                               cache_config: CacheConfig) -> Tuple[int, int]:
         start = time.time()
@@ -88,7 +87,18 @@ def _initialize_kv_caches(self,
 
     def add_request(self, request: EngineCoreRequest):
         """Add request to the scheduler."""
+
+        if request.mm_hashes is not None:
+            # Here, if hash exists for an image, then it will be fetched
+            # from the cache, else it will be added to the cache.
+            # Note that the cache here is mirrored with the client side of the
+            # MM mapper, so anything that has a hash must have a HIT cache
+            # entry here as well.
+            request.mm_inputs = self.mm_input_mapper_server.process_inputs(
+                request.mm_inputs, request.mm_hashes)
+
         req = Request.from_engine_core_request(request)
+
         self.scheduler.add_request(req)
 
     def abort_requests(self, request_ids: List[str]):
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index 7ad6882b04520..58ee29bedb201 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -1,11 +1,35 @@
 from typing import Any, Dict, List, Optional
 
+import PIL
+from blake3 import blake3
+
 from vllm.config import ModelConfig
+from vllm.inputs import PromptType
+from vllm.logger import init_logger
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
                              MultiModalKwargs, MultiModalRegistry)
+from vllm.v1.utils import LRUDictCache
+
+logger = init_logger(__name__)
+
+# The idea of MM preprocessor caching is based on having a client and a server,
+# where the client executes in the frontend process (=P0) and the server in the
+# core process (=P1).
+#
+# -- Client: Executes the MM mapper and performs caching of the results.
+# -- Server: Performs caching of the results
+#
+# The caching for both client and server is mirrored/similar, and this allows us
+# to avoid the serialization of "mm_inputs" (like pixel values) between
+# client (=P0) and server (=P1) processes.
 
+# Both Client and Server must use the same cache size
+# (to perform mirrored caching)
+# TODO: Tune the MM cache size
+MM_CACHE_SIZE = 256
 
-class MMInputMapper:
+
+class MMInputMapperClient:
 
     def __init__(
         self,
@@ -18,23 +42,131 @@ def __init__(
             model_config)
         self.mm_registry.init_mm_limits_per_prompt(model_config)
 
+        self.mm_cache = LRUDictCache(MM_CACHE_SIZE)
+
+        # DEBUG: Set to None to disable
+        self.mm_debug_cache_hit_ratio_steps = None
+        self.mm_cache_hits = 0
+        self.mm_cache_total = 0
+
+    def cache_hit_ratio(self, steps) -> float:
+        if self.mm_cache_total > 0 and self.mm_cache_total % steps == 0:
+            logger.debug("MMInputMapper: cache_hit_ratio = %.2f ",
+                         self.mm_cache_hits / self.mm_cache_total)
+
     def process_inputs(
         self,
         mm_data: MultiModalDataDict,
+        mm_hashes: Optional[List[str]],
         mm_processor_kwargs: Optional[Dict[str, Any]],
+        precomputed_mm_inputs: Optional[List[MultiModalKwargs]],
+    ) -> List[MultiModalKwargs]:
+        if precomputed_mm_inputs is None:
+            image_inputs = mm_data["image"]
+            if not isinstance(image_inputs, list):
+                image_inputs = [image_inputs]
+            num_inputs = len(image_inputs)
+        else:
+            num_inputs = len(precomputed_mm_inputs)
+
+        # Check if hash is enabled
+        use_hash = mm_hashes is not None
+        if use_hash:
+            assert num_inputs == len(
+                mm_hashes), "num_inputs = {} len(mm_hashes) = {}".format(
+                    num_inputs, len(mm_hashes))
+
+        # Process each image input separately, so that later we can schedule
+        # them in a fine-grained manner.
+        # Apply caching (if enabled) and reuse precomputed inputs (if provided)
+        ret_hashes = [] if use_hash else None
+        ret_inputs: List[MultiModalKwargs] = []
+        for input_id in range(num_inputs):
+            if self.mm_debug_cache_hit_ratio_steps is not None:
+                self.cache_hit_ratio(self.mm_debug_cache_hit_ratio_steps)
+
+            mm_hash = None
+            mm_input = None
+            if use_hash:
+                mm_hash = mm_hashes[input_id]
+                mm_input = self.mm_cache.get(mm_hash)
+
+            self.mm_cache_total += 1
+            if mm_input is None:
+                if precomputed_mm_inputs is not None:
+                    # Reuse precomputed input (for merged preprocessor)
+                    mm_input = precomputed_mm_inputs[input_id]
+                else:
+                    # Apply MM mapper
+                    mm_input = self.multi_modal_input_mapper(
+                        {"image": [image_inputs[input_id]]},
+                        mm_processor_kwargs=mm_processor_kwargs,
+                    )
+
+                if use_hash:
+                    # Add to cache
+                    self.mm_cache.put(mm_hash, mm_input)
+            else:
+                self.mm_cache_hits += 1
+                mm_input = None  # Avoids sending mm_input to Server
+
+            if use_hash:
+                ret_hashes.append(mm_hash)
+            ret_inputs.append(mm_input)
+
+        return ret_inputs, ret_hashes
+
+
+class MMInputMapperServer:
+
+    def __init__(self, ):
+        self.mm_cache = LRUDictCache(MM_CACHE_SIZE)
+
+    def process_inputs(
+        self,
+        mm_inputs: List[Optional[MultiModalKwargs]],
+        mm_hashes: List[Optional[str]],
     ) -> List[MultiModalKwargs]:
+        assert len(mm_inputs) == len(mm_hashes)
+
+        full_mm_inputs = []
+        for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
+            if mm_input is None:
+                mm_input = self.mm_cache.get(mm_hash)
+                assert mm_input is not None
+            else:
+                self.mm_cache.put(mm_hash, mm_input)
+
+            full_mm_inputs.append(mm_input)
+
+        return full_mm_inputs
+
+
+class MMHasher:
+
+    def __init__(self):
+        pass
+
+    def hash(self, prompt: PromptType) -> Optional[List[str]]:
+        if "multi_modal_data" not in prompt:
+            return None
+
+        mm_data = prompt["multi_modal_data"]
         image_inputs = mm_data["image"]
         if not isinstance(image_inputs, list):
             image_inputs = [image_inputs]
+        assert len(image_inputs) > 0
 
-        # Process each image input separately so that later we can schedule
-        # them in a fine-grained manner.
-        mm_inputs: List[MultiModalKwargs] = []
-        num_images = len(image_inputs)
-        for i in range(num_images):
-            mm_input = self.multi_modal_input_mapper(
-                {"image": image_inputs[i]},
-                mm_processor_kwargs=mm_processor_kwargs,
-            )
-            mm_inputs.append(mm_input)
-        return mm_inputs
+        ret = []
+        for image in image_inputs:
+            assert isinstance(image, PIL.Image.Image)
+
+            # Convert image to bytes
+            bytes = image.tobytes()
+
+            # Hash image bytes
+            hasher = blake3()
+            hasher.update(bytes)
+            ret.append(hasher.hexdigest())
+
+        return ret
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index e0e525b30a767..903996bad3726 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -15,7 +15,7 @@
 from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest
-from vllm.v1.engine.mm_input_mapper import MMInputMapper
+from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
 
 
 class Processor:
@@ -42,7 +42,11 @@ def __init__(
             model_config)
 
         # Multi-modal (huggingface) input mapper
-        self.mm_input_mapper = MMInputMapper(model_config)
+        self.mm_input_mapper_client = MMInputMapperClient(model_config)
+
+        # Multi-modal hasher (for images)
+        self.mm_hasher = MMHasher(
+        ) if model_config.mm_cache_preprocessor else None
 
     # TODO: run in an ThreadpoolExecutor or BackgroundProcess.
     # This ideally should releases the GIL, so we should not block the
@@ -71,6 +75,11 @@ def process_inputs(
         assert priority == 0, "vLLM V1 does not support priority at the moment."
         assert trace_headers is None, "vLLM V1 does not support tracing yet."
 
+        # Compute MM hashes (if enabled)
+        mm_hashes = None
+        if self.mm_hasher is not None:
+            mm_hashes = self.mm_hasher.hash(prompt)
+
         # Process inputs.
         preprocessed_inputs = self.input_preprocessor.preprocess(
             prompt,
@@ -101,16 +110,17 @@ def process_inputs(
         sampling_params.update_from_generation_config(
             self.generation_config_fields, eos_token_id)
 
-        # Preprocess multi-modal data
-        if len(decoder_inputs.multi_modal_data) == 0:
-            mm_inputs = None
-        elif isinstance(decoder_inputs.multi_modal_data, MultiModalKwargs):
-            mm_inputs = [decoder_inputs.multi_modal_data]
-        else:
-            mm_inputs = self.mm_input_mapper.process_inputs(
-                decoder_inputs.multi_modal_data,
-                decoder_inputs.mm_processor_kwargs,
-            )
+        # For merged preprocessor, mm_data is already mm_inputs
+        precomputed_mm_inputs = None
+        if isinstance(decoder_inputs.multi_modal_data, MultiModalKwargs):
+            precomputed_mm_inputs = [decoder_inputs.multi_modal_data]
+
+        # Apply MM mapper
+        mm_inputs = None
+        if len(decoder_inputs.multi_modal_data) > 0:
+            mm_inputs, mm_hashes = self.mm_input_mapper_client.process_inputs(
+                decoder_inputs.multi_modal_data, mm_hashes,
+                decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs)
 
         # Make Request for Detokenizer.
         detokenizer_request = DetokenizerRequest(
@@ -130,6 +140,7 @@ def process_inputs(
             decoder_inputs.prompt,
             decoder_inputs.prompt_token_ids,
             mm_inputs,
+            mm_hashes,
             decoder_inputs.multi_modal_placeholders,
             sampling_params,
             eos_token_id,
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 6e7a7d4fe12cd..6ecf20e717ca3 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,3 +1,4 @@
+from collections import OrderedDict
 from contextlib import contextmanager
 from typing import Any, Generic, Iterator, List, TypeVar, overload
 
@@ -93,3 +94,23 @@ def make_zmq_socket(path: str, type: Any) -> Iterator[zmq.Socket]:
 
     finally:
         ctx.destroy(linger=0)
+
+
+class LRUDictCache:
+
+    def __init__(self, size: int):
+        self.cache = OrderedDict()
+        self.size = size
+
+    def get(self, key, default=None):
+        if key not in self.cache:
+            return default
+
+        self.cache.move_to_end(key)
+        return self.cache[key]
+
+    def put(self, key, value):
+        self.cache[key] = value
+        self.cache.move_to_end(key)
+        if len(self.cache) > self.size:
+            self.cache.popitem(last=False)

From 7439a8b5fcbc4d77bd73496f27d4048c5b43cb22 Mon Sep 17 00:00:00 2001
From: Clayton <132770471+cedonley@users.noreply.github.com>
Date: Wed, 11 Dec 2024 17:10:12 -0800
Subject: [PATCH 029/357] [Bugfix] Multiple fixes to tool streaming with hermes
 and mistral (#10979)

Signed-off-by: cedonley <clayton@donley.io>
---
 vllm/entrypoints/openai/serving_chat.py       | 16 +++++-
 .../openai/tool_parsers/hermes_tool_parser.py | 51 +++++++++++++++----
 .../tool_parsers/mistral_tool_parser.py       | 23 ++++++---
 3 files changed, 69 insertions(+), 21 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 0af7613a473a4..0738210e27cb6 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -496,21 +496,33 @@ async def chat_completion_stream_generator(
 
                         if self._should_check_for_unstreamed_tool_arg_tokens(
                                 delta_message, output) and tool_parser:
+                            latest_delta_len = 0
+                            if ((isinstance(
+                                    delta_message.tool_calls[0].function,
+                                    DeltaFunctionCall)) and isinstance(
+                                        delta_message.tool_calls[0].function.
+                                        arguments, str)):
+                                latest_delta_len = len(
+                                    delta_message.tool_calls[0].function.
+                                    arguments)
+
                             # get the expected call based on partial JSON
                             # parsing which "autocompletes" the JSON
                             expected_call = json.dumps(
                                 tool_parser.prev_tool_call_arr[index].get(
-                                    "arguments", {}))
+                                    "arguments", {}),
+                                ensure_ascii=False)
 
                             # get what we've streamed so far for arguments
                             # for the current tool
                             actual_call = tool_parser.streamed_args_for_tool[
                                 index]
+                            if (latest_delta_len > 0):
+                                actual_call = actual_call[:-latest_delta_len]
 
                             # check to see if there's anything left to stream
                             remaining_call = expected_call.replace(
                                 actual_call, "", 1)
-
                             # set that as a delta message
                             delta_message = DeltaMessage(tool_calls=[
                                 DeltaToolCall(index=index,
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index 18816cd665b3e..869d15ac359ea 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -91,7 +91,8 @@ def extract_tool_calls(
                         function=FunctionCall(
                             name=function_call["name"],
                             # function call args are JSON but as a string
-                            arguments=json.dumps(function_call["arguments"])))
+                            arguments=json.dumps(function_call["arguments"],
+                                                 ensure_ascii=False)))
                     for function_call in raw_function_calls
                 ]
 
@@ -139,13 +140,26 @@ def extract_tool_calls_streaming(
                 self.tool_call_start_token_id)
             cur_tool_end_count = current_token_ids.count(
                 self.tool_call_end_token_id)
+            tool_call_portion = None
+            text_portion = None
 
             # case: if we're generating text, OR rounding out a tool call
             if (cur_tool_start_count == cur_tool_end_count
-                    and prev_tool_end_count == cur_tool_end_count):
+                    and prev_tool_end_count == cur_tool_end_count
+                    and self.tool_call_end_token not in delta_text):
                 logger.debug("Generating text content! skipping tool parsing.")
-                if delta_text != self.tool_call_end_token:
-                    return DeltaMessage(content=delta_text)
+                return DeltaMessage(content=delta_text)
+
+            if self.tool_call_end_token in delta_text:
+                logger.debug("tool_call_end_token in delta_text")
+                full_text = current_text + delta_text
+                tool_call_portion = full_text.split(
+                    self.tool_call_start_token)[-1].split(
+                        self.tool_call_end_token)[0].rstrip()
+                delta_text = delta_text.split(
+                    self.tool_call_end_token)[0].rstrip()
+                text_portion = delta_text.split(
+                    self.tool_call_end_token)[-1].lstrip()
 
             # case: if tool open & close tag counts don't match, we're doing
             # imaginary "else" block here
@@ -184,15 +198,21 @@ def extract_tool_calls_streaming(
 
             # case -- the current tool call is being closed.
             elif (cur_tool_start_count == cur_tool_end_count
-                  and cur_tool_end_count > prev_tool_end_count):
+                  and cur_tool_end_count >= prev_tool_end_count):
+                if (self.prev_tool_call_arr is None
+                        or len(self.prev_tool_call_arr) == 0):
+                    logger.debug(
+                        "attempting to close tool call, but no tool call")
+                    return None
                 diff = self.prev_tool_call_arr[self.current_tool_id].get(
                     "arguments")
                 if diff:
                     diff = diff.encode('utf-8').decode(
                         'unicode_escape') if diff is str else diff
-                    diff = json.dumps(
-                        diff, ensure_ascii=False
-                    )[len(self.streamed_args_for_tool[self.current_tool_id]):]
+                    if ('"}' not in delta_text):
+                        return None
+                    end_loc = delta_text.rindex('"}')
+                    diff = delta_text[:end_loc] + '"}'
                     logger.debug(
                         "Finishing tool and found diff that had not "
                         "been streamed yet: %s", diff)
@@ -221,10 +241,15 @@ def extract_tool_calls_streaming(
             except partial_json_parser.core.exceptions.MalformedJSON:
                 logger.debug('not enough tokens to parse into JSON yet')
                 return None
+            except json.decoder.JSONDecodeError:
+                logger.debug("unable to parse JSON")
+                return None
 
             # case - we haven't sent the tool name yet. If it's available, send
             #   it. otherwise, wait until it's available.
             if not self.current_tool_name_sent:
+                if (current_tool_call is None):
+                    return None
                 function_name: Union[str, None] = current_tool_call.get("name")
                 if function_name:
                     self.current_tool_name_sent = True
@@ -284,13 +309,17 @@ def extract_tool_calls_streaming(
             #   autocompleting the JSON
             elif cur_arguments and not prev_arguments:
 
-                cur_arguments_json = json.dumps(cur_arguments)
+                cur_arguments_json = json.dumps(cur_arguments,
+                                                ensure_ascii=False)
                 logger.debug("finding %s in %s", delta_text,
                              cur_arguments_json)
 
                 # get the location where previous args differ from current
-                args_delta_start_loc = cur_arguments_json.index(delta_text) \
-                                       + len(delta_text)
+                if (delta_text not in cur_arguments_json[:-2]):
+                    return None
+                args_delta_start_loc = cur_arguments_json[:-2]. \
+                                           rindex(delta_text) + \
+                                           len(delta_text)
 
                 # use that to find the actual delta
                 arguments_delta = cur_arguments_json[:args_delta_start_loc]
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index 5caac84138e3b..bada805dd35b9 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -19,7 +19,6 @@
     extract_intermediate_diff)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
 
@@ -109,7 +108,8 @@ def extract_tool_calls(
                     function=FunctionCall(
                         name=raw_function_call["name"],
                         # function call args are JSON but as a string
-                        arguments=json.dumps(raw_function_call["arguments"])))
+                        arguments=json.dumps(raw_function_call["arguments"],
+                                             ensure_ascii=False)))
                 for raw_function_call in function_call_arr
             ]
 
@@ -199,7 +199,7 @@ def extract_tool_calls_streaming(
                     diff: Union[str, None] = current_tool_call.get("arguments")
 
                     if diff:
-                        diff = json.dumps(diff).replace(
+                        diff = json.dumps(diff, ensure_ascii=False).replace(
                             self.streamed_args_for_tool[self.current_tool_id],
                             "")
                         delta = DeltaMessage(tool_calls=[
@@ -232,7 +232,7 @@ def extract_tool_calls_streaming(
                     delta = DeltaMessage(tool_calls=[
                         DeltaToolCall(index=self.current_tool_id,
                                       type="function",
-                                      id=f"chatcmpl-tool-{random_uuid()}",
+                                      id=MistralToolCall.generate_random_id(),
                                       function=DeltaFunctionCall(
                                           name=function_name).model_dump(
                                               exclude_none=True))
@@ -250,6 +250,8 @@ def extract_tool_calls_streaming(
                 cur_arguments = current_tool_call.get("arguments")
 
                 new_text = delta_text.replace("\'", "\"")
+                if ('"}' in new_text):
+                    new_text = new_text[:new_text.rindex('"}')]
 
                 if not cur_arguments and not prev_arguments:
 
@@ -260,12 +262,15 @@ def extract_tool_calls_streaming(
                         "mid-arguments")
                     delta = None
                 elif cur_arguments and not prev_arguments:
-                    cur_arguments_json = json.dumps(cur_arguments)
+                    cur_arguments_json = json.dumps(cur_arguments,
+                                                    ensure_ascii=False)[:-2]
                     logger.debug("finding %s in %s", new_text,
                                  cur_arguments_json)
 
+                    if (new_text not in cur_arguments_json):
+                        return None
                     arguments_delta = cur_arguments_json[:cur_arguments_json.
-                                                         index(new_text) +
+                                                         rindex(new_text) +
                                                          len(new_text)]
                     logger.debug("First tokens in arguments received: %s",
                                  arguments_delta)
@@ -279,8 +284,10 @@ def extract_tool_calls_streaming(
                         self.current_tool_id] += arguments_delta
 
                 elif cur_arguments and prev_arguments:
-                    cur_args_json = json.dumps(cur_arguments)
-                    prev_args_json = json.dumps(prev_arguments)
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments,
+                                                ensure_ascii=False)
                     logger.debug("Searching for diff between \n%s\n%s",
                                  cur_args_json, prev_args_json)
 

From 8fb26dac614425de5b14f8e77a10bde35bacf155 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Wed, 11 Dec 2024 17:33:11 -0800
Subject: [PATCH 030/357] [Docs] Add media kit (#11121)

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index ed5161ccffb45..93b71ddaccc61 100644
--- a/README.md
+++ b/README.md
@@ -134,3 +134,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 * For coordinating contributions and development, please use Slack.
 * For security disclosures, please use Github's security advisory feature.
 * For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
+
+## Media Kit
+
+* If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).

From 24a36d6d5f789fd2d5105174c24528fc7e659b00 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Wed, 11 Dec 2024 21:39:21 -0500
Subject: [PATCH 031/357] Update link to LlamaStack remote vLLM guide in
 serving_with_llamastack.rst (#11112)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 docs/source/serving/serving_with_llamastack.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/serving/serving_with_llamastack.rst b/docs/source/serving/serving_with_llamastack.rst
index 8ef96c4e54369..a2acd7b39f887 100644
--- a/docs/source/serving/serving_with_llamastack.rst
+++ b/docs/source/serving/serving_with_llamastack.rst
@@ -24,7 +24,7 @@ Then start Llama Stack server pointing to your vLLM server with the following co
         config:
           url: http://127.0.0.1:8000
 
-Please refer to `this guide <https://github.com/meta-llama/llama-stack/blob/main/docs/source/getting_started/distributions/self_hosted_distro/remote_vllm.md>`_ for more details on this remote vLLM provider.
+Please refer to `this guide <https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html>`_ for more details on this remote vLLM provider.
 
 Inference via Embedded vLLM
 ---------------------------

From ccede2b264668d854cba4fce7f8fbbf203908f60 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 11 Dec 2024 22:12:24 -0500
Subject: [PATCH 032/357] [Core] cleanup zmq ipc sockets on exit (#11115)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/entrypoints/openai/api_server.py  |  9 +++++++++
 vllm/v1/engine/core.py                 | 16 +++++++++++++--
 vllm/v1/engine/core_client.py          | 28 ++++++++++++++++++--------
 vllm/v1/executor/multiproc_executor.py | 21 ++++++++++++-------
 4 files changed, 57 insertions(+), 17 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index a345f8caeeed2..2e27224b41864 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1,4 +1,5 @@
 import asyncio
+import atexit
 import importlib
 import inspect
 import multiprocessing
@@ -196,6 +197,14 @@ async def build_async_engine_client_from_engine_args(
         assert engine_pid is not None, "Engine process failed to start."
         logger.info("Started engine process with PID %d", engine_pid)
 
+        def _cleanup_ipc_path():
+            socket_path = ipc_path.replace("ipc://", "")
+            if os.path.exists(socket_path):
+                os.remove(socket_path)
+
+        # Ensure we clean up the local IPC socket file on exit.
+        atexit.register(_cleanup_ipc_path)
+
         # Build RPCClient, which conforms to EngineClient Protocol.
         engine_config = engine_args.create_engine_config()
         build_client = partial(MQLLMEngineClient, ipc_path, engine_config,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 877a35e36427a..6246a0067842a 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -4,6 +4,7 @@
 import signal
 import threading
 import time
+from dataclasses import dataclass
 from multiprocessing.process import BaseProcess
 from typing import List, Tuple, Type, Union
 
@@ -129,6 +130,14 @@ def profile(self, is_start=True):
         self.model_executor.profile(is_start)
 
 
+@dataclass
+class EngineCoreProcHandle:
+    proc: BaseProcess
+    ready_path: str
+    input_path: str
+    output_path: str
+
+
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
 
@@ -200,7 +209,7 @@ def make_engine_core_process(
         input_path: str,
         output_path: str,
         ready_path: str,
-    ) -> BaseProcess:
+    ) -> EngineCoreProcHandle:
         # The current process might have CUDA context,
         # so we need to spawn a new process.
         # NOTE(rob): this is a problem for using EngineCoreProc w/
@@ -222,7 +231,10 @@ def make_engine_core_process(
 
         # Wait for startup
         EngineCoreProc.wait_for_startup(proc, ready_path)
-        return proc
+        return EngineCoreProcHandle(proc=proc,
+                                    ready_path=ready_path,
+                                    input_path=input_path,
+                                    output_path=output_path)
 
     @staticmethod
     def run_engine_core(*args, **kwargs):
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 1d5ddf4db4d7c..8eb9a27438d53 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,4 +1,5 @@
 import atexit
+import os
 from typing import List, Union
 
 import msgspec
@@ -148,7 +149,7 @@ def __init__(
         self.input_socket.bind(input_path)
 
         # Start EngineCore in background process.
-        self.proc = EngineCoreProc.make_engine_core_process(
+        self.proc_handle = EngineCoreProc.make_engine_core_process(
             *args,
             input_path=input_path,
             output_path=output_path,
@@ -161,13 +162,24 @@ def shutdown(self):
         # Shut down the zmq context.
         self.ctx.destroy(linger=0)
 
-        # Shutdown the process if needed.
-        if hasattr(self, "proc") and self.proc.is_alive():
-            self.proc.terminate()
-            self.proc.join(5)
-
-            if self.proc.is_alive():
-                kill_process_tree(self.proc.pid)
+        if hasattr(self, "proc_handle"):
+            # Shutdown the process if needed.
+            if self.proc_handle.proc.is_alive():
+                self.proc_handle.proc.terminate()
+                self.proc_handle.proc.join(5)
+
+                if self.proc_handle.proc.is_alive():
+                    kill_process_tree(self.proc_handle.proc.pid)
+
+            # Remove zmq ipc socket files
+            ipc_sockets = [
+                self.proc_handle.ready_path, self.proc_handle.output_path,
+                self.proc_handle.input_path
+            ]
+            for ipc_socket in ipc_sockets:
+                socket_file = ipc_socket.replace("ipc://", "")
+                if os.path.exists(socket_file):
+                    os.remove(socket_file)
 
     def __del__(self):
         self.shutdown()
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index f8f3d583618cf..63a12f791051f 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -172,16 +172,23 @@ def wait_for_termination(procs, timeout):
 
         # Send SIGTERM if still running
         active_procs = [w.proc for w in self.workers if w.proc.is_alive()]
-        self.workers = None
         for p in active_procs:
             p.terminate()
-        if wait_for_termination(active_procs, 4):
-            return
+        if not wait_for_termination(active_procs, 4):
+            # Send SIGKILL if still running
+            active_procs = [p for p in active_procs if p.is_alive()]
+            for p in active_procs:
+                p.kill()
 
-        # Send SIGKILL if still running
-        active_procs = [p for p in active_procs if p.is_alive()]
-        for p in active_procs:
-            p.kill()
+        self._cleanup_sockets()
+        self.workers = None
+
+    def _cleanup_sockets(self):
+        for w in self.workers:
+            # Remove the zmq ipc socket file
+            socket_path = w.ready_path.replace("ipc://", "")
+            if os.path.exists(socket_path):
+                os.remove(socket_path)
 
     def shutdown(self):
         """Properly shut down the executor and its workers"""

From 1da8f0e1dddaf8625829e7ecca7fce93eb685c03 Mon Sep 17 00:00:00 2001
From: Pooya Davoodi <pooya.davoodi@parasail.io>
Date: Wed, 11 Dec 2024 22:39:16 -0800
Subject: [PATCH 033/357] [Model] Add support for embedding model GritLM
 (#10816)

Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io>
---
 docs/source/models/supported_models.rst       |  10 +
 .../models/embedding/language/test_gritlm.py  | 200 ++++++++++++++
 tests/models/registry.py                      |   1 +
 vllm/model_executor/models/gritlm.py          | 245 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   2 +
 5 files changed, 458 insertions(+)
 create mode 100644 tests/models/embedding/language/test_gritlm.py
 create mode 100644 vllm/model_executor/models/gritlm.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index b9957cf9563b1..35aa3bfdd12b7 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -203,6 +203,11 @@ Text Generation (``--task generate``)
     - :code:`ibm-granite/granite-3.0-1b-a400m-base`, :code:`ibm-granite/granite-3.0-3b-a800m-instruct`, :code:`ibm/PowerMoE-3b`, etc.
     - ✅︎
     - ✅︎
+  * - :code:`GritLM`
+    - GritLM
+    - :code:`parasail-ai/GritLM-7B-vllm`.
+    - ✅︎
+    - ✅︎
   * - :code:`InternLMForCausalLM`
     - InternLM
     - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc.
@@ -395,6 +400,11 @@ The following table lists those that are tested in vLLM.
     - :code:`BAAI/bge-multilingual-gemma2`, etc.
     - 
     - ✅︎
+  * - :code:`GritLM`
+    - GritLM
+    - :code:`parasail-ai/GritLM-7B-vllm`.
+    - ✅︎
+    - ✅︎
   * - :code:`LlamaModel`, :code:`LlamaForCausalLM`, :code:`MistralModel`, etc.
     - Llama-based
     - :code:`intfloat/e5-mistral-7b-instruct`, etc.
diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py
new file mode 100644
index 0000000000000..b947265be9e9d
--- /dev/null
+++ b/tests/models/embedding/language/test_gritlm.py
@@ -0,0 +1,200 @@
+import importlib.util
+import math
+from array import array
+from typing import List
+
+import openai
+import pytest
+import pytest_asyncio
+from scipy.spatial.distance import cosine
+
+import vllm
+import vllm.config
+
+from ....utils import RemoteOpenAIServer
+
+# GritLM embedding implementation is only supported by XFormers backend.
+pytest.mark.skipif(not importlib.util.find_spec("xformers"),
+                   reason="GritLM requires XFormers")
+
+MODEL_NAME = "parasail-ai/GritLM-7B-vllm"
+MAX_MODEL_LEN = 4000
+
+
+def _arr(arr):
+    """
+    Convert a list of integers to an array of integers.
+    """
+    return array("i", arr)
+
+
+def test_find_array(monkeypatch):
+    # GritLM embedding implementation is only supported by XFormers backend.
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+
+    from vllm.model_executor.models.gritlm import GritLMPooler
+
+    # Create an LLM object to get the model config.
+    llm = vllm.LLM(MODEL_NAME, task="embedding", max_model_len=MAX_MODEL_LEN)
+    pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
+
+    arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
+    assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
+
+    with pytest.raises(ValueError):
+        pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
+
+
+@pytest.fixture(scope="module")
+def server_embedding():
+    # GritLM embedding implementation is only supported by XFormers backend.
+    with pytest.MonkeyPatch.context() as mp:
+        mp.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+
+        args = ["--task", "embedding", "--max_model_len", str(MAX_MODEL_LEN)]
+        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+            yield remote_server
+
+
+@pytest.fixture(scope="module")
+def server_generate():
+    args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client_embedding(server_embedding: RemoteOpenAIServer):
+    async with server_embedding.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest_asyncio.fixture
+async def client_generate(server_generate: RemoteOpenAIServer):
+    async with server_generate.get_async_client() as async_client:
+        yield async_client
+
+
+def run_llm_encode(llm: vllm.LLM, queries: List[str],
+                   instruction: str) -> List[float]:
+    outputs = llm.encode([instruction + q for q in queries], )
+    return [output.outputs.embedding for output in outputs]
+
+
+async def run_client_embeddings(client: vllm.LLM, queries: List[str],
+                                instruction: str) -> List[float]:
+    outputs = await client.embeddings.create(
+        model=MODEL_NAME,
+        input=[instruction + q for q in queries],
+    )
+    return [data.embedding for data in outputs.data]
+
+
+def gritlm_instruction(instruction):
+    return ("<|user|>\n" + instruction +
+            "\n<|embed|>\n" if instruction else "<|embed|>\n")
+
+
+def get_test_data():
+    """
+    Grabbed this test data and the expected values from
+    README.md in https://github.com/ContextualAI/gritlm
+    """
+    q_instruction = gritlm_instruction(
+        "Given a scientific paper title, retrieve the paper's abstract")
+    queries = [
+        "Bitcoin: A Peer-to-Peer Electronic Cash System",
+        "Generative Representational Instruction Tuning",
+    ]
+
+    d_instruction = gritlm_instruction("")
+    documents = [
+        # ruff: noqa: E501
+        "A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.",
+        "All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.",
+    ]
+
+    return queries, q_instruction, documents, d_instruction
+
+
+def validate_embed_output(q_rep: List[float], d_rep: List[float]):
+    cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
+    assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001)
+
+    cosine_sim_q0_d1 = 1 - cosine(q_rep[0], d_rep[1])
+    assert math.isclose(cosine_sim_q0_d1, 0.101, abs_tol=0.001)
+
+    cosine_sim_q1_d0 = 1 - cosine(q_rep[1], d_rep[0])
+    assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001)
+
+    cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1])
+    assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001)
+
+
+def test_gritlm_offline_embedding(monkeypatch):
+    # GritLM embedding implementation is only supported by XFormers backend.
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+
+    queries, q_instruction, documents, d_instruction = get_test_data()
+
+    llm = vllm.LLM(MODEL_NAME, task="embedding", max_model_len=MAX_MODEL_LEN)
+
+    d_rep = run_llm_encode(
+        llm,
+        documents,
+        d_instruction,
+    )
+    q_rep = run_llm_encode(
+        llm,
+        queries,
+        q_instruction,
+    )
+
+    validate_embed_output(q_rep, d_rep)
+
+
+@pytest.mark.asyncio
+async def test_gritlm_api_server_embedding(
+        client_embedding: openai.AsyncOpenAI):
+    queries, q_instruction, documents, d_instruction = get_test_data()
+
+    d_rep = await run_client_embeddings(
+        client_embedding,
+        documents,
+        d_instruction,
+    )
+    q_rep = await run_client_embeddings(
+        client_embedding,
+        queries,
+        q_instruction,
+    )
+
+    validate_embed_output(q_rep, d_rep)
+
+
+def test_gritlm_offline_gen():
+    input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
+
+    llm = vllm.LLM(MODEL_NAME, max_model_len=MAX_MODEL_LEN)
+    sampling_params = vllm.SamplingParams(temperature=0.0, max_tokens=256)
+    outputs = llm.generate(input, sampling_params=sampling_params)
+
+    assert outputs[0].outputs[0].text == "The capital of France is Paris."
+
+
+@pytest.mark.asyncio
+async def test_gritlm_api_server_gen(client_generate: openai.AsyncOpenAI):
+    input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
+
+    outputs = await client_generate.completions.create(
+        model=MODEL_NAME,
+        prompt=input,
+        max_tokens=256,
+        temperature=0.0,
+    )
+
+    assert outputs.choices[0].text == "The capital of France is Paris."
diff --git a/tests/models/registry.py b/tests/models/registry.py
index a89518820045f..6a8b1742ceae3 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -135,6 +135,7 @@ class _HfExamplesInfo:
     # [Text-only]
     "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
     "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),
+    "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
     "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
     "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
new file mode 100644
index 0000000000000..ec01a07c16a62
--- /dev/null
+++ b/vllm/model_executor/models/gritlm.py
@@ -0,0 +1,245 @@
+from array import array
+from typing import List, Optional, Union
+
+import torch
+from torch import nn
+from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+
+from vllm.attention import AttentionMetadata
+from vllm.attention.backends.xformers import XFormersImpl
+from vllm.config import ModelConfig, VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.model_executor.pooling_metadata import (PoolingMetadata,
+                                                  PoolingTensors)
+from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.sequence import (EmbeddingSequenceGroupOutput, IntermediateTensors,
+                           PoolerOutput)
+
+logger = init_logger(__name__)
+
+
+class GritLMPooler(nn.Module):
+
+    def __init__(self, model_config: ModelConfig):
+        super().__init__()
+
+        self.model_config = model_config
+
+        tokenizer = cached_get_tokenizer(
+            self.model_config.tokenizer,
+            tokenizer_mode=self.model_config.tokenizer_mode,
+            tokenizer_revision=self.model_config.tokenizer_revision,
+            trust_remote_code=self.model_config.trust_remote_code,
+        )
+
+        # Collect the tokens needed for pattern matching.
+        # "▁<" is different from "_<". The former uses "▁" to indicate that
+        # the next token is the start of a word.
+        # "<0x0A>" is the newline token (i.e. "\n")."
+        self.token_ids = {
+            tok: tokenizer.convert_tokens_to_ids([tok])[0]
+            for tok in ["<s>", "▁<", "<", "|", "embed", ">", "<0x0A>", "user"]
+        }
+
+        def tokens_to_ids(tokens: list[str]) -> array:
+            return array("i", [self.token_ids[token] for token in tokens])
+
+        self.user_pattern_ids = tokens_to_ids(
+            ["▁<", "|", "user", "|", ">", "<0x0A>"])
+        self.embed_newline_pattern_ids = tokens_to_ids(
+            ["<0x0A>", "<", "|", "embed", "|", ">", "<0x0A>"])
+        self.embed_pattern_ids = tokens_to_ids(
+            ["▁<", "|", "embed", "|", ">", "<0x0A>"])
+
+    def _find_array(self, arr: array, target: array, start_idx: int) -> int:
+        """
+        Find the first occurrence of target in arr starting from start_idx.
+
+        Args:
+        arr: The array to search within
+        target: The consecutive subsequence to find
+        start_idx: The starting index to search from
+
+        Returns:
+        int: The index of the first occurrence of target in arr.
+        """
+        if start_idx < 0:
+            raise ValueError("start_idx must be non-negative")
+        if not target or not arr:
+            raise ValueError("Empty arr or target not allowed")
+
+        target_len = len(target)
+        for i in range(start_idx, len(arr) - target_len + 1):
+            if arr[i:i + target_len] == target:
+                return i
+        return -1
+
+    def _get_instruction_len(self, prompt_token_ids: array) -> bool:
+        """
+        Get the length of the instruction in the prompt.
+
+        We do a pattern matching to find the instruction in the prompt,
+        and then return the length of the instruction.
+
+        The pattern matching is done using integers instead of strings
+        because the prompt is given as a list of token IDs.
+        """
+
+        instruction_len = 0
+
+        # Return no instruction in case of missing BOS token.
+        if prompt_token_ids[0] != self.token_ids["<s>"]:
+            logger.warning("BOS token not found in prompt,"
+                           "thus using empty string for instruction."
+                           "GritLM requires BOS token in prompt.")
+            return instruction_len
+
+        # If user pattern is found in the prompt, that means there should be
+        # a newline token before the embed pattern.
+        embed_pattern_ids = self.embed_pattern_ids
+        if self._find_array(prompt_token_ids,
+                            self.user_pattern_ids,
+                            start_idx=1) == 1:
+            embed_pattern_ids = self.embed_newline_pattern_ids
+
+        # Find the embed pattern in the prompt.
+        found_embed_pattern_idx = self._find_array(prompt_token_ids,
+                                                   embed_pattern_ids,
+                                                   start_idx=1)
+
+        if found_embed_pattern_idx != -1:
+            instruction_len = found_embed_pattern_idx + len(embed_pattern_ids)
+        else:
+            logger.warning("Query instruction not found in prompt,"
+                           "thus using BOS token as instruction instead."
+                           "GritLM requires query instruction in prompt.")
+            instruction_len = 1
+
+        return instruction_len
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        """
+        Pool the hidden states by summing the embeddings of
+        non-instruction tokens.
+        """
+        prompts_token_ids = [
+            token_ids.prompt_token_ids_array
+            for _, token_ids in pooling_metadata.seq_data.items()
+        ]
+
+        instruction_lens = torch.tensor(
+            [
+                self._get_instruction_len(prompt_token_ids)
+                for prompt_token_ids in prompts_token_ids
+            ],
+            device=hidden_states.device,
+        )
+
+        prompt_lens = PoolingTensors.from_pooling_metadata(
+            pooling_metadata, hidden_states.device).prompt_lens
+
+        mask = torch.zeros_like(hidden_states, dtype=torch.bool)
+
+        start_idx = 0
+        for prompt_len, instruction_len in zip(prompt_lens, instruction_lens):
+            end_idx = start_idx + prompt_len
+            mask[start_idx + instruction_len:end_idx] = True
+            start_idx = end_idx
+
+        masked_hidden_states = hidden_states.masked_fill(~mask, 0.0)
+
+        sum_embeddings = torch.zeros(len(prompt_lens),
+                                     hidden_states.size(1),
+                                     device=hidden_states.device)
+
+        start_idx = 0
+        for i, prompt_len in enumerate(prompt_lens):
+            end_idx = start_idx + prompt_len
+            sum_embeddings[i] = masked_hidden_states[start_idx:end_idx].sum(
+                dim=0)
+            start_idx = end_idx
+
+        num_non_instruction_tokens = prompt_lens - instruction_lens
+        mean_embeddings = sum_embeddings / num_non_instruction_tokens.unsqueeze(
+            1)
+
+        pooled_data = nn.functional.normalize(mean_embeddings, p=2, dim=1)
+
+        pooled_outputs = [
+            EmbeddingSequenceGroupOutput(data.tolist()) for data in pooled_data
+        ]
+
+        return PoolerOutput(outputs=pooled_outputs)
+
+
+class GritLM(LlamaForCausalLM):
+    """This class implements the embedding model for parasail-ai/GritLM-7B-vllm.
+
+    The class inherits from LlamaForCausalLM and provides a custom pooling
+    layer.
+
+    The main difference between the pooling layer in GritLM and the one in
+    LlamaForCausalLM is that GritLM ignores the query instruction in the prompt
+    when pooling the hidden states.
+
+    Embedding prompts should be in the following format:
+    - With instruction: "<|user|>\nINSTRUCTION\n<|embed|>\nPROMPT".
+    - Without instruction: "<|embed|>\nPROMPT".
+
+    Generation prompts should be in the following format:
+    - "<|user|>\nPROMPT\n<|assistant|>\n"
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        **kwargs,
+    ) -> None:
+        super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
+
+        self.task = vllm_config.model_config.task
+
+        self._pooler = GritLMPooler(vllm_config.model_config)
+
+        for layer in self.model.layers:
+            if self.task == "embedding" and hasattr(layer, "self_attn"):
+                assert isinstance(layer.self_attn.attn.impl, XFormersImpl), (
+                    "GritLM embedding is only supported by XFormers backend, "
+                    "which can be forced by VLLM_ATTENTION_BACKEND=XFORMERS")
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        **kwargs,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+
+        # Change attention to non-causal for embedding task.
+        if self.task == "embedding":
+            assert attn_metadata.prefill_metadata.attn_bias is None
+            attn_metadata.prefill_metadata.attn_bias = [
+                BlockDiagonalMask.from_seqlens(attn_metadata.seq_lens)
+            ]
+
+        return super().forward(
+            input_ids=input_ids,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            **kwargs,
+        )
+
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        return self._pooler(hidden_states, pooling_metadata)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 4beea4641f5ab..4e77746f312e3 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -56,6 +56,7 @@
     "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
     "GraniteForCausalLM": ("granite", "GraniteForCausalLM"),
     "GraniteMoeForCausalLM": ("granitemoe", "GraniteMoeForCausalLM"),
+    "GritLM": ("gritlm", "GritLM"),
     "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
     "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
     "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"),
@@ -110,6 +111,7 @@
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),
+    "GritLM": ("gritlm", "GritLM"),
     "LlamaModel": ("llama", "LlamaForCausalLM"),
     **{
         # Multiple models share the same architecture, so we include them all

From f092153fbe349a9a1742940e3703bfcff6aa0a6d Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 11 Dec 2024 23:14:20 -0800
Subject: [PATCH 034/357] [V1] Use more persistent buffers to optimize input
 preparation overheads (#11111)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_input_batch.py  |  19 +++--
 vllm/v1/worker/gpu_model_runner.py | 119 ++++++++++++++++-------------
 2 files changed, 79 insertions(+), 59 deletions(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 25d95ac6e26af..9046b37f60005 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -53,14 +53,23 @@ def __init__(
         self.req_ids: List[Optional[str]] = [None] * max_num_reqs
         self.req_id_to_index: Dict[str, int] = {}
 
-        self.token_ids_cpu = np.empty((max_num_reqs, max_model_len),
-                                      dtype=np.int32)
+        # TODO(woosuk): This buffer could be too large if max_model_len is big.
+        # Find a way to reduce the CPU memory usage.
+        self.token_ids_cpu_tensor = torch.zeros(
+            (max_num_reqs, max_model_len),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
         self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
 
         # Attention-related.
-        self.block_table = torch.zeros((max_num_reqs, max_num_blocks_per_req),
-                                       device=self.device,
-                                       dtype=torch.int32)
+        self.block_table = torch.zeros(
+            (max_num_reqs, max_num_blocks_per_req),
+            device=self.device,
+            dtype=torch.int32,
+        )
         self.block_table_cpu_tensor = torch.zeros(
             (max_num_reqs, max_num_blocks_per_req),
             device="cpu",
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index e75be21ef2d91..aa91255e68d48 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -67,6 +67,7 @@ def __init__(
         self.max_model_len = model_config.max_model_len
         self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
+        self.max_num_reqs = scheduler_config.max_num_seqs
 
         # Model-related.
         self.num_attn_layers = model_config.get_num_layers_by_block_type(
@@ -88,7 +89,7 @@ def __init__(
         self.requests: Dict[str, CachedRequestState] = {}
         # Persistent batch.
         self.input_batch = InputBatch(
-            max_num_reqs=self.scheduler_config.max_num_seqs,
+            max_num_reqs=self.max_num_reqs,
             max_model_len=self.max_model_len,
             max_num_blocks_per_req=self.max_num_blocks_per_req,
             device=self.device,
@@ -117,6 +118,32 @@ def __init__(
             dtype=self.dtype,
             device=self.device)
 
+        self.input_ids_cpu = torch.zeros(self.max_num_tokens,
+                                         dtype=torch.int32,
+                                         device="cpu",
+                                         pin_memory=self.pin_memory)
+        self.input_ids_np = self.input_ids_cpu.numpy()
+        self.positions_cpu = torch.zeros(self.max_num_tokens,
+                                         dtype=torch.int64,
+                                         device="cpu",
+                                         pin_memory=self.pin_memory)
+        self.positions_np = self.positions_cpu.numpy()
+        self.slot_mapping_cpu = torch.zeros(self.max_num_tokens,
+                                            dtype=torch.int32,
+                                            device="cpu",
+                                            pin_memory=self.pin_memory)
+        self.slot_mapping_np = self.slot_mapping_cpu.numpy()
+        self.query_start_loc_cpu = torch.zeros(self.max_num_reqs + 1,
+                                               dtype=torch.int32,
+                                               device="cpu",
+                                               pin_memory=self.pin_memory)
+        self.query_start_loc_np = self.query_start_loc_cpu.numpy()
+        self.seq_start_loc_cpu = torch.zeros(self.max_num_reqs + 1,
+                                             dtype=torch.int32,
+                                             device="cpu",
+                                             pin_memory=self.pin_memory)
+        self.seq_start_loc_np = self.seq_start_loc_cpu.numpy()
+
     def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # Remove stopped requests from the cached states.
         # Keep the states of the pre-empted requests.
@@ -241,22 +268,14 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
 
         # Get request indices.
         # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
-        indices = np.arange(num_reqs)
-        req_indices = np.repeat(indices, num_scheduled_tokens)
+        req_indices = np.repeat(np.arange(num_reqs), num_scheduled_tokens)
 
         # Get batched arange.
         # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
-        arange_matrix = np.tile(np.arange(max_num_scheduled_tokens),
-                                (num_reqs, 1))
-        mask = arange_matrix < num_scheduled_tokens[:, np.newaxis]
-        arange = arange_matrix[mask]
+        arange = np.concatenate([np.arange(n) for n in num_scheduled_tokens])
 
         # Get positions.
-        positions = torch.empty((total_num_scheduled_tokens, ),
-                                dtype=torch.int32,
-                                device="cpu",
-                                pin_memory=self.pin_memory)
-        positions_np = positions.numpy()
+        positions_np = self.positions_np[:total_num_scheduled_tokens]
         np.add(self.input_batch.num_computed_tokens_cpu[req_indices],
                arange,
                out=positions_np)
@@ -267,16 +286,13 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # where M is the max_model_len.
         token_indices = (positions_np +
                          req_indices * self.input_batch.token_ids_cpu.shape[1])
-        token_indices = torch.from_numpy(token_indices)
-        input_ids = torch.empty((total_num_scheduled_tokens, ),
-                                dtype=torch.int32,
-                                device="cpu",
-                                pin_memory=self.pin_memory)
-        torch.index_select(torch.from_numpy(
-            self.input_batch.token_ids_cpu).flatten(),
+        # NOTE(woosuk): We use torch.index_select instead of np.take here
+        # because torch.index_select is much faster than np.take for large
+        # tensors.
+        torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(),
                            0,
-                           token_indices,
-                           out=input_ids)
+                           torch.from_numpy(token_indices),
+                           out=self.input_ids_cpu[:total_num_scheduled_tokens])
 
         # Calculate the slot mapping.
         # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
@@ -284,45 +300,40 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # where K is the max_num_blocks_per_req and the block size is 2.
         # NOTE(woosuk): We can't simply use `token_indices // block_size` here
         # because M (max_model_len) is not necessarily divisible by block_size.
-        block_numbers = self.input_batch.block_table_cpu_tensor.flatten()[
-            req_indices * self.max_num_blocks_per_req +
-            positions_np // self.block_size]
-        block_offsets = torch.from_numpy(positions_np % self.block_size)
-        slot_mapping = torch.empty((total_num_scheduled_tokens, ),
-                                   dtype=torch.int32,
-                                   device="cpu",
-                                   pin_memory=self.pin_memory)
-        torch.add(block_numbers * self.block_size,
-                  block_offsets,
-                  out=slot_mapping)
+        block_table_indices = (req_indices * self.max_num_blocks_per_req +
+                               positions_np // self.block_size)
+        # NOTE(woosuk): We use torch.index_select instead of np.take here
+        # because torch.index_select is much faster than np.take for large
+        # tensors.
+        block_numbers = (self.input_batch.block_table_cpu_tensor.flatten()
+                         [block_table_indices].numpy())
+        block_offsets = positions_np % self.block_size
+        np.add(block_numbers * self.block_size,
+               block_offsets,
+               out=self.slot_mapping_np[:total_num_scheduled_tokens])
 
         # Prepare the attention metadata.
-        query_start_loc = torch.empty((num_reqs + 1, ),
-                                      dtype=torch.int32,
-                                      device="cpu",
-                                      pin_memory=self.pin_memory)
-        query_start_loc_np = query_start_loc.numpy()
-        query_start_loc_np[0] = 0
-        np.cumsum(num_scheduled_tokens, out=query_start_loc_np[1:])
+        self.query_start_loc_np[0] = 0
+        np.cumsum(num_scheduled_tokens,
+                  out=self.query_start_loc_np[1:num_reqs + 1])
 
         seq_lens = (self.input_batch.num_computed_tokens_cpu[:num_reqs] +
                     num_scheduled_tokens)
         max_seq_len = seq_lens.max()
-        seq_start_loc = torch.empty((num_reqs + 1, ),
-                                    dtype=torch.int32,
-                                    device="cpu",
-                                    pin_memory=self.pin_memory)
-        seq_start_loc_np = seq_start_loc.numpy()
-        seq_start_loc_np[0] = 0
-        np.cumsum(seq_lens, out=seq_start_loc_np[1:])
-
-        self.input_ids[:total_num_scheduled_tokens].copy_(input_ids,
-                                                          non_blocking=True)
-        self.positions[:total_num_scheduled_tokens].copy_(positions,
-                                                          non_blocking=True)
-        query_start_loc = query_start_loc.to(self.device, non_blocking=True)
-        seq_start_loc = seq_start_loc.to(self.device, non_blocking=True)
-        slot_mapping = slot_mapping.to(self.device, non_blocking=True).long()
+        self.seq_start_loc_np[0] = 0
+        np.cumsum(seq_lens, out=self.seq_start_loc_np[1:num_reqs + 1])
+
+        # Copy the tensors to the GPU.
+        self.input_ids[:total_num_scheduled_tokens].copy_(
+            self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
+        self.positions[:total_num_scheduled_tokens].copy_(
+            self.positions_cpu[:total_num_scheduled_tokens], non_blocking=True)
+        query_start_loc = self.query_start_loc_cpu[:num_reqs + 1].to(
+            self.device, non_blocking=True)
+        seq_start_loc = self.seq_start_loc_cpu[:num_reqs + 1].to(
+            self.device, non_blocking=True)
+        slot_mapping = self.slot_mapping_cpu[:total_num_scheduled_tokens].to(
+            self.device, non_blocking=True).long()
         attn_metadata = FlashAttentionMetadata(
             num_actual_tokens=total_num_scheduled_tokens,
             max_query_len=max_num_scheduled_tokens,

From 8195824206ad2e3c45d1807b321c11f06ccb3a91 Mon Sep 17 00:00:00 2001
From: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Date: Thu, 12 Dec 2024 13:39:28 +0530
Subject: [PATCH 035/357] [Hardware][Intel-Gaudi] Enable LoRA support for Intel
 Gaudi (HPU) (#10565)

Signed-off-by: Sanju C Sudhakaran <scsudhakaran@habana.ai>
---
 requirements-hpu.txt                        |  2 +-
 vllm/lora/layers.py                         |  6 ++
 vllm/lora/punica_wrapper/punica_hpu.py      | 87 +++++++++++++++++++++
 vllm/lora/punica_wrapper/punica_selector.py |  5 ++
 vllm/worker/hpu_model_runner.py             | 21 ++---
 5 files changed, 107 insertions(+), 14 deletions(-)
 create mode 100644 vllm/lora/punica_wrapper/punica_hpu.py

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index 17d40d0ee131a..f4fb89ef42834 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -8,4 +8,4 @@ pandas
 tabulate
 setuptools>=61
 setuptools-scm>=8
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@e096d6f
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@4312768
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 38cb846578d5c..a6c93a3d8bfe9 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -30,6 +30,7 @@
     LinearScalingRotaryEmbedding, RotaryEmbedding)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
+from vllm.platforms import current_platform
 
 if TYPE_CHECKING:
     from vllm.lora.punica_wrapper import PunicaWrapperBase
@@ -1068,6 +1069,11 @@ def _get_logits(
         ).index_select(0, indices_padded).nan_to_num_(nan=float("-inf"),
                                                       posinf=float("inf"),
                                                       neginf=float("-inf")))
+
+        # HPU needs special handling to prune out dummy samples.
+        if current_platform.is_hpu():
+            lora_logits = lora_logits[:logits.shape[0], :]
+
         logits[:,
                self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
                lora_logits.shape[1]] = lora_logits
diff --git a/vllm/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py
new file mode 100644
index 0000000000000..996325b712996
--- /dev/null
+++ b/vllm/lora/punica_wrapper/punica_hpu.py
@@ -0,0 +1,87 @@
+from typing import Optional, Tuple, Union, final
+
+import torch
+from vllm_hpu_extension.ops import (dispatch_bgmv_embedding,
+                                    dispatch_bgmv_linear)
+
+from .punica_base import PunicaWrapperBase
+
+
+@final
+class PunicaWrapperHPU(PunicaWrapperBase):
+
+    def __init__(self, max_num_batched_tokens: int, max_batches: int,
+                 device: Union[torch.device, str], **kwargs):
+        # Increasing max_num_batched_tokens by 3x to handle increase in
+        # tensor size due to padding.
+        PunicaWrapperBase.__init__(self, 3 * max_num_batched_tokens,
+                                   max_batches, device)
+
+    def add_lora_embedding(self,
+                           y: torch.Tensor,
+                           x: torch.Tensor,
+                           lora_b_stacked: torch.Tensor,
+                           add_input: bool = True,
+                           **kwargs) -> None:
+        dispatch_bgmv_embedding(y, x, lora_b_stacked, 0)
+
+    def add_lora_linear(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: Tuple[torch.Tensor, ...],
+                        lora_b_stacked: Tuple[torch.Tensor, ...],
+                        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+                        scale: float,
+                        output_slices: Tuple[int, ...],
+                        *,
+                        buffer: Optional[Tuple[torch.Tensor, ...]] = None,
+                        **kwargs) -> None:
+        y_org = y
+        x = x.view(-1, x.shape[-1])
+        y = y.view(-1, y.shape[-1])
+        offset_left = 0
+
+        for slice_idx in range(len(output_slices)):
+            dispatch_bgmv_linear(
+                y[:, offset_left:offset_left + output_slices[slice_idx]], x,
+                lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], 0, scale)
+            offset_left += output_slices[slice_idx]
+        y = y.view_as(y_org)
+
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: torch.Tensor,
+                        lora_b_stacked: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None,
+                        **kwargs) -> None:
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        x = x.view(-1, x.shape[-1])
+        dispatch_bgmv_linear(y, x, lora_a_stacked, lora_b_stacked, 0, scale)
+        y = y.view_as(y_org)
+
+    def add_shrink(
+        self,
+        y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+        x: torch.Tensor,
+        lora_a_stacked: Tuple[torch.Tensor, ...],
+        scale: float,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
+
+    def add_expand(
+        self,
+        y: torch.Tensor,
+        x: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+        lora_b_stacked: Tuple[torch.Tensor, ...],
+        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+        output_slices: Tuple[int, ...],
+        offset_start: int = 0,
+        add_input=True,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
index df6c1bdc7dd71..cd64878d95ae3 100644
--- a/vllm/lora/punica_wrapper/punica_selector.py
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -10,5 +10,10 @@ def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
         from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
         print_info_once("Using PunicaWrapperGPU.")
         return PunicaWrapperGPU(*args, **kwargs)
+    elif current_platform.is_hpu():
+        # Lazy import to avoid ImportError
+        from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU
+        print_info_once("Using PunicaWrapperHPU.")
+        return PunicaWrapperHPU(*args, **kwargs)
     else:
         raise NotImplementedError
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 99cf9a7e67256..9d479f412af46 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -622,6 +622,10 @@ def load_model(self) -> None:
                 assert hasattr(
                     self.model, "embedding_padding_modules"
                 ), "Model does not have embedding_padding_modules"
+                assert not self.lora_config.bias_enabled, \
+                    "Bias support in LoRA is not enabled in HPU yet."
+                assert not self.lora_config.fully_sharded_loras, \
+                    "Fully sharded LoRAs is not enabled in HPU yet."
                 self.lora_manager = LRUCacheWorkerLoRAManager(
                     self.scheduler_config.max_num_seqs,
                     self.scheduler_config.max_num_batched_tokens,
@@ -1282,11 +1286,9 @@ def create_dummy_seq_group_metadata(self,
     def profile_run(self) -> None:
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
-        max_batch_size = self.bucketing_global_state.prompt_bs_bucket_cfg[-1]
-        max_seq_len = min(
-            self.bucketing_global_state.prompt_seq_bucket_cfg[-1],
-            self.max_num_batched_tokens // max_batch_size)
-
+        max_seq_len = self.bucketing_global_state.prompt_seq_bucket_cfg[-1]
+        max_batch_size = min(self.max_num_batched_tokens // max_seq_len,
+                             self.scheduler_config.max_num_seqs)
         self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches,
                              False, True)
         return
@@ -1304,7 +1306,6 @@ def warmup_scenario(self,
                          f"bs{batch_size}_"
                          f"seq{seq_len}_"
                          f"graphs{'T' if use_graphs else 'F'}")
-        max_num_seqs = self.scheduler_config.max_num_seqs
         # This represents the maximum number of different requests
         # that will have unique loras, an therefore the max amount of memory
         # consumption create dummy lora request copies from the lora request
@@ -1326,16 +1327,10 @@ def warmup_scenario(self,
                     dummy_lora_requests.append(dummy_lora_request)
                 dummy_lora_requests_per_seq = [
                     dummy_lora_requests[idx % len(dummy_lora_requests)]
-                    for idx in range(max_num_seqs)
+                    for idx in range(batch_size)
                 ]
         self.profiler.start('internal', scenario_name)
         times = 3 if use_graphs or is_pt_profiler_run else 1
-        if self.lora_config and not is_lora_profile_run:
-            lora_mapping = LoRAMapping(
-                **dict(index_mapping=[0] * batch_size * seq_len,
-                       prompt_mapping=[0] * batch_size * seq_len,
-                       is_prefill=is_prompt))
-            self.set_active_loras(set(), lora_mapping)
         if is_prompt:
             seqs = [
                 self.create_dummy_seq_group_metadata(

From 62de37a38ed4a3877f3b1607b7163135f7ab9e36 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 12 Dec 2024 01:04:19 -0800
Subject: [PATCH 036/357] [core][distributed] initialization from
 StatelessProcessGroup (#10986)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  6 +-
 tests/distributed/test_same_node.py           | 29 ++++++-
 tests/distributed/test_shm_broadcast.py       | 84 ++++++++++++-------
 .../device_communicators/shm_broadcast.py     | 39 ++++++---
 vllm/distributed/parallel_state.py            | 64 +++++++++-----
 5 files changed, 153 insertions(+), 69 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index aca505178df06..6a6ee3cf713ae 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -432,11 +432,11 @@ steps:
   - tests/distributed/
   commands:
   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
 
 - label: Distributed Tests (2 GPUs) # 40min
   #mirror_hardwares: [amd]
@@ -455,7 +455,7 @@ steps:
   commands:
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
-  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   # Avoid importing model tests that cause CUDA reinitialization error
   - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py
index defc4e23c8ce2..62311a626bc47 100644
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
@@ -3,11 +3,32 @@
 import torch.distributed as dist
 
 from vllm.distributed.parallel_state import in_the_same_node_as
+from vllm.distributed.utils import StatelessProcessGroup
+from vllm.utils import get_ip, get_open_port
 
 if __name__ == "__main__":
     dist.init_process_group(backend="gloo")
-    test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0))
 
-    expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
-    assert test_result == expected, f"Expected {expected}, got {test_result}"
-    print("Same node test passed!")
+    rank = dist.get_rank()
+    if rank == 0:
+        port = get_open_port()
+        ip = get_ip()
+        dist.broadcast_object_list([ip, port], src=0)
+    else:
+        recv = [None, None]
+        dist.broadcast_object_list(recv, src=0)
+        ip, port = recv
+
+    stateless_pg = StatelessProcessGroup.create(ip, port, rank,
+                                                dist.get_world_size())
+
+    for pg in [dist.group.WORLD, stateless_pg]:
+        test_result = all(in_the_same_node_as(pg, source_rank=0))
+
+        expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
+        assert test_result == expected, \
+            f"Expected {expected}, got {test_result}"
+        if pg == dist.group.WORLD:
+            print("Same node test passed! when using torch distributed!")
+        else:
+            print("Same node test passed! when using StatelessProcessGroup!")
diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py
index 2761b7f6c0644..723872682cf97 100644
--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
@@ -7,7 +7,8 @@
 import torch.distributed as dist
 
 from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
-from vllm.utils import update_environment_variables
+from vllm.distributed.utils import StatelessProcessGroup
+from vllm.utils import get_ip, get_open_port, update_environment_variables
 
 
 def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]:
@@ -54,34 +55,61 @@ def wrapped_fn(env):
 
 @worker_fn_wrapper
 def worker_fn():
-    writer_rank = 2
-    broadcaster = MessageQueue.create_from_process_group(
-        dist.group.WORLD, 40 * 1024, 2, writer_rank)
-    if dist.get_rank() == writer_rank:
-        seed = random.randint(0, 1000)
-        dist.broadcast_object_list([seed], writer_rank)
-    else:
-        recv = [None]
-        dist.broadcast_object_list(recv, writer_rank)
-        seed = recv[0]  # type: ignore
-    dist.barrier()
-    # in case we find a race condition
-    # print the seed so that we can reproduce the error
-    print(f"Rank {dist.get_rank()} got seed {seed}")
-    # test broadcasting with about 400MB of data
-    N = 10_000
-    if dist.get_rank() == writer_rank:
-        arrs = get_arrays(N, seed)
-        for x in arrs:
-            broadcaster.broadcast_object(x)
-            time.sleep(random.random() / 1000)
+
+    rank = dist.get_rank()
+    if rank == 0:
+        port = get_open_port()
+        ip = get_ip()
+        dist.broadcast_object_list([ip, port], src=0)
     else:
-        arrs = get_arrays(N, seed)
-        for x in arrs:
-            y = broadcaster.broadcast_object(None)
-            assert np.array_equal(x, y)
-            time.sleep(random.random() / 1000)
-    dist.barrier()
+        recv = [None, None]
+        dist.broadcast_object_list(recv, src=0)
+        ip, port = recv
+
+    stateless_pg = StatelessProcessGroup.create(ip, port, rank,
+                                                dist.get_world_size())
+
+    for pg in [dist.group.WORLD, stateless_pg]:
+
+        writer_rank = 2
+        broadcaster = MessageQueue.create_from_process_group(
+            pg, 40 * 1024, 2, writer_rank)
+        if rank == writer_rank:
+            seed = random.randint(0, 1000)
+            dist.broadcast_object_list([seed], writer_rank)
+        else:
+            recv = [None]
+            dist.broadcast_object_list(recv, writer_rank)
+            seed = recv[0]  # type: ignore
+
+        if pg == dist.group.WORLD:
+            dist.barrier()
+        else:
+            pg.barrier()
+
+        # in case we find a race condition
+        # print the seed so that we can reproduce the error
+        print(f"Rank {rank} got seed {seed}")
+        # test broadcasting with about 400MB of data
+        N = 10_000
+        if rank == writer_rank:
+            arrs = get_arrays(N, seed)
+            for x in arrs:
+                broadcaster.broadcast_object(x)
+                time.sleep(random.random() / 1000)
+        else:
+            arrs = get_arrays(N, seed)
+            for x in arrs:
+                y = broadcaster.broadcast_object(None)
+                assert np.array_equal(x, y)
+                time.sleep(random.random() / 1000)
+
+        if pg == dist.group.WORLD:
+            dist.barrier()
+            print("torch distributed passed the test!")
+        else:
+            pg.barrier()
+            print("StatelessProcessGroup passed the test!")
 
 
 def test_shm_broadcast():
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 9a2d8918d96e5..9f97b0f01ad8a 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -5,7 +5,7 @@
 from contextlib import contextmanager
 from dataclasses import dataclass, field
 from multiprocessing import shared_memory
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 from unittest.mock import patch
 
 import torch
@@ -15,6 +15,7 @@
 from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
 
 import vllm.envs as envs
+from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
 from vllm.utils import get_ip, get_open_port, is_valid_ipv6_address
 
@@ -476,13 +477,19 @@ def broadcast_object(self, obj=None):
             return self.dequeue()
 
     @staticmethod
-    def create_from_process_group(pg: ProcessGroup,
+    def create_from_process_group(pg: Union[ProcessGroup,
+                                            StatelessProcessGroup],
                                   max_chunk_bytes,
                                   max_chunks,
                                   writer_rank=0) -> "MessageQueue":
-        group_rank = dist.get_rank(pg)
-        group_world_size = dist.get_world_size(pg)
-        global_ranks = dist.get_process_group_ranks(pg)
+        if isinstance(pg, ProcessGroup):
+            group_rank = dist.get_rank(pg)
+            group_world_size = dist.get_world_size(pg)
+            global_ranks = dist.get_process_group_ranks(pg)
+        else:
+            group_rank = pg.rank
+            group_world_size = pg.world_size
+            global_ranks = list(range(pg.world_size))
 
         from vllm.distributed.parallel_state import in_the_same_node_as
         status = in_the_same_node_as(pg, source_rank=writer_rank)
@@ -500,15 +507,21 @@ def create_from_process_group(pg: ProcessGroup,
                 max_chunks=max_chunks,
             )
             handle = buffer_io.export_handle()
-            dist.broadcast_object_list([handle],
-                                       src=global_ranks[writer_rank],
-                                       group=pg)
+            if isinstance(pg, ProcessGroup):
+                dist.broadcast_object_list([handle],
+                                           src=global_ranks[writer_rank],
+                                           group=pg)
+            else:
+                pg.broadcast_obj(handle, writer_rank)
         else:
-            recv = [None]
-            dist.broadcast_object_list(recv,
-                                       src=global_ranks[writer_rank],
-                                       group=pg)
-            handle = recv[0]  # type: ignore
+            if isinstance(pg, ProcessGroup):
+                recv = [None]
+                dist.broadcast_object_list(recv,
+                                           src=global_ranks[writer_rank],
+                                           group=pg)
+                handle = recv[0]  # type: ignore
+            else:
+                handle = pg.broadcast_obj(None, writer_rank)
             buffer_io = MessageQueue.create_from_handle(handle, group_rank)
         buffer_io.wait_until_ready()
         return buffer_io
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 34815d7f0aa78..5b9236f8c56b6 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -37,6 +37,7 @@
 
 import vllm.distributed.kv_transfer.kv_transfer_agent as kv_transfer
 import vllm.envs as envs
+from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op, supports_custom_op
@@ -1191,25 +1192,31 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
         torch.cuda.empty_cache()
 
 
-def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
+def in_the_same_node_as(pg: Union[ProcessGroup, StatelessProcessGroup],
+                        source_rank: int = 0) -> List[bool]:
     """
     This is a collective operation that returns if each rank is in the same node
     as the source rank. It tests if processes are attached to the same
     memory system (shared access to shared memory).
     """
-    assert torch.distributed.get_backend(
-        pg) != torch.distributed.Backend.NCCL, (
-            "in_the_same_node_as should be tested with a non-NCCL group.")
-    # local rank inside the group
-    rank = torch.distributed.get_rank(group=pg)
-    world_size = torch.distributed.get_world_size(group=pg)
+    if isinstance(pg, ProcessGroup):
+        assert torch.distributed.get_backend(
+            pg) != torch.distributed.Backend.NCCL, (
+                "in_the_same_node_as should be tested with a non-NCCL group.")
+        # local rank inside the group
+        rank = torch.distributed.get_rank(group=pg)
+        world_size = torch.distributed.get_world_size(group=pg)
+
+        # global ranks of the processes in the group
+        ranks = torch.distributed.get_process_group_ranks(pg)
+    else:
+        rank = pg.rank
+        world_size = pg.world_size
+        ranks = list(range(world_size))
 
     # local tensor in each process to store the result
     is_in_the_same_node = torch.tensor([0] * world_size, dtype=torch.int32)
 
-    # global ranks of the processes in the group
-    ranks = torch.distributed.get_process_group_ranks(pg)
-
     magic_message = b"magic_message"
     shm = None
 
@@ -1219,17 +1226,21 @@ def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
                 # create a shared memory segment
                 shm = shared_memory.SharedMemory(create=True, size=128)
                 shm.buf[:len(magic_message)] = magic_message
-                torch.distributed.broadcast_object_list([shm.name],
-                                                        src=ranks[source_rank],
-                                                        group=pg)
+                if isinstance(pg, ProcessGroup):
+                    torch.distributed.broadcast_object_list(
+                        [shm.name], src=ranks[source_rank], group=pg)
+                else:
+                    pg.broadcast_obj(shm.name, src=source_rank)
                 is_in_the_same_node[rank] = 1
             else:
                 # try to open the shared memory segment
-                recv = [None]
-                torch.distributed.broadcast_object_list(recv,
-                                                        src=ranks[source_rank],
-                                                        group=pg)
-                name = recv[0]
+                if isinstance(pg, ProcessGroup):
+                    recv = [None]
+                    torch.distributed.broadcast_object_list(
+                        recv, src=ranks[source_rank], group=pg)
+                    name = recv[0]
+                else:
+                    name = pg.broadcast_obj(None, src=source_rank)
                 # fix to https://stackoverflow.com/q/62748654/9191338
                 # Python incorrectly tracks shared memory even if it is not
                 # created by the process. The following patch is a workaround.
@@ -1244,12 +1255,23 @@ def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
         if shm:
             shm.close()
 
-    torch.distributed.barrier(group=pg)
+    if isinstance(pg, ProcessGroup):
+        torch.distributed.barrier(group=pg)
+    else:
+        pg.barrier()
 
     # clean up the shared memory segment
     with contextlib.suppress(OSError):
         if rank == source_rank and shm:
             shm.unlink()
-    torch.distributed.all_reduce(is_in_the_same_node, group=pg)
 
-    return [x == 1 for x in is_in_the_same_node.tolist()]
+    if isinstance(pg, ProcessGroup):
+        torch.distributed.all_reduce(is_in_the_same_node, group=pg)
+        aggregated_data = is_in_the_same_node
+    else:
+        aggregated_data = torch.zeros_like(is_in_the_same_node)
+        for i in range(world_size):
+            rank_data = pg.broadcast_obj(is_in_the_same_node, src=i)
+            aggregated_data += rank_data
+
+    return [x == 1 for x in aggregated_data.tolist()]

From 85362f028c0324d8d00b0438f29c3d9f64737b9a Mon Sep 17 00:00:00 2001
From: Jiaxin Shan <seedjeffwan@gmail.com>
Date: Thu, 12 Dec 2024 01:25:16 -0800
Subject: [PATCH 037/357] [Misc][LoRA] Ensure Lora Adapter requests return
 adapter name (#11094)

Signed-off-by: Jiaxin Shan <seedjeffwan@gmail.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/entrypoints/openai/test_serving_engine.py | 11 +++++++++++
 vllm/entrypoints/openai/serving_chat.py         | 14 ++++++++------
 vllm/entrypoints/openai/serving_completion.py   |  2 +-
 vllm/entrypoints/openai/serving_engine.py       | 13 +++++++++++++
 4 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py
index 6199a75b5b4f8..096ab6fa0ac09 100644
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -9,6 +9,7 @@
                                               LoadLoraAdapterRequest,
                                               UnloadLoraAdapterRequest)
 from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.lora.request import LoRARequest
 
 MODEL_NAME = "meta-llama/Llama-2-7b"
 BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
@@ -33,6 +34,16 @@ async def _async_serving_engine_init():
     return serving_engine
 
 
+@pytest.mark.asyncio
+async def test_serving_model_name():
+    serving_engine = await _async_serving_engine_init()
+    assert serving_engine._get_model_name(None) == MODEL_NAME
+    request = LoRARequest(lora_name="adapter",
+                          lora_path="/path/to/adapter2",
+                          lora_int_id=1)
+    assert serving_engine._get_model_name(request) == request.lora_name
+
+
 @pytest.mark.asyncio
 async def test_load_lora_adapter_success():
     serving_engine = await _async_serving_engine_init()
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 0738210e27cb6..a5e7b4ac3bb30 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -123,6 +123,8 @@ async def create_chat_completion(
                 prompt_adapter_request,
             ) = self._maybe_get_adapters(request)
 
+            model_name = self._get_model_name(lora_request)
+
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
 
             tool_parser = self.tool_parser
@@ -238,13 +240,13 @@ async def create_chat_completion(
         # Streaming response
         if request.stream:
             return self.chat_completion_stream_generator(
-                request, result_generator, request_id, conversation, tokenizer,
-                request_metadata)
+                request, result_generator, request_id, model_name,
+                conversation, tokenizer, request_metadata)
 
         try:
             return await self.chat_completion_full_generator(
-                request, result_generator, request_id, conversation, tokenizer,
-                request_metadata)
+                request, result_generator, request_id, model_name,
+                conversation, tokenizer, request_metadata)
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
@@ -259,11 +261,11 @@ async def chat_completion_stream_generator(
         request: ChatCompletionRequest,
         result_generator: AsyncIterator[RequestOutput],
         request_id: str,
+        model_name: str,
         conversation: List[ConversationMessage],
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
     ) -> AsyncGenerator[str, None]:
-        model_name = self.base_model_paths[0].name
         created_time = int(time.time())
         chunk_object_type: Final = "chat.completion.chunk"
         first_iteration = True
@@ -604,12 +606,12 @@ async def chat_completion_full_generator(
         request: ChatCompletionRequest,
         result_generator: AsyncIterator[RequestOutput],
         request_id: str,
+        model_name: str,
         conversation: List[ConversationMessage],
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
     ) -> Union[ErrorResponse, ChatCompletionResponse]:
 
-        model_name = self.base_model_paths[0].name
         created_time = int(time.time())
         final_res: Optional[RequestOutput] = None
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index ee97d35f2b087..b3436773062f3 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -85,7 +85,6 @@ async def create_completion(
             return self.create_error_response(
                 "suffix is not currently supported")
 
-        model_name = self.base_model_paths[0].name
         request_id = f"cmpl-{self._base_request_id(raw_request)}"
         created_time = int(time.time())
 
@@ -162,6 +161,7 @@ async def create_completion(
         result_generator = merge_async_iterators(
             *generators, is_cancelled=raw_request.is_disconnected)
 
+        model_name = self._get_model_name(lora_request)
         num_prompts = len(engine_prompts)
 
         # Similar to the OpenAI API, when n != best_of, we do not stream the
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 63f27b955461e..d5ad4354c78be 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -661,3 +661,16 @@ async def unload_lora_adapter(
 
     def _is_model_supported(self, model_name):
         return any(model.name == model_name for model in self.base_model_paths)
+
+    def _get_model_name(self, lora: Optional[LoRARequest]):
+        """
+        Returns the appropriate model name depending on the availability
+        and support of the LoRA or base model.
+        Parameters:
+        - lora: LoRARequest that contain a base_model_name.
+        Returns:
+        - str: The name of the base model or the first available model path.
+        """
+        if lora is not None:
+            return lora.lora_name
+        return self.base_model_paths[0].name

From 4816d20aa43fdc4abf66c28f6690a1953d8adbe9 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 12 Dec 2024 07:51:53 -0800
Subject: [PATCH 038/357] [V1] Fix torch profiling for offline inference
 (#11125)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 examples/offline_inference_with_profiler.py | 31 +++++++++++++--------
 vllm/v1/engine/core_client.py               |  4 +--
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/examples/offline_inference_with_profiler.py b/examples/offline_inference_with_profiler.py
index 1f00d26808771..abcfa8e8f2f2a 100644
--- a/examples/offline_inference_with_profiler.py
+++ b/examples/offline_inference_with_profiler.py
@@ -1,4 +1,5 @@
 import os
+import time
 
 from vllm import LLM, SamplingParams
 
@@ -15,19 +16,25 @@
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-# Create an LLM.
-llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
+if __name__ == "__main__":
 
-llm.start_profile()
+    # Create an LLM.
+    llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
 
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
+    llm.start_profile()
 
-llm.stop_profile()
+    # Generate texts from the prompts. The output is a list of RequestOutput
+    # objects that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
 
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    llm.stop_profile()
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    # Add a buffer to wait for profiler in the background process
+    # (in case MP is on) to finish writing profiling output.
+    time.sleep(10)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 8eb9a27438d53..a66ae111be8c5 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -105,7 +105,7 @@ def shutdown(self):
     def __del__(self):
         self.shutdown()
 
-    async def profile(self, is_start=True) -> None:
+    def profile(self, is_start=True) -> None:
         self.engine_core.profile(is_start)
 
 
@@ -212,7 +212,7 @@ def add_request(self, request: EngineCoreRequest) -> None:
     def abort_requests(self, request_ids: List[str]) -> None:
         self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
-    async def profile(self, is_start=True) -> None:
+    def profile(self, is_start=True) -> None:
         self._send_input(EngineCoreRequestType.PROFILE,
                          EngineCoreProfile(is_start))
 

From d4d5291cc216ccc7ce824c03ab25141026b9a394 Mon Sep 17 00:00:00 2001
From: Ramon Ziai <ramon.ziai@bettermarks.com>
Date: Thu, 12 Dec 2024 18:36:32 +0100
Subject: [PATCH 039/357] fix(docs): typo in helm install instructions (#11141)

Signed-off-by: Ramon Ziai <ramon.ziai@bettermarks.com>
---
 docs/source/serving/deploying_with_helm.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/serving/deploying_with_helm.rst b/docs/source/serving/deploying_with_helm.rst
index 21b17e881b945..d185a6951d7ec 100644
--- a/docs/source/serving/deploying_with_helm.rst
+++ b/docs/source/serving/deploying_with_helm.rst
@@ -25,7 +25,7 @@ To install the chart with the release name ``test-vllm``:
 
 .. code-block:: console
 
-    helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3buckername=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
+    helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
 
 Uninstalling the Chart
 ----------------------

From 5d712571afd87f5cc1b6c6c25feaac5c706f3712 Mon Sep 17 00:00:00 2001
From: Jeff Cook <jeff@jeffcook.io>
Date: Thu, 12 Dec 2024 11:09:20 -0700
Subject: [PATCH 040/357] [Bugfix] Quick fix to make Pixtral-HF load correctly
 again after 39e227c7ae. (#11024)

---
 vllm/model_executor/models/llava.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 65c6bd07bfff0..53eef72dd5f91 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -218,12 +218,8 @@ def _get_dummy_mm_kwargs(
         image_processor = hf_processor.image_processor  # type: ignore
         hf_inputs = image_processor.preprocess(data['image'],
                                                return_tensors="pt")
-        is_pixtral = isinstance(hf_processor, PixtralProcessor)
 
-        return MultiModalKwargs(
-            **hf_inputs,
-            is_pixtral=torch.tensor(is_pixtral),
-        )
+        return MultiModalKwargs(**hf_inputs)
 
 
 class LlavaLikeConfig(Protocol):

From 2c97eca1fff5297089794d2bd8ebd0bf98e12641 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Thu, 12 Dec 2024 10:34:26 -0800
Subject: [PATCH 041/357] [Misc] Validate grammar and fail early (#11119)

---
 .../guided_decoding/xgrammar_decoding.py      | 32 +++++++++++++------
 .../guided_decoding/xgrammar_utils.py         | 12 +++----
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index 80e88dd5b4b37..fc45e37cf6f06 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -131,22 +131,25 @@ def from_guided_params(cls,
                            max_threads: int = 8) -> GrammarConfig:
 
         tokenizer_hash = hash(tokenizer)
-        # Only get tokenizer data if not already cached
-        if tokenizer_hash in TokenizerDataCache._cache:
-            encoded_vocab = None
-            stop_token_ids = None
-            backend_str = None
-        else:
-            tokenizer_data = TokenizerDataCache.get_tokenizer_data(tokenizer)
-            encoded_vocab = tokenizer_data.encoded_vocab
-            stop_token_ids = tokenizer_data.stop_token_ids
-            backend_str = tokenizer_data.backend_str
+        tokenizer_data = TokenizerDataCache.get_tokenizer_data(tokenizer)
+        encoded_vocab = tokenizer_data.encoded_vocab
+        stop_token_ids = tokenizer_data.stop_token_ids
+        backend_str = tokenizer_data.backend_str
 
         if guided_params.json:
             if not isinstance(guided_params.json, str):
                 json_str = json.dumps(guided_params.json)
             else:
                 json_str = guided_params.json
+
+            # Validate the schema and raise ValueError here if it is invalid.
+            # This is to avoid exceptions in model execution, which will crash
+            # the engine worker process.
+            try:
+                xgr.Grammar.from_json_schema(json_str)
+            except RuntimeError as err:
+                raise ValueError(str(err)) from err
+
             return cls(json_str=json_str,
                        vocab_size=model_config.hf_text_config.vocab_size,
                        encoded_vocab=encoded_vocab,
@@ -167,6 +170,15 @@ def from_guided_params(cls,
                         f"Conversion error: {str(e)}") from e
             else:
                 grammar_str = guided_params.grammar
+
+            # Validate the grammar and raise ValueError here if it is invalid.
+            # This is to avoid exceptions in model execution, which will crash
+            # the engine worker process.
+            try:
+                xgr.Grammar.from_ebnf(grammar_str)
+            except RuntimeError as err:
+                raise ValueError(str(err)) from err
+
             return cls(grammar_str=grammar_str,
                        vocab_size=model_config.hf_text_config.vocab_size,
                        encoded_vocab=encoded_vocab,
diff --git a/vllm/model_executor/guided_decoding/xgrammar_utils.py b/vllm/model_executor/guided_decoding/xgrammar_utils.py
index 12b42245f4e3d..9a0463964de49 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_utils.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_utils.py
@@ -26,15 +26,11 @@ def grammar_is_likely_lark(grammar_str: str) -> bool:
         if not line:
             continue
 
-        # Look for Lark-style rule definitions
-        if ':' in line and '::=' not in line:
-            return True
+        # Look for GBNF rule definition
+        if '::=' in line:
+            return False
 
-        # Look for Lark-specific features
-        if any(pattern in line for pattern in ['?start:', '|', '~']):
-            return True
-
-    return False
+    return True
 
 
 def convert_lark_to_gbnf(grammar_str: str) -> str:

From 9f3974a31911b551d416bb4d435273409d23f021 Mon Sep 17 00:00:00 2001
From: Jeremy Arnold <103538711+JArnoldAMD@users.noreply.github.com>
Date: Thu, 12 Dec 2024 14:05:57 -0600
Subject: [PATCH 042/357] Fix logging of the vLLM Config (#11143)

---
 vllm/engine/llm_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 9be30c635cb2c..d756f71e4fa53 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -248,7 +248,7 @@ def __init__(
         )
 
         logger.info(
-            "Initializing an LLM engine (v%s) with config: %r,"
+            "Initializing an LLM engine (v%s) with config: %s, "
             "use_cached_outputs=%s, ",
             VLLM_VERSION,
             vllm_config,

From db6c264a1e658e37782570f5155c77be0d41f312 Mon Sep 17 00:00:00 2001
From: shangmingc <csmthu@gmail.com>
Date: Fri, 13 Dec 2024 05:19:17 +0800
Subject: [PATCH 043/357] [Bugfix] Fix value unpack error of simple connector
 for KVCache transfer. (#11058)

Signed-off-by: ShangmingCai <csmthu@gmail.com>
---
 .../kv_transfer/kv_connector/simple_connector.py          | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
index 5870070a54c75..bf4f40ca94e29 100644
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -118,6 +118,12 @@ def send_kv_caches_and_hidden_states(
         start_layer = model_executable.model.start_layer
         end_layer = model_executable.model.end_layer
 
+        model_config = model_executable.model.config
+        num_heads = model_config.num_key_value_heads
+        hidden_size = model_config.hidden_size
+        num_attention_heads = model_config.num_attention_heads
+        head_size = int(hidden_size / num_attention_heads)
+
         # query_lens contains new KV caches that are added to vLLM.
         # so we will send them to decode instance
         # FIXME(Kuntai): This assume that all requests are prefill.
@@ -131,8 +137,6 @@ def send_kv_caches_and_hidden_states(
             for layer_id in range(start_layer, end_layer):
                 kv_cache = kv_caches[layer_id - start_layer]
 
-                _, _, num_heads, head_size = kv_cache[0].shape
-
                 key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
                 value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
 

From 78ed8f57d8815cdd5567533f7d3e25b959d861ab Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Thu, 12 Dec 2024 16:57:40 -0800
Subject: [PATCH 044/357] [Misc][V1] Fix type in v1 prefix caching (#11151)

---
 tests/v1/core/test_prefix_caching.py | 12 ++++++++----
 vllm/v1/core/kv_cache_manager.py     |  8 ++++----
 vllm/v1/core/kv_cache_utils.py       | 22 +++++++++++++++-------
 3 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index b44d3e5cb0678..00f7b0fcfe1dc 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -49,7 +49,7 @@ def test_prefill():
         block_hash = hash_block_tokens(parent_block_hash, block_tokens)
         assert manager.block_pool[block_id].block_hash == block_hash
         assert manager.block_pool[block_id].ref_cnt == 1
-        parent_block_hash = block_hash
+        parent_block_hash = block_hash.hash_value
 
     # Check partial/preallocated block metadata
     for block_id in (3, 4):
@@ -360,11 +360,15 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
     assert not computed_blocks
     # Just ask for 1 block.
     blocks = manager.allocate_slots(req, block_size, computed_blocks)
+    req.num_computed_tokens = block_size
     assert len(blocks) == 1 + num_preallocated_blocks
 
-    # Append slots to the block.
-    req.num_computed_tokens = block_size * len(blocks)  # Assume all used.
-    blocks = manager.append_slots(req, block_size)  # Append 1 block.
+    # Assume all computed.
+    manager.append_slots(req, block_size * (len(blocks) - 1))
+    req.num_computed_tokens = block_size * len(blocks)
+
+    # Append 1 block.
+    blocks = manager.append_slots(req, block_size)
     assert len(blocks) == 1 + num_preallocated_blocks
 
 
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index b492a755e6dd5..03cbb958237df 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -375,8 +375,8 @@ def _cache_full_blocks(
             prev_block: The previous block in the chain.
         """
         # Update the new blocks with the block hashes through the chain.
-        prev_block_hash = (prev_block.block_hash
-                           if prev_block is not None else None)
+        prev_block_hash_value = (prev_block.block_hash.hash_value
+                                 if prev_block is not None else None)
         for i, blk in enumerate(full_blocks):
             blk_idx = blk_start_idx + i
 
@@ -390,10 +390,10 @@ def _cache_full_blocks(
                 f"{request.request_id}({request})")
 
             # Compute the hash of the current block.
-            block_hash = hash_block_tokens(prev_block_hash,
+            block_hash = hash_block_tokens(prev_block_hash_value,
                                            tuple(block_tokens))
 
             # Update and added the full block to the cache.
             blk.block_hash = block_hash
             self.cached_block_hash_to_block[block_hash][blk.block_id] = blk
-            prev_block_hash = block_hash
+            prev_block_hash_value = block_hash.hash_value
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index fb666c364bfb2..814e462a91fed 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1,12 +1,19 @@
 """KV-Cache Utilities."""
 from dataclasses import dataclass
-from typing import List, Optional, Tuple
+from typing import List, NamedTuple, Optional, Tuple
 
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
-BlockHashType = Tuple[int, Tuple[int]]
+
+class BlockHashType(NamedTuple):
+    """Hash value of a block and the token IDs in the block.
+    The reason we keep a tuple of token IDs is to make sure no hash
+    collision happens when the hash value is the same.
+    """
+    hash_value: int
+    token_ids: Tuple[int]
 
 
 @dataclass
@@ -171,8 +178,8 @@ def hash_block_tokens(parent_block_hash: Optional[int],
         The hash value of the block and the token ids in the block.
         The entire tuple is used as the hash key of the block.
     """
-    return (hash(
-        (parent_block_hash, *curr_block_token_ids)), curr_block_token_ids)
+    return BlockHashType(hash((parent_block_hash, *curr_block_token_ids)),
+                         curr_block_token_ids)
 
 
 def hash_request_tokens(block_size: int,
@@ -188,14 +195,15 @@ def hash_request_tokens(block_size: int,
         The list of computed hash values.
     """
     ret = []
-    parent_block_hash = None
+    parent_block_hash_value = None
     for start in range(0, len(token_ids), block_size):
         end = start + block_size
         block_token_ids = tuple(token_ids[start:end])
         # Do not hash the block if it is not full.
         if len(block_token_ids) < block_size:
             break
-        block_hash = hash_block_tokens(parent_block_hash, block_token_ids)
+        block_hash = hash_block_tokens(parent_block_hash_value,
+                                       block_token_ids)
         ret.append(block_hash)
-        parent_block_hash = block_hash
+        parent_block_hash_value = block_hash.hash_value
     return ret

From 30870b4f66414020645608b81dced94d8a99111c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Thu, 12 Dec 2024 22:19:23 -0500
Subject: [PATCH 045/357] [torch.compile] Dynamic fp8 + rms_norm fusion
 (#10906)

Signed-off-by: luka <luka@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 CMakeLists.txt                                |   3 +-
 .../fused_kernels/layernorm_rms_benchmarks.py | 173 +++++
 csrc/dispatch_utils.h                         |  14 +
 csrc/ops.h                                    |   8 +
 csrc/quantization/fp8/common.cuh              |  26 +-
 ...fused_layernorm_dynamic_per_token_quant.cu | 160 ++++
 .../fused_kernels/layernorm_utils.cuh         | 327 ++++++++
 .../fused_kernels/quant_conversions.cuh       |  81 ++
 csrc/quantization/vectorization.cuh           |  33 +
 csrc/torch_bindings.cpp                       |   8 +
 tests/compile/test_functionalization.py       |  21 +-
 tests/compile/test_fusion.py                  |  61 +-
 tests/kernels/test_fused_quant_layernorm.py   | 171 +++++
 vllm/_custom_ops.py                           |  20 +
 vllm/compilation/fix_functionalization.py     |   9 +-
 vllm/compilation/fusion.py                    | 719 +++++++++++++-----
 vllm/compilation/fx_utils.py                  |  42 +
 vllm/compilation/multi_output_match.py        | 105 +++
 vllm/compilation/reshapes.py                  |   3 +-
 vllm/compilation/vllm_inductor_pass.py        |   4 -
 20 files changed, 1736 insertions(+), 252 deletions(-)
 create mode 100644 benchmarks/fused_kernels/layernorm_rms_benchmarks.py
 create mode 100644 csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
 create mode 100644 csrc/quantization/fused_kernels/layernorm_utils.cuh
 create mode 100644 csrc/quantization/fused_kernels/quant_conversions.cuh
 create mode 100644 csrc/quantization/vectorization.cuh
 create mode 100644 tests/kernels/test_fused_quant_layernorm.py
 create mode 100644 vllm/compilation/fx_utils.py
 create mode 100644 vllm/compilation/multi_output_match.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c78cdc77a7e42..bf19b3d227171 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -196,6 +196,7 @@ set(VLLM_EXT_SRC
   "csrc/quantization/gptq/q_gemm.cu"
   "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
   "csrc/quantization/fp8/common.cu"
+  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
   "csrc/quantization/gguf/gguf_kernel.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/prepare_inputs/advance_step.cu"
@@ -300,7 +301,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   #
   # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
   # kernels for the remaining archs that are not already built for 3x.
-  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS 
+  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
     "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
   # subtract out the archs that are already built for 3x
   list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
new file mode 100644
index 0000000000000..ef91f9f8eb529
--- /dev/null
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -0,0 +1,173 @@
+import pickle as pkl
+import time
+from dataclasses import dataclass
+from itertools import product
+from typing import Callable, Iterable, List, Optional
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from tqdm import tqdm
+
+import vllm._custom_ops as ops
+from vllm.model_executor.layers.layernorm import RMSNorm
+
+
+@dataclass
+class bench_params_t:
+    num_tokens: int
+    hidden_size: int
+    add_residual: bool
+    dtype: torch.dtype
+
+    def description(self):
+        return (f'N {self.num_tokens} '
+                f'x D {self.hidden_size} '
+                f'x R {self.add_residual} '
+                f'x DT {self.dtype}')
+
+
+def get_bench_params() -> List[bench_params_t]:
+    ## Test Fixtures
+    NUM_TOKENS = [2**x for x in range(11)]
+    HIDDEN_SIZES = list(range(1024, 8129, 1024))
+    ADD_RESIDUAL = [True, False]
+    DTYPES = [torch.bfloat16, torch.float]
+
+    combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
+    bench_params = list(map(lambda x: \
+        bench_params_t(x[0], x[1], x[2], x[3]), combinations))
+    return bench_params
+
+
+# Reference impls
+def unfused_int8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
+                      residual: Optional[torch.Tensor],
+                      quant_dtype: torch.dtype):
+    # Norm
+    torch_out = None
+    if residual is None:
+        torch_out = rms_norm_layer.forward_cuda(x, residual)
+    else:
+        torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+    # Quant
+    torch_out, _, _ = ops.scaled_int8_quant(torch_out)
+
+
+def unfused_fp8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
+                     residual: Optional[torch.Tensor],
+                     quant_dtype: torch.dtype):
+    # Norm
+    torch_out = None
+    if residual is None:
+        torch_out = rms_norm_layer.forward_cuda(x, residual)
+    else:
+        torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+    # Quant
+    torch_out, _ = ops.scaled_fp8_quant(torch_out)
+
+
+def fused_impl(
+        rms_norm_layer: RMSNorm,  # this stores the weights
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        quant_dtype: torch.dtype):
+    out, _ = ops.rms_norm_dynamic_per_token_quant(x,
+                                                  rms_norm_layer.weight,
+                                                  1e-6,
+                                                  quant_dtype,
+                                                  residual=residual)
+
+
+# Bench functions
+def bench_fn(rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor,
+             quant_dtype: torch.dtype, label: str, sub_label: str,
+             fn: Callable, description: str) -> TMeasurement:
+
+    min_run_time = 1
+
+    globals = {
+        "rms_norm_layer": rms_norm_layer,
+        "x": x,
+        "residual": residual,
+        "quant_dtype": quant_dtype,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+def bench(params: bench_params_t, label: str, sub_label: str) \
+        -> Iterable[TMeasurement]:
+
+    # Make inputs
+    layer = RMSNorm(params.hidden_size, 1e-6).to(dtype=params.dtype)
+    # Make weights
+    layer.weight.data.normal_(mean=1.0, std=0.1)
+    # Make inputs
+    scale = 1 / params.hidden_size
+    x = torch.randn(params.num_tokens,
+                    params.hidden_size,
+                    dtype=params.dtype,
+                    device='cuda') * scale
+    residual = (torch.randn_like(x) * scale).to(device='cuda') \
+            if params.add_residual else None
+
+    timers = []
+
+    # unfused int8 impl.
+    timers.append(
+        bench_fn(layer, x, residual, torch.int8, label, sub_label,
+                 unfused_int8_impl, "unfused_int8_impl"))
+
+    # unfused fp8 impl.
+    timers.append(
+        bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label,
+                 unfused_fp8_impl, "unfused_fp8_impl"))
+
+    # fused int8 impl.
+    timers.append(
+        bench_fn(layer, x, residual, torch.int8, label, sub_label, fused_impl,
+                 "fused_int8_impl"))
+
+    # fused fp8 impl.
+    timers.append(
+        bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label,
+                 fused_impl, "fused_fp8_impl"))
+
+    print_timers(timers)
+
+    return timers
+
+
+# launch bench
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def main():
+    torch.set_default_device('cuda')
+    bench_params = get_bench_params()
+
+    timers = []
+    for bp in tqdm(bench_params):
+        timers.extend(
+            bench(bp, "rms-norm-dynamic-per-token-quant", bp.description()))
+    print_timers(timers)
+
+    # pickle all the results
+    timestamp = int(time.time())
+    with open(f"rms_norm_dpt_quant-{timestamp}.pkl", "wb") as f:
+        pkl.dump(timers, f)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index a634e1c3d4886..03414b7e1ae93 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -14,6 +14,20 @@
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 
+// TODO(luka/varun): use FP8_TYPE macro after refactoring
+#ifndef USE_ROCM
+  #define VLLM_DISPATCH_CASE_QUANT_TYPES(...)                    \
+    AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
+#else
+  #define VLLM_DISPATCH_CASE_QUANT_TYPES(...)                      \
+    AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fnuz, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
+#endif
+
+#define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__))
+
 #define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...)   \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)     \
diff --git a/csrc/ops.h b/csrc/ops.h
index ea001190bc202..816b471d062d2 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -66,6 +66,14 @@ void fused_add_rms_norm_static_fp8_quant(torch::Tensor& out,
                                          torch::Tensor& weight,
                                          torch::Tensor& scale, double epsilon);
 
+void rms_norm_dynamic_per_token_quant(torch::Tensor& out,
+                                      torch::Tensor const& input,
+                                      torch::Tensor const& weight,
+                                      torch::Tensor& scales,
+                                      double const epsilon,
+                                      std::optional<torch::Tensor> scale_ub,
+                                      std::optional<torch::Tensor> residual);
+
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                       torch::Tensor& key, int64_t head_size,
                       torch::Tensor& cos_sin_cache, bool is_neox);
diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/fp8/common.cuh
index d7c0297d5333f..15bd5b6ed1564 100644
--- a/csrc/quantization/fp8/common.cuh
+++ b/csrc/quantization/fp8/common.cuh
@@ -1,6 +1,9 @@
 #pragma once
 
+#include "quantization/vectorization.cuh"
+
 #include <cmath>
+#include <c10/core/ScalarType.h>
 
 #ifndef USE_ROCM
   #include <c10/util/Float8_e4m3fn.h>
@@ -15,6 +18,7 @@ using FP8_TYPE = c10::Float8_e4m3fnuz;
 // issue when running dynamic quantization. Here use 224.0f for rocm.
 constexpr auto FP8_E4M3_MAX = 224.0f;
 #endif
+constexpr static auto kFp8Type = c10::CppTypeToScalarType<FP8_TYPE>::value;
 
 namespace vllm {
 
@@ -89,22 +93,6 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
   }
 }
 
-template <typename scalar_t>
-struct __align__(8) vec4_t {
-  scalar_t x;
-  scalar_t y;
-  scalar_t z;
-  scalar_t w;
-};
-
-typedef struct __align__(4) {
-  FP8_TYPE x;
-  FP8_TYPE y;
-  FP8_TYPE z;
-  FP8_TYPE w;
-}
-float8x4_t;
-
 template <typename scalar_t>
 __device__ float thread_max_vec(scalar_t const* __restrict__ input,
                                 int64_t const num_elems, int const tid,
@@ -139,10 +127,10 @@ __device__ void scaled_fp8_conversion_vec(FP8_TYPE* __restrict__ out,
                                           float const scale,
                                           int64_t const num_elems,
                                           int const tid, int const step) {
+  using float8x4_t = q8x4_t<FP8_TYPE>;
   // Vectorized input/output to better utilize memory bandwidth.
-  vec4_t<scalar_t> const* vectorized_in =
-      reinterpret_cast<vec4_t<scalar_t> const*>(input);
-  float8x4_t* vectorized_out = reinterpret_cast<float8x4_t*>(out);
+  auto const* vectorized_in = reinterpret_cast<vec4_t<scalar_t> const*>(input);
+  auto* vectorized_out = reinterpret_cast<float8x4_t*>(out);
 
   int64_t const num_vec_elems = num_elems >> 2;
 
diff --git a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
new file mode 100644
index 0000000000000..3c4f183bf4b59
--- /dev/null
+++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@@ -0,0 +1,160 @@
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "../../dispatch_utils.h"
+#include "layernorm_utils.cuh"
+#include "quant_conversions.cuh"
+
+namespace vllm {
+
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__device__ void rms_norm_dynamic_per_token_quant_vec(
+    scalar_out_t* __restrict__ out,       // [..., hidden_size]
+    float* __restrict__ scales,           // [num_tokens]
+    scalar_t const* __restrict__ input,   // [..., hidden_size]
+    scalar_t const* __restrict__ weight,  // [hidden_size]
+    float const* scale_ub, float const var_epsilon,
+    float const min_scaling_factor, int32_t const hidden_size,
+    scalar_t* __restrict__ residual = nullptr) {
+  float rms = 0.0f;
+  float token_scale = 0.0f;
+
+  // Compute rms
+  vllm::vectorized::compute_rms<scalar_t, has_residual>(
+      &rms, input, hidden_size, var_epsilon, residual);
+
+  // Compute scale
+  vllm::vectorized::compute_dynamic_per_token_scales<scalar_t, scalar_out_t,
+                                                     has_residual>(
+      &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor,
+      hidden_size, residual);
+
+  // RMS Norm + Quant
+  if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
+    vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, true,
+                                     has_residual>(
+        out, input, weight, rms, 1.0f / token_scale, hidden_size, residual);
+  } else {
+    // FP8 - Do not invert token_scale for exact match with FBGemm
+    vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, false,
+                                     has_residual>(
+        out, input, weight, rms, token_scale, hidden_size, residual);
+  }
+}
+
+// RMS norm + quant kernel
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__global__ void rms_norm_dynamic_per_token_quant_kernel(
+    scalar_out_t* __restrict__ out,       // [..., hidden_size]
+    float* __restrict__ scales,           // [num_tokens]
+    scalar_t const* __restrict__ input,   // [..., hidden_size]
+    scalar_t const* __restrict__ weight,  // [hidden_size]
+    float const* scale_ub, float const var_epsilon,
+    float const min_scaling_factor, int32_t const hidden_size,
+    scalar_t* __restrict__ residual = nullptr) {
+  // For vectorization, token_input and token_output pointers need to be
+  // aligned at 8-byte and 4-byte addresses respectively.
+  bool const can_vectorize = hidden_size % 4 == 0;
+
+  if (can_vectorize) {
+    return rms_norm_dynamic_per_token_quant_vec<scalar_t, scalar_out_t,
+                                                has_residual>(
+        out, scales, input, weight, scale_ub, var_epsilon, min_scaling_factor,
+        hidden_size, residual);
+  }
+
+  float rms = 0.0f;
+  float token_scale = 0.0f;
+
+  // Compute RMS
+  vllm::compute_rms<scalar_t, has_residual>(&rms, input, hidden_size,
+                                            var_epsilon, residual);
+  // Compute Scale
+  vllm::compute_dynamic_per_token_scales<scalar_t, scalar_out_t, has_residual>(
+      &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor,
+      hidden_size, residual);
+
+  // RMS Norm + Quant
+  if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
+    vllm::norm_and_quant<scalar_t, scalar_out_t, true, has_residual>(
+        out, input, weight, rms, 1.0f / token_scale, hidden_size, residual);
+  } else {
+    // FP8 - Do not invert s_token_scale for exact match with FBGemm
+    vllm::norm_and_quant<scalar_t, scalar_out_t, false, has_residual>(
+        out, input, weight, rms, token_scale, hidden_size, residual);
+  }
+}
+}  // namespace vllm
+
+// Residual add + RMS norm + dynamic per token
+template <typename scalar_in_t>
+void rms_norm_dynamic_per_token_quant_dispatch(
+    torch::Tensor& out,           // [..., hidden_size]
+    torch::Tensor const& input,   // [..., hidden_size]
+    torch::Tensor const& weight,  // [hidden_size]
+    torch::Tensor& scales,        // [num_tokens]
+    double const var_epsilon,     // Variance epsilon used in norm calculation
+    std::optional<at::Tensor> const& scale_ub,
+    std::optional<at::Tensor>& residual) {
+  int32_t hidden_size = input.size(-1);
+  int32_t num_tokens = input.numel() / hidden_size;
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(hidden_size, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const float min_scaling_factor =
+      out.dtype() == torch::kInt8
+          ? std::numeric_limits<float>::epsilon()
+          : 1.0f / (std::numeric_limits<c10::Float8_e4m3fn>::max() * 512.f);
+
+  if (residual.has_value()) {
+    VLLM_DISPATCH_QUANT_TYPES(
+        out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
+          vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
+                                                        true>
+              <<<grid, block, 0, stream>>>(
+                  out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
+                  input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
+                  scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
+                  var_epsilon, min_scaling_factor, hidden_size,
+                  residual->data_ptr<scalar_in_t>());
+        });
+
+  } else {
+    VLLM_DISPATCH_QUANT_TYPES(
+        out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
+          vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
+                                                        false>
+              <<<grid, block, 0, stream>>>(
+                  out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
+                  input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
+                  scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
+                  var_epsilon, min_scaling_factor, hidden_size, nullptr);
+        });
+  }
+}
+
+void rms_norm_dynamic_per_token_quant(
+    torch::Tensor& out,           // [..., hidden_size]
+    torch::Tensor const& input,   // [..., hidden_size]
+    torch::Tensor const& weight,  // [hidden_size]
+    torch::Tensor& scales,        // [num_tokens]
+    double const var_epsilon,     // Variance epsilon used in norm calculation
+    std::optional<at::Tensor> scale_ub, std::optional<at::Tensor> residual) {
+  TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
+  TORCH_CHECK(out.is_contiguous() && input.is_contiguous());
+
+  if (scale_ub.has_value()) {
+    TORCH_CHECK(out.dtype() == kFp8Type);
+  }
+  TORCH_CHECK(scales.dtype() == torch::kFloat32);
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "rms_norm_dynamic_per_token_quant_dispatch", [&] {
+        rms_norm_dynamic_per_token_quant_dispatch<scalar_t>(
+            out, input, weight, scales, var_epsilon, scale_ub, residual);
+      });
+}
diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh
new file mode 100644
index 0000000000000..cec6b54edb569
--- /dev/null
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@@ -0,0 +1,327 @@
+#pragma once
+
+/**
+ * __device__ layernorm utilities.
+ */
+
+#include "quantization/vectorization.cuh"
+#include "quant_conversions.cuh"
+
+#ifndef USE_ROCM
+  #include <cub/cub.cuh>
+#else
+  #include <hipcub/hipcub.hpp>
+#endif
+
+namespace vllm {
+
+// has_residual must be true, if residual is not a nullptr
+template <typename scalar_t, bool has_residual = false>
+__device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
+                            int32_t const hidden_size, float const epsilon,
+                            scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  // sum of squares
+  float ss = 0.0f;
+
+  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+    float x = static_cast<float>(input[token_offset + i]);
+    if constexpr (has_residual) {
+      x += static_cast<float>(residual[token_offset + i]);
+    }
+
+    ss += x * x;
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
+
+  __shared__ float s_rms;
+  if (threadIdx.x == 0) {
+    s_rms = rsqrtf(ss / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  *rms = s_rms;
+}
+
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__device__ void compute_dynamic_per_token_scales(
+    float* __restrict__ token_scale, float* __restrict__ all_token_scales,
+    scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
+    float const rms, float const* __restrict__ scale_ub,
+    float const min_scaling_factor, int32_t const hidden_size,
+    scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  ;
+  constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
+
+  float block_absmax_val_maybe = 0.0f;
+  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+    float x = static_cast<float>(input[token_offset + i]);
+    if constexpr (has_residual) {
+      x += static_cast<float>(residual[token_offset + i]);
+    }
+
+    x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
+    block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  block_absmax_val_maybe =
+      BlockReduce(reduceStore)
+          .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
+
+  __shared__ float s_token_scale;
+  if (threadIdx.x == 0) {
+    float scale = 0.0f;
+    if (scale_ub) {
+      scale = min(block_absmax_val_maybe, *scale_ub);
+    } else {
+      scale = block_absmax_val_maybe;
+    }
+    // token scale computation
+    scale = max(scale / qmax, min_scaling_factor);
+    s_token_scale = scale;                 // Shared memory store
+    all_token_scales[blockIdx.x] = scale;  // Global output store
+  }
+  __syncthreads();
+
+  *token_scale = s_token_scale;
+}
+
+template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
+          bool has_residual = false>
+__device__ void norm_and_quant(scalar_out_t* __restrict__ output,
+                               scalar_t const* __restrict__ input,
+                               scalar_t const* __restrict__ weight,
+                               float const rms, float const scale,
+                               int32_t const hidden_size,
+                               scalar_t* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  ;
+
+  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+    float x = static_cast<float>(input[token_offset + i]);
+    if constexpr (has_residual) {
+      x += static_cast<float>(residual[token_offset + i]);
+      residual[token_offset + i] = static_cast<scalar_t>(x);
+    }
+    // Norm
+    x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
+    // Quant
+    output[token_offset + i] =
+        ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(x, scale);
+  }
+}
+
+namespace vectorized {
+
+// Compute 1.0/rms(input)
+// hidden_size must be a multiple of 4
+template <typename scalar_t, bool has_residual = false>
+__device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
+                            int32_t const hidden_size, float const epsilon,
+                            scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+
+  // Vectorized input/output to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vec_input =
+      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+  vec4_t<scalar_t> const* vec_residual = nullptr;
+  if constexpr (has_residual) {
+    vec_residual =
+        reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
+  }
+
+  // sum of squares
+  float ss = 0.0f;
+
+  int32_t const num_vec_elems = hidden_size >> 2;
+
+#pragma unroll 4
+  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+    vec4_t<scalar_t> in = vec_input[i];
+
+    vec4_t<float> x;
+    x.x = static_cast<float>(in.x);
+    x.y = static_cast<float>(in.y);
+    x.z = static_cast<float>(in.z);
+    x.w = static_cast<float>(in.w);
+    if constexpr (has_residual) {
+      vec4_t<scalar_t> r = vec_residual[i];
+      x.x += static_cast<float>(r.x);
+      x.y += static_cast<float>(r.y);
+      x.z += static_cast<float>(r.z);
+      x.w += static_cast<float>(r.w);
+    }
+
+    ss += x.x * x.x;
+    ss += x.y * x.y;
+    ss += x.z * x.z;
+    ss += x.w * x.w;
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
+
+  __shared__ float s_rms;
+  if (threadIdx.x == 0) {
+    s_rms = rsqrtf(ss / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  *rms = s_rms;
+}
+
+// Vectorized version of vllm::compute_dynamic_per_token_scales
+// hidden_size must be a multiple of 4
+template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
+__device__ void compute_dynamic_per_token_scales(
+    float* __restrict__ token_scale, float* __restrict__ all_token_scales,
+    scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
+    float const rms, float const* __restrict__ scale_ub,
+    float const min_scaling_factor, int32_t const hidden_size,
+    scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  ;
+
+  // Vectorized input/weight/residual to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vec_input =
+      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+  vec4_t<scalar_t> const* vec_weight =
+      reinterpret_cast<vec4_t<scalar_t> const*>(weight);
+  vec4_t<scalar_t> const* vec_residual = nullptr;
+  if constexpr (has_residual) {
+    vec_residual =
+        reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
+  }
+
+  constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
+
+  int32_t const num_vec_elems = hidden_size >> 2;
+  float block_absmax_val_maybe = 0.0f;
+
+#pragma unroll 4
+  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+    vec4_t<scalar_t> in = vec_input[i];
+    vec4_t<scalar_t> const w = vec_weight[i];
+
+    vec4_t<float> x;
+    x.x = static_cast<float>(in.x);
+    x.y = static_cast<float>(in.y);
+    x.z = static_cast<float>(in.z);
+    x.w = static_cast<float>(in.w);
+    if constexpr (has_residual) {
+      vec4_t<scalar_t> r = vec_residual[i];
+      x.x += static_cast<float>(r.x);
+      x.y += static_cast<float>(r.y);
+      x.z += static_cast<float>(r.z);
+      x.w += static_cast<float>(r.w);
+    }
+
+    block_absmax_val_maybe = fmaxf(
+        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.x * rms) * w.x));
+    block_absmax_val_maybe = fmaxf(
+        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.y * rms) * w.y));
+    block_absmax_val_maybe = fmaxf(
+        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.z * rms) * w.z));
+    block_absmax_val_maybe = fmaxf(
+        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.w * rms) * w.w));
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  block_absmax_val_maybe =
+      BlockReduce(reduceStore)
+          .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
+
+  __shared__ float s_token_scale;
+  if (threadIdx.x == 0) {
+    float scale = 0.0f;
+    if (scale_ub) {
+      scale = min(block_absmax_val_maybe, *scale_ub);
+    } else {
+      scale = block_absmax_val_maybe;
+    }
+    // token scale computation
+    scale = max(scale / qmax, min_scaling_factor);
+    s_token_scale = scale;                 // shared memory store
+    all_token_scales[blockIdx.x] = scale;  // global output store
+  }
+  __syncthreads();
+
+  *token_scale = s_token_scale;
+}
+
+// hidden_size must be a multiple of 4
+template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
+          bool has_residual = false>
+__device__ void norm_and_quant(scalar_out_t* __restrict__ output,
+                               scalar_t const* __restrict__ input,
+                               scalar_t const* __restrict__ weight,
+                               float const rms, float const scale,
+                               int32_t const hidden_size,
+                               scalar_t* __restrict__ residual = nullptr) {
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+  ;
+
+  // Vectorized input/output/weight/residual to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vec_input =
+      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+  vec4_t<scalar_t> const* vec_weight =
+      reinterpret_cast<vec4_t<scalar_t> const*>(weight);
+  q8x4_t<scalar_out_t>* vec_output =
+      reinterpret_cast<q8x4_t<scalar_out_t>*>(&output[token_offset]);
+  vec4_t<scalar_t>* vec_residual = nullptr;
+  if constexpr (has_residual) {
+    vec_residual = reinterpret_cast<vec4_t<scalar_t>*>(&residual[token_offset]);
+  }
+
+  int32_t const num_vec_elems = hidden_size >> 2;
+
+// TODO(luka/varun) extract into type-agnostic vectorized quant function to
+//  replace scaled_fp8_conversion_vec
+#pragma unroll 4
+  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+    vec4_t<scalar_t> const in = vec_input[i];
+    vec4_t<scalar_t> const w = vec_weight[i];
+
+    vec4_t<float> x;
+    x.x = static_cast<float>(in.x);
+    x.y = static_cast<float>(in.y);
+    x.z = static_cast<float>(in.z);
+    x.w = static_cast<float>(in.w);
+    if constexpr (has_residual) {
+      vec4_t<scalar_t> r = vec_residual[i];
+      x.x += static_cast<float>(r.x);
+      x.y += static_cast<float>(r.y);
+      x.z += static_cast<float>(r.z);
+      x.w += static_cast<float>(r.w);
+      // Update residual
+      r.x = static_cast<scalar_t>(x.x);
+      r.y = static_cast<scalar_t>(x.y);
+      r.z = static_cast<scalar_t>(x.z);
+      r.w = static_cast<scalar_t>(x.w);
+      vec_residual[i] = r;
+    }
+
+    q8x4_t<scalar_out_t> out;
+    out.x = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+        static_cast<scalar_t>(x.x * rms) * w.x, scale);
+    out.y = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+        static_cast<scalar_t>(x.y * rms) * w.y, scale);
+    out.z = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+        static_cast<scalar_t>(x.z * rms) * w.z, scale);
+    out.w = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+        static_cast<scalar_t>(x.w * rms) * w.w, scale);
+    vec_output[i] = out;
+  }
+}
+
+}  // namespace vectorized
+
+}  // namespace vllm
diff --git a/csrc/quantization/fused_kernels/quant_conversions.cuh b/csrc/quantization/fused_kernels/quant_conversions.cuh
new file mode 100644
index 0000000000000..f8a9872226a3a
--- /dev/null
+++ b/csrc/quantization/fused_kernels/quant_conversions.cuh
@@ -0,0 +1,81 @@
+#pragma once
+
+/**
+ * __device__ helper functions to deal with float -> quant datatype conversion
+ */
+
+#include "quantization/vectorization.cuh"
+// TODO(luka/varun):refactor common.cuh to use this file instead
+#include "quantization/fp8/common.cuh"
+
+namespace vllm {
+
+// TODO(luka/varun): combine into common utilities for int8
+//  (with int8_quant_kernels.cu)
+static __device__ __forceinline__ int8_t float_to_int8_rn(float const x) {
+#ifdef USE_ROCM
+  static const float i8_min =
+      static_cast<float>(std::numeric_limits<int8_t>::min());
+  static const float i8_max =
+      static_cast<float>(std::numeric_limits<int8_t>::max());
+  // round
+  float dst = std::nearbyint(x);
+  // saturate
+  dst = std::clamp(dst, i8_min, i8_max);
+  return static_cast<int8_t>(dst);
+#else
+  // CUDA path
+  uint32_t dst;
+  asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x));
+  return reinterpret_cast<const int8_t&>(dst);
+#endif
+}
+
+static __device__ __forceinline__ FP8_TYPE float_to_fp8(float const x) {
+  float const r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX));
+  return static_cast<FP8_TYPE>(r);
+}
+
+template <typename quant_type_t, bool is_scale_inverted, typename enable = void>
+struct ScaledQuant;
+
+template <typename quant_type_t, bool is_scale_inverted>
+struct ScaledQuant<
+    quant_type_t, is_scale_inverted,
+    typename std::enable_if_t<std::is_same_v<quant_type_t, int8_t>>> {
+  static __device__ __forceinline__ quant_type_t quant_fn(float const x,
+                                                          float const scale) {
+    if constexpr (is_scale_inverted) {
+      return float_to_int8_rn(x * scale);
+    } else {
+      return float_to_int8_rn(x / scale);
+    }
+  }
+};
+
+template <typename quant_type_t, bool is_scale_inverted>
+struct ScaledQuant<
+    quant_type_t, is_scale_inverted,
+    typename std::enable_if_t<std::is_same_v<quant_type_t, FP8_TYPE>>> {
+  static __device__ __forceinline__ quant_type_t quant_fn(float const x,
+                                                          float const scale) {
+    if constexpr (is_scale_inverted) {
+      return float_to_fp8(x * scale);
+    } else {
+      return float_to_fp8(x / scale);
+    }
+  }
+};
+
+template <typename scalar_t, typename quant_type_t, bool is_scale_inverted>
+__device__ void scaled_quant_conversion(quant_type_t* __restrict__ output,
+                                        scalar_t const* __restrict__ input,
+                                        float const scale, int const tid,
+                                        int const num_elements,
+                                        int const step) {
+  for (int i = tid; i < num_elements; i += step) {
+    output[i] = ScaledQuant<quant_type_t, is_scale_inverted>(input[i], scale);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/vectorization.cuh b/csrc/quantization/vectorization.cuh
new file mode 100644
index 0000000000000..44c999130f756
--- /dev/null
+++ b/csrc/quantization/vectorization.cuh
@@ -0,0 +1,33 @@
+#pragma once
+/**
+ * __device__ datatypes vectorized by 4
+ */
+
+// Include both AMD and NVIDIA fp8 types to avoid circular import
+// TODO(luka/varun) use FP8_TYPE instead after refactoring
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e4m3fn.h>
+
+namespace vllm {
+
+// Vectorization containers
+template <typename scalar_t>
+struct __align__(8) vec4_t {
+  scalar_t x;
+  scalar_t y;
+  scalar_t z;
+  scalar_t w;
+};
+
+template <typename quant_type_t>
+struct __align__(4) q8x4_t {
+  static_assert(std::is_same_v<quant_type_t, int8_t> ||
+                std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
+                std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>);
+  quant_type_t x;
+  quant_type_t y;
+  quant_type_t z;
+  quant_type_t w;
+};
+
+}  // namespace vllm
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 4e64b9c92773a..1ffab14862fed 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -128,6 +128,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("fused_add_rms_norm_static_fp8_quant", torch::kCUDA,
            &fused_add_rms_norm_static_fp8_quant);
 
+  // Fused Layernorm + Quant kernels
+  ops.def(
+      "rms_norm_dynamic_per_token_quant(Tensor! result, Tensor input, "
+      "Tensor weight, Tensor! scale, float epsilon, "
+      "Tensor? scale_ub, Tensor!? residual) -> ()");
+  ops.impl("rms_norm_dynamic_per_token_quant", torch::kCUDA,
+           &rms_norm_dynamic_per_token_quant);
+
   // Rotary embedding
   // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
   ops.def(
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index 5036189077be2..ea3aaee9565ec 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -4,10 +4,10 @@
 import vllm.envs as envs
 from vllm import LLM, SamplingParams
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
-from vllm.compilation.fusion import (FusionPass, find_auto_fn,
-                                     find_auto_fn_maybe)
+from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey,
+                                     kFp8DynamicTokenSym, kFp8StaticTensorSym)
+from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
 from vllm.compilation.reshapes import RedundantReshapesPass
-from vllm.compilation.vllm_inductor_pass import is_func
 from vllm.config import CompilationConfig
 
 from .backend import TestBackend
@@ -35,12 +35,16 @@
 ]
 
 
-@pytest.mark.parametrize("model",
-                         ["nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"])
+@pytest.mark.parametrize(
+    "model, quant_key",
+    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e", kFp8StaticTensorSym),
+     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8_DYNAMIC-e2e",
+      kFp8DynamicTokenSym)])
 @pytest.mark.parametrize("do_fusion", [True, False])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
                     reason="Only test on CUDA")
-def test_fix_functionalization(model: str, do_fusion: bool):
+def test_fix_functionalization(model: str, quant_key: QuantKey,
+                               do_fusion: bool):
     torch.set_default_device("cuda")
 
     config = CompilationConfig.PassConfig(enable_fusion=do_fusion,
@@ -78,8 +82,9 @@ def test_fix_functionalization(model: str, do_fusion: bool):
 
     # OPS_IN_MODEL always appear. RMS_OP is fused away if we run fusion,
     # and replaced by fused quantized ops in RMS_QUANT_OPS.
-    ops = OPS_IN_MODEL + (RMS_QUANT_OPS["static_fp8"]
-                          if do_fusion else [RMS_OP])
+    rms_ops = [FUSED_OPS[(quant_key, True)], FUSED_OPS[(quant_key, False)]
+               ] if do_fusion else [RMS_OP]
+    ops = OPS_IN_MODEL + rms_ops
 
     for op in ops:
         find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index f92ec8d0de5f1..b4266a4a7db94 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -3,8 +3,9 @@
 from compressed_tensors.quantization import FP8_DTYPE
 
 import vllm.envs as envs
-from vllm.compilation.fusion import (FusionPass, find_auto_fn,
-                                     find_auto_fn_maybe)
+from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey,
+                                     FusionPass, QuantKey)
+from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe
 from vllm.compilation.reshapes import RedundantReshapesPass
 from vllm.config import CompilationConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -16,24 +17,37 @@
 
 class TestModel(torch.nn.Module):
 
-    def __init__(self, hidden_size: int, eps: float, *args, **kwargs):
+    def __init__(self, hidden_size: int, eps: float, static: bool, *args,
+                 **kwargs):
         super().__init__(*args, **kwargs)
         self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
-        self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(4)]
+        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
+        if static:
+            self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
+        else:
+            self.scale = [None for _ in range(2)]
         self.w = [
             torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
             for _ in range(2)
         ]
 
     def forward(self, x):
-        resid = torch.relu(x)
+        resid = torch.sqrt(x)
         y = self.norm[0](x)
 
-        x2 = apply_fp8_linear(y, self.w[0], self.scale[0], self.scale[1])
+        x2 = apply_fp8_linear(y,
+                              self.w[0],
+                              self.wscale[0],
+                              self.scale[0],
+                              use_per_token_if_dynamic=True)
         # make sure resid is used for replacement to work
         y2, resid = self.norm[1](x2, resid)
 
-        x3 = apply_fp8_linear(y2, self.w[1], self.scale[2], self.scale[3])
+        x3 = apply_fp8_linear(y2,
+                              self.w[1],
+                              self.wscale[1],
+                              self.scale[1],
+                              use_per_token_if_dynamic=True)
         y3, resid = self.norm[2](x3, resid)  # use resid here
         return y3
 
@@ -42,14 +56,13 @@ def forward(self, x):
 @pytest.mark.parametrize("hidden_size", [64, 3392, 4096])
 @pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
 @pytest.mark.parametrize("eps", [1e-5, 1e-6])
+@pytest.mark.parametrize("static", [True, False])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
                     reason="Only test on CUDA")
-def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps):
+def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static):
     torch.set_default_device("cuda")
-    torch.set_default_dtype(torch.float16)
-
-    if eps != 1e-5:
-        pytest.skip("Only test eps=1e-5 for now")
+    torch.set_default_dtype(dtype)
+    torch.manual_seed(1)
 
     # Reshape pass is needed for the fusion pass to work
     config = CompilationConfig.PassConfig(enable_fusion=True,
@@ -58,7 +71,7 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps):
     fusion_pass = FusionPass.instance(config)
 
     backend = TestBackend(reshape_pass, fusion_pass)
-    model = TestModel(hidden_size, eps)
+    model = TestModel(hidden_size, eps, static)
 
     # First dimension dynamic
     x = torch.rand(num_tokens, hidden_size)
@@ -69,16 +82,28 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps):
     model2 = torch.compile(model, backend=backend)
     result2 = model2(x)
 
-    # Check that it gives the same answer
-    torch.testing.assert_close(result, result2, atol=1e-3, rtol=1e-3)
+    # Higher tol for dynamic, even higher for bfloat16
+    if static:
+        ATOL, RTOL = (1e-3, 1e-3)
+    elif dtype == torch.float16:
+        ATOL, RTOL = (2e-3, 2e-3)
+    else:
+        ATOL, RTOL = (1e-2, 1e-2)
+
+    torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
 
     # Check substitution worked
     pre_nodes = backend.graph_pre_pass.nodes
     post_nodes = backend.graph_post_pass.nodes
 
-    rms_quant = torch.ops._C.rms_norm_static_fp8_quant.default
-    add_rms_quant = torch.ops._C.fused_add_rms_norm_static_fp8_quant.default
-    fp8_quant = torch.ops._C.static_scaled_fp8_quant.default
+    # static is per-tensor, dynamic is per-token
+    key = QuantKey(dtype=FP8_DTYPE,
+                   static=static,
+                   per_tensor=static,
+                   symmetric=True)
+    rms_quant = FUSED_OPS[FusedRMSQuantKey(key, False)]
+    add_rms_quant = FUSED_OPS[FusedRMSQuantKey(key, True)]
+    fp8_quant = QUANT_OPS[key]
 
     # In pre-nodes, fp8 quant should be present and fused kernels should not
     assert find_auto_fn_maybe(pre_nodes, rms_quant) is None
diff --git a/tests/kernels/test_fused_quant_layernorm.py b/tests/kernels/test_fused_quant_layernorm.py
new file mode 100644
index 0000000000000..baf8d73fdbffb
--- /dev/null
+++ b/tests/kernels/test_fused_quant_layernorm.py
@@ -0,0 +1,171 @@
+from typing import Optional, Tuple, Union
+
+import pytest
+import torch
+
+import vllm._custom_ops as ops
+from tests.kernels.utils import opcheck
+from vllm.model_executor.layers.layernorm import RMSNorm
+
+DTYPES = [torch.bfloat16, torch.float]
+QUANT_DTYPES = [torch.int8, torch.float8_e4m3fn]
+VEC_HIDDEN_SIZES = range(1024, 1030)
+# Avoid combinatorial explosion with full Cartesian product
+NUM_TOKENS_HIDDEN_SIZES = [
+    *[(1, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5120, 5137]],
+    *[(83, i) for i in [1, 1033, 2048, 5120]],
+    *[(2048, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5137]],
+    *[(4096, i) for i in [1, 64, 5137]],
+]
+
+ADD_RESIDUAL = [False, True]
+SCALE_UBS = [True, False]
+SEEDS = [0]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+EPS = 1e-6
+
+## Helpers
+
+
+def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
+    return torch.as_tensor(x, dtype=torch.float32, device='cuda')
+
+
+def ref_rms_norm(rms_norm_layer: RMSNorm,
+                 x: torch.Tensor,
+                 residual: Optional[torch.Tensor]) \
+        -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    if residual is not None:
+        residual = residual.clone()
+        out, residual = rms_norm_layer.forward_native(x, residual)
+    else:
+        out = rms_norm_layer.forward_native(x)
+
+    return out, residual
+
+
+def ref_dynamic_per_token_quant(rms_norm_layer: RMSNorm,
+                                x: torch.Tensor,
+                                quant_dtype: torch.dtype,
+                                residual: Optional[torch.Tensor],
+                                scale_ub: Optional[torch.Tensor]) \
+        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    if scale_ub is not None:
+        assert quant_dtype == torch.float8_e4m3fn
+
+    # Norm
+    torch_out, residual = ref_rms_norm(rms_norm_layer, x, residual)
+
+    # Quant
+    if quant_dtype == torch.float8_e4m3fn:
+        torch_out, scales = ops.scaled_fp8_quant(torch_out,
+                                                 scale_ub=scale_ub,
+                                                 use_per_token_if_dynamic=True)
+    else:
+        assert quant_dtype == torch.int8
+        torch_out, scales = ops.scaled_int8_quant(torch_out)
+
+    return torch_out, scales, residual
+
+
+def ref_impl(rms_norm_layer: RMSNorm,
+             x: torch.Tensor,
+             quant_dtype: torch.dtype,
+             residual: Optional[torch.Tensor],
+             scale_ub: Optional[torch.Tensor]) \
+        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    return ref_dynamic_per_token_quant(rms_norm_layer, x, quant_dtype,
+                                       residual, scale_ub)
+
+
+def ops_dynamic_per_token_quant(weight: torch.Tensor,
+                                x: torch.Tensor,
+                                quant_dtype: torch.dtype,
+                                residual: Optional[torch.Tensor],
+                                scale_ub: Optional[torch.Tensor]) \
+        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    if residual is not None:
+        residual = residual.clone()
+    out, scales = ops.rms_norm_dynamic_per_token_quant(x, weight, EPS,
+                                                       quant_dtype, scale_ub,
+                                                       residual)
+    return out, scales, residual
+
+
+def ops_impl(weight: torch.Tensor,
+             x: torch.Tensor,
+             quant_dtype: torch.dtype,
+             residual: Optional[torch.Tensor],
+             scale_ub: Optional[torch.Tensor]) \
+        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual,
+                                       scale_ub)
+
+
+@pytest.mark.parametrize("num_tokens, hidden_size", NUM_TOKENS_HIDDEN_SIZES)
+@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
+@pytest.mark.parametrize("scale_ub", SCALE_UBS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("quant_dtype", QUANT_DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_rms_norm(
+    num_tokens: int,
+    hidden_size: int,
+    add_residual: bool,
+    scale_ub: bool,
+    dtype: torch.dtype,
+    quant_dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+
+    if scale_ub is not None and quant_dtype != torch.float8_e4m3fn:
+        # skip
+        return
+
+    layer = RMSNorm(hidden_size, EPS).to(dtype=dtype)
+
+    # Make weights
+    layer.weight.data.normal_(mean=1.0, std=0.1)
+
+    # Make inputs
+    scale = 1 / (hidden_size)
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype) * scale
+    residual = torch.randn_like(x) * scale if add_residual else None
+    if scale_ub is not None:
+        rms_x, _ = ref_rms_norm(layer, x, residual)
+        scale_ub = torch.mean(rms_x).to(dtype=torch.float32, device='cuda')
+
+    ref_out, ref_scales, ref_residual = \
+        ref_impl(layer, x, quant_dtype, residual, scale_ub)
+    ops_out, ops_scales, ops_residual = \
+        ops_impl(layer.weight, x, quant_dtype, residual, scale_ub)
+
+    assert ref_out.dtype == quant_dtype
+    assert ops_out.dtype == quant_dtype
+    assert torch.allclose(ref_scales, ops_scales)
+    if quant_dtype == torch.int8:
+        # big atol to account for round-off errors.
+        assert torch.allclose(ref_out, ops_out, atol=1)
+    else:
+        assert torch.allclose(ref_out.to(dtype=torch.float32),
+                              ops_out.to(dtype=torch.float32))
+    if add_residual:
+        assert torch.allclose(ref_residual, ops_residual)
+
+    output = torch.empty_like(x, dtype=quant_dtype)
+    scales = torch.empty((x.numel() // x.shape[-1], 1),
+                         device=x.device,
+                         dtype=torch.float32)
+
+    opcheck(torch.ops._C.rms_norm_dynamic_per_token_quant,
+            (output, x, layer.weight, scales, 1e-5, scale_ub, residual))
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index c192c9a7b0e4d..d6002630ee02c 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -249,6 +249,26 @@ def advance_step_flashinfer(num_seqs: int, num_queries: int, block_size: int,
         block_table_bound)
 
 
+# fused quant layer norm ops
+def rms_norm_dynamic_per_token_quant(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+    quant_dtype: torch.dtype,
+    scale_ub: Optional[torch.Tensor] = None,
+    residual: Optional[torch.Tensor] = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    output = torch.empty_like(input, dtype=quant_dtype)
+    scales = torch.empty((input.numel() // input.shape[-1], 1),
+                         device=input.device,
+                         dtype=torch.float32)
+
+    torch.ops._C.rms_norm_dynamic_per_token_quant(output, input, weight,
+                                                  scales, epsilon, scale_ub,
+                                                  residual)
+    return output, scales
+
+
 # quantization ops
 # awq
 def awq_dequantize(qweight: torch.Tensor, scales: torch.Tensor,
diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index 3584cc3608caf..e15d7b315c50f 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -6,7 +6,8 @@
 
 from vllm.logger import init_logger
 
-from .vllm_inductor_pass import VllmInductorPass, is_func
+from .fx_utils import is_func
+from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
 
@@ -53,14 +54,16 @@ def __call__(self, graph: torch.fx.Graph):
                 self.insert_defunctionalized(graph, node)
                 self._remove(node)
 
-            # These 2 replacements avoid the most copies for LLaMa.
+            # rms_norm replacements avoid the most copies for LLaMa.
             elif at_target == torch.ops._C.fused_add_rms_norm.default:
                 mutated_args = {1: 'input', 2: 'residual'}
                 self.defunctionalize(graph, node, mutated_args)
             elif at_target == torch.ops._C.fused_add_rms_norm_static_fp8_quant.default:  # noqa: E501
                 mutated_args = {1: 'result', 2: 'residual'}
                 self.defunctionalize(graph, node, mutated_args)
-
+            elif at_target == torch.ops._C.rms_norm_dynamic_per_token_quant.default:  # noqa: E501
+                mutated_args = {1: 'result', 2: 'scale', 3: 'residual'}
+                self.defunctionalize(graph, node, mutated_args)
             elif at_target in [
                     torch.ops._C.rms_norm.default,
                     torch.ops._C.rms_norm_static_fp8_quant.default
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index 5efa410fab6a0..cde27bd108212 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -1,129 +1,517 @@
-import operator
-from typing import Iterable, List, Optional
+from typing import Callable, Dict, List, NamedTuple, Optional, Tuple
 
 import torch
+import torch._inductor.pattern_matcher as pm
+# TODO(luka) use vllm.utils once #10836 landed
+from compressed_tensors.quantization import FP8_DTYPE
+from torch import fx
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
-from torch._inductor.pattern_matcher import (Match, PatternMatcherPass,
-                                             fwd_only, register_replacement)
+from torch._inductor.pattern_matcher import PatternMatcherPass
+from torch._ops import OpOverload
 
 from vllm.config import CompilationConfig
 from vllm.logger import init_logger
 
-from .vllm_inductor_pass import VllmInductorPass, is_func
+from .fx_utils import find_getitem_maybe
+from .multi_output_match import MultiOutputMatch
+from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
 
 
-def rms_pattern_static(result: torch.Tensor, result_rms: torch.Tensor,
-                       input: torch.Tensor, weight: torch.Tensor,
-                       scale: torch.Tensor):
-    at1 = auto_functionalized(torch.ops._C.rms_norm.default,
-                              result=result_rms,
-                              input=input,
-                              weight=weight,
-                              epsilon=1e-5)
-    at2 = auto_functionalized(torch.ops._C.static_scaled_fp8_quant.default,
-                              result=result,
-                              input=at1[1],
-                              scale=scale)
-
-    # result
-    return at2[1]
-
-
-def rms_replacement_static(result: torch.Tensor, result_rms: torch.Tensor,
-                           input: torch.Tensor, weight: torch.Tensor,
-                           scale: torch.Tensor):
-    at = auto_functionalized(torch.ops._C.rms_norm_static_fp8_quant.default,
-                             result=result,
-                             input=input,
-                             weight=weight,
-                             scale=scale,
-                             epsilon=1e-5)
-
-    # result
-    return at[1]
-
-
-def rms_pattern_residual_static(result: torch.Tensor, input: torch.Tensor,
-                                residual: torch.Tensor, weight: torch.Tensor,
-                                scale: torch.Tensor):
-    at = auto_functionalized(torch.ops._C.fused_add_rms_norm.default,
-                             input=input,
-                             residual=residual,
-                             weight=weight,
-                             epsilon=1e-5)
-    at1 = auto_functionalized(torch.ops._C.static_scaled_fp8_quant.default,
-                              result=result,
-                              input=at[1],
-                              scale=scale)
-
-    # result, residual
-    return at1[1], at[2]
-
-
-def rms_replacement_residual_static(result: torch.Tensor, input: torch.Tensor,
-                                    residual: torch.Tensor,
-                                    weight: torch.Tensor, scale: torch.Tensor):
-    at = auto_functionalized(
-        torch.ops._C.fused_add_rms_norm_static_fp8_quant.default,
-        result=result,
-        input=input,
-        residual=residual,
-        weight=weight,
-        scale=scale,
-        epsilon=1e-5)
-    # result, residual
-    return at[1], at[2]
-
-
 def empty_bf16(*args, **kwargs):
     return torch.empty(*args, **kwargs, dtype=torch.bfloat16, device="cuda")
 
 
-def empty_fp8(*args, **kwargs):
-    fp8 = torch.float8_e4m3fn
-    return torch.empty(*args, **kwargs, dtype=fp8, device="cuda")
-
-
 def empty_fp32(*args, **kwargs):
     return torch.empty(*args, **kwargs, dtype=torch.float32, device="cuda")
 
 
-# Utilities for post-processing multi-output matches
+RMS_OP = torch.ops._C.rms_norm.default
+RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default
+
+
+class QuantKey(NamedTuple):
+    """
+    Named tuple for identifying the type of quantization.
+    dtype: quantized data type
+    static: static quantization if True, dynamic if False
+    per_tensor: per-tensor quantization if True, per-token if False
+    symmetric: symmetric if True, asymmetric if False
+    """
+    dtype: torch.dtype
+    static: bool
+    per_tensor: bool = True
+    symmetric: bool = True
+
+    def __str__(self):
+        return (f"QuantKey({'static' if self.static else 'dynamic'},"
+                f"{fx.graph.dtype_abbrs[self.dtype]},"
+                f"{'per_tensor' if self.per_tensor else 'per_token'},"
+                f"{'a' if not self.symmetric else ''}symmetric)")
+
+
+kFp8StaticTensorSym = QuantKey(FP8_DTYPE, True, True, True)
+kFp8DynamicTensorSym = QuantKey(FP8_DTYPE, False, True, True)
+kFp8DynamicTokenSym = QuantKey(FP8_DTYPE, False, False, True)
+
+QUANT_OPS: Dict[QuantKey, OpOverload] = {
+    kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default,  # noqa
+    kFp8DynamicTensorSym:
+    torch.ops._C.dynamic_scaled_fp8_quant.default,  # noqa
+    kFp8DynamicTokenSym:
+    torch.ops._C.dynamic_per_token_scaled_fp8_quant.default,  # noqa
+}
+
+
+class FusedRMSQuantKey(NamedTuple):
+    """
+    Named tuple for identifying the type of RMSNorm + quant fusion.
+    quant: type of quantization
+    fused_add: does the op also perform the residual add
+    """
+    quant: QuantKey
+    fused_add: bool
+
+    def __str__(self):
+        return (f"FusedQuantKey({self.quant}, with"
+                f"{'' if self.fused_add else 'out'} residual)")
+
+
+FUSED_OPS: Dict[FusedRMSQuantKey, OpOverload] = {
+    FusedRMSQuantKey(kFp8StaticTensorSym, False):
+    torch.ops._C.rms_norm_static_fp8_quant.default,  # noqa
+    FusedRMSQuantKey(kFp8StaticTensorSym, True):
+    torch.ops._C.fused_add_rms_norm_static_fp8_quant.default,  # noqa
+    FusedRMSQuantKey(kFp8DynamicTokenSym, False):
+    torch.ops._C.rms_norm_dynamic_per_token_quant.default,  # noqa
+    FusedRMSQuantKey(kFp8DynamicTokenSym, True):
+    torch.ops._C.rms_norm_dynamic_per_token_quant.default,  # noqa
+}
+
+
+class QuantMultiOutputMatch(MultiOutputMatch):
+
+    def __init__(self, match: pm.Match, quant_op, fused_op):
+        super().__init__(match)
+        assert isinstance(quant_op, OpOverload)
+        assert isinstance(fused_op, OpOverload)
+        self.QUANT_OP = quant_op  # in-place quant op
+        self.FUSED_OP = fused_op  # in-place fused quant op
+
+    def insert_fused_node(self, fused_return_mapping: Dict[int, Tuple[fx.Node,
+                                                                      int]],
+                          **kwargs):
+        """
+        This utility function inserts an auto-functionalized node for FUSED_OP.
+        It also correctly sets its meta value and rebinds the users of the
+        unfused nodes to use the fused node instead.
+
+        :param fused_return_mapping: A dictionary, mapping from getitem indices
+        of the fused node result to a tuple of the old node and a getitem index.
+        :param kwargs: kwargs that get directly forwarded to the auto_fn node
+
+        Example:
+        If we want to replace this graph:
+        _, x1, x2 = auto_fn(op1)
+        _, y1, y2 = auto_fn(op2)
+
+        with
+        _, x1, y2, x2 = auto_fn(FUSED_OP)
+
+        we would call:
+        insert_fused_node({1: (op1_node, 1), 2: (op2_node, 2), 3: (op1_node, 2)}
+
+        Note that the 0th element is None for auto-functionalized in-place ops.
+        Hence, others appear 1-indexed.
+        """
+        fused_node = self.insert_auto_fn(self.FUSED_OP, kwargs)
+        indices = fused_return_mapping.keys()
+        getitem_nodes = self.insert_getitems(fused_node, indices)
+
+        # Prepare the meta value, use a list so it's mutable
+        meta_val = [None] * (max(indices) + 1)
+
+        # Iterate through elements of the tuple produced by fused_node
+        for idx, getitem_node in zip(indices, getitem_nodes):
+            old_node, old_idx = fused_return_mapping[idx]
+
+            # If the old value was never used, the old_getitem might not exist
+            old_getitem = find_getitem_maybe(old_node, old_idx)
+            if old_getitem is not None:
+                # Rebind the users of match getitem nodes to use the new nodes.
+                # The old nodes will be removed by DCE at the end of the pass.
+                old_getitem.replace_all_uses_with(getitem_node)
+                getitem_node.meta["val"] = old_getitem.meta["val"]
+
+            # Extract the appropriate meta value
+            # It is present even if the getitem node does not exist
+            meta_val[idx] = old_node.meta["val"][old_idx]
+
+        # Fix the meta value on the new fused node
+        fused_node.meta["val"] = tuple(meta_val)
+
+
+class RMSNormQuantPattern:
+
+    def __init__(self, epsilon: float, key: FusedRMSQuantKey):
+        self.epsilon = epsilon
+        self.quant_dtype = key.quant.dtype
+
+        assert key.quant in QUANT_OPS, \
+            f"unsupported quantization scheme {key.quant}"
+        self.QUANT_OP = QUANT_OPS[key.quant]
+
+        assert key in FUSED_OPS, \
+            f"unsupported fused rmsnorm+quant op for {key}"
+        self.FUSED_OP = FUSED_OPS[key]
+
+
+class RMSNormStaticQuantPattern(RMSNormQuantPattern):
+
+    def __init__(self,
+                 epsilon: float,
+                 quant_dtype: torch.dtype,
+                 symmetric=True):
+        fused_key = FusedRMSQuantKey(fused_add=False,
+                                     quant=QuantKey(dtype=quant_dtype,
+                                                    static=True,
+                                                    per_tensor=True,
+                                                    symmetric=symmetric))
+        super().__init__(epsilon, fused_key)
+
+    def register(self, pm_pass: PatternMatcherPass):
+        # Cannot use methods, as the self argument affects tracing
+        def pattern(result: torch.Tensor, result_rms: torch.Tensor,
+                    input: torch.Tensor, weight: torch.Tensor,
+                    scale: torch.Tensor):
+            at1 = auto_functionalized(RMS_OP,
+                                      result=result_rms,
+                                      input=input,
+                                      weight=weight,
+                                      epsilon=self.epsilon)
+            at2 = auto_functionalized(self.QUANT_OP,
+                                      result=result,
+                                      input=at1[1],
+                                      scale=scale)
+
+            # result
+            return at2[1]
+
+        def replacement(result: torch.Tensor, result_rms: torch.Tensor,
+                        input: torch.Tensor, weight: torch.Tensor,
+                        scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     input=input,
+                                     weight=weight,
+                                     scale=scale,
+                                     epsilon=self.epsilon)
+
+            # result
+            return at[1]
+
+        inputs = [
+            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
+            empty_bf16(5, 4),  # result_rms
+            empty_bf16(5, 4),  # input
+            empty_bf16(1, 5),  # weight
+            empty_fp32(1, 1)  # scale
+        ]
+
+        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only,
+                                pm_pass)
+
+
+class FusedAddRMSNormStaticQuantPattern(RMSNormQuantPattern):
+
+    def __init__(self,
+                 epsilon: float,
+                 quant_dtype: torch.dtype,
+                 symmetric=True):
+        key = FusedRMSQuantKey(fused_add=True,
+                               quant=QuantKey(dtype=quant_dtype,
+                                              static=True,
+                                              per_tensor=True,
+                                              symmetric=symmetric))
+        super().__init__(epsilon, key)
+
+    def register(self, pm_pass: PatternMatcherPass,
+                 record_match: Callable[[MultiOutputMatch], bool]):
+
+        def pattern(result: torch.Tensor, input: torch.Tensor,
+                    residual: torch.Tensor, weight: torch.Tensor,
+                    scale: torch.Tensor):
+            at = auto_functionalized(RMS_ADD_OP,
+                                     input=input,
+                                     residual=residual,
+                                     weight=weight,
+                                     epsilon=self.epsilon)
+            at1 = auto_functionalized(self.QUANT_OP,
+                                      result=result,
+                                      input=at[1],
+                                      scale=scale)
+
+            # result, residual
+            return at1[1], at[2]
+
+        def replacement(result: torch.Tensor, input: torch.Tensor,
+                        residual: torch.Tensor, weight: torch.Tensor,
+                        scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     input=input,
+                                     residual=residual,
+                                     weight=weight,
+                                     scale=scale,
+                                     epsilon=self.epsilon)
+
+            # result, residual
+            return at[1], at[2]
+
+        inputs = [
+            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
+            empty_bf16(5, 4),  # input
+            empty_bf16(5, 4),  # residual
+            empty_bf16(1, 5),  # weight
+            empty_fp32(1, 1)  # scale
+        ]
+
+        pm.register_replacement(
+            pattern,
+            replacement,
+            inputs,
+            pm.fwd_only,
+            pm_pass,
+            extra_check=lambda m: record_match(
+                self.Match(m, self.QUANT_OP, self.FUSED_OP)))
+
+    class Match(QuantMultiOutputMatch):
+
+        def process(self):
+            # Find the nodes in the match that we need to rebind
+            rms_node = self.find_auto_fn(RMS_ADD_OP)
+            quant_node = self.find_auto_fn(self.QUANT_OP)
+
+            assert len(rms_node.users) == 2
+            assert len(quant_node.users) == 1
+
+            # First, insert a new auto_functionalized node for the fused op,
+            # as well as getitem nodes to extract the result and residual.
+            # The auto_fn node returns a tuple of (None, result, residual).
+            #
+            # The resulting graph looks like this:
+            # at = auto_functionalized(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, ...)  # noqa
+            # result_node_new = at[1]
+            # residual_node_new = at[2]
+            with self.inserting_after_match():
+                # Missing epsilon, scalars cannot be inputs to the pattern
+                kwargs = self.match.kwargs.copy()
+
+                # 0 is always None
+                fused_return_mapping = {1: (quant_node, 1), 2: (rms_node, 2)}
+                self.insert_fused_node(fused_return_mapping,
+                                       epsilon=rms_node.kwargs["epsilon"],
+                                       **kwargs)
+
+
+class RMSNormDynamicQuantPattern(RMSNormQuantPattern):
+
+    def __init__(self,
+                 epsilon: float,
+                 quant_dtype: torch.dtype,
+                 per_tensor: bool,
+                 symmetric=True):
+        key = FusedRMSQuantKey(fused_add=False,
+                               quant=QuantKey(dtype=quant_dtype,
+                                              static=False,
+                                              per_tensor=per_tensor,
+                                              symmetric=symmetric))
+        super().__init__(epsilon, key)
+
+    def register(self, pm_pass: PatternMatcherPass,
+                 record_match: Callable[[MultiOutputMatch], bool]):
+
+        def pattern(result: torch.Tensor, result_rms: torch.Tensor,
+                    input: torch.Tensor, weight: torch.Tensor,
+                    scale: torch.Tensor):
+            at1 = auto_functionalized(RMS_OP,
+                                      result=result_rms,
+                                      input=input,
+                                      weight=weight,
+                                      epsilon=self.epsilon)
+            at2 = auto_functionalized(self.QUANT_OP,
+                                      result=result,
+                                      input=at1[1],
+                                      scale=scale,
+                                      scale_ub=None)
+
+            # result, scale
+            return at2[1], at2[2]
+
+        def replacement(result: torch.Tensor, result_rms: torch.Tensor,
+                        input: torch.Tensor, weight: torch.Tensor,
+                        scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     input=input,
+                                     weight=weight,
+                                     scale=scale,
+                                     epsilon=self.epsilon,
+                                     scale_ub=None,
+                                     residual=None)
+
+            # result, scale
+            return at[1], at[2]
+
+        inputs = [
+            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
+            empty_bf16(5, 4),  # result_rms
+            empty_bf16(5, 4),  # input
+            empty_bf16(1, 5),  # weight
+            empty_fp32(1, 1)  # scale
+        ]
+
+        pm.register_replacement(
+            pattern,
+            replacement,
+            inputs,
+            pm.fwd_only,
+            pm_pass,
+            extra_check=lambda m: record_match(
+                self.Match(m, self.QUANT_OP, self.FUSED_OP)))
 
+    class Match(QuantMultiOutputMatch):
 
-# Returns the first auto_functionalized node with the given op (if it exists)
-def find_auto_fn_maybe(nodes: Iterable[torch.fx.Node],
-                       op) -> Optional[torch.fx.Node]:
-    for node in nodes:
-        if is_func(node, auto_functionalized) and node.args[0] == op:  # noqa
-            return node
-    return None
+        def process(self):
+            # Find the nodes in the match that we need to rebind
+            rms_node = self.find_auto_fn(RMS_OP)
+            quant_node = self.find_auto_fn(self.QUANT_OP)
 
+            assert len(rms_node.users) == 1
+            assert len(quant_node.users) == 2
 
-# Returns the first auto_functionalized node with the given op
-def find_auto_fn(nodes: Iterable[torch.fx.Node], op) -> torch.fx.Node:
-    node = find_auto_fn_maybe(nodes, op)
-    assert node is not None, f"Could not find {op} in nodes {nodes}"
-    return node
+            # First, insert a new auto_functionalized node for the fused op,
+            # as well as getitem nodes to extract the result and scale.
+            # The auto_fn node returns a tuple of (None, result, scale).
+            #
+            # The resulting graph looks like this:
+            # at = auto_functionalized(torch.ops._C.rms_norm_dynamic_per_token_quant.default, ...)  # noqa
+            # result_node_new = at[1]
+            # scale_node_new = at[2]
+            with self.inserting_after_match():
+                # Missing epsilon, scalars cannot be inputs to the pattern
+                kwargs = self.match.kwargs.copy()
+                del kwargs["result_rms"]  # not used in the fused op
+
+                fused_return_mapping = {1: (quant_node, 1), 2: (quant_node, 2)}
+                self.insert_fused_node(
+                    fused_return_mapping,
+                    epsilon=rms_node.kwargs["epsilon"],
+                    scale_ub=None,  # not used but required
+                    residual=None,  # not used but required
+                    **kwargs)
+
+
+class FusedAddRMSNormDynamicQuantPattern(RMSNormQuantPattern):
+
+    def __init__(self,
+                 epsilon: float,
+                 quant_dtype: torch.dtype,
+                 per_tensor: bool = True,
+                 symmetric=True):
+        key = FusedRMSQuantKey(fused_add=True,
+                               quant=QuantKey(dtype=quant_dtype,
+                                              static=False,
+                                              per_tensor=per_tensor,
+                                              symmetric=symmetric))
+        super().__init__(epsilon, key)
+
+    def register(self, pm_pass: PatternMatcherPass,
+                 record_match: Callable[[MultiOutputMatch], bool]):
+
+        def pattern(result: torch.Tensor, input: torch.Tensor,
+                    residual: torch.Tensor, weight: torch.Tensor,
+                    scale: torch.Tensor):
+            at = auto_functionalized(RMS_ADD_OP,
+                                     input=input,
+                                     residual=residual,
+                                     weight=weight,
+                                     epsilon=self.epsilon)
+            at1 = auto_functionalized(self.QUANT_OP,
+                                      result=result,
+                                      input=at[1],
+                                      scale=scale,
+                                      scale_ub=None)
+
+            # result, residual, scale
+            return at1[1], at[2], at1[2]
+
+        def replacement(result: torch.Tensor, input: torch.Tensor,
+                        residual: torch.Tensor, weight: torch.Tensor,
+                        scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     input=input,
+                                     weight=weight,
+                                     scale=scale,
+                                     epsilon=self.epsilon,
+                                     scale_ub=None,
+                                     residual=residual)
+
+            # result, residual, scale
+            return at[1], at[3], at[2]
 
+        inputs = [
+            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
+            empty_bf16(5, 4),  # input
+            empty_bf16(5, 4),  # residual
+            empty_bf16(1, 5),  # weight
+            empty_fp32(1, 1)  # scale
+        ]
 
-# Returns the getitem node that extracts the idx-th element from node
-# (if it exists)
-def find_getitem_maybe(node: torch.fx.Node,
-                       idx: int) -> Optional[torch.fx.Node]:
-    for user in node.users:
-        if is_func(user, operator.getitem) and user.args[1] == idx:
-            return user
-    return None
+        pm.register_replacement(
+            pattern,
+            replacement,
+            inputs,
+            pm.fwd_only,
+            pm_pass,
+            extra_check=lambda m: record_match(
+                self.Match(m, self.QUANT_OP, self.FUSED_OP)))
 
+    class Match(QuantMultiOutputMatch):
 
-# Returns the getitem node that extracts the idx-th element from node
-def find_getitem(node: torch.fx.Node, idx: int) -> torch.fx.Node:
-    ret = find_getitem_maybe(node, idx)
-    assert ret is not None, f"Could not find getitem {idx} in node {node}"
-    return ret
+        def process(self):
+            # Find the nodes in the match that we need to rebind
+            rms_node = self.find_auto_fn(RMS_ADD_OP)
+            quant_node = self.find_auto_fn(self.QUANT_OP)
+
+            assert len(rms_node.users) == 2
+            assert len(quant_node.users) == 2
+
+            # First, insert a new auto_functionalized node for the fused op,
+            # as well as getitem nodes to extract result, scale, and residual.
+            # The auto_fn node returns a tuple (None, result, scale, residual).
+            #
+            # The resulting graph looks like this:
+            # at = auto_functionalized(torch.ops._C.rms_norm_dynamic_per_token_quant.default, ...)  # noqa
+            # result_node_new = at[1]
+            # scale_node_new = at[2]
+            # residual_node_new = at[3]
+            with self.inserting_after_match():
+                # Missing epsilon, scalars cannot be inputs to the pattern
+                kwargs = self.match.kwargs.copy()
+
+                fused_return_mapping = {
+                    1: (quant_node, 1),  # result
+                    2: (quant_node, 2),  # scale
+                    3: (rms_node, 2),  # residual
+                }
+                self.insert_fused_node(
+                    fused_return_mapping,
+                    epsilon=rms_node.kwargs["epsilon"],
+                    scale_ub=None,  # not used but required
+                    **kwargs)
 
 
 class FusionPass(VllmInductorPass):
@@ -158,41 +546,39 @@ def __init__(self, config: CompilationConfig.PassConfig):
             "FusionPass singleton instance already exists"
         super().__init__(config)
 
-        self.matches: List[Match] = []
+        self.matches: List[MultiOutputMatch] = []
         self.patterns: PatternMatcherPass = PatternMatcherPass(
             pass_name="fusion_pass")
 
-        # Fuse rms_norm + static_scaled_fp8_quant into
-        # rms_norm_static_fp8_quant
-        inputs = [
-            empty_fp8(5, 4),
-            empty_bf16(5, 4),
-            empty_bf16(5, 4),
-            empty_bf16(1, 5),
-            empty_fp32(1, 1)
-        ]
-        register_replacement(rms_pattern_static, rms_replacement_static,
-                             inputs, fwd_only, self.patterns)
+        for epsilon in [1e-5, 1e-6]:
+            # Fuse rms_norm + static fp8 quant
+            RMSNormStaticQuantPattern(epsilon,
+                                      FP8_DTYPE).register(self.patterns)
 
-        # Fuse fused_add_rms_norm + static_scaled_fp8_quant into
-        # fused_add_rms_norm_static_fp8_quant
-        # Because pattern has 2 outputs, we need to manually process the match
-        # (see process_matches)
-        inputs = [
-            empty_fp8(5, 4),
-            empty_bf16(5, 4),
-            empty_bf16(5, 4),
-            empty_bf16(1, 5),
-            empty_fp32(1, 1)
-        ]
-        register_replacement(rms_pattern_residual_static,
-                             rms_replacement_residual_static,
-                             inputs,
-                             fwd_only,
-                             self.patterns,
-                             extra_check=lambda m: self.record_match(m))
-
-    def record_match(self, match: Match) -> bool:
+            # Matches for patterns below have 2 or more outputs,
+            # so we need to process them manually (see process_matches)
+
+            # Fuse rms_norm + static fp8 quant
+            FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register(
+                self.patterns, self.record_match)
+
+            # Fuse rms_norm + dynamic per-token fp8 quant
+            RMSNormDynamicQuantPattern(epsilon, FP8_DTYPE,
+                                       per_tensor=False).register(
+                                           self.patterns, self.record_match)
+
+            # Fuse fused_add_rms_norm + dynamic per-token fp8 quant
+            FusedAddRMSNormDynamicQuantPattern(epsilon,
+                                               FP8_DTYPE,
+                                               per_tensor=False).register(
+                                                   self.patterns,
+                                                   self.record_match)
+
+            # WARNING: This is a hack to clear the pattern matcher cache
+            # and allow multiple values of epsilon.
+            torch._inductor.pattern_matcher._seen_patterns.clear()
+
+    def record_match(self, match: MultiOutputMatch) -> bool:
         # Hijack the extra_check to record the match and
         # save it for post-processing.
         self.matches.append(match)
@@ -200,83 +586,20 @@ def record_match(self, match: Match) -> bool:
         # Return False to prevent automatic replacement.
         return False
 
-    def process_matches(self, graph: torch.fx.Graph):
+    def process_matches(self, graph: fx.Graph):
         """
         Manually process multi-output matches and replace them with fused nodes.
-        This is necessary because the automatic replacement for multi-output
-        matches is broken: https://github.com/pytorch/pytorch/issues/137280
+        See MultiOutputMatch for more details.
         """
         for match in self.matches:
-            # To avoid use-before-definition errors, insert replacement nodes
-            # after the last node in the match.
-            # match.nodes is not guaranteed to be sorted.
-            # Find the last node in the match.
-            for last_node_in_match in reversed(graph.nodes):
-                if last_node_in_match in match.nodes:
-                    break
-            else:
-                raise ValueError("No nodes in graph")
-
-            # Insert a new auto_functionalized node for the fused operation,
-            # as well as getitem nodes to extract the result and residual.
-            # The auto_functionalized node returns a tuple of
-            # (None, result, residual) - None is the function return value.
-            # The resulting graph looks like this:
-            # at = auto_functionalized(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, ...)  # noqa
-            # result_node_new = at[1]
-            # residual_node_new = at[2]
-            with graph.inserting_after(last_node_in_match):
-                kwargs = match.kwargs
-                kwargs["epsilon"] = 1e-5  # Currently hard-coded in RMSNorm
-
-                fused_node = graph.call_function(
-                    auto_functionalized,
-                    (torch.ops._C.fused_add_rms_norm_static_fp8_quant.default,
-                     ),
-                    kwargs=kwargs)
-
-                graph.inserting_after(fused_node)
-                result_node_new = graph.call_function(operator.getitem,
-                                                      (fused_node, 1))
-                residual_node_new = graph.call_function(
-                    operator.getitem, (fused_node, 2))
-
-            # Last part of replacement is rebinding the users of nodes in the
-            # match to use the new nodes.
-
-            # Find the nodes in the match that we need to rebind
-            rms_node = find_auto_fn(match.nodes,
-                                    torch.ops._C.fused_add_rms_norm.default)
-            quant_node = find_auto_fn(
-                match.nodes, torch.ops._C.static_scaled_fp8_quant.default)
-
-            assert len(rms_node.users) == 2
-            assert len(quant_node.users) == 1
-
-            # meta["val"] is used by de-functionalization and has to contain the
-            # value of the node (tuple of tensors) that would be returned by the
-            # functionalized node during tracing.
-
-            rms_tup = rms_node.meta["val"]
-            quant_tup = quant_node.meta["val"]
-
-            # The result of fused_node must be a tuple with the first element
-            # None (the function return value) and the remaining elements
-            # representing the mutated inputs.
-            fused_tup = (None, quant_tup[1], rms_tup[1], rms_tup[2])
-            fused_node.meta["val"] = fused_tup
-
-            # Find the getitem nodes and replace their uses with the new nodes.
-            # The old nodes will be removed by DCE at the end of the pass.
-            find_getitem(rms_node, 2).replace_all_uses_with(residual_node_new)
-            find_getitem(quant_node, 1).replace_all_uses_with(result_node_new)
+            match.process()
 
         # Finally, remove matched nodes
         graph.eliminate_dead_code()
         assert all(node not in graph.nodes for match in self.matches
-                   for node in match.nodes)
+                   for node in match.match.nodes)
 
-    def __call__(self, graph: torch.fx.Graph):
+    def __call__(self, graph: fx.Graph):
         self.begin()
         self.dump_graph(graph, "before_fusion")
 
diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py
new file mode 100644
index 0000000000000..924e26f2e262e
--- /dev/null
+++ b/vllm/compilation/fx_utils.py
@@ -0,0 +1,42 @@
+import operator
+from typing import Iterable, Optional
+
+from torch import fx
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._ops import OpOverload
+
+
+def is_func(node: fx.Node, target) -> bool:
+    return node.op == "call_function" and node.target == target
+
+
+# Returns the first auto_functionalized node with the given op (if it exists)
+def find_auto_fn_maybe(nodes: Iterable[fx.Node],
+                       op: OpOverload) -> Optional[fx.Node]:
+    for node in nodes:
+        if is_func(node, auto_functionalized) and node.args[0] == op:  # noqa
+            return node
+    return None
+
+
+# Returns the first auto_functionalized node with the given op
+def find_auto_fn(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node:
+    node = find_auto_fn_maybe(nodes, op)
+    assert node is not None, f"Could not find {op} in nodes {nodes}"
+    return node
+
+
+# Returns the getitem node that extracts the idx-th element from node
+# (if it exists)
+def find_getitem_maybe(node: fx.Node, idx: int) -> Optional[fx.Node]:
+    for user in node.users:
+        if is_func(user, operator.getitem) and user.args[1] == idx:
+            return user
+    return None
+
+
+# Returns the getitem node that extracts the idx-th element from node
+def find_getitem(node: fx.Node, idx: int) -> fx.Node:
+    ret = find_getitem_maybe(node, idx)
+    assert ret is not None, f"Could not find getitem {idx} in node {node}"
+    return ret
diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py
new file mode 100644
index 0000000000000..0ad648abfbb3a
--- /dev/null
+++ b/vllm/compilation/multi_output_match.py
@@ -0,0 +1,105 @@
+import abc
+import operator
+from abc import abstractmethod
+from typing import Iterable, List, Tuple
+
+from torch import fx
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._inductor import pattern_matcher as pm
+from torch._ops import OpOverload
+
+from vllm.compilation.fx_utils import find_auto_fn
+
+
+class MultiOutputMatch(abc.ABC):
+    """
+    This class provides utilities to process multi-output matches and
+    manually insert replacements.
+
+    This is necessary because the automatic replacement for multi-output
+    matches is broken: https://github.com/pytorch/pytorch/issues/137280
+    """
+
+    def __init__(self, match: pm.Match):
+        self.match = match
+
+    @abstractmethod
+    def process(self):
+        """
+        Process a multi-output match and manually insert the replacement.
+
+        This method should:
+        1. Insert the replacement nodes after the last node in the match.
+        2. Rebind the users of nodes in the match to use the new nodes.
+        3. Set meta["val"] for de-functionalization.
+
+        The result of an auto-functionalized node is a tuple of tensors.
+        The first element is the return value of the function, usually None.
+        The remaining elements are the mutated args of the function.
+
+        All auto-functionalized nodes must contain a proper meta["val"],
+        as it is used by de-functionalization. meta["val"] has to contain the
+        value of the node (tuple of tensors) that would be returned by the
+        functionalized node during tracing.
+
+        Existing nodes in the graph all have this property set, but we have
+        to set it manually for new nodes we insert.
+
+        Example:
+        # op schema: foo(a: Tensor!, b: Tensor, c: Tensor!) -> None
+        at = auto_functionalized(torch.ops._C.foo.default, a, b, c)
+        # at.meta["val"] = (None, a, c)
+        """
+        raise NotImplementedError
+
+    @property
+    def nodes(self) -> List[fx.Node]:
+        return self.match.nodes
+
+    @property
+    def graph(self) -> fx.Graph:
+        return self.match.graph
+
+    def find_auto_fn(self, op) -> fx.Node:
+        """
+        Find the first auto_functionalized node with the given op in the match.
+        """
+        return find_auto_fn(self.nodes, op)
+
+    def inserting_after_match(self):
+        """
+        Insert nodes after the last node in the match.
+        This is done to avoid use-before-definition errors after inserting
+        replacement nodes.
+        """
+
+        # match.nodes is not guaranteed to be sorted.
+        # Find the last node in the match.
+        for last_node_in_match in reversed(self.graph.nodes):
+            if last_node_in_match in self.match.nodes:
+                break
+        else:
+            raise ValueError("No nodes in graph")
+
+        return self.graph.inserting_after(last_node_in_match)
+
+    def insert_getitems(self, tuple_node: fx.Node,
+                        indices: Iterable[int]) -> Tuple[fx.Node, ...]:
+        """
+        Insert operator.getitem nodes to extract elements from a tuple node.
+
+        :param tuple_node: The tuple node to extract elements from.
+        :param indices: The indices of the elements to extract.
+        :return: Tuple of the new getitem nodes, corresponding to the indices.
+        """
+        with self.graph.inserting_after(tuple_node):
+            return tuple(
+                self.graph.call_function(operator.getitem, (tuple_node, idx))
+                for idx in indices)
+
+    def insert_auto_fn(self, op: OpOverload, kwargs):
+        """
+        Insert an auto_functionalized node with the given op and kwargs.
+        """
+        return self.graph.call_function(auto_functionalized, (op, ),
+                                        kwargs=kwargs)
diff --git a/vllm/compilation/reshapes.py b/vllm/compilation/reshapes.py
index 63a369fe8d966..ba28b1f0be7bd 100644
--- a/vllm/compilation/reshapes.py
+++ b/vllm/compilation/reshapes.py
@@ -5,7 +5,8 @@
 
 from vllm.logger import init_logger
 
-from .vllm_inductor_pass import VllmInductorPass, is_func
+from .fx_utils import is_func
+from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
 
diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py
index dbf6b8f7789e1..b8c52a7f46838 100644
--- a/vllm/compilation/vllm_inductor_pass.py
+++ b/vllm/compilation/vllm_inductor_pass.py
@@ -16,10 +16,6 @@
 logger = init_logger(__name__)
 
 
-def is_func(node: torch.fx.Node, target) -> bool:
-    return node.op == "call_function" and node.target == target
-
-
 class VllmInductorPass(InductorPass):
     """
     An inductor pass with access to vLLM PassConfig.

From 1efce686053c15cd6f84361bb0bd1898fbb23a82 Mon Sep 17 00:00:00 2001
From: Pooya Davoodi <pooya.davoodi@parasail.io>
Date: Thu, 12 Dec 2024 20:09:53 -0800
Subject: [PATCH 046/357] [Bugfix] Use runner_type instead of task in GritLM
 (#11144)

Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io>
---
 tests/models/embedding/language/test_gritlm.py | 6 +++---
 vllm/model_executor/models/gritlm.py           | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py
index b947265be9e9d..55c2e5d4ed412 100644
--- a/tests/models/embedding/language/test_gritlm.py
+++ b/tests/models/embedding/language/test_gritlm.py
@@ -35,7 +35,7 @@ def test_find_array(monkeypatch):
     from vllm.model_executor.models.gritlm import GritLMPooler
 
     # Create an LLM object to get the model config.
-    llm = vllm.LLM(MODEL_NAME, task="embedding", max_model_len=MAX_MODEL_LEN)
+    llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
     pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
 
     arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
@@ -55,7 +55,7 @@ def server_embedding():
     with pytest.MonkeyPatch.context() as mp:
         mp.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
 
-        args = ["--task", "embedding", "--max_model_len", str(MAX_MODEL_LEN)]
+        args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
         with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
             yield remote_server
 
@@ -141,7 +141,7 @@ def test_gritlm_offline_embedding(monkeypatch):
 
     queries, q_instruction, documents, d_instruction = get_test_data()
 
-    llm = vllm.LLM(MODEL_NAME, task="embedding", max_model_len=MAX_MODEL_LEN)
+    llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
 
     d_rep = run_llm_encode(
         llm,
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index ec01a07c16a62..34c1332ac4a66 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -203,12 +203,12 @@ def __init__(
     ) -> None:
         super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
 
-        self.task = vllm_config.model_config.task
+        self.runner_type = vllm_config.model_config.runner_type
 
         self._pooler = GritLMPooler(vllm_config.model_config)
 
         for layer in self.model.layers:
-            if self.task == "embedding" and hasattr(layer, "self_attn"):
+            if self.runner_type == "pooling" and hasattr(layer, "self_attn"):
                 assert isinstance(layer.self_attn.attn.impl, XFormersImpl), (
                     "GritLM embedding is only supported by XFormers backend, "
                     "which can be forced by VLLM_ATTENTION_BACKEND=XFORMERS")
@@ -222,8 +222,8 @@ def forward(
         **kwargs,
     ) -> Union[torch.Tensor, IntermediateTensors]:
 
-        # Change attention to non-causal for embedding task.
-        if self.task == "embedding":
+        # Change attention to non-causal for pooling tasks.
+        if self.runner_type == "pooling":
             assert attn_metadata.prefill_metadata.attn_bias is None
             attn_metadata.prefill_metadata.attn_bias = [
                 BlockDiagonalMask.from_seqlens(attn_metadata.seq_lens)

From 3989a798249bfa24b6dd22aff599796fcf92dce9 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Fri, 13 Dec 2024 00:07:20 -0500
Subject: [PATCH 047/357] [Bugfix] Update starcoder2 to remap k/v scale names
 for kv_cache quantization (#11148)

---
 vllm/model_executor/models/starcoder2.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 15e8f2af52cda..22189a517d313 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -37,7 +37,8 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
@@ -345,6 +346,10 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
                 if self.config.tie_word_embeddings and "lm_head.weight" in name:
                     continue
                 if is_pp_missing_parameter(name, self):

From 00c1bde5d8cd30b14f661b11d9ad1c1d4470ddbf Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Fri, 13 Dec 2024 00:31:26 -0500
Subject: [PATCH 048/357] [ROCm][AMD] Disable auto enabling chunked prefill on
 ROCm (#11146)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 vllm/engine/arg_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 0c28fe7032728..0098648b1cd60 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1074,7 +1074,8 @@ def create_engine_config(self,
                 if (is_gpu and not use_sliding_window and not use_spec_decode
                         and not self.enable_lora
                         and not self.enable_prompt_adapter
-                        and model_config.runner_type != "pooling"):
+                        and model_config.runner_type != "pooling"
+                        and not current_platform.is_rocm()):
                     self.enable_chunked_prefill = True
                     logger.warning(
                         "Chunked prefill is enabled by default for models with "

From 34f1a806d5771c4ee81fdaf4feb7f9fd4071d779 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Thu, 12 Dec 2024 22:30:06 -0800
Subject: [PATCH 049/357] [Bugfix][V1] Fix 'NoneType' object has no attribute
 'hash_value' (#11157)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
---
 vllm/v1/core/kv_cache_manager.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 03cbb958237df..8044481a9cd6a 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -164,13 +164,14 @@ def append_slots(
 
         new_full_blocks = req_blocks[
             num_computed_full_blocks:num_full_blocks_after_append]
-        self._cache_full_blocks(
-            request=request,
-            blk_start_idx=num_computed_full_blocks,
-            full_blocks=new_full_blocks,
-            prev_block=req_blocks[num_computed_full_blocks - 1]
-            if num_computed_full_blocks >= 1 else None,
-        )
+        if new_full_blocks:
+            self._cache_full_blocks(
+                request=request,
+                blk_start_idx=num_computed_full_blocks,
+                full_blocks=new_full_blocks,
+                prev_block=req_blocks[num_computed_full_blocks - 1]
+                if num_computed_full_blocks >= 1 else None,
+            )
 
         return new_blocks
 
@@ -375,8 +376,13 @@ def _cache_full_blocks(
             prev_block: The previous block in the chain.
         """
         # Update the new blocks with the block hashes through the chain.
-        prev_block_hash_value = (prev_block.block_hash.hash_value
-                                 if prev_block is not None else None)
+        prev_block_hash_value = None
+        if prev_block is not None:
+            # Previous block must have a block hash because it must be
+            # a full, cached block.
+            assert prev_block.block_hash is not None
+            prev_block_hash_value = prev_block.block_hash.hash_value
+
         for i, blk in enumerate(full_blocks):
             blk_idx = blk_start_idx + i
 

From be39e3cd18781c4571410323f3c767e67240eb51 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 12 Dec 2024 22:57:50 -0800
Subject: [PATCH 050/357] [core] clean up cudagraph batchsize padding logic
 (#10996)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .../decoder_only/language/test_jamba.py       |   5 +-
 .../decoder_only/language/test_mamba.py       |   5 +-
 .../test_encoder_decoder_model_runner.py      |   4 +-
 tests/worker/test_model_runner.py             |   4 +-
 vllm/config.py                                | 171 +++++++++++-------
 vllm/model_executor/models/jamba.py           |  20 +-
 vllm/model_executor/models/mamba.py           |  21 ++-
 vllm/v1/worker/gpu_model_runner.py            |  11 +-
 vllm/worker/enc_dec_model_runner.py           |   2 +-
 vllm/worker/model_runner.py                   |   7 +-
 vllm/worker/xpu_model_runner.py               |   4 -
 11 files changed, 150 insertions(+), 104 deletions(-)

diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py
index cae25ae9fa2c8..057b04349e8b7 100644
--- a/tests/models/decoder_only/language/test_jamba.py
+++ b/tests/models/decoder_only/language/test_jamba.py
@@ -1,7 +1,7 @@
 import pytest
 
 from tests.utils import multi_gpu_test
-from vllm.config import VllmConfig
+from vllm.engine.arg_utils import EngineArgs
 from vllm.sampling_params import SamplingParams
 
 from ...utils import check_outputs_equal
@@ -189,7 +189,8 @@ def test_mamba_cache_cg_padding(
     # This test is for verifying that mamba cache is padded to CG captured
     # batch size. If it's not, a torch RuntimeError will be raised because
     # tensor dimensions aren't compatible
-    while len(example_prompts) == VllmConfig.get_graph_batch_size(
+    vllm_config = EngineArgs(model=model).create_engine_config()
+    while len(example_prompts) == vllm_config.pad_for_cudagraph(
             len(example_prompts)):
         example_prompts.append(example_prompts[0])
 
diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py
index 35018c3c14dee..06739e8f02253 100644
--- a/tests/models/decoder_only/language/test_mamba.py
+++ b/tests/models/decoder_only/language/test_mamba.py
@@ -5,7 +5,7 @@
 import pytest
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from vllm.config import VllmConfig
+from vllm.engine.arg_utils import EngineArgs
 from vllm.sampling_params import SamplingParams
 
 from ...utils import check_outputs_equal
@@ -200,7 +200,8 @@ def test_mamba_cache_cg_padding(
     # This test is for verifying that mamba cache is padded to CG captured
     # batch size. If it's not, a torch RuntimeError will be raised because
     # tensor dimensions aren't compatible
-    while len(example_prompts) == VllmConfig.get_graph_batch_size(
+    vllm_config = EngineArgs(model=model).create_engine_config()
+    while len(example_prompts) == vllm_config.pad_for_cudagraph(
             len(example_prompts)):
         example_prompts.append(example_prompts[0])
 
diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py
index 5289c91f201cd..a6b3cb5759f2b 100644
--- a/tests/worker/test_encoder_decoder_model_runner.py
+++ b/tests/worker/test_encoder_decoder_model_runner.py
@@ -4,7 +4,6 @@
 import pytest
 import torch
 
-from vllm.config import VllmConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
@@ -548,7 +547,8 @@ def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
     # With CUDA Graph capture and replay enabled, the decoder and encoder
     # input sequences will be padded. Create the expected padded tensors
     # accordingly.
-    graph_batch_size = VllmConfig.get_graph_batch_size(expanded_batch_size)
+    graph_batch_size = model_runner.vllm_config.pad_for_cudagraph(
+        expanded_batch_size)
     cuda_graph_pad_size = graph_batch_size - expanded_batch_size
     padded_seq_lens = seq_lens + list(itertools.repeat(1, cuda_graph_pad_size))
     padded_encoder_seq_lens = encoder_seq_lens + list(
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index 4055524f3e0c7..aabe913c242e1 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -3,7 +3,6 @@
 import pytest
 import torch
 
-from vllm.config import VllmConfig
 from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
                                              init_distributed_environment)
 from vllm.engine.arg_utils import EngineArgs
@@ -177,7 +176,8 @@ def test_prepare_decode_cuda_graph(batch_size):
         model_input.attn_metadata, model_input.attn_metadata.slot_mapping)
     assert len(slot_mapping) == len(input_tokens)
 
-    expected_bs = VllmConfig.get_graph_batch_size(len(seq_group_metadata_list))
+    expected_bs = model_runner.vllm_config.pad_for_cudagraph(
+        len(seq_group_metadata_list))
     # Verify input metadata is correct for prompts.
     device = model_runner.device
     assert attn_metadata.num_prefills == 0
diff --git a/vllm/config.py b/vllm/config.py
index 08a7b607630af..12ed80c366e43 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2354,6 +2354,12 @@ def model_post_init(self, __context: Any) -> None:
     # not configurable, computed after init
     compile_sizes: List[int] = PrivateAttr
     capture_sizes: List[int] = PrivateAttr
+    max_capture_size: int = PrivateAttr
+    # optimization:
+    # Intuitively, bs_to_padded_graph_size should be Dict[int, int].
+    # since we know all keys are in a range [0, max_capture_size],
+    # we can optimize it to List[int] for better lookup performance.
+    bs_to_padded_graph_size: List[int] = PrivateAttr
 
     # keep track of enabled and disabled custom ops
     enabled_custom_ops: Counter[str] = PrivateAttr
@@ -2365,6 +2371,19 @@ def model_post_init(self, __context: Any) -> None:
     # Map from layer name to the attention cls
     static_forward_context: Dict[str, Any] = PrivateAttr
 
+    def __repr__(self) -> str:
+        exclude = {
+            "static_forward_context",
+            "enabled_custom_ops",
+            "disabled_custom_ops",
+            "compilation_time",
+            "bs_to_padded_graph_size",
+            "pass_config",
+        }
+        return self.model_dump_json(exclude=exclude, exclude_unset=True)
+
+    __str__ = __repr__
+
     @classmethod
     def from_cli(cls, cli_value: str) -> "CompilationConfig":
         """Parse the CLI value for the compilation config."""
@@ -2450,18 +2469,22 @@ def init_with_cudagraph_sizes(self, sizes_to_specialize: List[int]):
 
         # sort to make sure cudagraph capture sizes are in descending order
         self.capture_sizes.sort(reverse=True)
+        self.max_capture_size = self.capture_sizes[
+            0] if self.capture_sizes else 0
 
-
-_BATCH_SIZE_ALIGNMENT = 8
-# all the token sizes that **can** be captured by cudagraph.
-# they can be arbitrarily large.
-# currently it includes: 1, 2, 4, 8, 16, 24, 32, 40, ..., 8192.
-# the actual sizes to capture will be determined by the model,
-# depending on the model's max_num_seqs.
-# NOTE: get_graph_batch_size needs to be updated if this list is changed.
-_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
-    _BATCH_SIZE_ALIGNMENT * i for i in range(1, 1025)
-]
+        # pre-compute the mapping from batch size to padded graph size
+        self.bs_to_padded_graph_size = [
+            0 for i in range(self.max_capture_size + 1)
+        ]
+        for end, start in zip(self.capture_sizes,
+                              self.capture_sizes[1:] + [0]):
+            for bs in range(start, end):
+                if bs == start:
+                    self.bs_to_padded_graph_size[bs] = start
+                else:
+                    self.bs_to_padded_graph_size[bs] = end
+        self.bs_to_padded_graph_size[
+            self.max_capture_size] = self.max_capture_size
 
 
 @dataclass
@@ -2491,40 +2514,12 @@ class VllmConfig:
                                                  init=True)  # type: ignore
     instance_id: str = ""
 
-    @staticmethod
-    def get_graph_batch_size(batch_size: int) -> int:
-        """Returns the padded batch size given actual batch size.
-
-        Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT,
-        2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT...
-        """
-        if batch_size <= 2:
-            return batch_size
-        elif batch_size <= 4:
-            return 4
-        else:
-            return ((batch_size + _BATCH_SIZE_ALIGNMENT - 1) //
-                    _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT)
-
-    @staticmethod
-    def get_max_graph_batch_size(max_num_seqs: int) -> int:
-        """
-        max_num_seqs: Maximum number of sequences in a batch.
-        _BATCH_SIZES_TO_CAPTURE: all the sizes that we want to capture.
-
-        pad the max_num_seqs if necessary by calling get_graph_batch_size,
-        which will deal with some edge cases like 1, 2, 4.
-
-        if the padded size is in _BATCH_SIZES_TO_CAPTURE, return the padded
-        size. if not, it means the padded size is larger than the largest size
-        in _BATCH_SIZES_TO_CAPTURE, return the largest size in
-        _BATCH_SIZES_TO_CAPTURE.
-        """
-        padded_size = VllmConfig.get_graph_batch_size(max_num_seqs)
-        if padded_size in _BATCH_SIZES_TO_CAPTURE:
-            return padded_size
-        assert padded_size > _BATCH_SIZES_TO_CAPTURE[-1]
-        return _BATCH_SIZES_TO_CAPTURE[-1]
+    def pad_for_cudagraph(self, batch_size: int) -> int:
+        # if batch_size > self.compilation_config.max_capture_size,
+        # it should raise an IndexError.
+        # the caller should make sure the batch_size is within the range,
+        # i.e., batch_size <= self.compilation_config.max_capture_size
+        return self.compilation_config.bs_to_padded_graph_size[batch_size]
 
     @staticmethod
     def _get_quantization_config(
@@ -2618,27 +2613,7 @@ def __post_init__(self):
             self.compilation_config.pass_config.enable_reshape = False
             self.compilation_config.level = CompilationLevel.PIECEWISE
 
-        if not envs.VLLM_USE_V1:
-            max_batchsize_to_capture = 0
-            if self.scheduler_config is not None and \
-                self.model_config is not None and \
-                    not self.model_config.enforce_eager:
-                max_batchsize_to_capture = \
-                    self.get_max_graph_batch_size(
-                    self.scheduler_config.max_num_seqs)
-            batch_size_capture_list = [
-                size for size in _BATCH_SIZES_TO_CAPTURE
-                if size <= max_batchsize_to_capture
-            ]
-        else:
-            batch_size_capture_list = []
-            if self.model_config is not None and \
-                not self.model_config.enforce_eager:
-                batch_size_capture_list = [1, 2, 4
-                                           ] + [i for i in range(8, 513, 8)]
-
-        self.compilation_config.init_with_cudagraph_sizes(
-            batch_size_capture_list)
+        self._set_cudagraph_sizes()
 
         if self.cache_config is not None and \
             self.cache_config.cpu_offload_gb > 0 and \
@@ -2659,6 +2634,70 @@ def __post_init__(self):
         if not self.instance_id:
             self.instance_id = random_uuid()[:5]
 
+    def _set_cudagraph_sizes(self):
+        """
+        cudagraph batchsize padding logic:
+
+        `[1, 2, 4] + [8 * i for i in range(1, 1025)]` is a list of all possible
+        batch sizes that cudagraph will capture.
+
+        Depending on the engine's configuration of `max_num_seqs`, the
+        candidate batch sizes to capture cudagraph will shrink to the subset
+        which just cover the range of `[1, max_num_seqs]`. In the common case,
+        `max_num_seqs` is 256, and the cudagraph batch sizes will be
+        `[1, 2, 4, 8, 16, 24, 32, 40, ..., 256]`.
+
+        However, if users specify the cudagraph capture sizes through
+        compilation config, we will use the specified sizes instead.
+
+        In the end, `vllm_config.compilation_config.capture_sizes` will be the
+        final sizes to capture cudagraph (in descending order).
+
+        During runtime, if batchsize is larger than
+        `vllm_config.compilation_config.capture_sizes`,
+        no cudagraph will be used.
+        If the batch size is no larger than
+        `vllm_config.compilation_config.capture_sizes`,
+        we can quickly find the padded graph size for a given batch size by
+        looking up `vllm_config.compilation_config.bs_to_padded_graph_size`.
+        """
+
+        # calculate the default `batch_size_capture_list`
+        if not envs.VLLM_USE_V1:
+            batch_size_capture_list = []
+            max_batchsize_to_capture = 0
+            if self.scheduler_config is not None and \
+                self.model_config is not None and \
+                    not self.model_config.enforce_eager:
+
+                possible_sizes = [1, 2, 4] + [8 * i for i in range(1, 1025)]
+                # find the minimum size that is larger than max_num_seqs,
+                # which then becomes the max_batchsize_to_capture
+                larger_sizes = [
+                    x for x in possible_sizes
+                    if x >= self.scheduler_config.max_num_seqs
+                ]
+                if larger_sizes:
+                    max_batchsize_to_capture = larger_sizes[0]
+                else:
+                    max_batchsize_to_capture = possible_sizes[-1]
+
+                # filter out the sizes that are
+                # larger than max_batchsize_to_capture
+                batch_size_capture_list = [
+                    size for size in possible_sizes
+                    if size <= max_batchsize_to_capture
+                ]
+        else:
+            batch_size_capture_list = []
+            if self.model_config is not None and \
+                not self.model_config.enforce_eager:
+                batch_size_capture_list = [1, 2, 4
+                                           ] + [i for i in range(8, 513, 8)]
+
+        self.compilation_config.init_with_cudagraph_sizes(
+            batch_size_capture_list)
+
     def __str__(self):
         return (
             f"model={self.model_config.model!r},"
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 6bb4c13ab35df..831db2ae52d74 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -7,7 +7,7 @@
 
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.layer import Attention
-from vllm.config import _BATCH_SIZES_TO_CAPTURE, CacheConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -420,6 +420,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
+        if self.scheduler_config is not None and \
+            not self.model_config.enforce_eager:
+            if self.scheduler_config.max_num_seqs > \
+                vllm_config.compilation_config.max_capture_size:
+                self.max_batch_size = \
+                    vllm_config.compilation_config.max_capture_size
+            else:
+                self.max_batch_size = vllm_config.pad_for_cudagraph(
+                    self.scheduler_config.max_num_seqs)
+        else:
+            self.max_batch_size = 8192 + 2
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
@@ -433,15 +444,12 @@ def forward(self,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
         if self.mamba_cache is None:
-            max_batch_size = (VllmConfig.get_graph_batch_size(
-                self.scheduler_config.max_num_seqs) if self.scheduler_config
-                              else max(_BATCH_SIZES_TO_CAPTURE) + 2)
 
             num_mamba_layers = self.model_config.get_num_layers_by_block_type(
                 self.vllm_config.parallel_config, LayerBlockType.mamba)
             self.mamba_cache = MambaCacheManager(
-                self.lm_head.weight.dtype, num_mamba_layers, max_batch_size,
-                *self._get_mamba_cache_shape())
+                self.lm_head.weight.dtype, num_mamba_layers,
+                self.max_batch_size, *self._get_mamba_cache_shape())
         (
             mamba_cache_tensors,
             state_indices_tensor,
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 1f5cd02711899..06c8d9723cd01 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -6,7 +6,7 @@
 from transformers import MambaConfig
 
 from vllm.attention.backends.abstract import AttentionMetadata
-from vllm.config import _BATCH_SIZES_TO_CAPTURE, CacheConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -195,6 +195,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.make_empty_intermediate_tensors = (
             self.backbone.make_empty_intermediate_tensors)
+        if self.scheduler_config is not None and \
+            not self.model_config.enforce_eager:
+            if self.scheduler_config.max_num_seqs > \
+                vllm_config.compilation_config.max_capture_size:
+                self.max_batch_size = \
+                    vllm_config.compilation_config.max_capture_size
+            else:
+                self.max_batch_size = vllm_config.pad_for_cudagraph(
+                    self.scheduler_config.max_num_seqs)
+        else:
+            self.max_batch_size = 8192 + 2
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.backbone.get_input_embeddings(input_ids)
@@ -208,15 +219,11 @@ def forward(self,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
         if self.mamba_cache is None:
-            max_batch_size = (VllmConfig.get_graph_batch_size(
-                self.scheduler_config.max_num_seqs) if self.scheduler_config
-                              else max(_BATCH_SIZES_TO_CAPTURE) + 2)
-
             num_mamba_layers = self.model_config.get_num_layers_by_block_type(
                 self.vllm_config.parallel_config, LayerBlockType.mamba)
             self.mamba_cache = MambaCacheManager(
-                self.lm_head.weight.dtype, num_mamba_layers, max_batch_size,
-                *self._get_mamba_cache_shape())
+                self.lm_head.weight.dtype, num_mamba_layers,
+                self.max_batch_size, *self._get_mamba_cache_shape())
 
         (
             mamba_cache_tensors,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index aa91255e68d48..f24942068d1f8 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,6 +1,6 @@
 import gc
 import time
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Dict, List, Tuple
 
 import numpy as np
 import torch
@@ -459,7 +459,7 @@ def execute_model(
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
             # Use piecewise CUDA graphs.
             # Add padding to the batch size.
-            num_input_tokens = self._get_padded_batch_size(
+            num_input_tokens = self.vllm_config.pad_for_cudagraph(
                 num_scheduled_tokens)
         else:
             # Eager mode.
@@ -641,10 +641,3 @@ def initialize_kv_cache(self, num_blocks: int) -> None:
                 torch.zeros(kv_cache_shape,
                             dtype=self.kv_cache_dtype,
                             device=self.device))
-
-    def _get_padded_batch_size(self, batch_size: int) -> Optional[int]:
-        # TODO: Optimize this?
-        for size in self.cudagraph_batch_sizes:
-            if batch_size <= size:
-                return size
-        return None
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 5697fbbaa2041..bff01320d7927 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -464,7 +464,7 @@ def _prepare_encoder_model_input_tensors(
                 # We will be using CUDA graph replay for this decode.
                 max_len_of_block_table = self.get_max_block_per_batch()
                 batch_size = len(encoder_seq_lens)
-                graph_batch_size = self.vllm_config.get_graph_batch_size(
+                graph_batch_size = self.vllm_config.pad_for_cudagraph(
                     batch_size)
                 assert graph_batch_size >= batch_size
                 cuda_graph_pad_size = graph_batch_size - batch_size
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 26fd486130ce6..6ff98a8f1bab2 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -802,7 +802,8 @@ def _get_cuda_graph_pad_size(self,
                                         max_encoder_seq_len):
             return -1
 
-        graph_batch_size = VllmConfig.get_graph_batch_size(batch_size)
+        graph_batch_size = self.runner.vllm_config.pad_for_cudagraph(
+            batch_size)
         assert graph_batch_size >= batch_size
         return graph_batch_size - batch_size
 
@@ -1014,8 +1015,8 @@ def __init__(
         self.sliding_window = model_config.get_sliding_window()
         self.block_size = cache_config.block_size
         self.max_seq_len_to_capture = self.model_config.max_seq_len_to_capture
-        self.max_batchsize_to_capture = VllmConfig.get_max_graph_batch_size(
-            self.scheduler_config.max_num_seqs)
+        self.max_batchsize_to_capture = \
+            self.vllm_config.compilation_config.max_capture_size
 
         self.graph_runners: List[Dict[int, CUDAGraphRunner]] = [
             {} for _ in range(self.parallel_config.pipeline_parallel_size)
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index e6322e095bbb9..9cf25387560da 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -37,10 +37,6 @@
 logger = init_logger(__name__)
 
 _PAD_SLOT_ID = -1
-_BATCH_SIZE_ALIGNMENT = 8
-_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
-    _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33)
-]
 
 TModelInputForXPU = TypeVar('TModelInputForXPU', bound="ModelInputForXPU")
 

From 7cd7409142ff97aee1a13568753db9263fcf8f6b Mon Sep 17 00:00:00 2001
From: Jani Monoses <jani.monoses@gmail.com>
Date: Fri, 13 Dec 2024 09:40:07 +0200
Subject: [PATCH 051/357] PaliGemma 2 support (#11142)

---
 docs/source/models/supported_models.rst       |  4 ++--
 examples/offline_inference_vision_language.py | 13 +++++++++++++
 vllm/model_executor/models/paligemma.py       | 11 ++++++++++-
 3 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 35aa3bfdd12b7..cae4a88de1638 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -664,9 +664,9 @@ Text Generation (``--task generate``)
     - ✅︎
     - ✅︎
   * - :code:`PaliGemmaForConditionalGeneration`
-    - PaliGemma
+    - PaliGemma, PaliGemma 2
     - T + I\ :sup:`E`
-    - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc.
+    - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, :code:`google/paligemma2-3b-ft-docci-448`, etc.
     - 
     - ✅︎
     - 
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 5e210126dc8fe..c430f42fdc814 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -137,6 +137,18 @@ def run_paligemma(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
+# PaliGemma 2
+def run_paligemma2(question: str, modality: str):
+    assert modality == "image"
+
+    # PaliGemma 2 has special prompt format for VQA
+    prompt = "caption en"
+    llm = LLM(model="google/paligemma2-3b-ft-docci-448",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
 # Chameleon
 def run_chameleon(question: str, modality: str):
     assert modality == "image"
@@ -473,6 +485,7 @@ def run_mantis(question: str, modality: str):
     "fuyu": run_fuyu,
     "phi3_v": run_phi3v,
     "paligemma": run_paligemma,
+    "paligemma2": run_paligemma2,
     "chameleon": run_chameleon,
     "minicpmv": run_minicpmv,
     "blip-2": run_blip2,
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 253e689e50a3b..f9ad0c67adaba 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -105,6 +105,11 @@ def input_processor_for_paligemma(ctx: InputContext,
         orig_prompt_ids.remove(hf_config.image_token_index)
 
     new_prompt = f"{image_token_str_pad}{bos_token}{orig_prompt}\n"
+
+    # The PaliGemma 2 tokenizer does not include a starting BOS token
+    if orig_prompt_ids[0] != hf_config.bos_token_id:
+        orig_prompt_ids = [hf_config.bos_token_id] + orig_prompt_ids
+
     new_token_ids = image_token_ids_pad + orig_prompt_ids + [108]  #newline
 
     # NOTE: Create a defensive copy of the original inputs
@@ -149,7 +154,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             projection_dim=config.vision_config.projection_dim)
 
         self.quant_config = quant_config
-        config.text_config.architectures = ["GemmaForCausalLM"]
+
+        if config.text_config.model_type == "gemma":
+            config.text_config.architectures = ["GemmaForCausalLM"]
+        else:
+            config.text_config.architectures = ["Gemma2ForCausalLM"]
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
             hf_config=config.text_config,

From f93bf2b1897cca5b644fe03f31925e4faff40056 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Fri, 13 Dec 2024 16:50:35 +0800
Subject: [PATCH 052/357] [Bugfix][CI][CPU] add missing datasets package to
 requirements-cpu.txt  (#11159)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 requirements-cpu.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index db8ad9d3a015d..e62f313297762 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -4,4 +4,5 @@
 # Dependencies for CPUs
 torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" 
 torch==2.5.1; platform_machine == "aarch64"
-torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
\ No newline at end of file
+torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
+datasets # for benchmark scripts
\ No newline at end of file

From eeec9e339005d887e0064f7b3e7771295ecd68e7 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 13 Dec 2024 18:40:07 +0800
Subject: [PATCH 053/357] [Frontend] Separate pooling APIs in offline inference
 (#11129)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml                 |   7 +-
 docs/source/models/pooling_models.rst         |  53 +++-
 examples/offline_inference_classification.py  |  28 ++
 examples/offline_inference_embedding.py       |  16 +-
 examples/offline_inference_scoring.py         |  23 ++
 ...ine_inference_vision_language_embedding.py |   2 +-
 tests/conftest.py                             |  18 +-
 tests/entrypoints/openai/test_score.py        |  10 +-
 .../models/embedding/language/test_scoring.py |  10 +-
 tests/models/test_oot_registration.py         |   5 +-
 vllm/__init__.py                              |  36 +--
 vllm/engine/llm_engine.py                     |  17 +-
 vllm/entrypoints/llm.py                       | 143 ++++++++-
 vllm/entrypoints/openai/protocol.py           |   2 +-
 vllm/entrypoints/openai/serving_embedding.py  |   9 +-
 vllm/entrypoints/openai/serving_engine.py     |  12 +-
 vllm/entrypoints/openai/serving_score.py      |  12 +-
 vllm/model_executor/layers/pooler.py          | 288 ++++++++++++------
 vllm/model_executor/models/gritlm.py          |  15 +-
 vllm/outputs.py                               | 207 ++++++++-----
 vllm/sequence.py                              |  40 ++-
 21 files changed, 659 insertions(+), 294 deletions(-)
 create mode 100644 examples/offline_inference_classification.py
 create mode 100644 examples/offline_inference_scoring.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 6a6ee3cf713ae..97aae233db105 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -181,14 +181,14 @@ steps:
   commands:
     - VLLM_USE_V1=1 pytest -v -s v1
 
-- label: Examples Test # 15min
+- label: Examples Test # 25min
   working_dir: "/vllm-workspace/examples"
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/entrypoints
   - examples/
   commands:
-    - pip install awscli tensorizer # for llava example and tensorizer test
+    - pip install tensorizer # for tensorizer test
     - python3 offline_inference.py
     - python3 cpu_offload.py
     - python3 offline_inference_chat.py
@@ -198,6 +198,9 @@ steps:
     - python3 offline_inference_vision_language_multi_image.py
     - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference_encoder_decoder.py
+    - python3 offline_inference_classification.py
+    - python3 offline_inference_embedding.py
+    - python3 offline_inference_scoring.py
     - python3 offline_profile.py --model facebook/opt-125m
 
 - label: Prefix Caching Test # 9min
diff --git a/docs/source/models/pooling_models.rst b/docs/source/models/pooling_models.rst
index 7fa66274c3c5a..94475c5e6689d 100644
--- a/docs/source/models/pooling_models.rst
+++ b/docs/source/models/pooling_models.rst
@@ -6,7 +6,7 @@ Pooling Models
 vLLM also supports pooling models, including embedding, reranking and reward models.
 
 In vLLM, pooling models implement the :class:`~vllm.model_executor.models.VllmModelForPooling` interface.
-These models use a :class:`~vllm.model_executor.layers.Pooler` to aggregate the final hidden states of the input
+These models use a :class:`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input
 before returning them.
 
 .. note::
@@ -45,20 +45,48 @@ which takes priority over both the model's and Sentence Transformers's defaults.
 ^^^^^^^^^^^^^^
 
 The :class:`~vllm.LLM.encode` method is available to all pooling models in vLLM.
-It returns the aggregated hidden states directly.
+It returns the extracted hidden states directly, which is useful for reward models.
+
+.. code-block:: python
+
+    llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward")
+    output, = llm.encode("Hello, my name is")
+
+    data = output.outputs.data
+    print(f"Prompt: {prompt!r} | Data: {data!r}")
+
+``LLM.embed``
+^^^^^^^^^^^^^
+
+The :class:`~vllm.LLM.embed` method outputs an embedding vector for each prompt.
+It is primarily designed for embedding models.
 
 .. code-block:: python
 
     llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed")
-    outputs = llm.encode("Hello, my name is")
+    output, = llm.embed("Hello, my name is")
 
-    outputs = model.encode(prompts)
-    for output in outputs:
-        embeddings = output.outputs.embedding
-        print(f"Prompt: {prompt!r}, Embeddings (size={len(embeddings)}: {embeddings!r}")
+    embeds = output.outputs.embedding
+    print(f"Embeddings: {embeds!r} (size={len(embeds)})")
 
 A code example can be found in `examples/offline_inference_embedding.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_embedding.py>`_.
 
+``LLM.classify``
+^^^^^^^^^^^^^^^^
+
+The :class:`~vllm.LLM.classify` method outputs a probability vector for each prompt.
+It is primarily designed for classification models.
+
+.. code-block:: python
+
+    llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify")
+    output, = llm.classify("Hello, my name is")
+
+    probs = output.outputs.probs
+    print(f"Class Probabilities: {probs!r} (size={len(probs)})")
+
+A code example can be found in `examples/offline_inference_classification.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_classification.py>`_.
+
 ``LLM.score``
 ^^^^^^^^^^^^^
 
@@ -71,7 +99,16 @@ These types of models serve as rerankers between candidate query-document pairs
     vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.
     To handle RAG at a higher level, you should use integration frameworks such as `LangChain <https://github.com/langchain-ai/langchain>`_.
 
-You can use `these tests <https://github.com/vllm-project/vllm/blob/main/tests/models/embedding/language/test_scoring.py>`_ as reference.
+.. code-block:: python
+
+    llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
+    output, = llm.score("What is the capital of France?",
+                        "The capital of Brazil is Brasilia.")
+
+    score = output.outputs.score
+    print(f"Score: {score}")
+
+A code example can be found in `examples/offline_inference_scoring.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_scoring.py>`_.
 
 Online Inference
 ----------------
diff --git a/examples/offline_inference_classification.py b/examples/offline_inference_classification.py
new file mode 100644
index 0000000000000..de539b639a196
--- /dev/null
+++ b/examples/offline_inference_classification.py
@@ -0,0 +1,28 @@
+from vllm import LLM
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+# Create an LLM.
+# You should pass task="classify" for classification models
+model = LLM(
+    model="jason9693/Qwen2.5-1.5B-apeach",
+    task="classify",
+    enforce_eager=True,
+)
+
+# Generate logits. The output is a list of ClassificationRequestOutputs.
+outputs = model.classify(prompts)
+
+# Print the outputs.
+for prompt, output in zip(prompts, outputs):
+    probs = output.outputs.probs
+    probs_trimmed = ((str(probs[:16])[:-1] +
+                      ", ...]") if len(probs) > 16 else probs)
+    print(f"Prompt: {prompt!r} | "
+          f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference_embedding.py
index 17f6d992073d7..58d004313ad51 100644
--- a/examples/offline_inference_embedding.py
+++ b/examples/offline_inference_embedding.py
@@ -9,14 +9,20 @@
 ]
 
 # Create an LLM.
+# You should pass task="embed" for embedding models
 model = LLM(
     model="intfloat/e5-mistral-7b-instruct",
-    task="embed",  # You should pass task="embed" for embedding models
+    task="embed",
     enforce_eager=True,
 )
 
-# Generate embedding. The output is a list of PoolingRequestOutputs.
-outputs = model.encode(prompts)
+# Generate embedding. The output is a list of EmbeddingRequestOutputs.
+outputs = model.embed(prompts)
+
 # Print the outputs.
-for output in outputs:
-    print(output.outputs.embedding)  # list of 4096 floats
+for prompt, output in zip(prompts, outputs):
+    embeds = output.outputs.embedding
+    embeds_trimmed = ((str(embeds[:16])[:-1] +
+                       ", ...]") if len(embeds) > 16 else embeds)
+    print(f"Prompt: {prompt!r} | "
+          f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
diff --git a/examples/offline_inference_scoring.py b/examples/offline_inference_scoring.py
new file mode 100644
index 0000000000000..5da9e710959b5
--- /dev/null
+++ b/examples/offline_inference_scoring.py
@@ -0,0 +1,23 @@
+from vllm import LLM
+
+# Sample prompts.
+text_1 = "What is the capital of France?"
+texts_2 = [
+    "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+]
+
+# Create an LLM.
+# You should pass task="score" for cross-encoder models
+model = LLM(
+    model="BAAI/bge-reranker-v2-m3",
+    task="score",
+    enforce_eager=True,
+)
+
+# Generate scores. The output is a list of ScoringRequestOutputs.
+outputs = model.score(text_1, texts_2)
+
+# Print the outputs.
+for text_2, output in zip(texts_2, outputs):
+    score = output.outputs.score
+    print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference_vision_language_embedding.py
index bf466109f0981..4ce3d496bf45b 100644
--- a/examples/offline_inference_vision_language_embedding.py
+++ b/examples/offline_inference_vision_language_embedding.py
@@ -133,7 +133,7 @@ def run_encode(model: str, modality: QueryModality):
     if req_data.image is not None:
         mm_data["image"] = req_data.image
 
-    outputs = req_data.llm.encode({
+    outputs = req_data.llm.embed({
         "prompt": req_data.prompt,
         "multi_modal_data": mm_data,
     })
diff --git a/tests/conftest.py b/tests/conftest.py
index 7606e0f11dfeb..4e939221329cd 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -719,14 +719,6 @@ def get_inputs(
 
         return inputs
 
-    def classify(self, prompts: List[str]) -> List[str]:
-        req_outputs = self.model.encode(prompts)
-        outputs = []
-        for req_output in req_outputs:
-            embedding = req_output.outputs.embedding
-            outputs.append(embedding)
-        return outputs
-
     def generate(
         self,
         prompts: List[str],
@@ -897,6 +889,10 @@ def generate_beam_search(
             returned_outputs.append((token_ids, texts))
         return returned_outputs
 
+    def classify(self, prompts: List[str]) -> List[List[float]]:
+        req_outputs = self.model.classify(prompts)
+        return [req_output.outputs.probs for req_output in req_outputs]
+
     def encode(
         self,
         prompts: List[str],
@@ -909,16 +905,16 @@ def encode(
                                  videos=videos,
                                  audios=audios)
 
-        req_outputs = self.model.encode(inputs)
+        req_outputs = self.model.embed(inputs)
         return [req_output.outputs.embedding for req_output in req_outputs]
 
     def score(
         self,
         text_1: Union[str, List[str]],
         text_2: Union[str, List[str]],
-    ) -> List[List[float]]:
+    ) -> List[float]:
         req_outputs = self.model.score(text_1, text_2)
-        return [req_output.outputs.embedding for req_output in req_outputs]
+        return [req_output.outputs.score for req_output in req_outputs]
 
     def __enter__(self):
         return self
diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
index 7565ff7192f67..0698c19ad0023 100644
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -39,8 +39,8 @@ async def test_text_1_str_text_2_list(server: RemoteOpenAIServer,
     assert score.id is not None
     assert score.data is not None
     assert len(score.data) == 2
-    assert score.data[0].score[0] <= 0.01
-    assert score.data[1].score[0] >= 0.9
+    assert score.data[0].score <= 0.01
+    assert score.data[1].score >= 0.9
 
 
 @pytest.mark.asyncio
@@ -67,8 +67,8 @@ async def test_text_1_list_text_2_list(server: RemoteOpenAIServer,
     assert score.id is not None
     assert score.data is not None
     assert len(score.data) == 2
-    assert score.data[0].score[0] <= 0.01
-    assert score.data[1].score[0] >= 0.9
+    assert score.data[0].score <= 0.01
+    assert score.data[1].score >= 0.9
 
 
 @pytest.mark.asyncio
@@ -90,4 +90,4 @@ async def test_text_1_str_text_2_str(server: RemoteOpenAIServer,
     assert score.id is not None
     assert score.data is not None
     assert len(score.data) == 1
-    assert score.data[0].score[0] >= 0.9
+    assert score.data[0].score >= 0.9
diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py
index 0c3115d195fc1..af31e1a635f65 100644
--- a/tests/models/embedding/language/test_scoring.py
+++ b/tests/models/embedding/language/test_scoring.py
@@ -42,7 +42,7 @@ def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
     assert len(vllm_outputs) == 1
     assert len(hf_outputs) == 1
 
-    assert math.isclose(hf_outputs[0], vllm_outputs[0][0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
 
 
 @pytest.mark.parametrize("dtype", ["half"])
@@ -63,8 +63,8 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
     assert len(vllm_outputs) == 2
     assert len(hf_outputs) == 2
 
-    assert math.isclose(hf_outputs[0], vllm_outputs[0][0], rel_tol=0.01)
-    assert math.isclose(hf_outputs[1], vllm_outputs[1][0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
 
 
 @pytest.mark.parametrize("dtype", ["half"])
@@ -85,5 +85,5 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):
     assert len(vllm_outputs) == 2
     assert len(hf_outputs) == 2
 
-    assert math.isclose(hf_outputs[0], vllm_outputs[0][0], rel_tol=0.01)
-    assert math.isclose(hf_outputs[1], vllm_outputs[1][0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index 94be215258f89..2c413a633896a 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from vllm import LLM, PoolingParams, SamplingParams
+from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 
 from ..utils import fork_new_process_for_each_test
@@ -36,9 +36,8 @@ def test_oot_registration_text_generation(dummy_opt_path):
 def test_oot_registration_embedding(dummy_gemma2_embedding_path):
     os.environ["VLLM_PLUGINS"] = "register_dummy_model"
     prompts = ["Hello, my name is", "The text does not matter"]
-    sampling_params = PoolingParams()
     llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
-    outputs = llm.encode(prompts, sampling_params)
+    outputs = llm.embed(prompts)
 
     for output in outputs:
         assert all(v == 0 for v in output.outputs.embedding)
diff --git a/vllm/__init__.py b/vllm/__init__.py
index a10f6d3128cb6..45252b93e3d54 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -7,8 +7,11 @@
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import PromptType, TextPrompt, TokensPrompt
 from vllm.model_executor.models import ModelRegistry
-from vllm.outputs import (CompletionOutput, PoolingOutput,
-                          PoolingRequestOutput, RequestOutput)
+from vllm.outputs import (ClassificationOutput, ClassificationRequestOutput,
+                          CompletionOutput, EmbeddingOutput,
+                          EmbeddingRequestOutput, PoolingOutput,
+                          PoolingRequestOutput, RequestOutput, ScoringOutput,
+                          ScoringRequestOutput)
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 
@@ -27,6 +30,12 @@
     "CompletionOutput",
     "PoolingOutput",
     "PoolingRequestOutput",
+    "EmbeddingOutput",
+    "EmbeddingRequestOutput",
+    "ClassificationOutput",
+    "ClassificationRequestOutput",
+    "ScoringOutput",
+    "ScoringRequestOutput",
     "LLMEngine",
     "EngineArgs",
     "AsyncLLMEngine",
@@ -34,26 +43,3 @@
     "initialize_ray_cluster",
     "PoolingParams",
 ]
-
-
-def __getattr__(name: str):
-    import warnings
-
-    if name == "EmbeddingOutput":
-        msg = ("EmbeddingOutput has been renamed to PoolingOutput. "
-               "The original name will be removed in an upcoming version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return PoolingOutput
-
-    if name == "EmbeddingRequestOutput":
-        msg = ("EmbeddingRequestOutput has been renamed to "
-               "PoolingRequestOutput. "
-               "The original name will be removed in an upcoming version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return PoolingRequestOutput
-
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index d756f71e4fa53..dc2d77d6927cd 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -46,11 +46,10 @@
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import RequestOutputKind, SamplingParams
-from vllm.sequence import (EmbeddingSequenceGroupOutput, ExecuteModelRequest,
-                           ParallelSampleSequenceGroup, Sequence,
-                           SequenceGroup, SequenceGroupBase,
-                           SequenceGroupMetadata, SequenceGroupOutput,
-                           SequenceStatus)
+from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup,
+                           PoolingSequenceGroupOutput, Sequence, SequenceGroup,
+                           SequenceGroupBase, SequenceGroupMetadata,
+                           SequenceGroupOutput, SequenceStatus)
 from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
                           init_tracer)
 from vllm.transformers_utils.config import try_get_generation_config
@@ -966,9 +965,9 @@ def has_unfinished_requests_for_virtual_engine(
     @staticmethod
     def _process_sequence_group_outputs(
         seq_group: SequenceGroup,
-        outputs: List[EmbeddingSequenceGroupOutput],
+        outputs: List[PoolingSequenceGroupOutput],
     ) -> None:
-        seq_group.embeddings = outputs[0].embeddings
+        seq_group.pooled_data = outputs[0].data
 
         for seq in seq_group.get_seqs():
             seq.status = SequenceStatus.FINISHED_STOPPED
@@ -1784,8 +1783,8 @@ def _get_stats(self,
                                num_prompt_tokens_iter)
         # Spec decode, if enabled, emits specialized metrics from the worker in
         # sampler output.
-        if model_output and (model_output[0].spec_decode_worker_metrics
-                             is not None):
+        if model_output and isinstance(model_output[0], SamplerOutput) and (
+                model_output[0].spec_decode_worker_metrics is not None):
             spec_decode_metrics = model_output[0].spec_decode_worker_metrics
         else:
             spec_decode_metrics = None
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 0bec978c4869c..11b2574ce42dd 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -26,7 +26,9 @@
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.guided_decoding.guided_fields import (
     GuidedDecodingRequest, LLMGuidedOptions)
-from vllm.outputs import PoolingRequestOutput, RequestOutput
+from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput,
+                          PoolingRequestOutput, RequestOutput,
+                          ScoringRequestOutput)
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
@@ -120,7 +122,7 @@ class LLM:
         serving, use the :class:`~vllm.AsyncLLMEngine` class instead.
     """
 
-    DEPRECATE_LEGACY: ClassVar[bool] = False
+    DEPRECATE_LEGACY: ClassVar[bool] = True
     """A flag to toggle whether to deprecate the legacy generate/encode API."""
 
     DEPRECATE_INIT_POSARGS: ClassVar[bool] = True
@@ -257,11 +259,14 @@ def generate(
         self,
         prompts: Union[PromptType, Sequence[PromptType]],
         /,
-        *,
         sampling_params: Optional[Union[SamplingParams,
                                         Sequence[SamplingParams]]] = None,
+        *,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -275,6 +280,9 @@ def generate(
         prompt_token_ids: Optional[List[int]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -288,6 +296,9 @@ def generate(
         prompt_token_ids: Optional[List[List[int]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -302,6 +313,9 @@ def generate(
         prompt_token_ids: List[int],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -316,6 +330,9 @@ def generate(
         prompt_token_ids: List[List[int]],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -328,6 +345,9 @@ def generate(
         prompt_token_ids: Union[List[int], List[List[int]]],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        guided_options_request: Optional[Union[LLMGuidedOptions,
+                                               GuidedDecodingRequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -678,11 +698,12 @@ def encode(
         self,
         prompts: Union[PromptType, Sequence[PromptType]],
         /,
-        *,
         pooling_params: Optional[Union[PoolingParams,
                                        Sequence[PoolingParams]]] = None,
+        *,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[PoolingRequestOutput]:
         ...
 
@@ -696,6 +717,7 @@ def encode(
         prompt_token_ids: Optional[List[int]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[PoolingRequestOutput]:
         ...
 
@@ -709,6 +731,7 @@ def encode(
         prompt_token_ids: Optional[List[List[int]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[PoolingRequestOutput]:
         ...
 
@@ -723,6 +746,7 @@ def encode(
         prompt_token_ids: List[int],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[PoolingRequestOutput]:
         ...
 
@@ -737,6 +761,7 @@ def encode(
         prompt_token_ids: List[List[int]],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[PoolingRequestOutput]:
         ...
 
@@ -749,6 +774,7 @@ def encode(
         prompt_token_ids: Union[List[int], List[List[int]]],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[PoolingRequestOutput]:
         ...
 
@@ -768,7 +794,8 @@ def encode(
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[PoolingRequestOutput]:
-        """Generates the completions for the input prompts.
+        """Apply pooling to the hidden states corresponding to the input
+        prompts.
 
         This class automatically batches the given prompts, considering
         the memory constraint. For the best performance, put all of your prompts
@@ -787,7 +814,7 @@ def encode(
 
         Returns:
             A list of ``PoolingRequestOutput`` objects containing the
-            generated embeddings in the same order as the input prompts.
+            pooled hidden states in the same order as the input prompts.
 
         Note:
             Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
@@ -833,28 +860,110 @@ def encode(
         return self.engine_class.validate_outputs(outputs,
                                                   PoolingRequestOutput)
 
+    def embed(
+        self,
+        prompts: Union[PromptType, Sequence[PromptType]],
+        /,
+        *,
+        use_tqdm: bool = True,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[EmbeddingRequestOutput]:
+        """
+        Generate an embedding vector for each prompt.
+
+        This class automatically batches the given prompts, considering
+        the memory constraint. For the best performance, put all of your prompts
+        into a single list and pass it to this method.
+
+        Args:
+            prompts: The prompts to the LLM. You may pass a sequence of prompts
+                for batch inference. See :class:`~vllm.inputs.PromptType`
+                for more details about the format of each prompts.
+            use_tqdm: Whether to use tqdm to display the progress bar.
+            lora_request: LoRA request to use for generation, if any.
+            prompt_adapter_request: Prompt Adapter request to use for
+                generation, if any.
+
+        Returns:
+            A list of ``EmbeddingRequestOutput`` objects containing the
+            embedding vectors in the same order as the input prompts.
+        """
+        if self.llm_engine.model_config.task != "embed":
+            raise ValueError(
+                "Embedding API is only enabled for `--task embed`")
+
+        items = self.encode(prompts,
+                            use_tqdm=use_tqdm,
+                            lora_request=lora_request,
+                            prompt_adapter_request=prompt_adapter_request)
+
+        return [EmbeddingRequestOutput.from_base(item) for item in items]
+
+    def classify(
+        self,
+        prompts: Union[PromptType, Sequence[PromptType]],
+        /,
+        *,
+        use_tqdm: bool = True,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[ClassificationRequestOutput]:
+        """
+        Generate class logits for each prompt.
+
+        This class automatically batches the given prompts, considering
+        the memory constraint. For the best performance, put all of your prompts
+        into a single list and pass it to this method.
+
+        Args:
+            prompts: The prompts to the LLM. You may pass a sequence of prompts
+                for batch inference. See :class:`~vllm.inputs.PromptType`
+                for more details about the format of each prompts.
+            use_tqdm: Whether to use tqdm to display the progress bar.
+            lora_request: LoRA request to use for generation, if any.
+            prompt_adapter_request: Prompt Adapter request to use for
+                generation, if any.
+
+        Returns:
+            A list of ``ClassificationRequestOutput`` objects containing the
+            embedding vectors in the same order as the input prompts.
+        """
+        if self.llm_engine.model_config.task != "classify":
+            raise ValueError(
+                "Classification API is only enabled for `--task classify`")
+
+        items = self.encode(prompts,
+                            use_tqdm=use_tqdm,
+                            lora_request=lora_request,
+                            prompt_adapter_request=prompt_adapter_request)
+
+        return [ClassificationRequestOutput.from_base(item) for item in items]
+
     def score(
         self,
         text_1: Union[SingletonPrompt, Sequence[SingletonPrompt]],
         text_2: Union[SingletonPrompt, Sequence[SingletonPrompt]],
         /,
+        *,
         truncate_prompt_tokens: Optional[int] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> List[PoolingRequestOutput]:
-        """Generates similarity scores for all pairs <text,text_pair>.
+    ) -> List[ScoringRequestOutput]:
+        """Generate similarity scores for all pairs ``<text,text_pair>``.
 
-        The inputs can be 1 -> 1, 1 -> N or N -> N. In the 1 - N case
-        the text_1 sentence will be replicated N times to pair with the text_2
-        sentences. The input pairs are used to build a list of prompts for the
+        The inputs can be ``1 -> 1``, ``1 -> N`` or ``N -> N``.
+        In the ``1 - N`` case the ``text_1`` sentence will be replicated ``N``
+        times to pair with the ``text_2`` sentences.
+        The input pairs are used to build a list of prompts for the
         cross encoder model. This class automatically batches the prompts,
         considering the memory constraint. For the best performance, put all
         of your texts into a single list and pass it to this method.
 
         Args:
             text_1: can be a single prompt or a list of prompts, in which
-                case it has to have the same length as the text_2 list
+                case it has to have the same length as the ``text_2`` list
             text_2: The texts to pair with the query to form the input
                 to the LLM. See :class:`~vllm.inputs.PromptType` for
                 more details about the format of each prompts.
@@ -864,7 +973,7 @@ def score(
                 generation, if any.
 
         Returns:
-            A list of ``PoolingRequestOutput`` objects containing the
+            A list of ``ScoringRequestOutput`` objects containing the
             generated scores in the same order as the input prompts.
         """
         runner_type = self.llm_engine.model_config.runner_type
@@ -884,6 +993,8 @@ def score(
 
         if not self.llm_engine.model_config.is_cross_encoder:
             raise ValueError("Your model does not support cross encoding")
+        if self.llm_engine.model_config.task != "score":
+            raise ValueError("Score API is only enabled for `--task score`")
 
         tokenizer = self.llm_engine.get_tokenizer()
 
@@ -954,8 +1065,10 @@ def ensure_str(prompt: SingletonPrompt):
         )
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
-        return self.engine_class.validate_outputs(outputs,
-                                                  PoolingRequestOutput)
+        items = self.engine_class.validate_outputs(outputs,
+                                                   PoolingRequestOutput)
+
+        return [ScoringRequestOutput.from_base(item) for item in items]
 
     def start_profile(self) -> None:
         self.llm_engine.start_profile()
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index ee94a9413f098..34c9f0a96216f 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -900,7 +900,7 @@ class EmbeddingResponse(OpenAIBaseModel):
 class ScoreResponseData(OpenAIBaseModel):
     index: int
     object: str = "score"
-    score: Union[List[float], str]
+    score: float
 
 
 class ScoreResponse(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 3f7b75e893cad..fd501ad4f833e 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -18,14 +18,15 @@
                                               ErrorResponse, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
 from vllm.logger import init_logger
-from vllm.outputs import PoolingOutput, PoolingRequestOutput
+from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput,
+                          PoolingRequestOutput)
 from vllm.utils import merge_async_iterators
 
 logger = init_logger(__name__)
 
 
 def _get_embedding(
-    output: PoolingOutput,
+    output: EmbeddingOutput,
     encoding_format: Literal["float", "base64"],
 ) -> Union[List[float], str]:
     if encoding_format == "float":
@@ -46,8 +47,10 @@ def request_output_to_embedding_response(
     data: List[EmbeddingResponseData] = []
     num_prompt_tokens = 0
     for idx, final_res in enumerate(final_res_batch):
+        embedding_res = EmbeddingRequestOutput.from_base(final_res)
         prompt_token_ids = final_res.prompt_token_ids
-        embedding = _get_embedding(final_res.outputs, encoding_format)
+
+        embedding = _get_embedding(embedding_res.outputs, encoding_format)
         embedding_data = EmbeddingResponseData(index=idx, embedding=embedding)
         data.append(embedding_data)
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index d5ad4354c78be..5b6a089e4c319 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -31,7 +31,7 @@
                                               ErrorResponse,
                                               LoadLoraAdapterRequest,
                                               ModelCard, ModelList,
-                                              ModelPermission,
+                                              ModelPermission, ScoreRequest,
                                               TokenizeChatRequest,
                                               TokenizeCompletionRequest,
                                               UnloadLoraAdapterRequest)
@@ -73,7 +73,7 @@ class LoRAModulePath:
 
 
 CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest,
-                              EmbeddingCompletionRequest,
+                              EmbeddingCompletionRequest, ScoreRequest,
                               TokenizeCompletionRequest]
 
 ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest,
@@ -567,12 +567,14 @@ async def _get_trace_headers(
         return None
 
     @staticmethod
-    def _base_request_id(raw_request: Request,
+    def _base_request_id(raw_request: Optional[Request],
                          default: Optional[str] = None) -> Optional[str]:
         """Pulls the request id to use from a header, if provided"""
         default = default or random_uuid()
-        return raw_request.headers.get(
-            "X-Request-Id", default) if raw_request is not None else default
+        if raw_request is None:
+            return default
+
+        return raw_request.headers.get("X-Request-Id", default)
 
     @staticmethod
     def _get_decoded_token(logprob: Logprob,
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 4929e720c00e4..6f5cc14ac37cc 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -13,7 +13,7 @@
 from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
 from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
-from vllm.outputs import PoolingRequestOutput
+from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 from vllm.utils import make_async, merge_async_iterators
 
@@ -24,13 +24,13 @@ def request_output_to_score_response(
         final_res_batch: List[PoolingRequestOutput], request_id: str,
         created_time: int, model_name: str) -> ScoreResponse:
     data: List[ScoreResponseData] = []
-    score = None
     num_prompt_tokens = 0
     for idx, final_res in enumerate(final_res_batch):
-        if final_res is not None:
-            score = final_res.outputs.embedding
-            score_data = ScoreResponseData(index=idx, score=score)
-            data.append(score_data)
+        classify_res = ScoringRequestOutput.from_base(final_res)
+
+        score_data = ScoreResponseData(index=idx,
+                                       score=classify_res.outputs.score)
+        data.append(score_data)
 
     usage = UsageInfo(
         prompt_tokens=num_prompt_tokens,
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index e0d42e30ebef3..75bf33dc70a51 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -1,14 +1,16 @@
 from enum import IntEnum
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from transformers import PretrainedConfig
+from typing_extensions import assert_never
 
 from vllm.config import PoolerConfig
 from vllm.model_executor.pooling_metadata import (PoolingMetadata,
                                                   PoolingTensors)
-from vllm.sequence import EmbeddingSequenceGroupOutput, PoolerOutput
+from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
 from vllm.transformers_utils.config import (
     get_cross_encoder_activation_function)
 
@@ -22,7 +24,7 @@ class PoolingType(IntEnum):
     MEAN = 4
 
 
-class Pooler(nn.Module):
+class SimplePooler(nn.Module):
     """A layer that pools specific information from hidden states.
 
     This layer does the following:
@@ -35,21 +37,203 @@ class Pooler(nn.Module):
         normalize: Whether to normalize the pooled data.
     """
 
+    @staticmethod
+    def from_pooling_type(
+        pooling_type: PoolingType,
+        *,
+        normalize: bool,
+        softmax: bool,
+        step_tag_id: Optional[int] = None,
+        returned_token_ids: Optional[List[int]] = None,
+    ) -> "SimplePooler":
+        if pooling_type == PoolingType.LAST:
+            assert step_tag_id is None and returned_token_ids is None
+            return LastPool(normalize=normalize, softmax=softmax)
+        if pooling_type == PoolingType.ALL:
+            assert step_tag_id is None and returned_token_ids is None
+            return AllPool(normalize=normalize, softmax=softmax)
+        if pooling_type == PoolingType.CLS:
+            assert step_tag_id is None and returned_token_ids is None
+            return CLSPool(normalize=normalize, softmax=softmax)
+        if pooling_type == PoolingType.MEAN:
+            assert step_tag_id is None and returned_token_ids is None
+            return MeanPool(normalize=normalize, softmax=softmax)
+        if pooling_type == PoolingType.STEP:
+            return StepPool(normalize=normalize,
+                            softmax=softmax,
+                            step_tag_id=step_tag_id,
+                            returned_token_ids=returned_token_ids)
+
+        assert_never(pooling_type)
+
+    def __init__(self, *, normalize: bool, softmax: bool) -> None:
+        super().__init__()
+
+        self.head = PoolerHead(normalize=normalize, softmax=softmax)
+
+    def get_prompt_lens(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> torch.Tensor:
+        return PoolingTensors.from_pooling_metadata(
+            pooling_metadata, hidden_states.device).prompt_lens
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        raise NotImplementedError
+
+    def build_output(self, data: torch.Tensor) -> PoolingSequenceGroupOutput:
+        return PoolingSequenceGroupOutput(data)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        pooled_data = self.extract_states(hidden_states, pooling_metadata)
+        pooled_data = self.head(pooled_data)
+        pooled_outputs = [self.build_output(data) for data in pooled_data]
+        return PoolerOutput(outputs=pooled_outputs)
+
+
+class CLSPool(SimplePooler):
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
+
+        first_token_flat_indices = torch.zeros_like(prompt_lens)
+        first_token_flat_indices[1:] += torch.cumsum(prompt_lens, dim=0)[:-1]
+        return hidden_states[first_token_flat_indices]
+
+
+class LastPool(SimplePooler):
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
+
+        last_token_flat_indices = torch.cumsum(prompt_lens, dim=0) - 1
+        return hidden_states[last_token_flat_indices]
+
+
+class AllPool(SimplePooler):
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
+
+        offset = 0
+        pooled_data = list[torch.Tensor]()
+        for prompt_len in prompt_lens:
+            pooled_data.append(hidden_states[offset:offset + prompt_len])
+            offset += prompt_len
+
+        return pooled_data
+
+
+class MeanPool(SimplePooler):
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
+
+        cumsum = torch.cumsum(hidden_states, dim=0)
+        start_indices = torch.cat([
+            torch.tensor([0], device=hidden_states.device),
+            torch.cumsum(prompt_lens[:-1], dim=0)
+        ])
+        end_indices = torch.cumsum(prompt_lens, dim=0)
+        return (cumsum[end_indices - 1] - cumsum[start_indices] +
+                hidden_states[start_indices]) / prompt_lens.unsqueeze(1)
+
+
+class StepPool(SimplePooler):
+
     def __init__(
         self,
-        pooling_type: PoolingType,
+        *,
         normalize: bool,
         softmax: bool,
         step_tag_id: Optional[int] = None,
         returned_token_ids: Optional[List[int]] = None,
     ):
+        super().__init__(normalize=normalize, softmax=softmax)
+
+        self.step_tag_id = step_tag_id
+        self.returned_token_ids = returned_token_ids
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[torch.Tensor], torch.Tensor]:
+        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
+
+        returned_token_ids = self.returned_token_ids
+        if returned_token_ids is not None and len(returned_token_ids) > 0:
+            hidden_states = hidden_states[:, returned_token_ids]
+
+        step_tag_id = self.step_tag_id
+
+        offset = 0
+        pooled_data = list[torch.Tensor]()
+        for prompt_len, seq_data_i in zip(prompt_lens,
+                                          pooling_metadata.seq_data.values()):
+            pooled_data_i = hidden_states[offset:offset + prompt_len]
+            if step_tag_id is not None:
+                token_ids = torch.tensor(seq_data_i.prompt_token_ids)
+                pooled_data_i = pooled_data_i[token_ids == step_tag_id]
+
+            offset += prompt_len
+            pooled_data.append(pooled_data_i)
+
+        return pooled_data
+
+
+class PoolerHead(nn.Module):
+
+    def __init__(self, *, normalize: bool, softmax: bool) -> None:
         super().__init__()
 
-        self.pooling_type = pooling_type
         self.normalize = normalize
         self.softmax = softmax
-        self.step_tag_id = step_tag_id
-        self.returned_token_ids = returned_token_ids
+
+    def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor]):
+        if self.normalize:
+            if isinstance(pooled_data, list):
+                pooled_data = [
+                    F.normalize(data, p=2, dim=1) for data in pooled_data
+                ]
+            else:
+                pooled_data = F.normalize(pooled_data, p=2, dim=1)
+
+        if self.softmax:
+            if isinstance(pooled_data, list):
+                pooled_data = [F.softmax(data, dim=-1) for data in pooled_data]
+            else:
+                pooled_data = F.softmax(pooled_data, dim=-1)
+
+        return pooled_data
+
+
+class Pooler(nn.Module):
 
     @classmethod
     def from_config_with_defaults(
@@ -60,8 +244,8 @@ def from_config_with_defaults(
         softmax: bool,
         step_tag_id: Optional[int] = None,
         returned_token_ids: Optional[List[int]] = None,
-    ) -> "Pooler":
-        return cls(
+    ) -> SimplePooler:
+        return SimplePooler.from_pooling_type(
             pooling_type=PoolingType[pooler_config.pooling_type]
             if pooler_config.pooling_type is not None else pooling_type,
             normalize=pooler_config.normalize
@@ -75,85 +259,6 @@ def from_config_with_defaults(
             returned_token_ids,
         )
 
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> PoolerOutput:
-        """Pools specific information from hidden states based on metadata."""
-
-        prompt_lens = PoolingTensors.from_pooling_metadata(
-            pooling_metadata, hidden_states.device).prompt_lens
-
-        if self.pooling_type is PoolingType.CLS:
-            first_token_flat_indices = torch.zeros_like(prompt_lens)
-            first_token_flat_indices[1:] += torch.cumsum(prompt_lens,
-                                                         dim=0)[:-1]
-            pooled_data = hidden_states[first_token_flat_indices]
-        elif self.pooling_type == PoolingType.LAST:
-            last_token_flat_indices = torch.cumsum(prompt_lens, dim=0) - 1
-            pooled_data = hidden_states[last_token_flat_indices]
-        elif self.pooling_type == PoolingType.ALL:
-            offset = 0
-            pooled_data = []
-            for prompt_len in prompt_lens:
-                pooled_data.append(hidden_states[offset:offset + prompt_len])
-                offset += prompt_len
-        elif self.pooling_type == PoolingType.MEAN:
-            # Calculate mean pooling
-            cumsum = torch.cumsum(hidden_states, dim=0)
-            start_indices = torch.cat([
-                torch.tensor([0], device=hidden_states.device),
-                torch.cumsum(prompt_lens[:-1], dim=0)
-            ])
-            end_indices = torch.cumsum(prompt_lens, dim=0)
-            pooled_data = (
-                cumsum[end_indices - 1] - cumsum[start_indices] +
-                hidden_states[start_indices]) / prompt_lens.unsqueeze(1)
-        elif self.pooling_type == PoolingType.STEP:
-            returned_token_ids = self.returned_token_ids
-            if returned_token_ids is not None and len(returned_token_ids) > 0:
-                hidden_states = hidden_states[:, returned_token_ids]
-
-            step_tag_id = self.step_tag_id
-
-            offset = 0
-            pooled_data = []
-            for prompt_len, seq_data_i in zip(
-                    prompt_lens, pooling_metadata.seq_data.values()):
-                pooled_data_i = hidden_states[offset:offset + prompt_len]
-                if step_tag_id is not None:
-                    token_ids = torch.tensor(seq_data_i.prompt_token_ids)
-                    pooled_data_i = pooled_data_i[token_ids == step_tag_id]
-
-                offset += prompt_len
-                pooled_data.append(pooled_data_i)
-        else:
-            raise ValueError(f"Invalid pooling type: {self.pooling_type}")
-
-        if self.normalize:
-            if isinstance(pooled_data, list):
-                pooled_data = [
-                    nn.functional.normalize(data, p=2, dim=1)
-                    for data in pooled_data
-                ]
-            else:
-                pooled_data = nn.functional.normalize(pooled_data, p=2, dim=1)
-
-        if self.softmax:
-            if isinstance(pooled_data, list):
-                pooled_data = [
-                    nn.functional.softmax(data, dim=-1) for data in pooled_data
-                ]
-            else:
-                pooled_data = nn.functional.softmax(pooled_data, dim=-1)
-
-        pooled_outputs = [
-            EmbeddingSequenceGroupOutput(data.tolist()) for data in pooled_data
-        ]
-
-        return PoolerOutput(outputs=pooled_outputs)
-
 
 class CrossEncodingPooler(nn.Module):
     """A layer that pools specific information from hidden states.
@@ -208,9 +313,8 @@ def forward(
         if self.pooler is not None:
             # apply classifier once on the full batch if possible
             pooled_output = self.classifier(pooled_output)
-        logits = self.default_activation_function(pooled_output)
 
-        pooled_outputs = [
-            EmbeddingSequenceGroupOutput(data.tolist()) for data in logits
-        ]
+        scores = self.default_activation_function(pooled_output).squeeze(-1)
+
+        pooled_outputs = [PoolingSequenceGroupOutput(data) for data in scores]
         return PoolerOutput(outputs=pooled_outputs)
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 34c1332ac4a66..d179d6235424a 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -2,19 +2,20 @@
 from typing import List, Optional, Union
 
 import torch
-from torch import nn
+import torch.nn as nn
 from xformers.ops.fmha.attn_bias import BlockDiagonalMask
 
 from vllm.attention import AttentionMetadata
 from vllm.attention.backends.xformers import XFormersImpl
 from vllm.config import ModelConfig, VllmConfig
 from vllm.logger import init_logger
+from vllm.model_executor.layers.pooler import PoolerHead
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.model_executor.pooling_metadata import (PoolingMetadata,
                                                   PoolingTensors)
 from vllm.multimodal.utils import cached_get_tokenizer
-from vllm.sequence import (EmbeddingSequenceGroupOutput, IntermediateTensors,
-                           PoolerOutput)
+from vllm.sequence import (IntermediateTensors, PoolerOutput,
+                           PoolingSequenceGroupOutput)
 
 logger = init_logger(__name__)
 
@@ -52,6 +53,8 @@ def tokens_to_ids(tokens: list[str]) -> array:
         self.embed_pattern_ids = tokens_to_ids(
             ["▁<", "|", "embed", "|", ">", "<0x0A>"])
 
+        self.head = PoolerHead(normalize=True, softmax=False)
+
     def _find_array(self, arr: array, target: array, start_idx: int) -> int:
         """
         Find the first occurrence of target in arr starting from start_idx.
@@ -75,7 +78,7 @@ def _find_array(self, arr: array, target: array, start_idx: int) -> int:
                 return i
         return -1
 
-    def _get_instruction_len(self, prompt_token_ids: array) -> bool:
+    def _get_instruction_len(self, prompt_token_ids: array) -> int:
         """
         Get the length of the instruction in the prompt.
 
@@ -168,10 +171,10 @@ def forward(
         mean_embeddings = sum_embeddings / num_non_instruction_tokens.unsqueeze(
             1)
 
-        pooled_data = nn.functional.normalize(mean_embeddings, p=2, dim=1)
+        pooled_data = self.head(mean_embeddings)
 
         pooled_outputs = [
-            EmbeddingSequenceGroupOutput(data.tolist()) for data in pooled_data
+            PoolingSequenceGroupOutput(data) for data in pooled_data
         ]
 
         return PoolerOutput(outputs=pooled_outputs)
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 86264f604f6bc..8c6c1aca3a917 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -1,9 +1,13 @@
 import time
+import warnings
 from dataclasses import dataclass
-from typing import Dict, List, Optional
+from typing import Dict, Generic, List, Optional
 from typing import Sequence as GenericSequence
 from typing import Union
 
+import torch
+from typing_extensions import TypeVar
+
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.inputs import MultiModalPlaceholderDict
 from vllm.sampling_params import RequestOutputKind
@@ -57,14 +61,26 @@ class PoolingOutput:
     """The output data of one pooling output of a request.
 
     Args:
-        embedding: The embedding vector, which is a list of floats. The
-        length of vector depends on the model as listed in the embedding guide.
+        data: The extracted hidden states.
     """
-    embedding: List[float]
+    data: torch.Tensor
 
     def __repr__(self) -> str:
-        return (f"PoolingOutput("
-                f"embedding={len(self.embedding)})")
+        return (f"PoolingOutput(data={self.data})")
+
+    def __eq__(self, other: object) -> bool:
+        return (isinstance(other, self.__class__) and bool(
+            (self.data == other.data).all()))
+
+    @property
+    def embedding(self) -> list[float]:
+        msg = ("`LLM.encode()` now returns raw outputs. "
+               "To return embeddings, use `LLM.embed()`. "
+               "To return class probabilities, use `LLM.classify()` "
+               "and access the `probs` attribute. ")
+        warnings.warn(msg, DeprecationWarning, stacklevel=2)
+
+        return self.data.tolist()
 
 
 class RequestOutput:
@@ -316,7 +332,10 @@ def __repr__(self) -> str:
                 f"multi_modal_placeholders={self.multi_modal_placeholders})")
 
 
-class PoolingRequestOutput:
+_O = TypeVar("_O", default=PoolingOutput)
+
+
+class PoolingRequestOutput(Generic[_O]):
     """
     The output data of a pooling request to the LLM.
 
@@ -327,24 +346,24 @@ class PoolingRequestOutput:
         finished (bool): A flag indicating whether the pooling is completed.
     """
 
-    def __init__(self, request_id: str, outputs: "PoolingOutput",
+    def __init__(self, request_id: str, outputs: _O,
                  prompt_token_ids: List[int], finished: bool):
         self.request_id = request_id
         self.prompt_token_ids = prompt_token_ids
         self.finished = finished
         self.outputs = outputs
 
-    @classmethod
-    def from_seq_group(cls,
-                       seq_group: 'SequenceGroup') -> "PoolingRequestOutput":
-        if seq_group.embeddings is None:
-            raise ValueError(
-                "Embeddings are missing in seq_group for EmbeddingRequest.")
-        output = PoolingOutput(seq_group.embeddings)
+    @staticmethod
+    def from_seq_group(seq_group: SequenceGroup) -> "PoolingRequestOutput":
+        pooled_data = seq_group.pooled_data
+        assert pooled_data is not None
+
+        output = PoolingOutput(pooled_data)
         prompt_token_ids = seq_group.prompt_token_ids
         finished = seq_group.is_finished()
 
-        return cls(seq_group.request_id, output, prompt_token_ids, finished)
+        return PoolingRequestOutput(seq_group.request_id, output,
+                                    prompt_token_ids, finished)
 
     def __repr__(self):
         """
@@ -356,89 +375,137 @@ def __repr__(self):
         Returns:
             str: A string representation of the PoolingRequestOutput instance.
         """
-        return (f"PoolingRequestOutput(request_id='{self.request_id}', "
-                f"outputs={repr(self.outputs)}, "
+        return (f"{type(self).__name__}(request_id={self.request_id!r}, "
+                f"outputs={self.outputs!r}, "
                 f"prompt_token_ids={self.prompt_token_ids}, "
                 f"finished={self.finished})")
 
 
+class RequestOutputFactory:
+
+    @staticmethod
+    def create(seq_group: SequenceGroup,
+               seq_id_to_seq_group: Dict[str, SequenceGroupBase],
+               use_cache: bool = False):
+        if seq_group.pooled_data is not None:
+            return PoolingRequestOutput.from_seq_group(seq_group)
+        else:
+            return RequestOutput.from_seq_group(seq_group, use_cache,
+                                                seq_id_to_seq_group)
+
+
 @dataclass
-class ScoreOutput:
-    """The output data of one completion output of a request.
+class EmbeddingOutput:
+    """The output data of one embedding output of a request.
 
     Args:
-        score: The score, which is a list of floats. 
-        index: The correspondent text index of the score.
+        embedding: The embedding vector, which is a list of floats.
+        Its length depends on the hidden dimension of the model.
     """
-    index: int
-    score: List[float]
+    embedding: list[float]
+
+    @staticmethod
+    def from_base(pooling_output: PoolingOutput):
+        pooled_data = pooling_output.data
+        if pooled_data.ndim != 1:
+            raise ValueError("pooled_data should be a 1-D embedding vector")
+
+        return EmbeddingOutput(pooled_data.tolist())
+
+    @property
+    def hidden_size(self) -> int:
+        return len(self.embedding)
 
     def __repr__(self) -> str:
-        return (f"ScoreOutput("
-                f"score={self.score}), "
-                f"index={self.index})")
+        return f"EmbeddingOutput(hidden_size={self.hidden_size})"
 
 
-class ScoreRequestOutput:
-    """
-    The output data of an score request to the LLM.
+class EmbeddingRequestOutput(PoolingRequestOutput[EmbeddingOutput]):
+
+    @staticmethod
+    def from_base(request_output: PoolingRequestOutput):
+        return EmbeddingRequestOutput(
+            request_id=request_output.request_id,
+            outputs=EmbeddingOutput.from_base(request_output.outputs),
+            prompt_token_ids=request_output.prompt_token_ids,
+            finished=request_output.finished,
+        )
+
+
+@dataclass
+class ClassificationOutput:
+    """The output data of one classification output of a request.
 
     Args:
-        request_id (str): A unique identifier for the score request.
-        outputs (score): The embedding results for the given input.
+        probs: The probability vector, which is a list of floats.
+        Its length depends on the number of classes.
     """
+    probs: list[float]
 
-    def __init__(self, request_id: str, outputs: "ScoreOutput"):
-        self.request_id = request_id
-        self.outputs = outputs
+    @staticmethod
+    def from_base(pooling_output: PoolingOutput):
+        pooled_data = pooling_output.data
+        if pooled_data.ndim != 1:
+            raise ValueError("pooled_data should be a 1-D probability vector")
 
-    def __repr__(self):
-        """
-        Returns a string representation of an ScoreRequestOutput instance.
+        return ClassificationOutput(pooled_data.tolist())
 
-        The representation includes the request_id and the number of outputs,
-        providing a quick overview of the embedding request's results.
+    @property
+    def num_classes(self) -> int:
+        return len(self.probs)
 
-        Returns:
-            str: A string representation of the ScoreRequestOutput instance.
-        """
-        return (f"ScoreRequestOutput(request_id='{self.request_id}', "
-                f"outputs={repr(self.outputs)}")
+    def __repr__(self) -> str:
+        return f"ClassificationOutput(num_classes={self.num_classes})"
 
 
-class RequestOutputFactory:
+class ClassificationRequestOutput(PoolingRequestOutput[ClassificationOutput]):
 
     @staticmethod
-    def create(seq_group: SequenceGroup,
-               seq_id_to_seq_group: Dict[str, SequenceGroupBase],
-               use_cache: bool = False):
-        # Determine the type based on a condition, for example:
-        if hasattr(seq_group,
-                   'embeddings') and seq_group.embeddings is not None:
-            return PoolingRequestOutput.from_seq_group(seq_group)
-        else:
-            return RequestOutput.from_seq_group(seq_group, use_cache,
-                                                seq_id_to_seq_group)
+    def from_base(request_output: PoolingRequestOutput):
+        return ClassificationRequestOutput(
+            request_id=request_output.request_id,
+            outputs=ClassificationOutput.from_base(request_output.outputs),
+            prompt_token_ids=request_output.prompt_token_ids,
+            finished=request_output.finished,
+        )
 
 
-def __getattr__(name: str):
-    import warnings
+@dataclass
+class ScoringOutput:
+    """The output data of one scoring output of a request.
 
-    if name == "EmbeddingOutput":
-        msg = ("EmbeddingOutput has been renamed to PoolingOutput. "
-               "The original name will be removed in an upcoming version.")
+    Args:
+        score: The similarity score, which is a scalar value.
+    """
+    score: float
+
+    @staticmethod
+    def from_base(pooling_output: PoolingOutput):
+        pooled_data = pooling_output.data
+        if pooled_data.ndim != 0:
+            raise ValueError("pooled_data should be a scalar score")
 
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
+        return ScoringOutput(pooled_data.item())
 
-        return PoolingOutput
+    def __repr__(self) -> str:
+        return f"ScoringOutput(score={self.score})"
 
-    if name == "EmbeddingRequestOutput":
-        msg = ("EmbeddingRequestOutput has been renamed to "
-               "PoolingRequestOutput. "
-               "The original name will be removed in an upcoming version.")
+    @property
+    def embedding(self) -> list[float]:
+        msg = ("`LLM.score()` now returns scalar scores. "
+               "Please access it via the `score` attribute. ")
+        warnings.warn(msg, DeprecationWarning, stacklevel=2)
 
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
+        return [self.score]
 
-        return PoolingRequestOutput
 
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+class ScoringRequestOutput(PoolingRequestOutput[ScoringOutput]):
+
+    @staticmethod
+    def from_base(request_output: PoolingRequestOutput):
+        return ScoringRequestOutput(
+            request_id=request_output.request_id,
+            outputs=ScoringOutput.from_base(request_output.outputs),
+            prompt_token_ids=request_output.prompt_token_ids,
+            finished=request_output.finished,
+        )
diff --git a/vllm/sequence.py b/vllm/sequence.py
index b0f3c1cc3609f..ddb9ca5944f10 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -617,10 +617,9 @@ class SequenceGroup:
         sampling_params: The sampling parameters used to generate the outputs.
         arrival_time: The arrival time of the request.
         lora_request: LoRA request.
-        embeddings: The embeddings vectors of the prompt of the sequence group
-            for a pooling model.
-        pooling_params: The pooling parameters used to generate the pooling
+        pooling_params: The parameters used to generate the pooler
             for a pooling model.
+        pooled_data: The extracted hidden states from a pooling model.
         encoder_seq: Optional, the single encoder sequence. Should be None
                      unless you are working with an encoder/decoder model.
         trace_headers: OpenTelemetry trace headers.
@@ -635,8 +634,8 @@ def __init__(
         arrival_time: float,
         sampling_params: Optional[SamplingParams] = None,
         lora_request: Optional[LoRARequest] = None,
-        embeddings: Optional[List[float]] = None,
         pooling_params: Optional[PoolingParams] = None,
+        pooled_data: Optional[torch.Tensor] = None,
         encoder_seq: Optional[Sequence] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
@@ -658,8 +657,8 @@ def __init__(
         self.lora_request = lora_request
         self.prompt_logprobs: Optional[PromptLogprobs] = None
         self.state = SequenceGroupState()
-        self.embeddings = embeddings
         self.pooling_params = pooling_params
+        self.pooled_data = pooled_data
         self.prompt_adapter_request = prompt_adapter_request
         self.encoder_seq = encoder_seq
         self.trace_headers = trace_headers
@@ -1033,8 +1032,8 @@ class CompletionSequenceGroupOutput(
         msgspec.Struct,
         omit_defaults=True,  # type: ignore[call-arg]
         array_like=True):  # type: ignore[call-arg]
-    __metaclass__ = SequenceGroupOutput
     """The model output associated with a completion sequence group."""
+    __metaclass__ = SequenceGroupOutput
     samples: List[SequenceOutput]
     # Prompt logprob for each prompt query token.
     prompt_logprobs: Optional[PromptLogprobs]
@@ -1050,23 +1049,24 @@ def __eq__(self, other: object) -> bool:
                 and self.prompt_logprobs == other.prompt_logprobs)
 
 
-class EmbeddingSequenceGroupOutput(
+class PoolingSequenceGroupOutput(
         msgspec.Struct,
         omit_defaults=True,  # type: ignore[call-arg]
         array_like=True,  # type: ignore[call-arg]
 ):
-    """The model output associated with an embedding sequence group."""
+    """The model output associated with a pooling sequence group."""
     __metaclass__ = SequenceGroupOutput
-    embeddings: List[int]
+    # Annotated as Any to be compatible with msgspec
+    # The actual type is in SequenceGroup.pooled_data
+    data: Any
 
     def __repr__(self) -> str:
-        return (f"EmbeddingSequenceGroupOutput("
-                f"embeddings_shape={len(self.embeddings)})")
+        return f"PoolingSequenceGroupOutput(data={self.data}"
 
     def __eq__(self, other: object) -> bool:
-        if not isinstance(other, EmbeddingSequenceGroupOutput):
+        if not isinstance(other, PoolingSequenceGroupOutput):
             raise NotImplementedError()
-        return self.embeddings == other.embeddings
+        return self.data == other.data
 
 
 # cannot use msgspec.Struct here because Dynamo does not support it
@@ -1085,7 +1085,7 @@ def __getitem__(self, key: Union[str, slice]):
         elif isinstance(key, slice):
             return self.__class__({k: v[key] for k, v in self.tensors.items()})
 
-    def __setitem__(self, key: str, value):
+    def __setitem__(self, key: str, value: torch.Tensor):
         self.tensors[key] = value
 
     def __len__(self):
@@ -1103,16 +1103,12 @@ class PoolerOutput(
         omit_defaults=True,  # type: ignore[call-arg]
         array_like=True):  # type: ignore[call-arg]
     """The output from a pooling operation in the pooling model."""
-    outputs: List[EmbeddingSequenceGroupOutput]
-
-    # lazy import to avoid circular import
-    from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
-    spec_decode_worker_metrics: Optional[SpecDecodeWorkerMetrics] = None
+    outputs: List[PoolingSequenceGroupOutput]
 
-    def __getitem__(self, idx: int) -> EmbeddingSequenceGroupOutput:
+    def __getitem__(self, idx: int) -> PoolingSequenceGroupOutput:
         return self.outputs[idx]
 
-    def __setitem__(self, idx: int, value):
+    def __setitem__(self, idx: int, value: PoolingSequenceGroupOutput):
         self.outputs[idx] = value
 
     def __len__(self):
@@ -1385,8 +1381,8 @@ def add_request(request_id: str, engine, params, **kwargs):
             arrival_time=seq_group.arrival_time,
             sampling_params=original_params,
             lora_request=seq_group.lora_request,
-            embeddings=seq_group.embeddings,
             pooling_params=seq_group.pooling_params,
+            pooled_data=seq_group.pooled_data,
             encoder_seq=seq_group.encoder_seq,
             trace_headers=seq_group.trace_headers,
             prompt_adapter_request=seq_group.prompt_adapter_request,

From 969da7d70bc0539f6be12027b71bef758325a61a Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Fri, 13 Dec 2024 03:09:30 -0800
Subject: [PATCH 054/357] [V1][VLM] Fix edge case bug for InternVL2 (#11165)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/internvl.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 42c769f79e202..f4b7e4478c164 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -669,8 +669,11 @@ def _process_image_input(
         image_embeds = self.extract_feature(image_input["data"])
 
         patches_per_image = image_input["patches_per_image"]
+
+        # Only one image in the current batch
         if len(patches_per_image) == 1:
-            image_embeds = image_embeds.unsqueeze(0)
+            image_embeds = image_embeds.view(
+                -1, self.config.text_config.hidden_size).unsqueeze(0)
             return image_embeds
 
         # NOTE: Image embeddings are split into separate tensors for each image

From d1fa714cb1c9a708d7da0de27c99f7eee07fe663 Mon Sep 17 00:00:00 2001
From: Chenguang Li <757486878@qq.com>
Date: Fri, 13 Dec 2024 21:39:00 +0800
Subject: [PATCH 055/357] [Refactor]A simple device-related refactor (#11163)

Signed-off-by: noemotiovon <noemotiovon@gmail.com>
Co-authored-by: noemotiovon <noemotiovon@gmail.com>
---
 vllm/platforms/cpu.py       |  5 +++++
 vllm/platforms/hpu.py       |  9 +++++++++
 vllm/platforms/interface.py | 17 +++++++++++++++++
 vllm/platforms/neuron.py    |  9 +++++++++
 vllm/platforms/openvino.py  | 10 +++++-----
 vllm/platforms/xpu.py       |  5 +++++
 vllm/utils.py               | 27 +--------------------------
 7 files changed, 51 insertions(+), 31 deletions(-)

diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index e5142b985d1f2..aad8755d9fcd8 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -98,3 +98,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                     "vllm.worker.cpu_worker.CPUWorker"
             else:
                 parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"
+
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        logger.warning("Pin memory is not supported on CPU.")
+        return False
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 7f22bee3eaa74..2b947d280f9f8 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -2,6 +2,8 @@
 
 import torch
 
+from vllm.logger import init_logger
+
 from .interface import Platform, PlatformEnum, _Backend
 
 if TYPE_CHECKING:
@@ -9,6 +11,8 @@
 else:
     VllmConfig = None
 
+logger = init_logger(__name__)
+
 
 class HpuPlatform(Platform):
     _enum = PlatformEnum.HPU
@@ -43,3 +47,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         parallel_config = vllm_config.parallel_config
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker"
+
+    @classmethod
+    def is_pin_memory_available(cls):
+        logger.warning("Pin memory is not supported on HPU.")
+        return False
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index db06d2c18e681..4150b0cdf836a 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -1,6 +1,7 @@
 import enum
 import platform
 import random
+from platform import uname
 from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union
 
 import numpy as np
@@ -16,6 +17,11 @@
 logger = init_logger(__name__)
 
 
+def in_wsl() -> bool:
+    # Reference: https://github.com/microsoft/WSL/issues/4071
+    return "microsoft" in " ".join(uname()).lower()
+
+
 class _Backend(enum.Enum):
     FLASH_ATTN = enum.auto()
     FLASH_ATTN_VLLM_V1 = enum.auto()
@@ -221,6 +227,17 @@ def get_cpu_architecture(cls) -> CpuArchEnum:
 
         return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN
 
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        """Checks whether pin memory is available on the current platform."""
+        if in_wsl():
+            # Pinning memory in WSL is not supported.
+            # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
+            logger.warning("Using 'pin_memory=False' as WSL is detected. "
+                           "This may slow down the performance.")
+            return False
+        return True
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 1e5c4bddfa24f..86113523385f6 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -1,5 +1,7 @@
 from typing import TYPE_CHECKING, Optional
 
+from vllm.logger import init_logger
+
 from .interface import Platform, PlatformEnum
 
 if TYPE_CHECKING:
@@ -7,6 +9,8 @@
 else:
     VllmConfig = None
 
+logger = init_logger(__name__)
+
 
 class NeuronPlatform(Platform):
     _enum = PlatformEnum.NEURON
@@ -28,3 +32,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = \
                 "vllm.worker.neuron_worker.NeuronWorker"
+
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        logger.warning("Pin memory is not supported on Neuron.")
+        return False
diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
index e0f8e8b4b49fe..ccd94e8adb3b1 100644
--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -34,7 +34,7 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
         return _Backend.OPENVINO
 
     @classmethod
-    def get_device_name(self, device_id: int = 0) -> str:
+    def get_device_name(cls, device_id: int = 0) -> str:
         return "openvino"
 
     @classmethod
@@ -42,19 +42,19 @@ def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
         return False
 
     @classmethod
-    def inference_mode(self):
+    def inference_mode(cls):
         return torch.inference_mode(mode=True)
 
     @classmethod
-    def is_openvino_cpu(self) -> bool:
+    def is_openvino_cpu(cls) -> bool:
         return "CPU" in envs.VLLM_OPENVINO_DEVICE
 
     @classmethod
-    def is_openvino_gpu(self) -> bool:
+    def is_openvino_gpu(cls) -> bool:
         return "GPU" in envs.VLLM_OPENVINO_DEVICE
 
     @classmethod
-    def is_pin_memory_available(self) -> bool:
+    def is_pin_memory_available(cls) -> bool:
         logger.warning("Pin memory is not supported on OpenViNO.")
         return False
 
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 11dbd04d55671..c20190e789d7e 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -78,3 +78,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             parallel_config.distributed_executor_backend = "ray"
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker"
+
+    @classmethod
+    def is_pin_memory_available(cls):
+        logger.warning("Pin memory is not supported on XPU.")
+        return False
diff --git a/vllm/utils.py b/vllm/utils.py
index 1882264c19775..fbc3ef7fa7f89 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -24,7 +24,6 @@
 from collections import UserDict, defaultdict
 from collections.abc import Iterable, Mapping
 from functools import lru_cache, partial, wraps
-from platform import uname
 from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
                     Dict, Generic, Hashable, List, Literal, Optional,
                     OrderedDict, Set, Tuple, Type, TypeVar, Union, overload)
@@ -344,12 +343,6 @@ def random_uuid() -> str:
     return str(uuid.uuid4().hex)
 
 
-@lru_cache(maxsize=None)
-def in_wsl() -> bool:
-    # Reference: https://github.com/microsoft/WSL/issues/4071
-    return "microsoft" in " ".join(uname()).lower()
-
-
 def make_async(
     func: Callable[P, T],
     executor: Optional[concurrent.futures.Executor] = None
@@ -729,25 +722,7 @@ def print_warning_once(msg: str) -> None:
 
 @lru_cache(maxsize=None)
 def is_pin_memory_available() -> bool:
-
-    if in_wsl():
-        # Pinning memory in WSL is not supported.
-        # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
-        print_warning_once("Using 'pin_memory=False' as WSL is detected. "
-                           "This may slow down the performance.")
-        return False
-    elif current_platform.is_xpu():
-        print_warning_once("Pin memory is not supported on XPU.")
-        return False
-    elif current_platform.is_neuron():
-        print_warning_once("Pin memory is not supported on Neuron.")
-        return False
-    elif current_platform.is_hpu():
-        print_warning_once("Pin memory is not supported on HPU.")
-        return False
-    elif current_platform.is_cpu() or current_platform.is_openvino():
-        return False
-    return True
+    return current_platform.is_pin_memory_available()
 
 
 class DeviceMemoryProfiler:

From c31d4a57a6b639900a7c70b6e844db0116c2f9f6 Mon Sep 17 00:00:00 2001
From: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Date: Sat, 14 Dec 2024 00:51:25 +0900
Subject: [PATCH 056/357] [Core] support LoRA and prompt adapter in
 content-based hashing for Block Manager v2 prefix caching (#8240)

---
 tests/core/block/test_prefix_caching_block.py | 65 ++++++++++++++++++-
 tests/core/utils.py                           | 10 +++
 vllm/core/block/block_table.py                | 46 +++++++++----
 vllm/core/block/common.py                     | 19 ++++--
 vllm/core/block/cpu_gpu_block_allocator.py    | 43 ++++++++----
 vllm/core/block/interfaces.py                 | 32 ++++++---
 vllm/core/block/naive_block.py                | 10 ++-
 vllm/core/block/prefix_caching_block.py       | 55 ++++++++++++----
 vllm/core/block_manager.py                    |  8 ++-
 vllm/sequence.py                              | 13 ++++
 10 files changed, 246 insertions(+), 55 deletions(-)

diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
index bbeb4b3a58f2a..29ac3a3c86cb4 100644
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from tests.core.utils import create_dummy_sequence
+from tests.core.utils import create_dummy_lora_sequence, create_dummy_sequence
 from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
 from vllm.core.block.interfaces import Block, BlockAllocator
 from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
@@ -801,6 +801,7 @@ def create_immutable_chain(
         block_size: int,
         token_ids: List[int],
         allocator: PrefixCachingBlockAllocator,
+        extra_hash: Optional[int] = None,
     ) -> List[PrefixCachingBlock]:
         """Helper method which creates a chain of blocks.
         """
@@ -816,7 +817,9 @@ def create_immutable_chain(
                                         block_size:(block_number + 1) *
                                         block_size]
             prev_block = allocator.allocate_immutable_block(
-                prev_block=prev_block, token_ids=block_token_ids)
+                prev_block=prev_block,
+                token_ids=block_token_ids,
+                extra_hash=extra_hash)
             blocks.append(prev_block)
 
         return blocks
@@ -931,3 +934,61 @@ def test_correct_block_hash():
         allocator.mark_blocks_as_computed([])
 
         assert tracker.get_num_cached_tokens(seq) == len(tokens)
+
+    @staticmethod
+    def test_correct_extra_hash():
+        """
+        Test that the block hash is correctly computed based on the extra hash,
+        ensuring it matches the allocator's block hash, specifically for the
+        LoRA case, and that the correct number of cached tokens is retrieved.
+        """
+        block_size = 4
+        allocator = CpuGpuBlockAllocator.create(
+            allocator_type="prefix_caching",
+            num_gpu_blocks=16,
+            num_cpu_blocks=16,
+            block_size=block_size,
+        )
+        gpu_allocator = allocator._allocators[Device.GPU]
+
+        tracker = ComputedBlocksTracker(
+            allocator=allocator,
+            block_size=block_size,
+            enable_caching=True,
+        )
+
+        tokens = list(range(block_size * 4))
+
+        # Create a dummy LoRA sequence with a specific LoRA ID.
+        lora_seq = create_dummy_lora_sequence(request_id=0,
+                                              token_ids=tokens,
+                                              block_size=block_size,
+                                              lora_int_id=1)
+
+        _ = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=tokens,
+            allocator=gpu_allocator,
+            extra_hash=lora_seq.extra_hash(),
+        )
+
+        allocator.mark_blocks_as_computed([])
+
+        # Create different dummy sequences that have the same token IDs
+        # but different LoRA IDs.
+        seq = create_dummy_sequence(request_id=1,
+                                    token_ids=tokens,
+                                    block_size=block_size)
+
+        different_lora_seq = create_dummy_lora_sequence(request_id=2,
+                                                        token_ids=tokens,
+                                                        block_size=block_size,
+                                                        lora_int_id=2)
+
+        # Due to the different LoRA IDs, corresponding blocks are not cached.
+        assert tracker.get_num_cached_tokens(seq) == 0
+        assert tracker.get_num_cached_tokens(different_lora_seq) == 0
+
+        # The number of cached tokens matches the length of the tokens
+        # for the cached LoRA sequence.
+        assert tracker.get_num_cached_tokens(lora_seq) == len(tokens)
diff --git a/tests/core/utils.py b/tests/core/utils.py
index 277368b57b938..16703cd19fa1e 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -46,6 +46,16 @@ def create_dummy_prompt(
     return prompt, seq_group
 
 
+def create_dummy_lora_sequence(request_id: int, token_ids: List[int],
+                               block_size: int, lora_int_id: int) -> Sequence:
+    return Sequence(seq_id=request_id,
+                    inputs=token_inputs(token_ids),
+                    block_size=block_size,
+                    lora_request=LoRARequest(lora_name="dummy",
+                                             lora_path="/dummy",
+                                             lora_int_id=lora_int_id))
+
+
 def create_dummy_sequence(request_id: int, token_ids: List[int],
                           block_size: int) -> Sequence:
     return Sequence(
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index d10cb29ef4a7c..dca0b3fe8d304 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -80,7 +80,8 @@ def get_num_required_blocks(token_ids: List[int],
 
     def allocate(self,
                  token_ids: List[int],
-                 device: Device = Device.GPU) -> None:
+                 device: Device = Device.GPU,
+                 extra_hash: Optional[int] = None) -> None:
         """Allocates memory blocks for storing the given sequence of token IDs.
 
         This method allocates the required number of blocks to store the given
@@ -90,12 +91,16 @@ def allocate(self,
             token_ids (List[int]): The sequence of token IDs to be stored.
             device (Device, optional): The device on which the blocks should be
                 allocated. Defaults to Device.GPU.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefixcaching block.
         """
         assert not self._is_allocated
         assert token_ids
         blocks = self._allocate_blocks_for_token_ids(prev_block=None,
                                                      token_ids=token_ids,
-                                                     device=device)
+                                                     device=device,
+                                                     extra_hash=extra_hash)
         self.update(blocks)
         self._num_full_slots = len(token_ids)
 
@@ -108,7 +113,8 @@ def update(self, blocks: List[Block]) -> None:
     def append_token_ids(self,
                          token_ids: List[int],
                          num_lookahead_slots: int = 0,
-                         num_computed_slots: Optional[int] = None) -> None:
+                         num_computed_slots: Optional[int] = None,
+                         extra_hash: Optional[int] = None) -> None:
         """Appends a sequence of token IDs to the existing blocks in the
         BlockTable.
 
@@ -130,6 +136,9 @@ def append_token_ids(self,
                 Without sliding window, None can be passed.
                 Without chunked prefill, it should be the same as
                 _num_full_slots.
+            extra_hash (Optional[int]): The hash value of additional
+                factors such as adapters that influence the block, apart
+                from the token_ids.
         """
         assert self._is_allocated, "no blocks have been allocated"
         assert len(self._blocks) > 0
@@ -149,7 +158,8 @@ def append_token_ids(self,
         # Ensure there are enough empty slots for the new tokens plus
         # lookahead slots
         self.ensure_num_empty_slots(num_empty_slots=len(token_ids) +
-                                    num_lookahead_slots)
+                                    num_lookahead_slots,
+                                    extra_hash=extra_hash)
 
         # Update the blocks with the new tokens
         first_block_idx = self._num_full_slots // self._block_size
@@ -160,7 +170,9 @@ def append_token_ids(self,
 
         self._num_full_slots += len(token_ids)
 
-    def ensure_num_empty_slots(self, num_empty_slots: int) -> None:
+    def ensure_num_empty_slots(self,
+                               num_empty_slots: int,
+                               extra_hash: Optional[int] = None) -> None:
         """Ensures that the BlockTable has at least the specified number of
         empty slots available.
 
@@ -171,6 +183,9 @@ def ensure_num_empty_slots(self, num_empty_slots: int) -> None:
 
         Args:
             num_empty_slots (int): The minimum number of empty slots required.
+            extra_hash (Optional[int]): The hash value of additional
+                factors such as adapters that influence the block, apart
+                from the token_ids.
         """
         # Currently the block table only supports
         # appending tokens to GPU blocks.
@@ -187,7 +202,9 @@ def ensure_num_empty_slots(self, num_empty_slots: int) -> None:
             assert len(self._blocks) > 0
             self._blocks.append(
                 self._allocator.allocate_mutable_block(
-                    prev_block=self._blocks[-1], device=device))
+                    prev_block=self._blocks[-1],
+                    device=device,
+                    extra_hash=extra_hash))
 
     def fork(self) -> "BlockTable":
         """Creates a new BlockTable instance with a copy of the blocks from the
@@ -259,9 +276,12 @@ def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]:
         # ones after the appended ones.
         return sequence_token_ids[self.num_full_slots:]
 
-    def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block],
-                                       token_ids: List[int],
-                                       device: Device) -> List[Block]:
+    def _allocate_blocks_for_token_ids(
+            self,
+            prev_block: Optional[Block],
+            token_ids: List[int],
+            device: Device,
+            extra_hash: Optional[int] = None) -> List[Block]:
         blocks: List[Block] = []
 
         block_token_ids = []
@@ -275,8 +295,10 @@ def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block],
         if block_token_ids:
             blocks.extend(
                 self._allocator.allocate_immutable_blocks(
-                    prev_block, block_token_ids=block_token_ids,
-                    device=device))
+                    prev_block,
+                    block_token_ids=block_token_ids,
+                    device=device,
+                    extra_hash=extra_hash))
             prev_block = blocks[-1]
 
         if tail_token_ids:
@@ -284,7 +306,7 @@ def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block],
             cur_token_ids = tail_token_ids[0]
 
             block = self._allocator.allocate_mutable_block(
-                prev_block=prev_block, device=device)
+                prev_block=prev_block, device=device, extra_hash=extra_hash)
             block.append_token_ids(cur_token_ids)
 
             blocks.append(block)
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
index eb190adfbe802..c03b5932eafb6 100644
--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
@@ -177,7 +177,8 @@ def __init__(self, block_size: int, create_block: Block.Factory,
                                    token_ids=[],
                                    block_size=self._block_size,
                                    allocator=self._allocator,
-                                   block_id=None))
+                                   block_id=None,
+                                   extra_hash=None))
 
     def increase_pool(self):
         """Doubles the internal pool size
@@ -194,10 +195,15 @@ def increase_pool(self):
                                    token_ids=[],
                                    block_size=self._block_size,
                                    allocator=self._allocator,
-                                   block_id=None))
-
-    def init_block(self, prev_block: Optional[Block], token_ids: List[int],
-                   block_size: int, physical_block_id: Optional[int]) -> Block:
+                                   block_id=None,
+                                   extra_hash=None))
+
+    def init_block(self,
+                   prev_block: Optional[Block],
+                   token_ids: List[int],
+                   block_size: int,
+                   physical_block_id: Optional[int],
+                   extra_hash: Optional[int] = None) -> Block:
         if len(self._free_ids) == 0:
             self.increase_pool()
             assert len(self._free_ids) > 0
@@ -210,7 +216,8 @@ def init_block(self, prev_block: Optional[Block], token_ids: List[int],
             token_ids=token_ids,
             block_size=block_size,
             allocator=block._allocator,  # type: ignore[attr-defined] 
-            block_id=physical_block_id)
+            block_id=physical_block_id,
+            extra_hash=extra_hash)
         block.pool_id = pool_id  # type: ignore[attr-defined]
         return block
 
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 3197af3c2b7a4..3a57487a6cd8a 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -121,23 +121,32 @@ def allocate_or_get_null_block(self) -> Block:
                 self.allocate_mutable_block(None, Device.GPU))
         return self._null_block
 
-    def allocate_mutable_block(self, prev_block: Optional[Block],
-                               device: Device) -> Block:
+    def allocate_mutable_block(self,
+                               prev_block: Optional[Block],
+                               device: Device,
+                               extra_hash: Optional[int] = None) -> Block:
         """Allocates a new mutable block on the specified device.
 
         Args:
             prev_block (Optional[Block]): The previous block to in the sequence.
                 Used for prefix hashing.
             device (Device): The device on which to allocate the new block.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefix caching block.
 
         Returns:
             Block: The newly allocated mutable block.
         """
-        return self._allocators[device].allocate_mutable_block(prev_block)
-
-    def allocate_immutable_blocks(self, prev_block: Optional[Block],
-                                  block_token_ids: List[List[int]],
-                                  device: Device) -> List[Block]:
+        return self._allocators[device].allocate_mutable_block(
+            prev_block, extra_hash=extra_hash)
+
+    def allocate_immutable_blocks(
+            self,
+            prev_block: Optional[Block],
+            block_token_ids: List[List[int]],
+            device: Device,
+            extra_hash: Optional[int] = None) -> List[Block]:
         """Allocates a new group of immutable blocks with the provided block 
         token IDs on the specified device.
 
@@ -147,17 +156,22 @@ def allocate_immutable_blocks(self, prev_block: Optional[Block],
             block_token_ids (List[int]): The list of block token IDs to be 
                 stored in the new blocks.
             device (Device): The device on which to allocate the new block.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefix caching block.
 
         Returns:
             List[Block]: The newly allocated list of immutable blocks 
                 containing the provided block token IDs.
         """
         return self._allocators[device].allocate_immutable_blocks(
-            prev_block, block_token_ids)
+            prev_block, block_token_ids, extra_hash=extra_hash)
 
-    def allocate_immutable_block(self, prev_block: Optional[Block],
+    def allocate_immutable_block(self,
+                                 prev_block: Optional[Block],
                                  token_ids: List[int],
-                                 device: Device) -> Block:
+                                 device: Device,
+                                 extra_hash: Optional[int] = None) -> Block:
         """Allocates a new immutable block with the provided token IDs on the
         specified device.
 
@@ -167,13 +181,16 @@ def allocate_immutable_block(self, prev_block: Optional[Block],
             token_ids (List[int]): The list of token IDs to be stored in the new
                 block.
             device (Device): The device on which to allocate the new block.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefix caching block.
 
         Returns:
             Block: The newly allocated immutable block containing the provided
                 token IDs.
         """
         return self._allocators[device].allocate_immutable_block(
-            prev_block, token_ids)
+            prev_block, token_ids, extra_hash=extra_hash)
 
     def free(self, block: Block) -> None:
         """Frees the memory occupied by the given block.
@@ -387,6 +404,10 @@ def is_full(self):
     def prev_block(self):
         return self._proxy.prev_block
 
+    @property
+    def extra_hash(self):
+        return None
+
     @property
     def computed(self):
         return self._proxy.computed
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index 06f4851af3466..985a1098b6cd1 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -50,6 +50,11 @@ def is_full(self) -> bool:
     def prev_block(self) -> Optional["Block"]:
         pass
 
+    @property
+    @abstractmethod
+    def extra_hash(self) -> Optional[int]:
+        return None
+
     @property
     @abstractmethod
     def computed(self) -> bool:
@@ -81,6 +86,8 @@ def __call__(
             block_size: int,
             allocator: "BlockAllocator",
             block_id: Optional[int] = None,
+            computed: bool = False,
+            extra_hash: Optional[int] = None,
         ) -> "Block":
             pass
 
@@ -99,18 +106,20 @@ def content_hash(self) -> Optional[int]:
 class BlockAllocator(ABC):
 
     @abstractmethod
-    def allocate_mutable_block(self, prev_block: Optional[Block]) -> Block:
+    def allocate_mutable_block(self, prev_block: Optional[Block],
+                               extra_hash: Optional[int]) -> Block:
         pass
 
     @abstractmethod
     def allocate_immutable_block(self, prev_block: Optional[Block],
-                                 token_ids: List[int]) -> Block:
+                                 token_ids: List[int],
+                                 extra_hash: Optional[int]) -> Block:
         pass
 
     @abstractmethod
-    def allocate_immutable_blocks(
-            self, prev_block: Optional[Block],
-            block_token_ids: List[List[int]]) -> List[Block]:
+    def allocate_immutable_blocks(self, prev_block: Optional[Block],
+                                  block_token_ids: List[List[int]],
+                                  extra_hash: Optional[int]) -> List[Block]:
         pass
 
     @abstractmethod
@@ -197,14 +206,18 @@ def find_cached_blocks_prefix(
 class DeviceAwareBlockAllocator(ABC):
 
     @abstractmethod
-    def allocate_mutable_block(self, prev_block: Optional[Block],
-                               device: Device) -> Block:
+    def allocate_mutable_block(self,
+                               prev_block: Optional[Block],
+                               device: Device,
+                               extra_hash: Optional[int] = None) -> Block:
         pass
 
     @abstractmethod
-    def allocate_immutable_block(self, prev_block: Optional[Block],
+    def allocate_immutable_block(self,
+                                 prev_block: Optional[Block],
                                  token_ids: List[int],
-                                 device: Device) -> Block:
+                                 device: Device,
+                                 extra_hash: Optional[int] = None) -> Block:
         pass
 
     @abstractmethod
@@ -213,6 +226,7 @@ def allocate_immutable_blocks(
         prev_block: Optional[Block],
         block_token_ids: List[List[int]],
         device: Device,
+        extra_hash: Optional[int] = None,
     ) -> List[Block]:
         pass
 
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index a2af5ad6362c1..9b94918ab38ef 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -63,6 +63,7 @@ def __init__(
     def allocate_immutable_block(self,
                                  prev_block: Optional[Block],
                                  token_ids: List[int],
+                                 extra_hash: Optional[int] = None,
                                  device: Optional[Device] = None) -> Block:
         """Allocates a new immutable block with the given token IDs, linked to
         the previous block.
@@ -85,6 +86,7 @@ def allocate_immutable_blocks(
             self,
             prev_block: Optional[Block],
             block_token_ids: List[List[int]],
+            extra_hash: Optional[int] = None,
             device: Optional[Device] = None) -> List[Block]:
         assert device is None
         num_blocks = len(block_token_ids)
@@ -106,6 +108,7 @@ def allocate_immutable_blocks(
 
     def allocate_mutable_block(self,
                                prev_block: Optional[Block],
+                               extra_hash: Optional[int] = None,
                                device: Optional[Device] = None) -> Block:
         """Allocates a new mutable block, linked to the previous block.
 
@@ -355,7 +358,8 @@ def __init__(self,
                  block_size: int,
                  allocator: BlockAllocator,
                  block_id: Optional[int] = None,
-                 _cow_target: Optional[Block] = None):
+                 _cow_target: Optional[Block] = None,
+                 extra_hash: Optional[int] = None):
         self._token_ids: List[int] = []
         self._block_size = block_size
         self._prev_block = prev_block
@@ -441,6 +445,10 @@ def block_size(self) -> int:
     def prev_block(self) -> Optional["Block"]:
         return self._prev_block
 
+    @property
+    def extra_hash(self):
+        return None
+
     @property
     def content_hash(self) -> Optional[int]:
         return None
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index b736167f6ceb4..1238303234deb 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -126,6 +126,7 @@ def _create_block(
         allocator: BlockAllocator,
         block_id: Optional[int] = None,
         computed: bool = False,
+        extra_hash: Optional[int] = None,
     ) -> Block:
         # Bind block to self.
         allocator = self
@@ -137,11 +138,13 @@ def _create_block(
             block_id=block_id,
             allocator=allocator,
             computed=computed,
+            extra_hash=extra_hash,
         )
 
     def allocate_immutable_block(self,
                                  prev_block: Optional[Block],
                                  token_ids: List[int],
+                                 extra_hash: Optional[int] = None,
                                  device: Optional[Device] = None) -> Block:
         """Allocates an immutable block with the given token IDs, reusing cached
         blocks if possible.
@@ -160,7 +163,8 @@ def allocate_immutable_block(self,
         block = self._block_pool.init_block(prev_block=prev_block,
                                             token_ids=token_ids,
                                             block_size=self._block_size,
-                                            physical_block_id=None)
+                                            physical_block_id=None,
+                                            extra_hash=extra_hash)
         assert block.content_hash is not None
 
         cached_block_id = self._cached_blocks.get(block.content_hash, None)
@@ -173,7 +177,7 @@ def allocate_immutable_block(self,
         self._block_pool.free_block(block)
 
         # No cached block => Allocate a new block
-        block = self.allocate_mutable_block(prev_block)
+        block = self.allocate_mutable_block(prev_block, extra_hash=extra_hash)
         block.append_token_ids(token_ids)
         return block
 
@@ -181,17 +185,20 @@ def allocate_immutable_blocks(
             self,
             prev_block: Optional[Block],
             block_token_ids: List[List[int]],
+            extra_hash: Optional[int] = None,
             device: Optional[Device] = None) -> List[Block]:
         blocks = []
         for token_ids in block_token_ids:
             prev_block = self.allocate_immutable_block(prev_block=prev_block,
                                                        token_ids=token_ids,
-                                                       device=device)
+                                                       device=device,
+                                                       extra_hash=extra_hash)
             blocks.append(prev_block)
         return blocks
 
     def allocate_mutable_block(self,
                                prev_block: Optional[Block],
+                               extra_hash: Optional[int] = None,
                                device: Optional[Device] = None) -> Block:
         """Allocates a mutable block. If there are no free blocks, this will
         evict unused cached blocks.
@@ -210,7 +217,8 @@ def allocate_mutable_block(self,
         block = self._block_pool.init_block(prev_block=prev_block,
                                             token_ids=[],
                                             block_size=self._block_size,
-                                            physical_block_id=block_id)
+                                            physical_block_id=block_id,
+                                            extra_hash=extra_hash)
         assert not block.computed
         assert block.content_hash is None
         return block
@@ -382,7 +390,8 @@ def fork(self, last_block: Block) -> List[Block]:
                 prev_block=prev_block,
                 token_ids=block.token_ids,
                 block_size=self._block_size,
-                physical_block_id=block_id)
+                physical_block_id=block_id,
+                extra_hash=block.extra_hash)
 
             forked_blocks.append(forked_block)
             prev_block = forked_blocks[-1]
@@ -608,10 +617,12 @@ def swap_in(self, blocks: List[Block]) -> None:
             # existing "block" object
             if block.is_full:
                 tmp_block = self.allocate_immutable_block(
-                    prev_block=block.prev_block, token_ids=block.token_ids)
+                    prev_block=block.prev_block,
+                    token_ids=block.token_ids,
+                    extra_hash=block.extra_hash)
             else:
                 tmp_block = self.allocate_mutable_block(
-                    prev_block=block.prev_block)
+                    prev_block=block.prev_block, extra_hash=block.extra_hash)
                 tmp_block.append_token_ids(block.token_ids)
 
             block_id = tmp_block.block_id
@@ -679,6 +690,8 @@ class PrefixCachingBlock(Block):
             caching block allocator associated with this block.
         block_id (Optional[int], optional): The physical block index
             of this block. Defaults to None.
+        extra_hash (Optional[int]): The hash value of additional factors
+            such as adapters that influence the block, apart from the token_ids.
     """
 
     def __init__(
@@ -689,6 +702,7 @@ def __init__(
         allocator: BlockAllocator,
         block_id: Optional[int] = None,
         computed: bool = False,
+        extra_hash: Optional[int] = None,
     ):
         assert isinstance(allocator, PrefixCachingBlockAllocator), (
             "Currently this class is only tested with "
@@ -702,6 +716,7 @@ def __init__(
         self._allocator = allocator
         self._last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
         self._computed = computed
+        self._extra_hash = extra_hash
 
         # On the first time, we create the block object, and next we only
         # reinitialize it
@@ -811,6 +826,10 @@ def token_ids(self) -> List[int]:
     def prev_block(self) -> Optional[Block]:
         return self._prev_block
 
+    @property
+    def extra_hash(self) -> Optional[int]:
+        return self._extra_hash
+
     @property
     def content_hash(self) -> Optional[int]:
         """Return the content-based hash of the current block, or None if it is
@@ -841,18 +860,19 @@ def content_hash(self) -> Optional[int]:
         self._cached_content_hash = PrefixCachingBlock.hash_block_tokens(
             is_first_block,
             prev_block_hash,
-            cur_block_token_ids=self.token_ids)
+            cur_block_token_ids=self.token_ids,
+            extra_hash=self._extra_hash)
         return self._cached_content_hash
 
     @staticmethod
-    def hash_block_tokens(is_first_block: bool, prev_block_hash: Optional[int],
-                          cur_block_token_ids: List[int]) -> int:
+    def hash_block_tokens(is_first_block: bool,
+                          prev_block_hash: Optional[int],
+                          cur_block_token_ids: List[int],
+                          extra_hash: Optional[int] = None) -> int:
         """Computes a hash value corresponding to the contents of a block and
         the contents of the preceding block(s). The hash value is used for
         prefix caching.
 
-        NOTE: Content-based hashing does not yet support LoRA.
-
         Parameters:
         - is_first_block (bool): A flag indicating if the block is the first in
             the sequence.
@@ -860,12 +880,15 @@ def hash_block_tokens(is_first_block: bool, prev_block_hash: Optional[int],
             if this is the first block.
         - cur_block_token_ids (List[int]): A list of token ids in the current
             block. The current block is assumed to be full.
+        - extra_hash (Optional[int]): The hash value of additional factors
+            such as adapters that influence the block, apart from the token_ids.
 
         Returns:
         - int: The computed hash value for the block.
         """
         assert (prev_block_hash is None) == is_first_block
-        return hash((is_first_block, prev_block_hash, *cur_block_token_ids))
+        return hash((is_first_block, prev_block_hash, *cur_block_token_ids,
+                     extra_hash))
 
 
 class ComputedBlocksTracker:
@@ -935,12 +958,18 @@ def _update_seq_hashes(self, seq: Sequence) -> None:
             assert len(token_ids) >= (i + 1) * self._block_size
             block_token_ids = token_ids[i * self._block_size:(i + 1) *
                                         self._block_size]
+
+            # NOTE: If there are any factors affecting the block besides
+            # token_ids, they should be added as input to extra_hash.
+            extra_hash = seq.extra_hash()
+
             # This has to be kept in sync with the allocator's hash
             # calculation.
             block_hash = PrefixCachingBlock.hash_block_tokens(
                 is_first_block=prev_block_hash is None,
                 prev_block_hash=prev_block_hash,
                 cur_block_token_ids=block_token_ids,
+                extra_hash=extra_hash,
             )
             block_hashes_recorded.append(block_hash)
             prev_block_hash = block_hash
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 209487c6b4f9e..b41e848221882 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -151,8 +151,13 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable:
             max_block_sliding_window=self.max_block_sliding_window,
         )
         if seq.get_token_ids():
+            # NOTE: If there are any factors affecting the block besides
+            # token_ids, they should be added as input to extra_hash.
+            extra_hash = seq.extra_hash()
+
             # Add blocks to the block table only if the sequence is non empty.
-            block_table.allocate(seq.get_token_ids())
+            block_table.allocate(token_ids=seq.get_token_ids(),
+                                 extra_hash=extra_hash)
 
         return block_table
 
@@ -238,6 +243,7 @@ def append_slots(
             token_ids=block_table.get_unseen_token_ids(seq.get_token_ids()),
             num_lookahead_slots=num_lookahead_slots,
             num_computed_slots=seq.data.get_num_computed_tokens(),
+            extra_hash=seq.extra_hash(),
         )
         # Return any new copy-on-writes.
         new_cows = self.block_allocator.clear_copy_on_writes()
diff --git a/vllm/sequence.py b/vllm/sequence.py
index ddb9ca5944f10..cc3d96fc93a79 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -527,6 +527,19 @@ def hash_of_block(self, logical_idx: int) -> int:
         hashed_tokens = self.data.get_prefix_token_ids(num_tokens)
         return hash((hashed_tokens, self.lora_int_id))
 
+    def extra_hash(self) -> Optional[int]:
+        """
+        This function computes an extra hash for a sequence, specifically
+        designed for prefix caching mode. The final sequence hash is determined
+        by applying token_ids from the sequence's blocks.
+        """
+        if self.prompt_adapter_id == 0 and self.lora_int_id == 0:
+            return None
+
+        # NOTE: If there are additional factors influencing the block aside from
+        # token_ids, include them as input parameters to the hash.
+        return hash((self.prompt_adapter_id, self.lora_int_id))
+
     def num_hashed_tokens_of_block(self, logical_idx: int):
         return logical_idx * self.block_size + self.block_size
 

From 5b0ed8391d497439595a1968d65df93da98265ca Mon Sep 17 00:00:00 2001
From: zhangjf <1061683512@qq.com>
Date: Fri, 13 Dec 2024 23:56:19 +0800
Subject: [PATCH 057/357] [Bugfix] using len(tokenizer) instead of
 tokenizer.vocab_size in AllowedTokenIdsLogitsProcessor (#11156)

---
 vllm/entrypoints/openai/logits_processors.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py
index 7913f8720ca73..c8132811de903 100644
--- a/vllm/entrypoints/openai/logits_processors.py
+++ b/vllm/entrypoints/openai/logits_processors.py
@@ -71,7 +71,7 @@ def get_logits_processors(
 
         # Check if token_id is within the vocab size
         for token_id, bias in clamped_logit_bias.items():
-            if token_id < 0 or token_id >= tokenizer.vocab_size:
+            if token_id < 0 or token_id >= len(tokenizer):
                 raise ValueError(f"token_id {token_id} in logit_bias contains "
                                  "out-of-vocab token id")
 
@@ -81,6 +81,6 @@ def get_logits_processors(
     if allowed_token_ids is not None:
         logits_processors.append(
             _get_allowed_token_ids_logits_processor(
-                frozenset(allowed_token_ids), tokenizer.vocab_size))
+                frozenset(allowed_token_ids), len(tokenizer)))
 
     return logits_processors

From 238c0d93b40008244fae64530d82f1860b1f9121 Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>
Date: Fri, 13 Dec 2024 11:19:10 -0500
Subject: [PATCH 058/357] [Misc] Add tokenizer_mode param to
 benchmark_serving.py (#11174)

Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
---
 benchmarks/benchmark_serving.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 3256692142c5e..4eb0e1f8ac903 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -781,6 +781,7 @@ def main(args: argparse.Namespace):
     backend = args.backend
     model_id = args.model
     tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+    tokenizer_mode = args.tokenizer_mode
 
     if args.base_url is not None:
         api_url = f"{args.base_url}{args.endpoint}"
@@ -790,6 +791,7 @@ def main(args: argparse.Namespace):
         base_url = f"http://{args.host}:{args.port}"
 
     tokenizer = get_tokenizer(tokenizer_id,
+                              tokenizer_mode=tokenizer_mode,
                               trust_remote_code=args.trust_remote_code)
 
     if args.dataset is not None:
@@ -1210,5 +1212,15 @@ def main(args: argparse.Namespace):
         "from the sampled HF dataset.",
     )
 
+    parser.add_argument(
+        '--tokenizer-mode',
+        type=str,
+        default="auto",
+        choices=['auto', 'slow', 'mistral'],
+        help='The tokenizer mode.\n\n* "auto" will use the '
+        'fast tokenizer if available.\n* "slow" will '
+        'always use the slow tokenizer. \n* '
+        '"mistral" will always use the `mistral_common` tokenizer.')
+
     args = parser.parse_args()
     main(args)

From 0920ab9131274df143cfc49245409378a009b3c6 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 14 Dec 2024 00:22:22 +0800
Subject: [PATCH 059/357] [Doc] Reorganize online pooling APIs (#11172)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/pooling_models.rst         |  14 +-
 .../serving/openai_compatible_server.md       | 462 ++++++++++--------
 docs/source/usage/multimodal_inputs.rst       |   8 +-
 examples/offline_inference_openai.md          |  92 ++--
 ...ai_chat_embedding_client_for_multimodal.py |   2 +-
 examples/openai_cross_encoder_score.py        |  35 +-
 tests/entrypoints/openai/test_score.py        |   6 +-
 vllm/entrypoints/openai/api_server.py         |  11 +-
 vllm/entrypoints/openai/protocol.py           |   7 +-
 vllm/outputs.py                               |  19 +-
 10 files changed, 368 insertions(+), 288 deletions(-)

diff --git a/docs/source/models/pooling_models.rst b/docs/source/models/pooling_models.rst
index 94475c5e6689d..4e67677a2767a 100644
--- a/docs/source/models/pooling_models.rst
+++ b/docs/source/models/pooling_models.rst
@@ -50,10 +50,10 @@ It returns the extracted hidden states directly, which is useful for reward mode
 .. code-block:: python
 
     llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward")
-    output, = llm.encode("Hello, my name is")
+    (output,) = llm.encode("Hello, my name is")
 
     data = output.outputs.data
-    print(f"Prompt: {prompt!r} | Data: {data!r}")
+    print(f"Data: {data!r}")
 
 ``LLM.embed``
 ^^^^^^^^^^^^^
@@ -64,7 +64,7 @@ It is primarily designed for embedding models.
 .. code-block:: python
 
     llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed")
-    output, = llm.embed("Hello, my name is")
+    (output,) = llm.embed("Hello, my name is")
 
     embeds = output.outputs.embedding
     print(f"Embeddings: {embeds!r} (size={len(embeds)})")
@@ -80,7 +80,7 @@ It is primarily designed for classification models.
 .. code-block:: python
 
     llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify")
-    output, = llm.classify("Hello, my name is")
+    (output,) = llm.classify("Hello, my name is")
 
     probs = output.outputs.probs
     print(f"Class Probabilities: {probs!r} (size={len(probs)})")
@@ -102,8 +102,8 @@ These types of models serve as rerankers between candidate query-document pairs
 .. code-block:: python
 
     llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
-    output, = llm.score("What is the capital of France?",
-                        "The capital of Brazil is Brasilia.")
+    (output,) = llm.score("What is the capital of France?",
+                          "The capital of Brazil is Brasilia.")
 
     score = output.outputs.score
     print(f"Score: {score}")
@@ -119,7 +119,7 @@ Please click on the above link for more details on how to launch the server.
 Embeddings API
 ^^^^^^^^^^^^^^
 
-Our Embeddings API is similar to ``LLM.encode``, accepting both text and :ref:`multi-modal inputs <multimodal_inputs>`.
+Our Embeddings API is similar to ``LLM.embed``, accepting both text and :ref:`multi-modal inputs <multimodal_inputs>`.
 
 The text-only API is compatible with `OpenAI Embeddings API <https://platform.openai.com/docs/api-reference/embeddings>`__
 so that you can use OpenAI client to interact with it.
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index f75653106cf66..14a5b02d72aa5 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -1,13 +1,13 @@
 # OpenAI Compatible Server
 
-vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API.
+vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API, and more!
 
-You can start the server using Python, or using [Docker](deploying_with_docker.rst):
+You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](deploying_with_docker.rst):
 ```bash
 vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
 ```
 
-To call the server, you can use the official OpenAI Python client library, or any other HTTP client.
+To call the server, you can use the [official OpenAI Python client](https://github.com/openai/openai-python), or any other HTTP client.
 ```python
 from openai import OpenAI
 client = OpenAI(
@@ -25,166 +25,76 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message)
 ```
 
-## API Reference
+## Supported APIs
 
 We currently support the following OpenAI APIs:
 
-- [Completions API](https://platform.openai.com/docs/api-reference/completions)
+- [Completions API](#completions-api) (`/v1/completions`)
+  - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`).
   - *Note: `suffix` parameter is not supported.*
-- [Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
+- [Chat Completions API](#chat-api) (`/v1/chat/completions`)
+  - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`) with a [chat template](#chat-template).
   - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Multimodal Inputs](../usage/multimodal_inputs.rst).
     - *Note: `image_url.detail` parameter is not supported.*
   - We also support `audio_url` content type for audio files.
     - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema.
     - *TODO: Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).*
   - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
-- [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)
-  - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API),
-    which will be treated as a single prompt to the model according to its chat template.
-    - This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.rst) for details.
-  - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.*
-
-## Score API for Cross Encoder Models
+- [Embeddings API](#embeddings-api) (`/v1/embeddings`)
+  - Only applicable to [embedding models](../models/pooling_models.rst) (`--task embed`).
 
-vLLM supports *cross encoders models* at the **/v1/score** endpoint, which is not an OpenAI API standard endpoint. You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
+In addition, we have the following custom APIs:
 
-A ***Cross Encoder*** takes exactly two sentences / texts as input and either predicts a score or label for this sentence pair. It can for example predict the similarity of the sentence pair on a scale of 0 … 1.
+- [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`)
+  - Applicable to any model with a tokenizer.
+- [Score API](#score-api) (`/score`)
+  - Only applicable to [cross-encoder models](../models/pooling_models.rst) (`--task score`).
 
-### Example of usage for a pair of a string and a list of texts
+(chat-template)=
+## Chat Template
 
-In this case, the model will compare the first given text to each of the texts containing the list.
+In order for the language model to support chat protocol, vLLM requires the model to include
+a chat template in its tokenizer configuration. The chat template is a Jinja2 template that
+specifies how are roles, messages, and other chat-specific tokens are encoded in the input.
 
-```bash
-curl -X 'POST' \
-  'http://127.0.0.1:8000/v1/score' \
-  -H 'accept: application/json' \
-  -H 'Content-Type: application/json' \
-  -d '{
-  "model": "BAAI/bge-reranker-v2-m3",
-  "text_1": "What is the capital of France?",
-  "text_2": [
-    "The capital of Brazil is Brasilia.",
-    "The capital of France is Paris."
-  ]
-}'
-```
+An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models)
 
-Response:
+Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model,
+you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat
+template, or the template in string form. Without a chat template, the server will not be able to process chat
+and all chat requests will error.
 
 ```bash
-{
-  "id": "score-request-id",
-  "object": "list",
-  "created": 693570,
-  "model": "BAAI/bge-reranker-v2-m3",
-  "data": [
-    {
-      "index": 0,
-      "object": "score",
-      "score": [
-        0.001094818115234375
-      ]
-    },
-    {
-      "index": 1,
-      "object": "score",
-      "score": [
-        1
-      ]
-    }
-  ],
-  "usage": {}
-}
+vllm serve <model> --chat-template ./path-to-chat-template.jinja
 ```
 
-### Example of usage for a pair of two lists of texts
-
-In this case, the model will compare the one by one, making pairs by same index correspondent in each list.
+vLLM community provides a set of chat templates for popular models. You can find them in the examples
+directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
 
-```bash
-curl -X 'POST' \
-  'http://127.0.0.1:8000/v1/score' \
-  -H 'accept: application/json' \
-  -H 'Content-Type: application/json' \
-  -d '{
-  "model": "BAAI/bge-reranker-v2-m3",
-  "encoding_format": "float",
-  "text_1": [
-    "What is the capital of Brazil?",
-    "What is the capital of France?"
-  ],
-  "text_2": [
-    "The capital of Brazil is Brasilia.",
-    "The capital of France is Paris."
+With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies
+both a `type` and a `text` field. An example is provided below:
+```python
+completion = client.chat.completions.create(
+  model="NousResearch/Meta-Llama-3-8B-Instruct",
+  messages=[
+    {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]}
   ]
-}'
-```
-
-Response:
-
-```bash
-{
-  "id": "score-request-id",
-  "object": "list",
-  "created": 693447,
-  "model": "BAAI/bge-reranker-v2-m3",
-  "data": [
-    {
-      "index": 0,
-      "object": "score",
-      "score": [
-        1
-      ]
-    },
-    {
-      "index": 1,
-      "object": "score",
-      "score": [
-        1
-      ]
-    }
-  ],
-  "usage": {}
-}
+)
 ```
 
-### Example of usage for a pair of two strings
-
-In this case, the model will compare the strings of texts.
-
-```bash
-curl -X 'POST' \
-  'http://127.0.0.1:8000/v1/score' \
-  -H 'accept: application/json' \
-  -H 'Content-Type: application/json' \
-  -d '{
-  "model": "BAAI/bge-reranker-v2-m3",
-  "encoding_format": "float",
-  "text_1": "What is the capital of France?",
-  "text_2": "The capital of France is Paris."
-}'
-```
+Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like 
+`meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the
+request. vLLM provides best-effort support to detect this automatically, which is logged as a string like
+*"Detected the chat template content format to be..."*, and internally converts incoming requests to match
+the detected format, which can be one of:
 
-Response:
+- `"string"`: A string.
+  - Example: `"Hello world"`
+- `"openai"`: A list of dictionaries, similar to OpenAI schema.
+  - Example: `[{"type": "text", "text": "Hello world!"}]`
 
-```bash
-{
-  "id": "score-request-id",
-  "object": "list",
-  "created": 693447,
-  "model": "BAAI/bge-reranker-v2-m3",
-  "data": [
-    {
-      "index": 0,
-      "object": "score",
-      "score": [
-        1
-      ]
-    }
-  ],
-  "usage": {}
-}
-```
+If the result is not what you expect, you can set the `--chat-template-content-format` CLI argument
+to override which format to use.
 
 ## Extra Parameters
 
@@ -204,7 +114,7 @@ completion = client.chat.completions.create(
 )
 ```
 
-### Extra HTTP Headers
+## Extra HTTP Headers
 
 Only `X-Request-Id` HTTP request header is supported for now.
 
@@ -230,7 +140,53 @@ completion = client.completions.create(
 print(completion._request_id)
 ```
 
-### Extra Parameters for Completions API
+## CLI Reference
+
+(vllm-serve)=
+### `vllm serve`
+
+The `vllm serve` command is used to launch the OpenAI-compatible server.
+
+```{argparse}
+:module: vllm.entrypoints.openai.cli_args
+:func: create_parser_for_docs
+:prog: vllm serve
+```
+
+#### Configuration file
+
+You can load CLI arguments via a [YAML](https://yaml.org/) config file.
+The argument names must be the long form of those outlined [above](#vllm-serve).
+
+For example:
+
+```yaml
+# config.yaml
+
+host: "127.0.0.1"
+port: 6379
+uvicorn-log-level: "info"
+```
+
+To use the above config file:
+
+```bash
+$ vllm serve SOME_MODEL --config config.yaml
+```
+
+```{note}
+In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence.
+The order of priorities is `command line > config file values > defaults`.
+```
+
+## API Reference
+
+(completions-api)=
+### Completions API
+
+Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/completions) for more details.
+
+#### Extra parameters
 
 The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
 
@@ -248,7 +204,12 @@ The following extra parameters are supported:
 :end-before: end-completion-extra-params
 ```
 
-### Extra Parameters for Chat Completions API
+(chat-api)=
+### Chat Completions API
+
+Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/chat) for more details.
+
+#### Extra parameters
 
 The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
 
@@ -266,7 +227,19 @@ The following extra parameters are supported:
 :end-before: end-chat-completion-extra-params
 ```
 
-### Extra Parameters for Embeddings API
+(embeddings-api)=
+### Embeddings API
+
+Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/embeddings) for more details.
+
+If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat Completions API](#chat-api))
+which will be treated as a single prompt to the model.
+
+```{tip}
+This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.rst) for details.
+```
+
+#### Extra parameters
 
 The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported.
 
@@ -276,7 +249,7 @@ The following [pooling parameters (click through to see documentation)](../dev/p
 :end-before: end-embedding-pooling-params
 ```
 
-The following extra parameters are supported:
+The following extra parameters are supported by default:
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -284,80 +257,179 @@ The following extra parameters are supported:
 :end-before: end-embedding-extra-params
 ```
 
-## Chat Template
+For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:
 
-In order for the language model to support chat protocol, vLLM requires the model to include
-a chat template in its tokenizer configuration. The chat template is a Jinja2 template that
-specifies how are roles, messages, and other chat-specific tokens are encoded in the input.
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-chat-embedding-extra-params
+:end-before: end-chat-embedding-extra-params
+```
 
-An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models)
+(tokenizer-api)=
+### Tokenizer API
 
-Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model,
-you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat
-template, or the template in string form. Without a chat template, the server will not be able to process chat
-and all chat requests will error.
+The Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer).
+It consists of two endpoints:
+
+- `/tokenize` corresponds to calling `tokenizer.encode()`.
+- `/detokenize` corresponds to calling `tokenizer.decode()`.
+
+(score-api)=
+### Score API
+
+The Score API applies a cross-encoder model to predict scores for sentence pairs.
+Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1.
+
+You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
+
+#### Single inference
+
+You can pass a string to both `text_1` and `text_2`, forming a single sentence pair.
+
+Request:
 
 ```bash
-vllm serve <model> --chat-template ./path-to-chat-template.jinja
+curl -X 'POST' \
+  'http://127.0.0.1:8000/score' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "model": "BAAI/bge-reranker-v2-m3",
+  "encoding_format": "float",
+  "text_1": "What is the capital of France?",
+  "text_2": "The capital of France is Paris."
+}'
 ```
 
-vLLM community provides a set of chat templates for popular models. You can find them in the examples
-directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
+Response:
 
-With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies
-both a `type` and a `text` field. An example is provided below:
-```python
-completion = client.chat.completions.create(
-  model="NousResearch/Meta-Llama-3-8B-Instruct",
-  messages=[
-    {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]}
-  ]
-)
+```bash
+{
+  "id": "score-request-id",
+  "object": "list",
+  "created": 693447,
+  "model": "BAAI/bge-reranker-v2-m3",
+  "data": [
+    {
+      "index": 0,
+      "object": "score",
+      "score": 1
+    }
+  ],
+  "usage": {}
+}
 ```
 
-Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like 
-`meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the
-request. vLLM provides best-effort support to detect this automatically, which is logged as a string like
-*"Detected the chat template content format to be..."*, and internally converts incoming requests to match
-the detected format, which can be one of:
+#### Batch inference
 
-- `"string"`: A string.
-  - Example: `"Hello world"`
-- `"openai"`: A list of dictionaries, similar to OpenAI schema.
-  - Example: `[{"type": "text", "text": "Hello world!"}]`
+You can pass a string to `text_1` and a list to `text_2`, forming multiple sentence pairs
+where each pair is built from `text_1` and a string in `text_2`.
+The total number of pairs is `len(text_2)`.
 
-If the result is not what you expect, you can set the `--chat-template-content-format` CLI argument
-to override which format to use.
+Request:
 
-## Command line arguments for the server
+```bash
+curl -X 'POST' \
+  'http://127.0.0.1:8000/score' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "model": "BAAI/bge-reranker-v2-m3",
+  "text_1": "What is the capital of France?",
+  "text_2": [
+    "The capital of Brazil is Brasilia.",
+    "The capital of France is Paris."
+  ]
+}'
+```
 
-```{argparse}
-:module: vllm.entrypoints.openai.cli_args
-:func: create_parser_for_docs
-:prog: vllm serve
+Response:
+
+```bash
+{
+  "id": "score-request-id",
+  "object": "list",
+  "created": 693570,
+  "model": "BAAI/bge-reranker-v2-m3",
+  "data": [
+    {
+      "index": 0,
+      "object": "score",
+      "score": 0.001094818115234375
+    },
+    {
+      "index": 1,
+      "object": "score",
+      "score": 1
+    }
+  ],
+  "usage": {}
+}
 ```
 
+You can pass a list to both `text_1` and `text_2`, forming multiple sentence pairs
+where each pair is built from a string in `text_1` and the corresponding string in `text_2` (similar to `zip()`).
+The total number of pairs is `len(text_2)`.
+
+Request:
+
+```bash
+curl -X 'POST' \
+  'http://127.0.0.1:8000/score' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "model": "BAAI/bge-reranker-v2-m3",
+  "encoding_format": "float",
+  "text_1": [
+    "What is the capital of Brazil?",
+    "What is the capital of France?"
+  ],
+  "text_2": [
+    "The capital of Brazil is Brasilia.",
+    "The capital of France is Paris."
+  ]
+}'
+```
 
-### Config file
+Response:
 
-The `serve` module can also accept arguments from a config file in
-`yaml` format. The arguments in the yaml must be specified using the
-long form of the argument outlined [here](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server):
+```bash
+{
+  "id": "score-request-id",
+  "object": "list",
+  "created": 693447,
+  "model": "BAAI/bge-reranker-v2-m3",
+  "data": [
+    {
+      "index": 0,
+      "object": "score",
+      "score": 1
+    },
+    {
+      "index": 1,
+      "object": "score",
+      "score": 1
+    }
+  ],
+  "usage": {}
+}
+```
 
-For example:
+#### Extra parameters
 
-```yaml
-# config.yaml
+The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported.
 
-host: "127.0.0.1"
-port: 6379
-uvicorn-log-level: "info"
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-score-pooling-params
+:end-before: end-score-pooling-params
 ```
 
-```bash
-$ vllm serve SOME_MODEL --config config.yaml
+The following extra parameters are supported:
+
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-score-extra-params
+:end-before: end-score-extra-params
 ```
----
-**NOTE**
-In case an argument is supplied simultaneously using command line and the config file, the value from the commandline will take precedence.
-The order of priorities is `command line > config file values > defaults`.
diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst
index c93f65327e31b..1e00f26f9a3ba 100644
--- a/docs/source/usage/multimodal_inputs.rst
+++ b/docs/source/usage/multimodal_inputs.rst
@@ -345,12 +345,12 @@ Here is an end-to-end example using VLM2Vec. To serve the model:
 
 .. code-block:: bash
 
-    vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \
+    vllm serve TIGER-Lab/VLM2Vec-Full --task embed \
       --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
 
 .. important::
 
-    Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding``
+    Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embed``
     to run this model in embedding mode instead of text generation mode.
 
     The custom chat template is completely different from the original one for this model,
@@ -386,12 +386,12 @@ Below is another example, this time using the ``MrLight/dse-qwen2-2b-mrl-v1`` mo
 
 .. code-block:: bash
 
-    vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embedding \
+    vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
       --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja
 
 .. important::
 
-    Like with VLM2Vec, we have to explicitly pass ``--task embedding``.
+    Like with VLM2Vec, we have to explicitly pass ``--task embed``.
     
     Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, which is handled
     by `this custom chat template <https://github.com/vllm-project/vllm/blob/main/examples/template_dse_qwen2_vl.jinja>`__.
diff --git a/examples/offline_inference_openai.md b/examples/offline_inference_openai.md
index 4c64197975534..2436417cb543a 100644
--- a/examples/offline_inference_openai.md
+++ b/examples/offline_inference_openai.md
@@ -1,45 +1,48 @@
 # Offline Inference with the OpenAI Batch file format
 
- **NOTE:** This is a guide to performing batch inference using the OpenAI batch file format, **NOT** the complete Batch (REST) API.
- 
- ## File Format
- 
- The OpenAI batch file format consists of a series of json objects on new lines.
+```{important}
+This is a guide to performing batch inference using the OpenAI batch file format, **not** the complete Batch (REST) API.
+```
+
+## File Format
  
- [See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl)
+The OpenAI batch file format consists of a series of json objects on new lines.
  
- Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
+[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl)
  
- **NOTE:** We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon).
+Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
  
- ## Pre-requisites
+```{note}
+We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon).
+```
  
-* Ensure you are using `vllm >= 0.4.3`. You can check by running `python -c "import vllm; print(vllm.__version__)"`.
+## Pre-requisites
+
 * The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`.
   - Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens)
   - Install the token on your machine (Run `huggingface-cli login`).
   - Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions.
  
  
- ## Example 1: Running with a local file
- 
- ### Step 1: Create your batch file
- 
- To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
- 
- ```
- wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
- ```
- 
- Once you've created your batch file it should look like this
- 
- ```
- $ cat openai_example_batch.jsonl
+## Example 1: Running with a local file
+
+### Step 1: Create your batch file
+
+To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
+
+```
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
+```
+
+Once you've created your batch file it should look like this
+
+```
+$ cat openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
- ```
- 
- ### Step 2: Run the batch
+```
+
+### Step 2: Run the batch
  
 The batch running tool is designed to be used from the command line.
 
@@ -85,18 +88,18 @@ To integrate with cloud blob storage, we recommend using presigned urls.
 ### Step 1: Upload your input script
 
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
- 
- ```
- wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
- ```
- 
- Once you've created your batch file it should look like this
- 
- ```
- $ cat openai_example_batch.jsonl
+
+```
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
+```
+
+Once you've created your batch file it should look like this
+
+```
+$ cat openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
- ```
+```
 
 Now upload your batch file to your S3 bucket.
 
@@ -104,7 +107,6 @@ Now upload your batch file to your S3 bucket.
 aws s3 cp openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
 ```
 
-  
 ### Step 2: Generate your presigned urls
 
 Presigned urls can only be generated via the SDK. You can run the following python script to generate your presigned urls. Be sure to replace the `MY_BUCKET`, `MY_INPUT_FILE.jsonl`, and `MY_OUTPUT_FILE.jsonl` placeholders with your bucket and file names.
@@ -179,21 +181,19 @@ aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl -
 
 ### Step 1: Create your batch file
  
- Add embedding requests to your batch file. The following is an example:
+Add embedding requests to your batch file. The following is an example:
  
- ```
- {"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
+```
+{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}}
 ```
- 
- You can even mix chat completion and embedding requests in the batch file, as long as the model you are using supports both chat completion and embeddings (note that all requests must use the same model).
 
+You can even mix chat completion and embedding requests in the batch file, as long as the model you are using supports both chat completion and embeddings (note that all requests must use the same model).
 
- ### Step 2: Run the batch
+### Step 2: Run the batch
 
 You can run the batch using the same command as in earlier examples.
 
-
 ### Step 3: Check your results
 
 You can check your results by running `cat results.jsonl`
@@ -201,5 +201,5 @@ You can check your results by running `cat results.jsonl`
 ```
 $ cat results.jsonl
 {"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null}
-...```
+...
 ```
diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/openai_chat_embedding_client_for_multimodal.py
index fff82020d9a30..a56e7429b7567 100644
--- a/examples/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/openai_chat_embedding_client_for_multimodal.py
@@ -99,7 +99,7 @@ def dse_qwen2_vl(inp: dict):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(
         "Script to call a specified VLM through the API. Make sure to serve "
-        "the model with --task embedding before running this.")
+        "the model with --task embed before running this.")
     parser.add_argument("model",
                         type=str,
                         choices=["vlm2vec", "dse_qwen2_vl"],
diff --git a/examples/openai_cross_encoder_score.py b/examples/openai_cross_encoder_score.py
index 8c32eea5dd252..a06af8df5d3fe 100644
--- a/examples/openai_cross_encoder_score.py
+++ b/examples/openai_cross_encoder_score.py
@@ -1,14 +1,15 @@
-"""Examples Python client Score for Cross Encoder Models
 """
+Example online usage of Score API.
 
+Run `vllm serve <model> --task score` to start up the server in vLLM.
+"""
 import argparse
-import json
 import pprint
 
 import requests
 
 
-def post_http_request(prompt: json, api_url: str) -> requests.Response:
+def post_http_request(prompt: dict, api_url: str) -> requests.Response:
     headers = {"User-Agent": "Test Client"}
     response = requests.post(api_url, headers=headers, json=prompt)
     return response
@@ -20,20 +21,29 @@ def post_http_request(prompt: json, api_url: str) -> requests.Response:
     parser.add_argument("--port", type=int, default=8000)
     parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3")
     args = parser.parse_args()
-    api_url = f"http://{args.host}:{args.port}/v1/score"
+    api_url = f"http://{args.host}:{args.port}/score"
 
     model_name = args.model
 
+    text_1 = "What is the capital of Brazil?"
+    text_2 = "The capital of Brazil is Brasilia."
+    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
+    score_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("Prompt when text_1 and text_2 are both strings:")
+    pprint.pprint(prompt)
+    print("Score Response:")
+    pprint.pprint(score_response.json())
+
     text_1 = "What is the capital of France?"
     text_2 = [
         "The capital of Brazil is Brasilia.", "The capital of France is Paris."
     ]
     prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
     score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt for text_1 is string and text_2 is a list:")
+    print("Prompt when text_1 is string and text_2 is a list:")
     pprint.pprint(prompt)
     print("Score Response:")
-    pprint.pprint(score_response.data)
+    pprint.pprint(score_response.json())
 
     text_1 = [
         "What is the capital of Brazil?", "What is the capital of France?"
@@ -43,16 +53,7 @@ def post_http_request(prompt: json, api_url: str) -> requests.Response:
     ]
     prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
     score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt for text_1 and text_2 are lists:")
-    pprint.pprint(prompt)
-    print("Score Response:")
-    pprint.pprint(score_response.data)
-
-    text_1 = "What is the capital of Brazil?"
-    text_2 = "The capital of Brazil is Brasilia."
-    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
-    score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt for text_1 and text_2 are strings:")
+    print("Prompt when text_1 and text_2 are both lists:")
     pprint.pprint(prompt)
     print("Score Response:")
-    pprint.pprint(score_response.data)
\ No newline at end of file
+    pprint.pprint(score_response.json())
diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
index 0698c19ad0023..a803ea4a8d6ad 100644
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -27,7 +27,7 @@ async def test_text_1_str_text_2_list(server: RemoteOpenAIServer,
         "The capital of Brazil is Brasilia.", "The capital of France is Paris."
     ]
 
-    score_response = requests.post(server.url_for("v1/score"),
+    score_response = requests.post(server.url_for("score"),
                                    json={
                                        "model": model_name,
                                        "text_1": text_1,
@@ -55,7 +55,7 @@ async def test_text_1_list_text_2_list(server: RemoteOpenAIServer,
         "The capital of Brazil is Brasilia.", "The capital of France is Paris."
     ]
 
-    score_response = requests.post(server.url_for("v1/score"),
+    score_response = requests.post(server.url_for("score"),
                                    json={
                                        "model": model_name,
                                        "text_1": text_1,
@@ -78,7 +78,7 @@ async def test_text_1_str_text_2_str(server: RemoteOpenAIServer,
     text_1 = "What is the capital of France?"
     text_2 = "The capital of France is Paris."
 
-    score_response = requests.post(server.url_for("v1/score"),
+    score_response = requests.post(server.url_for("score"),
                                    json={
                                        "model": model_name,
                                        "text_1": text_1,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 2e27224b41864..14e3a34ce141c 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -406,7 +406,7 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
     assert_never(generator)
 
 
-@router.post("/v1/score")
+@router.post("/score")
 async def create_score(request: ScoreRequest, raw_request: Request):
     handler = score(raw_request)
     if handler is None:
@@ -423,6 +423,15 @@ async def create_score(request: ScoreRequest, raw_request: Request):
     assert_never(generator)
 
 
+@router.post("/v1/score")
+async def create_score_v1(request: ScoreRequest, raw_request: Request):
+    logger.warning(
+        "To indicate that Score API is not part of standard OpenAI API, we "
+        "have moved it to `/score`. Please update your client accordingly.")
+
+    return await create_score(request, raw_request)
+
+
 if envs.VLLM_TORCH_PROFILER_DIR:
     logger.warning(
         "Torch Profiler is enabled in the API server. This should ONLY be "
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 34c9f0a96216f..f4e7740ea0cff 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -812,10 +812,11 @@ class ScoreRequest(OpenAIBaseModel):
     text_2: Union[List[str], str]
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
 
-    # doc: begin-chat-embedding-pooling-params
+    # doc: begin-score-pooling-params
     additional_data: Optional[Any] = None
-    # doc: end-chat-embedding-pooling-params
+    # doc: end-score-pooling-params
 
+    # doc: begin-score-extra-params
     priority: int = Field(
         default=0,
         description=(
@@ -823,6 +824,8 @@ class ScoreRequest(OpenAIBaseModel):
             "default: 0). Any priority other than 0 will raise an error "
             "if the served model does not use priority scheduling."))
 
+    # doc: end-score-extra-params
+
     def to_pooling_params(self):
         return PoolingParams(additional_data=self.additional_data)
 
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 8c6c1aca3a917..2ecdf74ee59b3 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -1,12 +1,11 @@
 import time
-import warnings
 from dataclasses import dataclass
 from typing import Dict, Generic, List, Optional
 from typing import Sequence as GenericSequence
 from typing import Union
 
 import torch
-from typing_extensions import TypeVar
+from typing_extensions import TypeVar, deprecated
 
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.inputs import MultiModalPlaceholderDict
@@ -73,13 +72,11 @@ def __eq__(self, other: object) -> bool:
             (self.data == other.data).all()))
 
     @property
+    @deprecated("`LLM.encode()` now stores raw outputs in the `data` "
+                "attribute. To return embeddings, use `LLM.embed()`. "
+                "To return class probabilities, use `LLM.classify()` "
+                "and access the `probs` attribute. ")
     def embedding(self) -> list[float]:
-        msg = ("`LLM.encode()` now returns raw outputs. "
-               "To return embeddings, use `LLM.embed()`. "
-               "To return class probabilities, use `LLM.classify()` "
-               "and access the `probs` attribute. ")
-        warnings.warn(msg, DeprecationWarning, stacklevel=2)
-
         return self.data.tolist()
 
 
@@ -491,11 +488,9 @@ def __repr__(self) -> str:
         return f"ScoringOutput(score={self.score})"
 
     @property
+    @deprecated("`LLM.score()` now returns scalar scores. "
+                "Please access it via the `score` attribute. ")
     def embedding(self) -> list[float]:
-        msg = ("`LLM.score()` now returns scalar scores. "
-               "Please access it via the `score` attribute. ")
-        warnings.warn(msg, DeprecationWarning, stacklevel=2)
-
         return [self.score]
 
 

From 0a56bcc03de0857be464c3f8783258d590cbc762 Mon Sep 17 00:00:00 2001
From: Jani Monoses <jani.monoses@gmail.com>
Date: Fri, 13 Dec 2024 20:00:40 +0200
Subject: [PATCH 060/357] [Bugfix][Hardware][CPU] Enable Gemma2 with SDPA on
 CPU backend (#11169)

---
 vllm/attention/backends/torch_sdpa.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index 86e952a903f36..0cff6f5952aba 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -13,7 +13,7 @@
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.ipex_attn import PagedAttention
 from vllm.attention.ops.paged_attn import PagedAttentionMetadata
-from vllm.utils import make_tensor_with_pad
+from vllm.utils import make_tensor_with_pad, print_warning_once
 from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder
 
 
@@ -395,7 +395,8 @@ def __init__(
             raise ValueError(
                 "Torch SPDA does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            raise ValueError("Torch SPDA does not support logits soft cap.")
+            print_warning_once("Torch SPDA does not support logits soft cap. "
+                               "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
@@ -619,7 +620,7 @@ def _run_sdpa_forward(
                 value[None, :, start_kv:end_kv, :],
                 attn_mask=mask,
                 dropout_p=0.0,
-                is_causal=causal_attn and not self.need_mask,
+                is_causal=causal_attn and mask is None,
                 scale=self.scale).squeeze(0).movedim(query.dim() - 2, 0)
             output[start_q:end_q, :, :] = sub_out
             start_q, start_kv = end_q, end_kv

From 0d8451c3a45d309e58de5e1c546f043de461d478 Mon Sep 17 00:00:00 2001
From: Jiaxin Shan <seedjeffwan@gmail.com>
Date: Fri, 13 Dec 2024 12:17:37 -0800
Subject: [PATCH 061/357] [Distributed] Allow the placement group more time to
 wait for resources to be ready (#11138)

Signed-off-by: Jiaxin Shan <seedjeffwan@gmail.com>
---
 vllm/executor/ray_utils.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 4f28efd639084..426aa1b5c728f 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -277,10 +277,14 @@ def initialize_ray_cluster(
                 f"Total number of devices: {device_bundles}.")
     else:
         num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
+        # Log a warning message and delay resource allocation failure response.
+        # Avoid immediate rejection to allow user-initiated placement group
+        # created and wait cluster to be ready
         if parallel_config.world_size > num_devices_in_cluster:
-            raise ValueError(
-                f"The number of required {device_str}s exceeds the total "
-                f"number of available {device_str}s in the placement group.")
+            logger.warning(
+                "The number of required %ss exceeds the total "
+                "number of available %ss in the placement group.", device_str,
+                device_str)
         # Create a new placement group
         placement_group_specs: List[Dict[str, float]] = ([{
             device_str: 1.0

From 4863e5fba51b8e1a5012e2a7582aece0ca575b89 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 13 Dec 2024 19:27:32 -0500
Subject: [PATCH 062/357] [Core] V1: Use multiprocessing by default (#11074)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 docs/source/design/multiprocessing.md     | 195 ++++++++++++++++++++++
 docs/source/getting_started/debugging.rst |  56 +++++++
 docs/source/index.rst                     |   1 +
 vllm/entrypoints/llm.py                   |   4 +
 vllm/envs.py                              |   4 +-
 vllm/executor/multiproc_worker_utils.py   |  20 ++-
 vllm/v1/engine/core.py                    |   8 +-
 vllm/v1/engine/core_client.py             |  11 +-
 vllm/v1/engine/llm_engine.py              |   7 +
 vllm/v1/executor/multiproc_executor.py    |  10 +-
 10 files changed, 299 insertions(+), 17 deletions(-)
 create mode 100644 docs/source/design/multiprocessing.md

diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md
new file mode 100644
index 0000000000000..b58456ecc6da8
--- /dev/null
+++ b/docs/source/design/multiprocessing.md
@@ -0,0 +1,195 @@
+# Python Multiprocessing
+
+## Debugging
+
+Please see the [Debugging
+Tips](https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing)
+page for information on known issues and how to solve them.
+
+## Introduction
+
+*Note that source code references are to the state of the code at the time of writing in December, 2024.*
+
+The use of Python multiprocessing in vLLM is complicated by:
+
+- The use of vLLM as a library and the inability to control the code using vLLM
+- Varying levels of incompatibilities between multiprocessing methods and vLLM
+  dependencies
+
+This document describes how vLLM deals with these challenges.
+
+## Multiprocessing Methods
+
+[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
+
+- `spawn` - spawn a new Python process. This will be the default as of Python
+  3.14.
+
+- `fork` - Use `os.fork()` to fork the Python interpreter. This is the default
+  in Python versions prior to 3.14.
+
+- `forkserver` - Spawn a server process that will fork a new process on request.
+
+### Tradeoffs
+
+`fork` is the fastest method, but is incompatible with dependencies that use
+threads.
+
+`spawn` is more compatible with dependencies, but can be problematic when vLLM
+is used as a library. If the consuming code does not use a `__main__` guard (`if
+__name__ == "__main__":`), the code will be inadvertently re-executed when vLLM
+spawns a new process. This can lead to infinite recursion, among other problems.
+
+`forkserver` will spawn a new server process that will fork new processes on
+demand. This unfortunately has the same problem as `spawn` when vLLM is used as
+a library. The server process is created as a spawned new process, which will
+re-execute code not protected by a `__main__` guard.
+
+For both `spawn` and `forkserver`, the process must not depend on inheriting any
+global state as would be the case with `fork`.
+
+## Compatibility with Dependencies
+
+Multiple vLLM dependencies indicate either a preference or requirement for using
+`spawn`:
+
+- <https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing>
+- <https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors>
+- <https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders>
+
+It is perhaps more accurate to say that there are known problems with using
+`fork` after initializing these dependencies.
+
+## Current State (v0)
+
+The environment variable `VLLM_WORKER_MULTIPROC_METHOD` can be used to control which method is used by vLLM. The current default is `fork`.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/envs.py#L339-L342>
+
+When we know we own the process because the `vllm` command was used, we use
+`spawn` because it's the most widely compatible.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/scripts.py#L123-L140>
+
+The `multiproc_xpu_executor` forces the use of `spawn`.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/executor/multiproc_xpu_executor.py#L14-L18>
+
+There are other miscellaneous places hard-coding the use of `spawn`:
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L135>
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/entrypoints/openai/api_server.py#L184>
+
+Related PRs:
+
+- <https://github.com/vllm-project/vllm/pull/8823>
+
+## Prior State in v1
+
+There was an environment variable to control whether multiprocessing is used in
+the v1 engine core, `VLLM_ENABLE_V1_MULTIPROCESSING`. This defaulted to off.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/envs.py#L452-L454>
+
+When it was enabled, the v1 `LLMEngine` would create a new process to run the
+engine core.
+
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/llm_engine.py#L93-L95>
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/llm_engine.py#L70-L77>
+- https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/core_client.py#L44-L45
+
+It was off by default for all the reasons mentioned above - compatibility with
+dependencies and code using vLLM as a library.
+
+### Changes Made in v1
+
+There is not an easy solution with Python's `multiprocessing` that will work
+everywhere. As a first step, we can get v1 into a state where it does "best
+effort" choice of multiprocessing method to maximize compatibility.
+
+- Default to `fork`.
+- Use `spawn` when we know we control the main process (`vllm` was executed).
+- If we detect `cuda` was previously initialized, force `spawn` and emit a
+  warning. We know `fork` will break, so this is the best we can do.
+
+The case that is known to still break in this scenario is code using vLLM as a
+library that initializes `cuda` before calling vLLM. The warning we emit should
+instruct users to either add a `__main__` guard or to disable multiprocessing.
+
+If that known-failure case occurs, the user will see two messages that explain
+what is happening. First, a log message from vLLM:
+
+```
+    WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
+      initialized. We must use the `spawn` multiprocessing start method. Setting
+      VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
+      https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
+      for more information.
+```
+
+Second, Python itself will raise an exception with a nice explanation:
+
+```
+RuntimeError:
+        An attempt has been made to start a new process before the
+        current process has finished its bootstrapping phase.
+
+        This probably means that you are not using fork to start your
+        child processes and you have forgotten to use the proper idiom
+        in the main module:
+
+            if __name__ == '__main__':
+                freeze_support()
+                ...
+
+        The "freeze_support()" line can be omitted if the program
+        is not going to be frozen to produce an executable.
+
+        To fix this issue, refer to the "Safe importing of main module"
+        section in https://docs.python.org/3/library/multiprocessing.html
+```
+
+## Alternatives Considered
+
+### Detect if a `__main__` guard is present
+
+It has been suggested that we could behave better if we could detect whether
+code using vLLM as a library has a `__main__` guard in place. This [post on
+stackoverflow](https://stackoverflow.com/questions/77220442/multiprocessing-pool-in-a-python-class-without-name-main-guard)
+was from a library author facing the same question.
+
+It is possible to detect whether we are in the original, `__main__` process, or
+a subsequent spawned process. However, it does not appear to be straight forward
+to detect whether a `__main__` guard is present in the code.
+
+This option has been discarded as impractical.
+
+### Use `forkserver`
+
+At first it appears that `forkserver` is a nice solution to the problem.
+However, the way it works presents the same challenges that `spawn` does when
+vLLM is used as a library.
+
+### Force `spawn` all the time
+
+One way to clean this up is to just force the use of `spawn` all the time and
+document that the use of a `__main__` guard is required when using vLLM as a
+library. This would unfortunately break existing code and make vLLM harder to
+use, violating the desire to make the `LLM` class as easy as possible to use.
+
+Instead of pushing this on our users, we will retain the complexity to do our
+best to make things work.
+
+## Future Work
+
+We may want to consider a different worker management approach in the future
+that works around these challenges.
+
+1. We could implement something `forkserver`-like, but have the process manager
+   be something we initially launch by running our own subprocess and a custom
+   entrypoint for worker management (launch a `vllm-manager` process).
+
+2. We can explore other libraries that may better suit our needs. Examples to
+   consider:
+
+- <https://github.com/joblib/loky>
diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index 0c1afcbd7c0b9..d6c83014dc69f 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -136,6 +136,62 @@ If the test script hangs or crashes, usually it means the hardware/drivers are b
 
     Adjust ``--nproc-per-node``, ``--nnodes``, and ``--node-rank`` according to your setup, being sure to execute different commands (with different ``--node-rank``) on different nodes.
 
+Python multiprocessing
+----------------------
+
+`RuntimeError` Exception
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you have seen a warning in your logs like this:
+
+.. code-block:: console
+
+    WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
+        initialized. We must use the `spawn` multiprocessing start method. Setting
+        VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
+        https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
+        for more information.
+
+or an error from Python that looks like this:
+
+.. code-block:: console
+
+    RuntimeError:
+            An attempt has been made to start a new process before the
+            current process has finished its bootstrapping phase.
+
+            This probably means that you are not using fork to start your
+            child processes and you have forgotten to use the proper idiom
+            in the main module:
+
+                if __name__ == '__main__':
+                    freeze_support()
+                    ...
+
+            The "freeze_support()" line can be omitted if the program
+            is not going to be frozen to produce an executable.
+
+            To fix this issue, refer to the "Safe importing of main module"
+            section in https://docs.python.org/3/library/multiprocessing.html
+
+then you must update your Python code to guard usage of ``vllm`` behind a ``if
+__name__ == '__main__':`` block. For example, instead of this:
+
+.. code-block:: python
+
+    import vllm
+
+    llm = vllm.LLM(...)
+
+try this instead:
+
+.. code-block:: python
+
+    if __name__ == '__main__':
+        import vllm
+
+        llm = vllm.LLM(...)
+
 Known Issues
 ----------------------------------------
 - In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq <https://github.com/zeromq/pyzmq/issues/2000>`_ , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of ``vllm`` to include the `fix <https://github.com/vllm-project/vllm/pull/6759>`_.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 842013d6d49c4..8ac09f6988893 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -173,6 +173,7 @@ Documentation
    design/input_processing/model_inputs_index
    design/kernel/paged_attention
    design/multimodal/multimodal_index
+   design/multiprocessing
 
 .. For Developers: contributing to the vLLM project
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 11b2574ce42dd..58ab892676b9a 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -232,6 +232,10 @@ def __init__(
 
         self.request_counter = Counter()
 
+    def __del__(self):
+        if self.llm_engine and hasattr(self.llm_engine, "shutdown"):
+            self.llm_engine.shutdown()
+
     @staticmethod
     def get_engine_class() -> Type[LLMEngine]:
         if envs.VLLM_USE_V1:
diff --git a/vllm/envs.py b/vllm/envs.py
index bc8c1499e9534..da17b747ea215 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -69,7 +69,7 @@
     VLLM_SKIP_P2P_CHECK: bool = False
     VLLM_DISABLED_KERNELS: List[str] = []
     VLLM_USE_V1: bool = False
-    VLLM_ENABLE_V1_MULTIPROCESSING: bool = False
+    VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
 
 
@@ -460,7 +460,7 @@ def get_default_config_root():
 
     # If set, enable multiprocessing in LLM for the V1 code path.
     "VLLM_ENABLE_V1_MULTIPROCESSING":
-    lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0"))),
+    lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))),
     "VLLM_LOG_BATCHSIZE_INTERVAL":
     lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")),
 }
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index fe475db6d3f57..c4d90f0856f86 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -274,7 +274,20 @@ def write_with_prefix(s: str):
     file.write = write_with_prefix  # type: ignore[method-assign]
 
 
+def _check_multiproc_method():
+    if (cuda_is_initialized()
+            and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
+        logger.warning("CUDA was previously initialized. We must use "
+                       "the `spawn` multiprocessing start method. Setting "
+                       "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
+                       "See https://docs.vllm.ai/en/latest/getting_started/"
+                       "debugging.html#python-multiprocessing "
+                       "for more information.")
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+
 def get_mp_context():
+    _check_multiproc_method()
     mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
     return multiprocessing.get_context(mp_method)
 
@@ -284,12 +297,7 @@ def set_multiprocessing_worker_envs(parallel_config):
     in a multiprocessing environment. This should be called by the parent 
     process before worker processes are created"""
 
-    if (cuda_is_initialized()
-            and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
-        logger.warning("CUDA was previously initialized. We must use "
-                       "the `spawn` multiprocessing start method. Setting "
-                       "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'.")
-        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+    _check_multiproc_method()
 
     # Configure thread parallelism if OMP_NUM_THREADS isn't set
     #
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 6246a0067842a..ee7419bce2565 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1,4 +1,3 @@
-import multiprocessing
 import pickle
 import queue
 import signal
@@ -13,6 +12,7 @@
 from msgspec import msgpack
 
 from vllm.config import CacheConfig, VllmConfig
+from vllm.executor.multiproc_worker_utils import get_mp_context
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.core.scheduler import Scheduler
@@ -210,11 +210,7 @@ def make_engine_core_process(
         output_path: str,
         ready_path: str,
     ) -> EngineCoreProcHandle:
-        # The current process might have CUDA context,
-        # so we need to spawn a new process.
-        # NOTE(rob): this is a problem for using EngineCoreProc w/
-        # LLM, since we need a if __name__ == "__main__" guard.
-        context = multiprocessing.get_context("spawn")
+        context = get_mp_context()
 
         process_kwargs = {
             "input_path": input_path,
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index a66ae111be8c5..e0bfe1b93b360 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -159,10 +159,16 @@ def __init__(
         atexit.register(self.shutdown)
 
     def shutdown(self):
+        # During final garbage collection in process shutdown, atexit may be
+        # None.
+        if atexit:
+            # in case shutdown gets called via __del__ first
+            atexit.unregister(self.shutdown)
+
         # Shut down the zmq context.
         self.ctx.destroy(linger=0)
 
-        if hasattr(self, "proc_handle"):
+        if hasattr(self, "proc_handle") and self.proc_handle:
             # Shutdown the process if needed.
             if self.proc_handle.proc.is_alive():
                 self.proc_handle.proc.terminate()
@@ -178,8 +184,9 @@ def shutdown(self):
             ]
             for ipc_socket in ipc_sockets:
                 socket_file = ipc_socket.replace("ipc://", "")
-                if os.path.exists(socket_file):
+                if os and os.path.exists(socket_file):
                     os.remove(socket_file)
+            self.proc_handle = None
 
     def __del__(self):
         self.shutdown()
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 1b3a9f12d009e..c02494897b41f 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -196,3 +196,10 @@ def get_tokenizer_group(
                             f"found type: {type(tokenizer_group)}")
 
         return tokenizer_group
+
+    def __del__(self):
+        self.shutdown()
+
+    def shutdown(self):
+        if engine_core := getattr(self, "engine_core", None):
+            engine_core.shutdown()
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 63a12f791051f..14384a730ceec 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -163,6 +163,10 @@ def _ensure_worker_termination(self):
         termination and kill signals if needed."""
 
         def wait_for_termination(procs, timeout):
+            if not time:
+                # If we are in late stage shutdown, the interpreter may replace
+                # `time` with `None`.
+                return all(not proc.is_alive() for proc in procs)
             start_time = time.time()
             while time.time() - start_time < timeout:
                 if all(not proc.is_alive() for proc in procs):
@@ -187,10 +191,14 @@ def _cleanup_sockets(self):
         for w in self.workers:
             # Remove the zmq ipc socket file
             socket_path = w.ready_path.replace("ipc://", "")
-            if os.path.exists(socket_path):
+            if os and os.path.exists(socket_path):
                 os.remove(socket_path)
 
     def shutdown(self):
+        if atexit:
+            # in case shutdown was called explicitly, we don't need to call it
+            # again
+            atexit.unregister(self.shutdown)
         """Properly shut down the executor and its workers"""
         if (hasattr(self, 'workers') and self.workers is not None):
             for w in self.workers:  #TODO: not sure if needed

From 4b5b8a6a3bd94d9b0248b36b0eb4739d76fbb386 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Fri, 13 Dec 2024 20:02:35 -0500
Subject: [PATCH 063/357] [V1][Bugfix] Fix EngineCoreProc profile (#11185)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 vllm/v1/engine/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index ee7419bce2565..dc8c1d39eefa9 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -321,7 +321,7 @@ def _handle_client_request(
         if isinstance(request, EngineCoreRequest):
             self.add_request(request)
         elif isinstance(request, EngineCoreProfile):
-            self.model_executor.worker.profile(request.is_start)
+            self.model_executor.profile(request.is_start)
         else:
             # TODO: make an EngineCoreAbort wrapper
             assert isinstance(request, list)

From 9855aea21b6aec48b12cef3a1614e7796b970a73 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Fri, 13 Dec 2024 17:08:23 -0800
Subject: [PATCH 064/357] [Bugfix][V1] Re-compute an entire block when fully
 cache hit (#11186)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
---
 vllm/v1/core/scheduler.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index a3e85c20cc664..f055eed77c372 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -199,9 +199,13 @@ def schedule(self) -> "SchedulerOutput":
                 if num_new_tokens == 0:
                     # The happens when prompt length is divisible by the block
                     # size and all blocks are cached. Now we force to recompute
-                    # the last token.
-                    num_computed_tokens -= 1
-                    num_new_tokens = 1
+                    # the last block. Note that we have to re-compute an entire
+                    # block because allocate_slots() assumes num_computed_tokens
+                    # is always a multiple of the block size. This limitation
+                    # can potentially be removed in the future to slightly
+                    # improve the performance.
+                    num_computed_tokens -= self.block_size
+                    num_new_tokens = self.block_size
                     computed_blocks.pop()
                 num_new_tokens = min(num_new_tokens, token_budget)
                 assert num_new_tokens > 0

From 24a3d12b821a081850c1659f61762e799eeba902 Mon Sep 17 00:00:00 2001
From: dhuangnm <74931910+dhuangnm@users.noreply.github.com>
Date: Fri, 13 Dec 2024 22:22:44 -0500
Subject: [PATCH 065/357] update compressed-tensors to latest version (#11183)

Co-authored-by: dhuangnm <dhuang@MacBook-Pro-2.local>
---
 requirements-common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 11984260c580d..71c5b122d7c42 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -33,5 +33,5 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.8.0 # required for compressed-tensors
+compressed-tensors == 0.8.1 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging torch.compile

From 48259264a4012e756215adc87e3682bf1e7dfee9 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Sat, 14 Dec 2024 02:46:18 -0500
Subject: [PATCH 066/357] [Core] Update outlines and increase its threadpool
 size (#11140)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 requirements-common.txt                               |  2 +-
 .../guided_decoding/outlines_decoding.py              | 11 ++++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 71c5b122d7c42..bd2b4b7a01668 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -19,7 +19,7 @@ prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
-outlines == 0.1.9
+outlines == 0.1.11
 xgrammar >= 0.1.6; platform_machine == "x86_64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
index 8a7ff38bfeb1a..eb8db882435e6 100644
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -1,5 +1,6 @@
 import asyncio
 import concurrent.futures
+import os
 from enum import Enum
 from json import dumps as json_dumps
 from re import escape as regex_escape
@@ -48,6 +49,11 @@ class GuidedDecodingMode(Enum):
 
 global_thread_pool = None  # used for generating logits processor fsm
 
+# It's not yet clear that using more provides a benefit, and it could
+# potentially starve other processes on the machine. We'll cap this for now and
+# adjust later if testing proves it to help overcome a bottleneck.
+_MAX_THREADPOOL_WORKERS = 16
+
 
 async def get_outlines_guided_decoding_logits_processor(
     guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase
@@ -65,8 +71,11 @@ async def get_outlines_guided_decoding_logits_processor(
         return None
 
     if global_thread_pool is None:
+        max_workers = os.cpu_count() or 2
+        if max_workers > _MAX_THREADPOOL_WORKERS:
+            max_workers = _MAX_THREADPOOL_WORKERS
         global_thread_pool = concurrent.futures.ThreadPoolExecutor(
-            max_workers=2)
+            max_workers=max_workers)
     loop = asyncio.get_running_loop()
 
     return await loop.run_in_executor(global_thread_pool,

From ea7bd68d101884165ffd75c1fd6e94a97510f194 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Sat, 14 Dec 2024 03:21:23 -0500
Subject: [PATCH 067/357] [V1][Bugfix] Fix V1 TP trust-remote-code (#11182)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 vllm/v1/engine/core.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index dc8c1d39eefa9..af644fb5fedba 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -14,6 +14,8 @@
 from vllm.config import CacheConfig, VllmConfig
 from vllm.executor.multiproc_worker_utils import get_mp_context
 from vllm.logger import init_logger
+from vllm.transformers_utils.config import (
+    maybe_register_config_serialize_by_value)
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
@@ -241,6 +243,9 @@ def run_engine_core(*args, **kwargs):
         # processes to terminate without error
         shutdown_requested = False
 
+        # Ensure we can serialize transformer config after spawning
+        maybe_register_config_serialize_by_value()
+
         def signal_handler(signum, frame):
             nonlocal shutdown_requested
             if not shutdown_requested:

From 3cb5769883fa104e42248f2b3f41a310947f357c Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 15 Dec 2024 00:38:27 +0800
Subject: [PATCH 068/357] [Misc] Minor improvements to the readability of
 PunicaWrapperBase (#11200)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/punica_wrapper/punica_base.py | 14 +++++-----
 vllm/lora/punica_wrapper/punica_gpu.py  | 34 ++++++++++++-------------
 vllm/lora/punica_wrapper/punica_hpu.py  |  4 +--
 3 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index 0a5a84bdd8deb..b9ec0c4bc6323 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -63,7 +63,7 @@ def add_expand(
         lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
         output_slices: Tuple[int, ...],
         offset_start: int = 0,
-        add_input=True,
+        add_inputs=True,
         **kwargs,
     ) -> None:
         """
@@ -77,7 +77,7 @@ def add_lora_embedding(
         y: torch.Tensor,
         x: torch.Tensor,
         lora_b_stacked: torch.Tensor,
-        add_input: bool = True,
+        add_inputs: bool = True,
         **kwargs,
     ) -> None:
         """
@@ -367,12 +367,13 @@ def add_expand(self,
                    lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
                    output_slices: Tuple[int, ...],
                    offset_start: int = 0,
-                   add_input=True,
+                   add_inputs=True,
                    **kwargs) -> None:
         """
         Performs GEMM and bias addition for multiple slices of lora_b.
       
         Semantics:
+            offset = offset_start
             for i in range(len(lora_b_stacked)):
                 slice = output_slices[i]
                 y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
@@ -386,7 +387,8 @@ def add_expand(self,
             lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): 
                 bias's weight
             output_slices (Tuple[int, ...]): Every slice's size
-            add_input (bool):  Defaults to True.
+            offset_start (int): The starting position of y, defaults to 0
+            add_inputs (bool):  Defaults to True.
 
         """
         # TODO: implement it based on torch ops
@@ -397,7 +399,7 @@ def add_lora_embedding(self,
                            y: torch.Tensor,
                            x: torch.Tensor,
                            lora_b_stacked: torch.Tensor,
-                           add_input: bool = True,
+                           add_inputs: bool = True,
                            **kwargs) -> None:
         """
         Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
@@ -409,7 +411,7 @@ def add_lora_embedding(self,
             y (torch.Tensor): Output tensor.
             x (torch.Tensor): Input tensor.
             lora_b_stacked (torch.Tensor): lora_b's weights.
-            add_input (bool): Default to True.
+            add_inputs (bool): Default to True.
         """
         # TODO: implement it based on torch ops
         raise NotImplementedError
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index b2af29de129ce..de378df8b3cfa 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -67,7 +67,7 @@ def _expand_prefill(
         y: torch.Tensor,
         x: torch.Tensor,
         w_t_all: torch.Tensor,
-        add_input: bool,
+        add_inputs: bool,
     ):
         #No LoRA request, so return directly
         if self.no_lora:
@@ -77,7 +77,7 @@ def _expand_prefill(
             w_t_all,
             y,
             *self.prefill_metadata,
-            add_input,
+            add_inputs,
         )
 
     def _expand_decode(
@@ -85,9 +85,9 @@ def _expand_decode(
         y: torch.Tensor,
         x: torch.Tensor,
         w_t_all: torch.Tensor,
-        add_input: bool,
+        add_inputs: bool,
     ):
-        bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_input)
+        bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_inputs)
 
     def _expand_slice_prefill(
         self,
@@ -96,7 +96,7 @@ def _expand_slice_prefill(
         w_t_all: torch.Tensor,
         y_offset: Optional[int],
         y_slice_size: Optional[int],
-        add_input: bool,
+        add_inputs: bool,
     ):
         #No LoRA request, so return directly
         if self.no_lora:
@@ -108,7 +108,7 @@ def _expand_slice_prefill(
             *self.prefill_metadata,
             y_offset,
             y_slice_size,
-            add_input,
+            add_inputs,
         )
 
     def _expand_slice_decode(
@@ -118,10 +118,10 @@ def _expand_slice_decode(
         w_t_all: torch.Tensor,
         y_offset: Optional[int],
         y_slice_size: Optional[int],
-        add_input: bool,
+        add_inputs: bool,
     ):
         bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
-                          y_slice_size, add_input)
+                          y_slice_size, add_inputs)
 
     def _apply_expand(
         self,
@@ -130,7 +130,7 @@ def _apply_expand(
         w_t_all: torch.Tensor,
         y_offset: Optional[int],
         y_slice_size: Optional[int],
-        add_input: bool = True,
+        add_inputs: bool = True,
     ):
         """
         Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` 
@@ -141,7 +141,7 @@ def _apply_expand(
         expand_slice_fun: Callable = (self._expand_slice_prefill
                                       if self.is_prefill else
                                       self._expand_slice_decode)
-        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input)
+        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_inputs)
 
     def _apply_shrink(self, y: torch.Tensor, x: torch.Tensor,
                       w_t_all: torch.Tensor, scale: float):
@@ -194,7 +194,7 @@ def add_expand(self,
                    lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
                    output_slices: Tuple[int, ...],
                    offset_start: int = 0,
-                   add_input=True,
+                   add_inputs=True,
                    **kwargs) -> None:
         """
         Performs GEMM and bias addition for multiple slices of lora_b.
@@ -213,7 +213,7 @@ def add_expand(self,
             lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): 
                 bias's weight
             output_slices (Tuple[int, ...]): Every slice's size
-            add_input (bool):  Defaults to True.
+            add_inputs (bool):  Defaults to True.
         """
         y_org = y
         y = y.view(-1, y.shape[-1])
@@ -228,7 +228,7 @@ def add_expand(self,
                 lora_b_stacked[slice_idx],
                 offset_left,
                 output_slices[slice_idx],
-                add_input=add_input,
+                add_inputs=add_inputs,
             )
             offset_left += output_slices[slice_idx]
         y = y.view_as(y_org)
@@ -237,7 +237,7 @@ def add_lora_embedding(self,
                            y: torch.Tensor,
                            x: torch.Tensor,
                            lora_b_stacked: torch.Tensor,
-                           add_input: bool = True,
+                           add_inputs: bool = True,
                            **kwargs) -> None:
         """
         Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
@@ -249,13 +249,13 @@ def add_lora_embedding(self,
             y (torch.Tensor): Output tensor.
             x (torch.Tensor): Input tensor.
             lora_b_stacked (torch.Tensor): lora_b's weights.
-            add_input (bool): Default to True.
+            add_inputs (bool): Default to True.
         """
 
         # Embedding layer only need expand op
         expand_fun: Callable = (self._expand_prefill
                                 if self.is_prefill else self._expand_decode)
-        expand_fun(y, x, lora_b_stacked, add_input)
+        expand_fun(y, x, lora_b_stacked, add_inputs)
 
     def add_lora_linear(self,
                         y: torch.Tensor,
@@ -311,7 +311,7 @@ def add_lora_linear(self,
                         lora_b_stacked,
                         None,
                         output_slices,
-                        add_input=True,
+                        add_inputs=True,
                         **kwargs)
 
     def add_lora_logits(self,
diff --git a/vllm/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py
index 996325b712996..d9c4f44a1c282 100644
--- a/vllm/lora/punica_wrapper/punica_hpu.py
+++ b/vllm/lora/punica_wrapper/punica_hpu.py
@@ -21,7 +21,7 @@ def add_lora_embedding(self,
                            y: torch.Tensor,
                            x: torch.Tensor,
                            lora_b_stacked: torch.Tensor,
-                           add_input: bool = True,
+                           add_inputs: bool = True,
                            **kwargs) -> None:
         dispatch_bgmv_embedding(y, x, lora_b_stacked, 0)
 
@@ -81,7 +81,7 @@ def add_expand(
         lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
         output_slices: Tuple[int, ...],
         offset_start: int = 0,
-        add_input=True,
+        add_inputs=True,
         **kwargs,
     ) -> None:
         raise NotImplementedError

From 9c3dadd1c97df2b37388c6898a0725457391f647 Mon Sep 17 00:00:00 2001
From: Brad Hilton <brad.hilton.nw@gmail.com>
Date: Sat, 14 Dec 2024 09:46:42 -0700
Subject: [PATCH 069/357] [Frontend] Add `logits_processors` as an extra
 completion argument (#11150)

Signed-off-by: Brad Hilton <brad.hilton.nw@gmail.com>
---
 tests/entrypoints/openai/test_serving_chat.py |  1 +
 vllm/config.py                                | 71 +++++++++--------
 vllm/engine/arg_utils.py                      | 11 ++-
 vllm/entrypoints/openai/protocol.py           | 77 ++++++++++++++++++-
 vllm/entrypoints/openai/serving_chat.py       |  3 +-
 vllm/entrypoints/openai/serving_completion.py |  3 +-
 6 files changed, 127 insertions(+), 39 deletions(-)

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 93660e6118ca8..5b40a04db15ee 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -30,6 +30,7 @@ class MockModelConfig:
     tokenizer_revision = None
     multimodal_config = MultiModalConfig()
     hf_config = MockHFConfig()
+    logits_processor_pattern = None
 
 
 @dataclass
diff --git a/vllm/config.py b/vllm/config.py
index 12ed80c366e43..37d062f7eb079 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -156,41 +156,45 @@ class ModelConfig:
             can not be gathered from the vllm arguments.
         override_pooler_config: Initialize non default pooling config or
             override default pooling config for the pooling model.
+        logits_processor_pattern: Optional regex pattern specifying valid
+            logits processor qualified names that can be passed with the
+            `logits_processors` extra completion argument. Defaults to None, 
+            which allows no processors.
     """
 
-    def __init__(
-            self,
-            model: str,
-            task: Union[TaskOption, Literal["draft"]],
-            tokenizer: str,
-            tokenizer_mode: str,
-            trust_remote_code: bool,
-            dtype: Union[str, torch.dtype],
-            seed: int,
-            allowed_local_media_path: str = "",
-            revision: Optional[str] = None,
-            code_revision: Optional[str] = None,
-            rope_scaling: Optional[Dict[str, Any]] = None,
-            rope_theta: Optional[float] = None,
-            tokenizer_revision: Optional[str] = None,
-            max_model_len: Optional[int] = None,
-            spec_target_max_model_len: Optional[int] = None,
-            quantization: Optional[str] = None,
-            quantization_param_path: Optional[str] = None,
-            enforce_eager: Optional[bool] = None,
-            max_seq_len_to_capture: Optional[int] = None,
-            max_logprobs: int = 20,
-            disable_sliding_window: bool = False,
-            skip_tokenizer_init: bool = False,
-            served_model_name: Optional[Union[str, List[str]]] = None,
-            limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
-            use_async_output_proc: bool = True,
-            config_format: ConfigFormat = ConfigFormat.AUTO,
-            hf_overrides: Optional[HfOverrides] = None,
-            mm_processor_kwargs: Optional[Dict[str, Any]] = None,
-            mm_cache_preprocessor: bool = False,
-            override_neuron_config: Optional[Dict[str, Any]] = None,
-            override_pooler_config: Optional["PoolerConfig"] = None) -> None:
+    def __init__(self,
+                 model: str,
+                 task: Union[TaskOption, Literal["draft"]],
+                 tokenizer: str,
+                 tokenizer_mode: str,
+                 trust_remote_code: bool,
+                 dtype: Union[str, torch.dtype],
+                 seed: int,
+                 allowed_local_media_path: str = "",
+                 revision: Optional[str] = None,
+                 code_revision: Optional[str] = None,
+                 rope_scaling: Optional[Dict[str, Any]] = None,
+                 rope_theta: Optional[float] = None,
+                 tokenizer_revision: Optional[str] = None,
+                 max_model_len: Optional[int] = None,
+                 spec_target_max_model_len: Optional[int] = None,
+                 quantization: Optional[str] = None,
+                 quantization_param_path: Optional[str] = None,
+                 enforce_eager: Optional[bool] = None,
+                 max_seq_len_to_capture: Optional[int] = None,
+                 max_logprobs: int = 20,
+                 disable_sliding_window: bool = False,
+                 skip_tokenizer_init: bool = False,
+                 served_model_name: Optional[Union[str, List[str]]] = None,
+                 limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
+                 use_async_output_proc: bool = True,
+                 config_format: ConfigFormat = ConfigFormat.AUTO,
+                 hf_overrides: Optional[HfOverrides] = None,
+                 mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+                 mm_cache_preprocessor: bool = False,
+                 override_neuron_config: Optional[Dict[str, Any]] = None,
+                 override_pooler_config: Optional["PoolerConfig"] = None,
+                 logits_processor_pattern: Optional[str] = None) -> None:
         self.model = model
         self.tokenizer = tokenizer
         self.tokenizer_mode = tokenizer_mode
@@ -316,6 +320,7 @@ def __init__(
         self.task: Final = task
 
         self.pooler_config = self._init_pooler_config(override_pooler_config)
+        self.logits_processor_pattern = logits_processor_pattern
 
         self._verify_quantization()
         self._verify_cuda_graph()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 0098648b1cd60..5a73c6ee02e0c 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -170,6 +170,7 @@ class EngineArgs:
     enable_chunked_prefill: Optional[bool] = None
 
     guided_decoding_backend: str = 'xgrammar'
+    logits_processor_pattern: Optional[str] = None
     # Speculative decoding configuration.
     speculative_model: Optional[str] = None
     speculative_model_quantization: Optional[str] = None
@@ -374,6 +375,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'https://github.com/noamgat/lm-format-enforcer.'
             ' Can be overridden per request via guided_decoding_backend'
             ' parameter.')
+        parser.add_argument(
+            '--logits-processor-pattern',
+            type=nullable_str,
+            default=None,
+            help='Optional regex pattern specifying valid logits processor '
+            'qualified names that can be passed with the `logits_processors` '
+            'extra completion argument. Defaults to None, which allows no '
+            'processors.')
         # Parallel arguments
         parser.add_argument(
             '--distributed-executor-backend',
@@ -975,7 +984,7 @@ def create_model_config(self) -> ModelConfig:
             mm_cache_preprocessor=self.mm_cache_preprocessor,
             override_neuron_config=self.override_neuron_config,
             override_pooler_config=self.override_pooler_config,
-        )
+            logits_processor_pattern=self.logits_processor_pattern)
 
     def create_load_config(self) -> LoadConfig:
         return LoadConfig(
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index f4e7740ea0cff..dfb7c977dbd43 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1,5 +1,6 @@
 # Adapted from
 # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
+import re
 import time
 from argparse import Namespace
 from typing import Any, Dict, List, Literal, Optional, Union
@@ -14,7 +15,7 @@
 from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
                                   RequestOutputKind, SamplingParams)
 from vllm.sequence import Logprob
-from vllm.utils import random_uuid
+from vllm.utils import random_uuid, resolve_obj_by_qualname
 
 logger = init_logger(__name__)
 
@@ -148,6 +149,46 @@ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
     type: Literal["function"] = "function"
 
 
+class LogitsProcessorConstructor(BaseModel):
+    qualname: str
+    args: Optional[List[Any]] = None
+    kwargs: Optional[Dict[str, Any]] = None
+
+
+LogitsProcessors = List[Union[str, LogitsProcessorConstructor]]
+
+
+def get_logits_processors(processors: Optional[LogitsProcessors],
+                          pattern: Optional[str]) -> Optional[List[Any]]:
+    if processors and pattern:
+        logits_processors = []
+        for processor in processors:
+            qualname = processor if isinstance(processor,
+                                               str) else processor.qualname
+            if not re.match(pattern, qualname):
+                raise ValueError(
+                    f"Logits processor '{qualname}' is not allowed by this "
+                    "server. See --logits-processor-pattern engine argument "
+                    "for more information.")
+            try:
+                logits_processor = resolve_obj_by_qualname(qualname)
+            except Exception as e:
+                raise ValueError(
+                    f"Logits processor '{qualname}' could not be resolved: {e}"
+                ) from e
+            if isinstance(processor, LogitsProcessorConstructor):
+                logits_processor = logits_processor(*processor.args or [],
+                                                    **processor.kwargs or {})
+            logits_processors.append(logits_processor)
+        return logits_processors
+    elif processors:
+        raise ValueError(
+            "The `logits_processors` argument is not supported by this "
+            "server. See --logits-processor-pattern engine argugment "
+            "for more information.")
+    return None
+
+
 class ChatCompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/chat/create
@@ -293,6 +334,17 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
             "through out the inference process and return in response."))
+    logits_processors: Optional[LogitsProcessors] = Field(
+        default=None,
+        description=(
+            "A list of either qualified names of logits processors, or "
+            "constructor objects, to apply when sampling. A constructor is "
+            "a JSON object with a required 'qualname' field specifying the "
+            "qualified name of the processor class/factory, and optional "
+            "'args' and 'kwargs' fields containing positional and keyword "
+            "arguments. For example: {'qualname': "
+            "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
+            "{'param': 'value'}}."))
 
     # doc: end-chat-completion-extra-params
 
@@ -314,7 +366,9 @@ def to_beam_search_params(self,
             length_penalty=self.length_penalty,
             include_stop_str_in_output=self.include_stop_str_in_output)
 
-    def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
+    def to_sampling_params(
+            self, default_max_tokens: int,
+            logits_processor_pattern: Optional[str]) -> SamplingParams:
         # TODO(#9845): remove max_tokens when field is removed from OpenAI API
         max_tokens = self.max_completion_tokens or self.max_tokens
         if max_tokens is None:
@@ -364,6 +418,8 @@ def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
             min_tokens=self.min_tokens,
             skip_special_tokens=self.skip_special_tokens,
             spaces_between_special_tokens=self.spaces_between_special_tokens,
+            logits_processors=get_logits_processors(self.logits_processors,
+                                                    logits_processor_pattern),
             include_stop_str_in_output=self.include_stop_str_in_output,
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             output_kind=RequestOutputKind.DELTA if self.stream \
@@ -599,6 +655,17 @@ class CompletionRequest(OpenAIBaseModel):
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
             "if the served model does not use priority scheduling."))
+    logits_processors: Optional[LogitsProcessors] = Field(
+        default=None,
+        description=(
+            "A list of either qualified names of logits processors, or "
+            "constructor objects, to apply when sampling. A constructor is "
+            "a JSON object with a required 'qualname' field specifying the "
+            "qualified name of the processor class/factory, and optional "
+            "'args' and 'kwargs' fields containing positional and keyword "
+            "arguments. For example: {'qualname': "
+            "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
+            "{'param': 'value'}}."))
 
     # doc: end-completion-extra-params
 
@@ -619,7 +686,9 @@ def to_beam_search_params(self,
             length_penalty=self.length_penalty,
             include_stop_str_in_output=self.include_stop_str_in_output)
 
-    def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
+    def to_sampling_params(
+            self, default_max_tokens: int,
+            logits_processor_pattern: Optional[str]) -> SamplingParams:
         max_tokens = self.max_tokens
         if max_tokens is None:
             max_tokens = default_max_tokens
@@ -665,6 +734,8 @@ def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
             skip_special_tokens=self.skip_special_tokens,
             spaces_between_special_tokens=self.spaces_between_special_tokens,
             include_stop_str_in_output=self.include_stop_str_in_output,
+            logits_processors=get_logits_processors(self.logits_processors,
+                                                    logits_processor_pattern),
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             output_kind=RequestOutputKind.DELTA if self.stream \
                 else RequestOutputKind.FINAL_ONLY,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index a5e7b4ac3bb30..527418c635093 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -197,7 +197,8 @@ async def create_chat_completion(
                         default_max_tokens)
                 else:
                     sampling_params = request.to_sampling_params(
-                        default_max_tokens)
+                        default_max_tokens,
+                        self.model_config.logits_processor_pattern)
 
                 self._log_inputs(request_id,
                                  request_prompts[i],
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index b3436773062f3..bd39a4c42e938 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -123,7 +123,8 @@ async def create_completion(
                         default_max_tokens)
                 else:
                     sampling_params = request.to_sampling_params(
-                        default_max_tokens)
+                        default_max_tokens,
+                        self.model_config.logits_processor_pattern)
 
                 request_id_item = f"{request_id}-{i}"
 

From 93abf23a648051fe6dc053ba0b74499d119920bf Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 15 Dec 2024 01:52:18 +0800
Subject: [PATCH 070/357] [VLM] Fully dynamic prompt replacement in merged
 input processor (#11199)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 examples/offline_inference_vision_language.py |   5 +-
 .../mm_processor_kwargs/test_phi3v.py         |   4 +-
 tests/multimodal/test_processing.py           | 105 +--
 .../vllm_add_dummy_model/my_llava.py          |   4 +-
 vllm/inputs/registry.py                       |  71 +-
 vllm/model_executor/models/llava.py           | 144 ++---
 vllm/model_executor/models/phi3v.py           | 118 ++--
 vllm/model_executor/models/pixtral.py         |   2 +-
 vllm/multimodal/base.py                       |   4 +-
 vllm/multimodal/processing.py                 | 606 +++++++++---------
 vllm/multimodal/registry.py                   |   4 +-
 vllm/utils.py                                 |  12 +-
 12 files changed, 569 insertions(+), 510 deletions(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index c430f42fdc814..45539c665a922 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -97,9 +97,6 @@ def run_phi3v(question: str, modality: str):
     # max_model_len (128k) for this model may cause OOM.
     # You may lower either to run this example on lower-end GPUs.
 
-    # In this example, we override max_num_seqs to 5 while
-    # keeping the original context length of 128k.
-
     # num_crops is an override kwarg to the multimodal image processor;
     # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
     # to use 16 for single frame scenarios, and 4 for multi-frame.
@@ -113,7 +110,7 @@ def run_phi3v(question: str, modality: str):
     # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
     # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
     llm = LLM(
-        model="microsoft/Phi-3-vision-128k-instruct",
+        model="microsoft/Phi-3.5-vision-instruct",
         trust_remote_code=True,
         max_model_len=4096,
         max_num_seqs=2,
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
index c16192a1e1438..ce8ac8d8e0ceb 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
@@ -16,8 +16,8 @@
 # Wrap lazy imports to avoid initializing CUDA during test collection
 @pytest.fixture()
 def processor_for_phi3v():
-    from vllm.model_executor.models.phi3v import Phi3VProcessor
-    return Phi3VProcessor
+    from vllm.model_executor.models.phi3v import Phi3VMultiModalProcessor
+    return Phi3VMultiModalProcessor
 
 
 @pytest.fixture()
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index ae668d1dd56c8..6aaa80ddc9fa5 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1,11 +1,11 @@
 from typing import cast
 
 import pytest
-from transformers import BatchFeature
 
-from vllm.multimodal.processing import (PromptReplacement, _PlaceholderInfo,
-                                        find_text_matches, find_token_matches,
-                                        iter_placeholders, iter_token_matches,
+from vllm.multimodal.processing import (MultiModalDataItems, PromptReplacement,
+                                        _PlaceholderInfo, find_text_matches,
+                                        find_token_matches, iter_placeholders,
+                                        iter_token_matches,
                                         replace_text_matches,
                                         replace_token_matches)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -16,7 +16,7 @@
 @pytest.mark.parametrize(
     ("token_ids", "match_ids", "expected"),
     [
-        ([], [], [{ "start_idx": 0, "end_idx": 0 }]),
+        ([], [], []),
         ([], [32000], []),
         (
             [32000, 32000, 32000],
@@ -83,7 +83,7 @@ def test_iter_token_matches(token_ids, match_ids, expected):
                 "pattern_2": [32000],
             },
             {
-                "pattern_1": [{ "start_idx": 0, "end_idx": 0 }],
+                "pattern_1": [],
                 "pattern_2": [],
             }
         ),
@@ -136,7 +136,7 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
     mock_tokenizer = cast(AnyTokenizer, object())
 
     prompt_repls = [
-        PromptReplacement(target, [], 0).bind(key, mock_tokenizer)
+        PromptReplacement(key, target, []).bind(mock_tokenizer)
         for key, target in target_by_key.items()
     ]
     result = find_token_matches(prompt, prompt_repls)
@@ -243,7 +243,7 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key):
     mock_tokenizer = cast(AnyTokenizer, object())
 
     prompt_repls = [
-        PromptReplacement(target, [], 0).bind(key, mock_tokenizer)
+        PromptReplacement(key, target, []).bind(mock_tokenizer)
         for key, target in target_by_key.items()
     ]
     result = find_text_matches(prompt, prompt_repls)
@@ -276,12 +276,12 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key):
                 "pattern_3": "!",
             },
             {
-                # Test whether target is confused with repl_unit
-                "pattern_1": ("<image><image>", 1),
-                # Test empty repl_unit
-                "pattern_2": ("", 1),
-                # Test multiple repl_count
-                "pattern_3": ("?", 2),
+                # Test whether target is confused with replacement
+                "pattern_1": "<image><image>",
+                # Test empty replacement
+                "pattern_2": "",
+                # Test dynamic replacement (beyond the form of `unit * count`)
+                "pattern_3": "?!?",
             },
         ),
     ]
@@ -290,8 +290,8 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key):
     ("mm_count", "expected"),
     [
         (0, "Image:<image>Image:<image><image>!"),
-        (1, "<image><image>Image:<image><image>??"),
-        (2, "<image><image><image><image><image>??"),
+        (1, "<image><image>Image:<image><image>?!?"),
+        (2, "<image><image><image><image><image>?!?"),
     ]
 )
 # yapf: enable
@@ -306,7 +306,7 @@ def test_find_replace_text(
     mock_tokenizer = cast(AnyTokenizer, object())
 
     prompt_repls = [
-        PromptReplacement(target, *repl_by_key[key]).bind(key, mock_tokenizer)
+        PromptReplacement(key, target, repl_by_key[key]).bind(mock_tokenizer)
         for key, target in target_by_key.items()
     ]
     matches = find_text_matches(prompt, prompt_repls)
@@ -314,9 +314,8 @@ def test_find_replace_text(
     result = replace_text_matches(
         prompt,
         matches,
-        {key: list(range(mm_count))
-         for key in repl_by_key},
-        BatchFeature(),
+        MultiModalDataItems({key: [None] * mm_count
+                             for key in repl_by_key}),
     )
 
     # Only displayed on error
@@ -343,12 +342,12 @@ def test_find_replace_text(
                 "pattern_3": [918],
             },
             {
-                # Test whether target is confused with repl_unit
-                "pattern_1": ([32000, 32000], 1),
-                # Test empty repl_unit
-                "pattern_2": ([], 1),
-                # Test multiple repl_count
-                "pattern_3": ([1550], 2),
+                # Test whether target is confused with replacement
+                "pattern_1": [32000, 32000],
+                # Test empty replacement
+                "pattern_2": [],
+                # Test dynamic replacement (beyond the form of `unit * count`)
+                "pattern_3": [1550, 918, 1550],
             },
         ),
     ]
@@ -357,8 +356,8 @@ def test_find_replace_text(
     ("mm_count", "expected"),
     [
         (0, [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918]),
-        (1, [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 1550]),
-        (2, [1, 32000, 32000, 32000, 32000, 32000, 1550, 1550]),
+        (1, [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550]),
+        (2, [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550]),
     ]
 )
 # yapf: enable
@@ -373,7 +372,7 @@ def test_find_replace_tokens(
     mock_tokenizer = cast(AnyTokenizer, object())
 
     prompt_repls = [
-        PromptReplacement(target, *repl_by_key[key]).bind(key, mock_tokenizer)
+        PromptReplacement(key, target, repl_by_key[key]).bind(mock_tokenizer)
         for key, target in target_by_key.items()
     ]
     matches = find_token_matches(prompt, prompt_repls)
@@ -381,9 +380,8 @@ def test_find_replace_tokens(
     result = replace_token_matches(
         prompt,
         matches,
-        {key: list(range(mm_count))
-         for key in repl_by_key},
-        BatchFeature(),
+        MultiModalDataItems({key: [None] * mm_count
+                             for key in repl_by_key}),
     )
 
     # Only displayed on error
@@ -399,9 +397,9 @@ def test_find_replace_tokens(
     "repl_by_key",
     [
         {
-            "pattern_1": ([32000, 32000], 1),
-            "pattern_2": ([], 1),
-            "pattern_3": ([1550], 2),
+            "pattern_1": [32000, 32000],
+            "pattern_2": [],
+            "pattern_3": [1550, 918, 1550],
         },
     ],
 )
@@ -414,48 +412,47 @@ def test_find_replace_tokens(
                 _PlaceholderInfo(
                     modality="pattern_1",
                     start_idx=6,
-                    unit=[32000, 32000],
-                    unit_count=1,
+                    replacement=[32000, 32000],
                 ),
             ],
         ),
         (
-            [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 1550],
+            [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],
             [
                 _PlaceholderInfo(
                     modality="pattern_1",
                     start_idx=1,
-                    unit=[32000, 32000],
-                    unit_count=1,
+                    replacement=[32000, 32000],
                 ),
                 _PlaceholderInfo(
                     modality="pattern_1",
                     start_idx=5,
-                    unit=[32000, 32000],
-                    unit_count=1,
+                    replacement=[32000, 32000],
                 ),
                 _PlaceholderInfo(
                     modality="pattern_3",
                     start_idx=7,
-                    unit=[1550],
-                    unit_count=2,
+                    replacement=[1550, 918, 1550],
                 ),
             ],
         ),
         (
-            [1, 32000, 32000, 32000, 32000, 32000, 1550, 1550],
+            [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
             [
                 _PlaceholderInfo(
                     modality="pattern_1",
                     start_idx=1,
-                    unit=[32000, 32000],
-                    unit_count=2,
+                    replacement=[32000, 32000],
+                ),
+                _PlaceholderInfo(
+                    modality="pattern_1",
+                    start_idx=3,
+                    replacement=[32000, 32000],
                 ),
                 _PlaceholderInfo(
                     modality="pattern_3",
                     start_idx=6,
-                    unit=[1550],
-                    unit_count=2,
+                    replacement=[1550, 918, 1550],
                 ),
             ],
         ),
@@ -470,11 +467,17 @@ def test_iter_placeholders(
     mock_tokenizer = cast(AnyTokenizer, object())
 
     prompt_repls = [
-        PromptReplacement([], *repl).bind(key, mock_tokenizer)
+        PromptReplacement(key, [], repl).bind(mock_tokenizer)
         for key, repl in repl_by_key.items()
     ]
 
-    result = list(iter_placeholders(prompt_repls, prompt))
+    result = list(
+        iter_placeholders(
+            prompt_repls,
+            prompt,
+            # Effectively match all occurrences in the prompt
+            MultiModalDataItems({key: [None] * 3 for key in repl_by_key}),
+         ))
 
     # Only displayed on error
     print("result:", result)
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
index 2f4194a63fc25..0d90635093ac7 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
@@ -3,14 +3,14 @@
 import torch
 
 from vllm.model_executor.models.llava import (LlavaForConditionalGeneration,
-                                              LlavaProcessor,
+                                              LlavaMultiModalProcessor,
                                               get_max_llava_image_tokens)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
 
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
-@MULTIMODAL_REGISTRY.register_processor(LlavaProcessor)
+@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor)
 class MyLlava(LlavaForConditionalGeneration):
 
     def compute_logits(
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 0dfed3b7e61bf..0b85484c48714 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -2,7 +2,7 @@
 from collections import UserDict
 from dataclasses import dataclass
 from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, NamedTuple,
-                    Optional, Protocol, Type, cast)
+                    Optional, Protocol, Type)
 
 from torch import nn
 from transformers import PretrainedConfig, ProcessorMixin
@@ -47,7 +47,6 @@ def get_hf_config(self, hf_config_type: Type[C] = PretrainedConfig) -> C:
         Raises:
             TypeError: If the model is not of the specified type.
         """
-
         hf_config = self.model_config.hf_config
         if not isinstance(hf_config, hf_config_type):
             raise TypeError("Invalid type of HuggingFace config. "
@@ -60,21 +59,70 @@ def get_hf_image_processor_config(self) -> Dict[str, Any]:
         """
         Get the HuggingFace image processor configuration of the model.
         """
-
         return self.model_config.hf_image_processor_config
 
+    def get_mm_config(self):
+        """
+        Get the multimodal config of the model.
+
+        Raises:
+            RuntimeError: If the model is not a multimodal model.
+        """
+        mm_config = self.model_config.multimodal_config
+        if mm_config is None:
+            raise RuntimeError("Not a multimodal model")
+
+        return mm_config
+
+    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
+        base_kwargs = self.model_config.mm_processor_kwargs
+        if base_kwargs is None:
+            base_kwargs = {}
+
+        merged_kwargs = {**base_kwargs, **kwargs}
+
+        return cached_get_processor(
+            self.model_config.model,
+            trust_remote_code=self.model_config.trust_remote_code,
+            **merged_kwargs,
+        )
+
 
 @dataclass(frozen=True)
 class InputProcessingContext(InputContext):
     tokenizer: AnyTokenizer
     """The tokenizer used to tokenize the inputs."""
 
-    def get_hf_processor(self, **kwargs) -> ProcessorMixin:
+    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
+        base_kwargs = self.model_config.mm_processor_kwargs
+        if base_kwargs is None:
+            base_kwargs = {}
+
+        merged_kwargs = {**base_kwargs, **kwargs}
+
         return cached_get_processor(
-            self.model_config.tokenizer,
+            self.model_config.model,
             tokenizer=self.tokenizer,  # Override the tokenizer with ours
             trust_remote_code=self.model_config.trust_remote_code,
-            **kwargs)
+            **merged_kwargs,
+        )
+
+    def resolve_hf_processor_call_kwargs(
+        self,
+        hf_processor: ProcessorMixin,
+        inference_kwargs: Mapping[str, object],
+    ) -> Mapping[str, object]:
+        assert callable(hf_processor)
+
+        base_kwargs = self.model_config.mm_processor_kwargs
+        if base_kwargs is None:
+            base_kwargs = {}
+
+        return resolve_mm_processor_kwargs(
+            base_kwargs,
+            inference_kwargs,
+            hf_processor,
+        )
 
 
 N = TypeVar("N", bound=Type[nn.Module])
@@ -171,7 +219,8 @@ def register_dummy_data(self, factory: DummyDataFactory):
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._dummy_factories_by_model_type:
+            if self._dummy_factories_by_model_type.contains(model_cls,
+                                                            strict=True):
                 logger.warning(
                     "Model class %s already has dummy data "
                     "registered to %s. It is overwritten by the new one.",
@@ -195,7 +244,8 @@ def register_dummy_encoder_data(self, factory: DummyDataFactory):
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._dummy_encoder_factories_by_model_type:
+            if self._dummy_encoder_factories_by_model_type.contains(
+                    model_cls, strict=True):
                 logger.warning(
                     "Model class %s already has dummy encoder data "
                     "registered to %s. It is overwritten by the new one.",
@@ -305,7 +355,8 @@ def register_input_processor(self, processor: InputProcessor):
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._input_processors_by_model_type:
+            if self._input_processors_by_model_type.contains(model_cls,
+                                                             strict=True):
                 logger.warning(
                     "Model class %s already has input processor "
                     "registered to %s. It is overwritten by the new one.",
@@ -357,7 +408,7 @@ def process_input(self, model_config: "ModelConfig",
         # If it's empty, it'll fall back to the default kwarg values
         mm_processor_kwargs = resolve_mm_processor_kwargs(
             model_config.mm_processor_kwargs,
-            cast(Dict[str, Any], inputs.get("mm_processor_kwargs")),
+            inputs.get("mm_processor_kwargs", {}),  # type: ignore
             processor,
         )
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 53eef72dd5f91..a2e404cf43238 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -5,10 +5,10 @@
 
 import torch
 import torch.nn as nn
-from PIL.Image import Image
 from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig,
                           PixtralVisionConfig, PretrainedConfig,
                           ProcessorMixin, SiglipVisionConfig)
+from transformers.models.llava import LlavaProcessor
 from transformers.models.pixtral import PixtralProcessor
 
 from vllm.attention import AttentionMetadata
@@ -21,11 +21,9 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
+from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        InputProcessingContext,
-                                        ModalityProcessingMetadata,
-                                        MultiModalProcessingMetadata,
+                                        MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
 from vllm.sequence import IntermediateTensors
 
@@ -33,7 +31,8 @@
                    get_max_clip_image_tokens)
 from .interfaces import SupportsMultiModal, SupportsPP
 from .pixtral import (PixtralHFVisionModel, dummy_image_for_pixtral_hf,
-                      get_max_pixtral_hf_image_tokens)
+                      get_max_pixtral_hf_image_tokens,
+                      get_pixtral_hf_image_feature_size)
 from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
                      get_max_siglip_image_tokens)
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
@@ -115,62 +114,7 @@ def get_max_llava_image_tokens(ctx: InputContext):
         raise ValueError(f"Unexpected select feature strategy: {strategy}")
 
 
-def dummy_mm_kwargs_for_llava(ctx: InputProcessingContext,
-                              mm_counts: Mapping[str, int]):
-    hf_config = ctx.get_hf_config(LlavaConfig)
-    vision_config = hf_config.vision_config
-    num_images = mm_counts["image"]
-
-    if isinstance(vision_config, CLIPVisionConfig):
-        data = dummy_image_for_clip(vision_config, num_images)
-    elif isinstance(vision_config, SiglipVisionConfig):
-        data = dummy_image_for_siglip(vision_config, num_images)
-    elif isinstance(vision_config, PixtralVisionConfig):
-        data = dummy_image_for_pixtral_hf(vision_config, num_images)
-    else:
-        msg = f"Unsupported vision config: {type(vision_config)}"
-        raise NotImplementedError(msg)
-
-    hf_processor = ctx.get_hf_processor()
-    image_processor = hf_processor.image_processor  # type: ignore
-    hf_inputs = image_processor.preprocess(data['image'], return_tensors="pt")
-    is_pixtral = isinstance(hf_processor, PixtralProcessor)
-
-    return MultiModalKwargs(
-        **hf_inputs,
-        is_pixtral=torch.tensor(is_pixtral),
-    )
-
-
-def create_metadata_for_llava(
-        ctx: InputProcessingContext) -> MultiModalProcessingMetadata:
-    hf_config = ctx.get_hf_config(LlavaConfig)
-    image_token_id = hf_config.image_token_index
-
-    def get_repl_count(
-        mm_items: list[Image],
-        hf_inputs: BatchFeature,
-        item_idx: int,
-    ) -> int:
-        return get_max_llava_image_tokens(ctx)
-
-    return {
-        "image":
-        ModalityProcessingMetadata(prompt_repls=[
-            PromptReplacement(target=[image_token_id],
-                              repl_unit=[image_token_id],
-                              repl_count=get_repl_count),
-        ]),
-    }
-
-
-class LlavaProcessor(BaseMultiModalProcessor):
-
-    def __init__(self, ctx: InputProcessingContext) -> None:
-        super().__init__(
-            ctx=ctx,
-            metadata=create_metadata_for_llava(ctx),
-        )
+class LlavaMultiModalProcessor(BaseMultiModalProcessor):
 
     def _patch_pixtral_processor(self, hf_processor: PixtralProcessor):
         if getattr(hf_processor, "__is_patched__", False):
@@ -188,18 +132,72 @@ def preprocess(__self, *args, **kwargs):
 
         hf_processor.__is_patched__ = True  # type: ignore
 
-    def _get_hf_processor(self) -> ProcessorMixin:
+    def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]:
         hf_processor = self.ctx.get_hf_processor()
+        assert isinstance(hf_processor, (LlavaProcessor, PixtralProcessor))
 
         if isinstance(hf_processor, PixtralProcessor):
             self._patch_pixtral_processor(hf_processor)
 
         return hf_processor
 
-    def _get_dummy_mm_kwargs(
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_inputs: BatchFeature,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> list[PromptReplacement]:
+        hf_config = self.ctx.get_hf_config(LlavaConfig)
+        image_token_id = hf_config.image_token_index
+
+        processor = self._get_hf_processor()
+        if isinstance(processor, PixtralProcessor):
+            image_token = processor.image_token
+            image_break_token = processor.image_break_token
+            image_end_token = processor.image_end_token
+
+            vision_config = hf_config.vision_config
+            assert isinstance(vision_config, PixtralVisionConfig)
+
+            def get_replacement_pixtral(item_idx: int):
+                image_size = mm_items.get_image_size(item_idx)
+                (
+                    num_width_tokens,
+                    num_height_tokens,
+                ) = get_pixtral_hf_image_feature_size(
+                    vision_config,
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+
+                tokens = ([image_token] * num_width_tokens +
+                          [image_break_token]) * num_height_tokens
+                tokens[-1] = image_end_token
+
+                return "".join(tokens)
+
+            return [
+                PromptReplacement(
+                    modality="image",
+                    target=[image_token_id],
+                    replacement=get_replacement_pixtral,
+                ),
+            ]
+
+        max_image_tokens = get_max_llava_image_tokens(self.ctx)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=[image_token_id] * max_image_tokens,
+            )
+        ]
+
+    def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
-    ) -> MultiModalKwargs:
+    ) -> ProcessorInputs:
         hf_config = self.ctx.get_hf_config(LlavaConfig)
         vision_config = hf_config.vision_config
         num_images = mm_counts["image"]
@@ -215,11 +213,13 @@ def _get_dummy_mm_kwargs(
             raise NotImplementedError(msg)
 
         hf_processor = self._get_hf_processor()
-        image_processor = hf_processor.image_processor  # type: ignore
-        hf_inputs = image_processor.preprocess(data['image'],
-                                               return_tensors="pt")
+        image_token = hf_processor.image_token
 
-        return MultiModalKwargs(**hf_inputs)
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=data,
+            mm_processor_kwargs={},
+        )
 
 
 class LlavaLikeConfig(Protocol):
@@ -303,7 +303,7 @@ def init_vision_tower_for_llava(
 
 
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
-@MULTIMODAL_REGISTRY.register_processor(LlavaProcessor)
+@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor)
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     # BitandBytes specific attributes
     bitsandbytes_stacked_params_mapping = {
@@ -584,7 +584,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
         return loader.load_weights(weights)
 
 
-class MantisProcessor(LlavaProcessor):
+class MantisMultiModalProcessor(LlavaMultiModalProcessor):
 
     def _get_hf_processor(self) -> ProcessorMixin:
         try:
@@ -604,6 +604,6 @@ def _get_hf_processor(self) -> ProcessorMixin:
 # To use this model, please use
 # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'`
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
-@MULTIMODAL_REGISTRY.register_processor(MantisProcessor)
+@MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor)
 class MantisForConditionalGeneration(LlavaForConditionalGeneration):
     pass
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 3c7854ce388ab..7ab06768ae612 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -32,13 +32,10 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import cached_get_image_processor
-from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
+from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        InputProcessingContext,
-                                        ModalityProcessingMetadata,
                                         MultiModalDataDict,
-                                        MultiModalProcessingMetadata,
+                                        MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
@@ -305,64 +302,17 @@ def add_image_newline(self, image_features_hd):
         return image_features_hd_newline
 
 
-def get_max_phi3v_image_tokens(ctx: InputContext,
-                               *,
-                               num_crops: Optional[int] = None):
-    mm_processor_kwargs = {}
-    if num_crops is not None:
-        mm_processor_kwargs["num_crops"] = num_crops
+def get_max_phi3v_image_tokens(ctx: InputContext) -> int:
+    processor = ctx.get_hf_processor()
+    image_processor = processor.image_processor  # type: ignore
 
-    model_config = ctx.model_config
-    image_processor = cached_get_image_processor(
-        model_config.model,
-        trust_remote_code=model_config.trust_remote_code,
-        **mm_processor_kwargs,
-    )
-
-    num_tokens = image_processor.calc_num_image_tokens_from_image_size(
+    return image_processor.calc_num_image_tokens_from_image_size(
         width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
         height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
     )
-    return num_tokens
-
-
-def dummy_mm_kwargs_for_phi3v(ctx: InputProcessingContext,
-                              mm_counts: Mapping[str, int]):
-    num_images = mm_counts["image"]
-
-    data = dummy_image_for_clip(
-        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
-        num_images,
-        image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-        image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
-    )
-
-    hf_processor = ctx.get_hf_processor()
-    image_processor = hf_processor.image_processor  # type: ignore
-    hf_inputs = image_processor.preprocess(data['image'], return_tensors="pt")
-
-    return MultiModalKwargs(**hf_inputs)
-
-
-def create_metadata_for_phi3v(
-        ctx: InputProcessingContext) -> MultiModalProcessingMetadata:
-    return {
-        "image":
-        ModalityProcessingMetadata(prompt_repls=[
-            PromptReplacement(target=[_IMAGE_TOKEN_ID],
-                              repl_unit=[_IMAGE_TOKEN_ID],
-                              repl_count=get_max_phi3v_image_tokens(ctx)),
-        ]),
-    }
-
 
-class Phi3VProcessor(BaseMultiModalProcessor):
 
-    def __init__(self, ctx: InputProcessingContext) -> None:
-        super().__init__(
-            ctx=ctx,
-            metadata=create_metadata_for_phi3v(ctx),
-        )
+class Phi3VMultiModalProcessor(BaseMultiModalProcessor):
 
     def _get_hf_processor(
         self,
@@ -389,15 +339,61 @@ def _apply_hf_processor(
         processed_outputs['input_ids'] = token_ids
         return processed_outputs
 
-    def _get_dummy_mm_kwargs(
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_inputs: BatchFeature,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> list[PromptReplacement]:
+        hf_processor = self._get_hf_processor()
+        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
+        image_processor = hf_processor.image_processor  # type: ignore
+
+        mm_config = self.ctx.get_mm_config()
+        max_images = mm_config.limit_per_prompt.get("image", 1)
+
+        def get_replacement_phi3v(item_idx: int):
+            image_size = mm_items.get_image_size(item_idx)
+            num_tokens = image_processor.calc_num_image_tokens_from_image_size(
+                width=image_size.width,
+                height=image_size.height,
+            )
+
+            return [_IMAGE_TOKEN_ID] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=get_replacement_phi3v,
+            ) for image_token in image_tokens[:max_images]
+        ]
+
+    def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
-    ) -> MultiModalKwargs:
-        return dummy_mm_kwargs_for_phi3v(self.ctx, mm_counts)
+    ) -> ProcessorInputs:
+        num_images = mm_counts["image"]
+
+        data = dummy_image_for_clip(
+            CLIP_VIT_LARGE_PATCH14_336_CONFIG,
+            num_images,
+            image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+            image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+        )
+
+        hf_processor = self._get_hf_processor()
+        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
+
+        return ProcessorInputs(
+            prompt_text="".join(image_tokens[:num_images]),
+            mm_data=data,
+            mm_processor_kwargs={},
+        )
 
 
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens)
-@MULTIMODAL_REGISTRY.register_processor(Phi3VProcessor)
+@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor)
 class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 94a4ab882c1a9..161d6b41bfa5f 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -72,7 +72,7 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
     mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
     image_token_id = mm_encoder.special_ids.img
 
-    mm_config = ctx.model_config.multimodal_config
+    mm_config = ctx.get_mm_config()
     num_images = mm_config.limit_per_prompt.get("image", 1)
 
     # dummy size
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 7dba94b885b6d..fe77a4635f7d8 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -99,7 +99,7 @@ def register_input_mapper(
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._input_mappers:
+            if self._input_mappers.contains(model_cls, strict=True):
                 logger.warning(
                     "Model class %s already has an input mapper "
                     "registered to %s. It is overwritten by the new one.",
@@ -194,7 +194,7 @@ def register_max_multimodal_tokens(
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._max_mm_tokens:
+            if self._max_mm_tokens.contains(model_cls, strict=True):
                 logger.warning(
                     "Model class %s already calculates maximum number of "
                     "tokens in %s. It is overwritten by the new one.",
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 922c83b6fd8a9..de5a002d474c2 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,116 +1,59 @@
 import re
 from abc import ABC, abstractmethod
+from collections import UserDict
 from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from functools import lru_cache
-from typing import (Any, Dict, Generic, NamedTuple, Optional, Protocol,
-                    TypeVar, Union, cast)
+from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union
 
+import numpy as np
 import torch
+from PIL.Image import Image
 from transformers import BatchFeature, ProcessorMixin
-from typing_extensions import TypeAlias, TypedDict
+from typing_extensions import assert_never
 
 from vllm.inputs import DummyData, InputProcessingContext
+from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import (flatten_2d_lists, full_groupby, is_list_of,
-                        resolve_mm_processor_kwargs)
+from vllm.utils import flatten_2d_lists, full_groupby, is_list_of
 
 from .inputs import (AudioItem, ImageItem, MultiModalDataDict,
                      MultiModalInputsV2, MultiModalKwargs, PlaceholderRange,
                      VideoItem)
 
+logger = init_logger(__name__)
 
-def bind_prompt_sequence(
-    seq: Union[str, list[int]],
-    tokenizer: AnyTokenizer,
-) -> "_BoundPromptSequence":
-    """
-    Bind a text or token sequence to a tokenizer so that it can be
-    lazily converted into the other format on demand.
-    """
-    return _BoundPromptSequence(
-        tokenizer=tokenizer,
-        _text=seq if isinstance(seq, str) else None,
-        _token_ids=seq if isinstance(seq, list) else None,
-    )
-
-
-_T = TypeVar("_T")
 _S = TypeVar("_S", str, list[int])
+_PromptSeq = Union[str, list[int]]
 
 
 @dataclass
-class PromptReplacement(Generic[_S, _T]):
-    target: _S
-    """The text or token sequence to find and replace."""
+class PromptReplacement:
+    modality: str
+    """The modality for which the replacement is made"""
 
-    repl_unit: _S
-    """
-    The unit making up the replacement text or token sequence.
-    
-    See :code:`repl_count` for more details.
-    """
+    target: _PromptSeq
+    """The text or token sequence to find and replace."""
 
-    repl_count: Union[Callable[[list[_T], BatchFeature, int], int], int]
+    replacement: Union[Callable[[int], _PromptSeq],
+                       _PromptSeq] = field(repr=False)
     """
-    Given the original multi-modal items for this modality, HF-processed data,
-    and index of the processed item, output the number of repetitions of
-    :code:`repl_unit` to build up the replacement text or token sequence.
+    Given the index of the processed item within :attr:`modality`, output the
+    replacement text or token sequence.
 
-    For convenience, you can pass in an integer if the number of repetitions is
-    a constant.
+    For convenience, you can pass in the replacement instead of a function
+    if it does not depend on the input.
     """
 
-    def __repr__(self) -> str:
-        return (f"{type(self).__name__}(target={self.target!r}, "
-                f"repl_unit={self.repl_unit!r})")
-
-    def bind(
-        self,
-        modality: str,
-        tokenizer: AnyTokenizer,
-    ) -> "_BoundPromptReplacement[_T]":
+    def bind(self, tokenizer: AnyTokenizer) -> "_BoundPromptReplacement":
         return _BoundPromptReplacement(
-            modality=modality,
-            target=bind_prompt_sequence(self.target, tokenizer),
-            repl_unit=bind_prompt_sequence(self.repl_unit, tokenizer),
-            repl_count=self.repl_count,
+            tokenizer=tokenizer,
+            modality=self.modality,
+            _target=self.target,
+            _replacement=self.replacement,
         )
 
 
-@dataclass
-class ModalityProcessingMetadata(Generic[_T]):
-    prompt_repls: Sequence[Union[PromptReplacement[str, _T],
-                                 PromptReplacement[list[int], _T]]]
-    """
-    Defines each text or token sequence to replace in the HF-processed prompt.
-
-    This is skipped if the HF-processed prompt is found to already contain
-    the replacement prompts.
-    """
-
-
-class MultiModalProcessingMetadataBuiltins(TypedDict, total=False):
-    """Type annotations for modality types predefined by vLLM."""
-
-    image: ModalityProcessingMetadata[ImageItem]
-    video: ModalityProcessingMetadata[VideoItem]
-    audio: ModalityProcessingMetadata[AudioItem]
-
-
-MultiModalProcessingMetadata: TypeAlias = \
-    Mapping[str, ModalityProcessingMetadata[Any]]
-"""
-A dictionary containing an entry for each modality type to process.
-
-Note:
-    This dictionary also accepts modality keys defined outside
-    :class:`MultiModalProcessingMetadataBuiltins` as long as a customized plugin
-    is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-    Read more on that :ref:`here <adding_multimodal_plugin>`.
-"""
-
-
 def _encode(
     tokenizer: AnyTokenizer,
     text: str,
@@ -185,7 +128,8 @@ def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
 
 @dataclass
 class _BoundPromptSequence:
-    tokenizer: AnyTokenizer
+    tokenizer: AnyTokenizer = field(repr=False)
+
     _text: Optional[str]
     _token_ids: Optional[list[int]]
 
@@ -210,38 +154,92 @@ def token_ids(self) -> list[int]:
 
         return self._token_ids
 
-    def __repr__(self) -> str:
-        return (f"{type(self).__name__}(_text={self._text!r}, "
-                f"_token_ids={self._token_ids!r})")
-
 
 @dataclass
-class _BoundPromptReplacement(Generic[_T]):
+class _BoundPromptReplacement:
+    tokenizer: AnyTokenizer = field(repr=False)
     modality: str
-    target: _BoundPromptSequence
-    repl_unit: _BoundPromptSequence
-    repl_count: Union[Callable[[list[_T], BatchFeature, int], int], int]
 
-    def get_count(
-        self,
-        mm_items: list[_T],
-        hf_inputs: BatchFeature,
-        item_idx: int,
-    ) -> int:
-        repl_count = self.repl_count
-        if isinstance(repl_count, int):
-            return repl_count
+    _target: _PromptSeq
+    _replacement: Union[Callable[[int], _PromptSeq],
+                        _PromptSeq] = field(repr=False)
 
-        return repl_count(mm_items, hf_inputs, item_idx)
+    def __post_init__(self) -> None:
+        self._replacement_cache = dict[int, _BoundPromptSequence]()
+
+    @property
+    def target(self) -> _BoundPromptSequence:
+        target = self._target
 
+        return _BoundPromptSequence(
+            tokenizer=self.tokenizer,
+            _text=target if isinstance(target, str) else None,
+            _token_ids=target if isinstance(target, list) else None,
+        )
 
-def to_multi_format(data: MultiModalDataDict) -> dict[str, list[Any]]:
+    def get_replacement(self, item_idx: int) -> _BoundPromptSequence:
+        replacement = self._replacement
+        if callable(replacement):
+            cache_key = item_idx
+            if cache_key in self._replacement_cache:
+                return self._replacement_cache[cache_key]
+
+            replacement = replacement(item_idx)
+        else:
+            cache_key = None
+
+        bound_replacement = _BoundPromptSequence(
+            tokenizer=self.tokenizer,
+            _text=replacement if isinstance(replacement, str) else None,
+            _token_ids=replacement if isinstance(replacement, list) else None,
+        )
+
+        if cache_key is not None:
+            self._replacement_cache[cache_key] = bound_replacement
+
+        return bound_replacement
+
+
+class ImageSize(NamedTuple):
+    width: int
+    height: int
+
+
+class MultiModalDataItems(UserDict[str, list[Any]]):
     """
-    Convert a :class:`MultiModalDataDict` containing single data items
-    to a :class:`MultiModalMultiDataDict` containing multiple data items
-    per entry.
+    As :class:`MultiModalDataDict`, but normalized such that each entry
+    corresponds to a list.
     """
-    multi_data = dict[str, list[Any]]()
+
+    @property
+    def image(self) -> list[ImageItem]:
+        return self["image"]
+
+    @property
+    def video(self) -> list[VideoItem]:
+        return self["video"]
+
+    @property
+    def audio(self) -> list[AudioItem]:
+        return self["audio"]
+
+    def get_image_size(self, item_idx: int) -> ImageSize:
+        image = self.image[item_idx]
+
+        if isinstance(image, Image):
+            return ImageSize(*image.size)
+        if isinstance(image, (np.ndarray, torch.Tensor)):
+            _, h, w = image.shape
+            return ImageSize(w, h)
+
+        assert_never(image)
+
+
+def to_multi_format(data: MultiModalDataDict) -> MultiModalDataItems:
+    """
+    Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
+    """
+    multi_data = MultiModalDataItems()
 
     for k, v in data.items():
         # yapf: disable
@@ -266,22 +264,33 @@ def iter_token_matches(
     token_ids: list[int],
     match_ids: list[int],
 ) -> Iterable[_TokenMatch]:
-    """Yield each occurrence of :code:`match_ids` in :code:`token_ids`."""
+    """
+    Yield each occurrence of :code:`match_ids` in :code:`token_ids`.
+
+    Note that empty matches are ignored.
+    """
+    prompt_len = len(token_ids)
     match_len = len(match_ids)
 
-    last_end_idx = 0
-    for start_idx in range(len(token_ids) - match_len + 1):
-        if start_idx < last_end_idx:
-            continue  # Exclude overlapping matches
+    if match_len == 0:
+        return
 
+    start_idx = 0
+    while start_idx < prompt_len - match_len + 1:
         end_idx = start_idx + match_len
+
         if token_ids[start_idx:end_idx] == match_ids:
             yield _TokenMatch(start_idx=start_idx, end_idx=end_idx)
-            last_end_idx = end_idx
+
+            # Exclude overlapping matches
+            start_idx = end_idx
+        else:
+            start_idx += 1
 
 
-class _PromptReplacementMatch(ABC, Generic[_T, _S]):
-    prompt_repl: _BoundPromptReplacement[_T]
+@dataclass(repr=False)
+class _PromptReplacementMatch(ABC):
+    prompt_repl: _BoundPromptReplacement
 
     @property
     def modality(self) -> str:
@@ -297,19 +306,13 @@ def start_idx(self) -> int:
     def end_idx(self) -> int:
         raise NotImplementedError
 
-    @property
-    @abstractmethod
-    def repl_unit(self) -> _S:
-        raise NotImplementedError
-
     def __repr__(self) -> str:
         return (f"{type(self).__name__}(modality={self.modality!r}, "
                 f"start_idx={self.start_idx!r}, end_idx={self.end_idx!r})")
 
 
 @dataclass(repr=False)
-class _PromptReplacementTokenMatch(_PromptReplacementMatch[_T, list[int]]):
-    prompt_repl: _BoundPromptReplacement[_T]
+class _PromptReplacementTokenMatch(_PromptReplacementMatch):
     match: _TokenMatch
 
     @property
@@ -320,14 +323,9 @@ def start_idx(self) -> int:
     def end_idx(self) -> int:
         return self.match.end_idx
 
-    @property
-    def repl_unit(self) -> list[int]:
-        return self.prompt_repl.repl_unit.token_ids
-
 
 @dataclass(repr=False)
-class _PromptReplacementTextMatch(_PromptReplacementMatch[_T, str]):
-    prompt_repl: _BoundPromptReplacement[_T]
+class _PromptReplacementTextMatch(_PromptReplacementMatch):
     match: re.Match[str]
 
     @property
@@ -338,20 +336,15 @@ def start_idx(self) -> int:
     def end_idx(self) -> int:
         return self.match.end()
 
-    @property
-    def repl_unit(self) -> str:
-        return self.prompt_repl.repl_unit.text
-
 
 class _PlaceholderInfo(NamedTuple):
     modality: str
     start_idx: int
-    unit: list[int]
-    unit_count: int
+    replacement: list[int]
 
     @property
     def length(self) -> int:
-        return len(self.unit) * self.unit_count
+        return len(self.replacement)
 
     def to_range(self) -> PlaceholderRange:
         return PlaceholderRange(
@@ -362,8 +355,8 @@ def to_range(self) -> PlaceholderRange:
 
 def find_token_matches(
     prompt: list[int],
-    prompt_repls: Sequence[_BoundPromptReplacement[_T]],
-) -> list[_PromptReplacementTokenMatch[_T]]:
+    prompt_repls: Sequence[_BoundPromptReplacement],
+) -> list[_PromptReplacementTokenMatch]:
     """Return each target of :code:`prompt_repls` found in :code:`prompt`."""
     return [
         _PromptReplacementTokenMatch(prompt_repl, match)
@@ -374,8 +367,8 @@ def find_token_matches(
 
 def find_text_matches(
     prompt: str,
-    prompt_repls: Sequence[_BoundPromptReplacement[_T]],
-) -> list[_PromptReplacementTextMatch[_T]]:
+    prompt_repls: Sequence[_BoundPromptReplacement],
+) -> list[_PromptReplacementTextMatch]:
     """Return each target of :code:`prompt_repls` found in :code:`prompt`."""
     return [
         _PromptReplacementTextMatch(prompt_repl, match)
@@ -385,15 +378,15 @@ def find_text_matches(
 
 
 def _resolve_matches(
-    prompt: _S,
-    matches: Sequence[_PromptReplacementMatch[_T, _S]],
-) -> list[_PromptReplacementMatch[_T, _S]]:
+    prompt: _PromptSeq,
+    matches: Sequence[_PromptReplacementMatch],
+) -> list[_PromptReplacementMatch]:
     """
     Resolve :code:`matches` to ensure that there are no overlapping matches,
     and sort them such that earlier matches take priority over later ones.
     """
-    seen_matches: list[Optional[_PromptReplacementMatch[_T, _S]]] \
-        = [None] * len(prompt)
+    seen_matches: list[Optional[_PromptReplacementMatch]] = [None
+                                                             ] * len(prompt)
 
     for match in matches:
         for idx in range(match.start_idx, match.end_idx):
@@ -409,30 +402,34 @@ def _resolve_matches(
 
 def _replace_matches(
     prompt: _S,
-    matches: Sequence[_PromptReplacementMatch[_T, _S]],
-    mm_items_by_modality: Mapping[str, list[_T]],
-    hf_inputs: BatchFeature,
+    matches: Sequence[_PromptReplacementMatch],
+    mm_items: MultiModalDataItems,
 ) -> list[_S]:
     out_seqs = list[_S]()
     prev_end_idx = 0
-    next_idx_by_modality = {modality: 0 for modality in mm_items_by_modality}
+    next_idx_by_modality = {modality: 0 for modality in mm_items}
 
     for match in _resolve_matches(prompt, matches):
         modality = match.modality
-        mm_items = mm_items_by_modality[modality]
+        modal_items = mm_items[modality]
 
         item_idx = next_idx_by_modality[modality]
-        if item_idx >= len(mm_items):
+        if item_idx >= len(modal_items):
             continue
 
         start_idx = match.start_idx
         end_idx = match.end_idx
-        repl_unit = match.repl_unit
+
         repl_info = match.prompt_repl
-        repl_count = repl_info.get_count(mm_items, hf_inputs, item_idx)
+        replacement = repl_info.get_replacement(item_idx)
+
+        if isinstance(prompt, str):
+            repl_seq = replacement.text
+            out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq)
+        else:
+            repl_seq = replacement.token_ids
+            out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq)
 
-        out_seqs.append(prompt[prev_end_idx:start_idx] +
-                        repl_unit * repl_count)
         prev_end_idx = end_idx
         next_idx_by_modality[modality] += 1
 
@@ -443,92 +440,104 @@ def _replace_matches(
 
 def replace_token_matches(
     prompt: list[int],
-    matches: Sequence[_PromptReplacementMatch[_T, list[int]]],
-    mm_items_by_modality: Mapping[str, list[_T]],
-    hf_inputs: BatchFeature,
+    matches: Sequence[_PromptReplacementTokenMatch],
+    mm_items: MultiModalDataItems,
 ) -> list[int]:
     """Apply :code:`prompt_repls` to :code:`prompt`."""
     if not matches:
         return prompt
 
-    token_id_seqs = _replace_matches(
-        prompt,
-        matches,
-        mm_items_by_modality,
-        hf_inputs,
-    )
+    token_id_seqs = _replace_matches(prompt, matches, mm_items)
 
     return flatten_2d_lists(token_id_seqs)
 
 
 def replace_text_matches(
     prompt: str,
-    matches: Sequence[_PromptReplacementMatch[_T, str]],
-    mm_items_by_modality: Mapping[str, list[_T]],
-    hf_inputs: BatchFeature,
+    matches: Sequence[_PromptReplacementTextMatch],
+    mm_items: MultiModalDataItems,
 ) -> str:
     """Apply :code:`prompt_repls` to :code:`prompt`."""
     if not matches:
         return prompt
 
-    texts = _replace_matches(
-        prompt,
-        matches,
-        mm_items_by_modality,
-        hf_inputs,
-    )
+    texts = _replace_matches(prompt, matches, mm_items)
 
     return "".join(texts)
 
 
-def _merge_placeholder_matches(
-    matches: Iterable[_PromptReplacementTokenMatch],
-) -> Iterable[_PromptReplacementTokenMatch]:
-    current_match = None
+def _iter_modality_placeholders(
+    prompt: list[int],
+    modality: str,
+    modality_repls: Sequence[_BoundPromptReplacement],
+    modal_items: list[Any],
+) -> Iterable[_PlaceholderInfo]:
+    if len(modal_items) == 0:
+        return
 
-    for match in sorted(matches, key=lambda x: x.start_idx):
-        if current_match is None:
-            current_match = match
-        elif (current_match.prompt_repl == match.prompt_repl
-              and current_match.end_idx == match.start_idx):
-            current_match = _PromptReplacementTokenMatch(
-                current_match.prompt_repl,
-                match=_TokenMatch(current_match.start_idx, match.end_idx),
-            )
-        else:
-            yield current_match
-            current_match = match
+    prompt_len = len(prompt)
+    item_index = 0
+
+    start_idx = 0
+    while start_idx < prompt_len:
+        found = False
+
+        for repl_info in modality_repls:
+            replacement = repl_info.get_replacement(item_index)
+            repl_tokens = replacement.token_ids
+            repl_len = len(repl_tokens)
+            end_idx = start_idx + repl_len
+
+            if repl_len == 0 or end_idx > prompt_len:
+                continue
 
-    if current_match is not None:
-        yield current_match
+            if prompt[start_idx:end_idx] == repl_tokens:
+                yield _PlaceholderInfo(
+                    modality=modality,
+                    start_idx=start_idx,
+                    replacement=repl_tokens,
+                )
+
+                item_index += 1
+                if item_index >= len(modal_items):
+                    return
+
+                # Exclude overlapping matches
+                start_idx = end_idx
+                found = True
+                break
+
+        if not found:
+            start_idx += 1
 
 
 def iter_placeholders(
-    prompt_repls: Sequence[_BoundPromptReplacement[Any]],
+    prompt_repls: Sequence[_BoundPromptReplacement],
     prompt: list[int],
-    *,
-    min_unit_count: int = 1,
+    mm_items: MultiModalDataItems,
 ) -> Iterable[_PlaceholderInfo]:
-    """Yield each set of placeholder tokens found in :code:`token_ids`."""
-    if min_unit_count <= 0:
-        raise ValueError("`min_unit_count` must be a positive integer")
-
-    matches = (_PromptReplacementTokenMatch(prompt_repl, match)
-               for prompt_repl in prompt_repls
-               if len(repl_unit := prompt_repl.repl_unit.token_ids) > 0
-               for match in iter_token_matches(prompt, repl_unit))
-
-    for match in _merge_placeholder_matches(matches):
-        unit = match.repl_unit
-        placeholder = _PlaceholderInfo(
-            modality=match.modality,
-            start_idx=match.start_idx,
-            unit=unit,
-            unit_count=(match.end_idx - match.start_idx) // len(unit),
-        )
+    """
+    Yield each set of placeholder tokens found in :code:`prompt`.
+
+    Note that empty matches are ignored.
+    """
+    repls_by_modality = dict(full_groupby_modality(prompt_repls))
+
+    for modality, modal_items in mm_items.items():
+        if modality in repls_by_modality:
+            yield from _iter_modality_placeholders(
+                prompt,
+                modality,
+                repls_by_modality[modality],
+                modal_items,
+            )
+
 
-        if placeholder.unit_count >= min_unit_count:
-            yield placeholder
+class ProcessorInputs(NamedTuple):
+    """Keyword arguments to :meth:`BaseMultiModalProcessor`"""
+    prompt_text: str
+    mm_data: MultiModalDataDict
+    mm_processor_kwargs: Mapping[str, object]
 
 
 class BaseMultiModalProcessor(ABC):
@@ -536,52 +545,55 @@ class BaseMultiModalProcessor(ABC):
     Abstract base class to process multi-modal inputs to be used in vLLM.
     """
 
-    def __init__(
-        self,
-        ctx: InputProcessingContext,
-        metadata: MultiModalProcessingMetadata,
-    ) -> None:
+    def __init__(self, ctx: InputProcessingContext) -> None:
         super().__init__()
 
         self.ctx = ctx
-        self.metadata = metadata
-        self.init_mm_processor_kwargs = (ctx.model_config.mm_processor_kwargs
-                                         or {})
 
-    def _get_hf_processor(
+    def __call__(
         self,
-        **mm_processor_kwargs: Mapping[str, object],
-    ) -> ProcessorMixin:
-        # by default, we won't pass any kwargs to the processor initialization
+        prompt: str,
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        return self.apply(prompt, mm_data, mm_processor_kwargs)
+
+    def _get_hf_processor(self) -> ProcessorMixin:
+        """
+        Subclasses can add keyword arguments to this method to accept
+        additional kwargs from model config or user inputs.
+        """
         return self.ctx.get_hf_processor()
 
     def _get_tokenizer(self) -> AnyTokenizer:
         return self.ctx.tokenizer
 
-    def __call__(
+    @abstractmethod
+    def _get_prompt_replacements(
         self,
-        prompt: str,
-        mm_data: MultiModalDataDict,
+        mm_items: MultiModalDataItems,
+        hf_inputs: BatchFeature,
         mm_processor_kwargs: Mapping[str, object],
-    ) -> MultiModalInputsV2:
-        return self.apply(prompt, mm_data, mm_processor_kwargs)
+    ) -> list[PromptReplacement]:
+        """
+        Given the original multi-modal items for this modality
+        and HF-processed data, output the replacements to perform.
+
+        Note:
+            Even when the HF processor already performs replacement for us,
+            we still use this replacement information to determine
+            the placeholder token positions for each multi-modal item.
+        """
+        raise NotImplementedError
 
     def _find_placeholders(
         self,
-        all_prompt_repls: Sequence[_BoundPromptReplacement[Any]],
+        all_prompt_repls: Sequence[_BoundPromptReplacement],
         new_token_ids: list[int],
-        *,
-        # To avoid false positives from multi-input when detecting
-        # whether placeholder tokens have been inserted, in case
-        # the target sequence is a subset of the replacement tokens
-        min_unit_count: int = 16,
+        mm_items: MultiModalDataItems,
     ) -> list[_PlaceholderInfo]:
         return list(
-            iter_placeholders(
-                all_prompt_repls,
-                new_token_ids,
-                min_unit_count=min_unit_count,
-            ))
+            iter_placeholders(all_prompt_repls, new_token_ids, mm_items))
 
     def _apply_hf_processor(
         self,
@@ -589,13 +601,7 @@ def _apply_hf_processor(
         mm_data: MultiModalDataDict,
         mm_processor_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        # some mm_processor_kwargs may be used in processor initialization
-        # instead of processor call
-        processor_init_kwargs = {
-            **self.init_mm_processor_kwargs,
-            **mm_processor_kwargs,
-        }
-        hf_processor = self._get_hf_processor(**processor_init_kwargs)
+        hf_processor = self._get_hf_processor(**mm_processor_kwargs)
 
         processor_data = dict[str, Any]()
         passthrough_data = dict[str, Any]()
@@ -615,11 +621,10 @@ def _apply_hf_processor(
             else:
                 processor_data[k] = v
 
-        # filter mm_processor_kwargs used in processor call
-        mm_processor_kwargs = resolve_mm_processor_kwargs(
-            self.init_mm_processor_kwargs,
-            cast(Dict[str, Any], mm_processor_kwargs),
+        assert callable(hf_processor)
+        mm_processor_kwargs = self.ctx.resolve_hf_processor_call_kwargs(
             hf_processor,
+            mm_processor_kwargs,
         )
 
         try:
@@ -642,26 +647,21 @@ def _apply_hf_processor(
 
     def _bind_prompt_replacements(
         self,
-        mm_data: MultiModalDataDict,
-    ) -> list[_BoundPromptReplacement[Any]]:
+        prompt_repls: list[PromptReplacement],
+    ) -> list[_BoundPromptReplacement]:
         tokenizer = self._get_tokenizer()
 
-        return [
-            prompt_repl.bind(modality, tokenizer)
-            for modality, metadata in self.metadata.items()
-            if modality in mm_data for prompt_repl in metadata.prompt_repls
-        ]
+        return [prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls]
 
     def _apply_prompt_replacements(
         self,
-        mm_data: MultiModalDataDict,
+        mm_items: MultiModalDataItems,
         hf_inputs: BatchFeature,
         token_ids: list[int],
-        prompt_repls: Sequence[_BoundPromptReplacement[Any]],
+        prompt_repls: Sequence[_BoundPromptReplacement],
     ) -> tuple[list[int], str, list[_PlaceholderInfo]]:
         tokenizer = self._get_tokenizer()
 
-        mm_items = to_multi_format(mm_data)
         token_matches = find_token_matches(token_ids, prompt_repls)
 
         # If the search text does not represent a special token,
@@ -682,7 +682,6 @@ def _apply_prompt_replacements(
                 token_ids,
                 token_matches,
                 mm_items,
-                hf_inputs,
             )
 
             text = _decode(tokenizer, token_ids)
@@ -695,13 +694,13 @@ def _apply_prompt_replacements(
                 text,
                 text_matches,
                 mm_items,
-                hf_inputs,
             )
 
             token_ids = _encode(tokenizer, text)
             matched_repls = [match.prompt_repl for match in text_matches]
 
-        placeholders = self._find_placeholders(matched_repls, token_ids)
+        placeholders = self._find_placeholders(matched_repls, token_ids,
+                                               mm_items)
 
         return token_ids, text, placeholders
 
@@ -731,12 +730,16 @@ def apply(
         prompt_ids, = hf_inputs.pop("input_ids").tolist()
         mm_kwargs = MultiModalKwargs(hf_inputs)
 
-        all_prompt_repls = self._bind_prompt_replacements(mm_data)
+        mm_items = to_multi_format(mm_data)
+        prompt_repls = self._get_prompt_replacements(mm_items, hf_inputs,
+                                                     mm_processor_kwargs)
+        all_prompt_repls = self._bind_prompt_replacements(prompt_repls)
 
         # If HF processor already inserts placeholder tokens,
         # there is no need for us to insert them
         all_placeholders = self._find_placeholders(all_prompt_repls,
-                                                   prompt_ids)
+                                                   prompt_ids, mm_items)
+
         if all_placeholders:
             prompt_text = _decode(tokenizer, prompt_ids)
         else:
@@ -745,7 +748,7 @@ def apply(
                 prompt_text,
                 all_placeholders,
             ) = self._apply_prompt_replacements(
-                mm_data,
+                mm_items,
                 hf_inputs,
                 prompt_ids,
                 all_prompt_repls,
@@ -765,13 +768,13 @@ def apply(
         )
 
     @abstractmethod
-    def _get_dummy_mm_kwargs(
+    def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
-    ) -> MultiModalKwargs:
+    ) -> ProcessorInputs:
         """
-        Build the input that corresponds to `mm_max_tokens` in
-        :meth:`get_dummy_data`.
+        Build the multi-modal portion of the input which, after processing,
+        results in `mm_max_tokens` in :meth:`get_dummy_data`.
         """
         raise NotImplementedError
 
@@ -784,38 +787,41 @@ def get_dummy_data(
         # Avoid circular import
         from vllm.sequence import SequenceData
 
-        tokenizer = self._get_tokenizer()
-
-        mm_placeholders = dict[str, _PlaceholderInfo]()
-        offset = 0
-
-        for modality, max_tokens in mm_max_tokens.items():
-            if max_tokens == 0:
-                continue
-
-            metadata = self.metadata[modality]
-            repl = metadata.prompt_repls[0].bind(modality, tokenizer)
-            repl_token_ids = repl.repl_unit.token_ids
-
-            placeholders = _PlaceholderInfo(
-                modality=modality,
-                start_idx=offset,
-                unit=repl_token_ids,
-                unit_count=max_tokens // len(repl_token_ids),
-            )
-
-            mm_placeholders[modality] = placeholders
-            offset += placeholders.length
+        processor_inputs = self._get_dummy_mm_inputs(mm_counts)
+        mm_inputs = self.apply(*processor_inputs)
+
+        prompt_token_ids = mm_inputs["prompt_token_ids"]
+        placeholders_by_modality = mm_inputs["mm_placeholders"]
+
+        total_placeholders_by_modality = dict[str, int]()
+        for modality, placeholders in placeholders_by_modality.items():
+            num_placeholders = sum(item["length"] for item in placeholders)
+            max_tokens = mm_max_tokens[modality]
+
+            if num_placeholders != max_tokens:
+                logger.warning(
+                    "The processed dummy data has a total of %d placeholder "
+                    "tokens for the '%s' modality, which is not the expected "
+                    "%d tokens.", num_placeholders, modality, max_tokens)
+
+            total_placeholders_by_modality[modality] = num_placeholders
+
+        total_len = len(prompt_token_ids)
+        if total_len > seq_len:
+            logger.warning(
+                "The context length (%d) of the model is too short "
+                "to hold the multi-modal embeddings in the worst case "
+                "(%d tokens in total, out of which %s are reserved for "
+                "multi-modal embeddings). This may cause certain multi-modal "
+                "inputs to fail during inference, even when the input text is "
+                "short. To avoid this, you should increase `max_model_len`, "
+                "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len,
+                total_len, total_placeholders_by_modality)
 
-        prompt_token_ids = flatten_2d_lists(
-            [p.unit * p.unit_count for p in mm_placeholders.values()])
         prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
 
         return DummyData(
             seq_data=SequenceData.from_seqs(prompt_token_ids),
-            multi_modal_data=self._get_dummy_mm_kwargs(mm_counts),
-            multi_modal_placeholders={
-                modality: [p.to_range()]
-                for modality, p in mm_placeholders.items()
-            },
+            multi_modal_data=mm_inputs["mm_kwargs"],
+            multi_modal_placeholders=placeholders_by_modality,
         )
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 6ab6c0fe2f12e..03f8814a95356 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -299,9 +299,9 @@ def register_processor(
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._processor_factories:
+            if self._processor_factories.contains(model_cls, strict=True):
                 logger.warning(
-                    "Model class %s already has an input mapper "
+                    "Model class %s already has a multi-modal processor "
                     "registered to %s. It is overwritten by the new one.",
                     model_cls, self)
 
diff --git a/vllm/utils.py b/vllm/utils.py
index fbc3ef7fa7f89..45e682ac15782 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1370,8 +1370,8 @@ def supports_kw(
 
 
 def resolve_mm_processor_kwargs(
-    init_kwargs: Optional[Dict[str, Any]],
-    inference_kwargs: Optional[Dict[str, Any]],
+    init_kwargs: Optional[Mapping[str, object]],
+    inference_kwargs: Optional[Mapping[str, object]],
     callable: Callable[..., object],
     allow_var_kwargs: bool = False,
 ) -> Dict[str, Any]:
@@ -1405,7 +1405,7 @@ def resolve_mm_processor_kwargs(
 
 def get_allowed_kwarg_only_overrides(
     callable: Callable[..., object],
-    overrides: Optional[Dict[str, Any]],
+    overrides: Optional[Mapping[str, object]],
     allow_var_kwargs: bool = False,
 ) -> Dict[str, Any]:
     """
@@ -1524,9 +1524,15 @@ def __getitem__(self, key: Type[T]) -> _V:
         raise KeyError(key)
 
     def __contains__(self, key: object) -> bool:
+        return self.contains(key)
+
+    def contains(self, key: object, *, strict: bool = False) -> bool:
         if not isinstance(key, type):
             return False
 
+        if strict:
+            return key in self.data
+
         return any(cls in self.data for cls in key.mro())
 
 

From 6d917d0eebd03990edf2443780a5f2506026ea78 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Sat, 14 Dec 2024 17:54:04 +0000
Subject: [PATCH 071/357] Enable mypy checking on V1 code (#11105)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tools/mypy.sh                            |  1 +
 vllm/v1/attention/backends/flash_attn.py |  2 ++
 vllm/v1/core/kv_cache_manager.py         | 10 +++---
 vllm/v1/core/kv_cache_utils.py           | 17 +++++-----
 vllm/v1/core/scheduler.py                |  1 +
 vllm/v1/engine/__init__.py               | 23 ++++++++-----
 vllm/v1/engine/async_llm.py              | 11 +++---
 vllm/v1/engine/core.py                   | 20 +++++------
 vllm/v1/engine/core_client.py            | 43 +++++++++++++-----------
 vllm/v1/engine/detokenizer.py            |  4 +--
 vllm/v1/engine/llm_engine.py             |  3 +-
 vllm/v1/engine/mm_input_mapper.py        | 20 +++++++----
 vllm/v1/engine/processor.py              |  2 +-
 vllm/v1/executor/abstract.py             | 12 ++-----
 vllm/v1/executor/multiproc_executor.py   | 15 +++++----
 vllm/v1/executor/uniproc_executor.py     |  7 ++--
 vllm/v1/request.py                       |  3 +-
 vllm/v1/utils.py                         | 42 ++++++++++++++---------
 vllm/v1/worker/gpu_input_batch.py        |  1 +
 vllm/v1/worker/gpu_model_runner.py       | 42 ++++++++++++++---------
 vllm/v1/worker/gpu_worker.py             |  2 +-
 21 files changed, 160 insertions(+), 121 deletions(-)

diff --git a/tools/mypy.sh b/tools/mypy.sh
index e984e739d70cf..2454ff9fde466 100755
--- a/tools/mypy.sh
+++ b/tools/mypy.sh
@@ -29,3 +29,4 @@ run_mypy vllm/plugins
 run_mypy vllm/prompt_adapter
 run_mypy vllm/spec_decode
 run_mypy vllm/worker
+run_mypy vllm/v1
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index c9f04ace644c7..026a0292cc339 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -135,6 +135,8 @@ def forward(
         assert k_scale == 1.0 and v_scale == 1.0, (
             "key/v_scale is not supported in FlashAttention.")
 
+        assert output is not None, "Output tensor must be provided."
+
         if attn_metadata is None:
             # Profiling run.
             return output
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 8044481a9cd6a..aaa44c930e324 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -1,5 +1,5 @@
 from collections import defaultdict
-from typing import Dict, List, Optional
+from typing import Dict, Iterable, List, Optional
 
 from vllm.logger import init_logger
 from vllm.utils import cdiv
@@ -263,12 +263,13 @@ def free(self, request: Request) -> None:
         """
         # Default to [] in case a request is freed (aborted) before alloc.
         blocks = self.req_to_blocks.pop(request.request_id, [])
+        ordered_blocks: Iterable[KVCacheBlock] = blocks
         if self.enable_caching:
             # Free blocks in reverse order so that the tail blocks are
             # freed first.
-            blocks = reversed(blocks)
+            ordered_blocks = reversed(blocks)
 
-        for block in blocks:
+        for block in ordered_blocks:
             block.decr_ref()
             if block.ref_cnt == 0:
                 self.free_block_queue.append(block)
@@ -396,8 +397,7 @@ def _cache_full_blocks(
                 f"{request.request_id}({request})")
 
             # Compute the hash of the current block.
-            block_hash = hash_block_tokens(prev_block_hash_value,
-                                           tuple(block_tokens))
+            block_hash = hash_block_tokens(prev_block_hash_value, block_tokens)
 
             # Update and added the full block to the cache.
             blk.block_hash = block_hash
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 814e462a91fed..0ba338aa5a3d2 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1,4 +1,5 @@
 """KV-Cache Utilities."""
+from collections.abc import Sequence
 from dataclasses import dataclass
 from typing import List, NamedTuple, Optional, Tuple
 
@@ -13,7 +14,7 @@ class BlockHashType(NamedTuple):
     collision happens when the hash value is the same.
     """
     hash_value: int
-    token_ids: Tuple[int]
+    token_ids: Tuple[int, ...]
 
 
 @dataclass
@@ -79,8 +80,8 @@ def __init__(self, blocks: List[KVCacheBlock]) -> None:
         self.num_free_blocks = len(blocks)
 
         # Initialize the doubly linked list of free blocks.
-        self.free_list_head = blocks[0]
-        self.free_list_tail = blocks[-1]
+        self.free_list_head: Optional[KVCacheBlock] = blocks[0]
+        self.free_list_tail: Optional[KVCacheBlock] = blocks[-1]
         for i in range(self.num_free_blocks):
             if i > 0:
                 blocks[i].prev_free_block = blocks[i - 1]
@@ -159,7 +160,7 @@ def get_all_free_blocks(self) -> List[KVCacheBlock]:
 
 
 def hash_block_tokens(parent_block_hash: Optional[int],
-                      curr_block_token_ids: Tuple[int]) -> BlockHashType:
+                      curr_block_token_ids: Sequence[int]) -> BlockHashType:
     """Computes a hash value corresponding to the contents of a block and
     the contents of the preceding block(s). The hash value is used for
     prefix caching. We use LRU cache for this function to avoid recomputing
@@ -171,7 +172,7 @@ def hash_block_tokens(parent_block_hash: Optional[int],
     Args:
         parent_block_hash: The hash of the parent block. None
             if this is the first block.
-        curr_block_token_ids: A tuple of token ids in the current
+        curr_block_token_ids: A list of token ids in the current
             block. The current block is assumed to be full.
 
     Returns:
@@ -179,11 +180,11 @@ def hash_block_tokens(parent_block_hash: Optional[int],
         The entire tuple is used as the hash key of the block.
     """
     return BlockHashType(hash((parent_block_hash, *curr_block_token_ids)),
-                         curr_block_token_ids)
+                         tuple(curr_block_token_ids))
 
 
 def hash_request_tokens(block_size: int,
-                        token_ids: List[int]) -> List[BlockHashType]:
+                        token_ids: Sequence[int]) -> List[BlockHashType]:
     """Computes hash values of a chain of blocks given a sequence of
     token IDs. The hash value is used for prefix caching.
 
@@ -198,7 +199,7 @@ def hash_request_tokens(block_size: int,
     parent_block_hash_value = None
     for start in range(0, len(token_ids), block_size):
         end = start + block_size
-        block_token_ids = tuple(token_ids[start:end])
+        block_token_ids = token_ids[start:end]
         # Do not hash the block if it is not full.
         if len(block_token_ids) < block_size:
             break
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index f055eed77c372..f76364f64033d 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -152,6 +152,7 @@ def schedule(self) -> "SchedulerOutput":
                     break
             if not can_schedule:
                 break
+            assert new_blocks is not None
 
             # Schedule the request.
             scheduled_running_reqs.append(request)
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index abeea052c1fa5..cc0c7ea23469a 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -36,7 +36,7 @@ class EngineCoreRequest:
     prompt: Optional[str]
     prompt_token_ids: List[int]
     mm_inputs: Optional[List[Optional[MultiModalKwargs]]]
-    mm_hashes: Optional[List[Optional[str]]]
+    mm_hashes: Optional[List[str]]
     mm_placeholders: Optional[MultiModalPlaceholderDict]
     sampling_params: SamplingParams
     eos_token_id: Optional[int]
@@ -44,10 +44,11 @@ class EngineCoreRequest:
     lora_request: Optional[LoRARequest]
 
 
-class EngineCoreOutput(msgspec.Struct,
-                       array_like=True,
-                       omit_defaults=True,
-                       gc=False):
+class EngineCoreOutput(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
 
     request_id: str
     new_token_ids: List[int]
@@ -56,10 +57,11 @@ class EngineCoreOutput(msgspec.Struct,
     stop_reason: Union[int, str, None] = None
 
 
-class EngineCoreOutputs(msgspec.Struct,
-                        array_like=True,
-                        omit_defaults=True,
-                        gc=False):
+class EngineCoreOutputs(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
 
     #NOTE(Nick): We could consider ways to make this more compact,
     # e.g. columnwise layout and using an int enum for finish/stop reason
@@ -81,3 +83,6 @@ class EngineCoreRequestType(enum.Enum):
     ADD = b'\x00'
     ABORT = b'\x01'
     PROFILE = b'\x02'
+
+
+EngineCoreRequestUnion = Union[EngineCoreRequest, EngineCoreProfile, List[str]]
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 24cafeff63d1e..b36de5f66917c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -81,7 +81,7 @@ def __init__(
             asyncio_mode=True,
         )
 
-        self.output_handler = None
+        self.output_handler: Optional[asyncio.Task] = None
 
     def __del__(self):
         self.shutdown()
@@ -126,7 +126,8 @@ def shutdown(self):
             handler.cancel()
 
     @classmethod
-    def _get_executor_cls(cls, vllm_config: VllmConfig):
+    def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
+        executor_class: Type[Executor]
         distributed_executor_backend = (
             vllm_config.parallel_config.distributed_executor_backend)
         if distributed_executor_backend == "mp":
@@ -361,10 +362,10 @@ async def check_health(self) -> None:
         logger.debug("Called check_health.")
 
     async def start_profile(self) -> None:
-        await self.engine_core.profile(True)
+        await self.engine_core.profile_async(True)
 
     async def stop_profile(self) -> None:
-        await self.engine_core.profile(False)
+        await self.engine_core.profile_async(False)
 
     @property
     def is_running(self) -> bool:
@@ -380,7 +381,7 @@ def errored(self) -> bool:
 
     @property
     def dead_error(self) -> BaseException:
-        return Exception
+        return Exception()  # TODO: implement
 
 
 # Retain V0 name for backwards compatibility.
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index af644fb5fedba..56d4dc67e4a0e 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -5,7 +5,7 @@
 import time
 from dataclasses import dataclass
 from multiprocessing.process import BaseProcess
-from typing import List, Tuple, Type, Union
+from typing import List, Tuple, Type
 
 import zmq
 import zmq.asyncio
@@ -20,7 +20,7 @@
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
-                            EngineCoreRequestType)
+                            EngineCoreRequestType, EngineCoreRequestUnion)
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
@@ -97,8 +97,10 @@ def add_request(self, request: EngineCoreRequest):
             # Note that the cache here is mirrored with the client side of the
             # MM mapper, so anything that has a hash must have a HIT cache
             # entry here as well.
-            request.mm_inputs = self.mm_input_mapper_server.process_inputs(
-                request.mm_inputs, request.mm_hashes)
+            assert request.mm_inputs is not None
+            request.mm_inputs, request.mm_hashes = (
+                self.mm_input_mapper_server.process_inputs(
+                    request.mm_inputs, request.mm_hashes))
 
         req = Request.from_engine_core_request(request)
 
@@ -128,7 +130,7 @@ def step(self) -> List[EngineCoreOutput]:
     def shutdown(self):
         self.model_executor.shutdown()
 
-    def profile(self, is_start=True):
+    def profile(self, is_start: bool = True):
         self.model_executor.profile(is_start)
 
 
@@ -161,8 +163,8 @@ def __init__(
         # and to overlap some serialization/deserialization with the
         # model forward pass.
         # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
-        self.input_queue = queue.Queue()
-        self.output_queue = queue.Queue()
+        self.input_queue: queue.Queue[EngineCoreRequestUnion] = queue.Queue()
+        self.output_queue: queue.Queue[List[EngineCoreOutput]] = queue.Queue()
         threading.Thread(target=self.process_input_socket,
                          args=(input_path, ),
                          daemon=True).start()
@@ -318,9 +320,7 @@ def _log_stats(self):
 
             self._last_logging_time = now
 
-    def _handle_client_request(
-        self, request: Union[EngineCoreRequest, EngineCoreProfile,
-                             List[str]]) -> None:
+    def _handle_client_request(self, request: EngineCoreRequestUnion) -> None:
         """Handle EngineCoreRequest or EngineCoreABORT from Client."""
 
         if isinstance(request, EngineCoreRequest):
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index e0bfe1b93b360..ff25a9b2e9cac 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,6 +1,6 @@
 import atexit
 import os
-from typing import List, Union
+from typing import List, Optional
 
 import msgspec
 import zmq
@@ -10,8 +10,9 @@
 from vllm.utils import get_open_zmq_ipc_path, kill_process_tree
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
-                            EngineCoreRequestType)
-from vllm.v1.engine.core import EngineCore, EngineCoreProc
+                            EngineCoreRequestType, EngineCoreRequestUnion)
+from vllm.v1.engine.core import (EngineCore, EngineCoreProc,
+                                 EngineCoreProcHandle)
 from vllm.v1.serial_utils import PickleEncoder
 
 logger = init_logger(__name__)
@@ -59,7 +60,7 @@ def get_output(self) -> List[EngineCoreOutput]:
     def add_request(self, request: EngineCoreRequest) -> None:
         raise NotImplementedError
 
-    async def profile(self, is_start=True) -> None:
+    def profile(self, is_start: bool = True) -> None:
         raise NotImplementedError
 
     def abort_requests(self, request_ids: List[str]) -> None:
@@ -71,6 +72,9 @@ async def get_output_async(self) -> List[EngineCoreOutput]:
     async def add_request_async(self, request: EngineCoreRequest) -> None:
         raise NotImplementedError
 
+    async def profile_async(self, is_start: bool = True) -> None:
+        raise NotImplementedError
+
     async def abort_requests_async(self, request_ids: List[str]) -> None:
         raise NotImplementedError
 
@@ -105,7 +109,7 @@ def shutdown(self):
     def __del__(self):
         self.shutdown()
 
-    def profile(self, is_start=True) -> None:
+    def profile(self, is_start: bool = True) -> None:
         self.engine_core.profile(is_start)
 
 
@@ -133,7 +137,10 @@ def __init__(
         self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
 
         # ZMQ setup.
-        self.ctx = (zmq.asyncio.Context() if asyncio_mode else zmq.Context())
+        if asyncio_mode:
+            self.ctx = zmq.asyncio.Context()
+        else:
+            self.ctx = zmq.Context()  # type: ignore[attr-defined]
 
         # Path for IPC.
         ready_path = get_open_zmq_ipc_path()
@@ -149,11 +156,13 @@ def __init__(
         self.input_socket.bind(input_path)
 
         # Start EngineCore in background process.
+        self.proc_handle: Optional[EngineCoreProcHandle]
         self.proc_handle = EngineCoreProc.make_engine_core_process(
             *args,
-            input_path=input_path,
-            output_path=output_path,
-            ready_path=ready_path,
+            input_path=
+            input_path,  # type: ignore[misc]  # MyPy incorrectly flags duplicate keywords
+            output_path=output_path,  # type: ignore[misc]
+            ready_path=ready_path,  # type: ignore[misc]
             **kwargs,
         )
         atexit.register(self.shutdown)
@@ -204,10 +213,8 @@ def get_output(self) -> List[EngineCoreOutput]:
         engine_core_outputs = self.decoder.decode(frame.buffer).outputs
         return engine_core_outputs
 
-    def _send_input(
-        self, request_type: EngineCoreRequestType,
-        request: Union[EngineCoreRequest, EngineCoreProfile,
-                       List[str]]) -> None:
+    def _send_input(self, request_type: EngineCoreRequestType,
+                    request: EngineCoreRequestUnion) -> None:
 
         # (RequestType, SerializedRequest)
         msg = (request_type.value, self.encoder.encode(request))
@@ -219,7 +226,7 @@ def add_request(self, request: EngineCoreRequest) -> None:
     def abort_requests(self, request_ids: List[str]) -> None:
         self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
-    def profile(self, is_start=True) -> None:
+    def profile(self, is_start: bool = True) -> None:
         self._send_input(EngineCoreRequestType.PROFILE,
                          EngineCoreProfile(is_start))
 
@@ -237,10 +244,8 @@ async def get_output_async(self) -> List[EngineCoreOutput]:
 
         return engine_core_outputs
 
-    async def _send_input(
-        self, request_type: EngineCoreRequestType,
-        request: Union[EngineCoreRequest, EngineCoreProfile,
-                       List[str]]) -> None:
+    async def _send_input(self, request_type: EngineCoreRequestType,
+                          request: EngineCoreRequestUnion) -> None:
 
         msg = (request_type.value, self.encoder.encode(request))
         await self.input_socket.send_multipart(msg, copy=False)
@@ -252,6 +257,6 @@ async def abort_requests_async(self, request_ids: List[str]) -> None:
         if len(request_ids) > 0:
             await self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
-    async def profile(self, is_start=True) -> None:
+    async def profile_async(self, is_start: bool = True) -> None:
         await self._send_input(EngineCoreRequestType.PROFILE,
                                EngineCoreProfile(is_start))
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 6249d60199a62..02f34e2b54dd5 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Tuple
+from typing import Dict, Iterable, List, Optional, Tuple, Union
 
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
@@ -97,7 +97,7 @@ def add_tokens(
         self,
         new_token_ids: List[int],
         finish_reason: Optional[str],
-        stop_reason: Optional[str],
+        stop_reason: Optional[Union[int, str, None]],
     ) -> Optional[RequestOutput]:
         """
         Update RequestState for the request_id by:
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index c02494897b41f..15dedbd0f9529 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -103,7 +103,8 @@ def from_engine_args(
                    multiprocess_mode=enable_multiprocessing)
 
     @classmethod
-    def _get_executor_cls(cls, vllm_config: VllmConfig):
+    def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
+        executor_class: Type[Executor]
         distributed_executor_backend = (
             vllm_config.parallel_config.distributed_executor_backend)
         if distributed_executor_backend == "mp":
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index 58ee29bedb201..cca27c2218af7 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 import PIL
 from blake3 import blake3
@@ -42,14 +42,14 @@ def __init__(
             model_config)
         self.mm_registry.init_mm_limits_per_prompt(model_config)
 
-        self.mm_cache = LRUDictCache(MM_CACHE_SIZE)
+        self.mm_cache = LRUDictCache[str, MultiModalKwargs](MM_CACHE_SIZE)
 
         # DEBUG: Set to None to disable
         self.mm_debug_cache_hit_ratio_steps = None
         self.mm_cache_hits = 0
         self.mm_cache_total = 0
 
-    def cache_hit_ratio(self, steps) -> float:
+    def cache_hit_ratio(self, steps):
         if self.mm_cache_total > 0 and self.mm_cache_total % steps == 0:
             logger.debug("MMInputMapper: cache_hit_ratio = %.2f ",
                          self.mm_cache_hits / self.mm_cache_total)
@@ -60,7 +60,7 @@ def process_inputs(
         mm_hashes: Optional[List[str]],
         mm_processor_kwargs: Optional[Dict[str, Any]],
         precomputed_mm_inputs: Optional[List[MultiModalKwargs]],
-    ) -> List[MultiModalKwargs]:
+    ) -> Tuple[List[MultiModalKwargs], Optional[List[str]]]:
         if precomputed_mm_inputs is None:
             image_inputs = mm_data["image"]
             if not isinstance(image_inputs, list):
@@ -72,6 +72,7 @@ def process_inputs(
         # Check if hash is enabled
         use_hash = mm_hashes is not None
         if use_hash:
+            assert mm_hashes is not None
             assert num_inputs == len(
                 mm_hashes), "num_inputs = {} len(mm_hashes) = {}".format(
                     num_inputs, len(mm_hashes))
@@ -79,7 +80,7 @@ def process_inputs(
         # Process each image input separately, so that later we can schedule
         # them in a fine-grained manner.
         # Apply caching (if enabled) and reuse precomputed inputs (if provided)
-        ret_hashes = [] if use_hash else None
+        ret_hashes: Optional[List[str]] = [] if use_hash else None
         ret_inputs: List[MultiModalKwargs] = []
         for input_id in range(num_inputs):
             if self.mm_debug_cache_hit_ratio_steps is not None:
@@ -88,6 +89,7 @@ def process_inputs(
             mm_hash = None
             mm_input = None
             if use_hash:
+                assert mm_hashes is not None
                 mm_hash = mm_hashes[input_id]
                 mm_input = self.mm_cache.get(mm_hash)
 
@@ -105,12 +107,15 @@ def process_inputs(
 
                 if use_hash:
                     # Add to cache
+                    assert mm_hash is not None
                     self.mm_cache.put(mm_hash, mm_input)
             else:
                 self.mm_cache_hits += 1
                 mm_input = None  # Avoids sending mm_input to Server
 
             if use_hash:
+                assert mm_hash is not None
+                assert ret_hashes is not None
                 ret_hashes.append(mm_hash)
             ret_inputs.append(mm_input)
 
@@ -120,17 +125,18 @@ def process_inputs(
 class MMInputMapperServer:
 
     def __init__(self, ):
-        self.mm_cache = LRUDictCache(MM_CACHE_SIZE)
+        self.mm_cache = LRUDictCache[str, MultiModalKwargs](MM_CACHE_SIZE)
 
     def process_inputs(
         self,
         mm_inputs: List[Optional[MultiModalKwargs]],
-        mm_hashes: List[Optional[str]],
+        mm_hashes: List[str],
     ) -> List[MultiModalKwargs]:
         assert len(mm_inputs) == len(mm_hashes)
 
         full_mm_inputs = []
         for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
+            assert mm_hash is not None
             if mm_input is None:
                 mm_input = self.mm_cache.get(mm_hash)
                 assert mm_input is not None
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 903996bad3726..679bf8e25e9ca 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -56,7 +56,7 @@ def process_inputs(
         request_id: str,
         prompt: PromptType,
         params: Union[SamplingParams, PoolingParams],
-        arrival_time: float,
+        arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 9cd267581ad18..564d0447f15a6 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Dict, Optional, Tuple
+from typing import Tuple
 
 from vllm.config import VllmConfig
 from vllm.v1.outputs import ModelRunnerOutput
@@ -28,7 +28,7 @@ def execute_model(
         raise NotImplementedError
 
     @abstractmethod
-    def profile(self, is_start=True):
+    def profile(self, is_start: bool = True):
         raise NotImplementedError
 
     @abstractmethod
@@ -38,11 +38,3 @@ def shutdown(self):
     @abstractmethod
     def check_health(self) -> None:
         raise NotImplementedError
-
-    @abstractmethod
-    def collective_rpc(self,
-                       method: str,
-                       timeout: Optional[float] = None,
-                       args: Tuple = (),
-                       kwargs: Optional[Dict] = None) -> []:
-        raise NotImplementedError
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 14384a730ceec..17441dacdc5cf 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -7,7 +7,7 @@
 from dataclasses import dataclass
 from enum import Enum, auto
 from multiprocessing.process import BaseProcess
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import zmq
 
@@ -21,6 +21,7 @@
 from vllm.logger import init_logger
 from vllm.utils import (get_distributed_init_method, get_open_port,
                         get_open_zmq_ipc_path)
+from vllm.v1.executor.abstract import Executor
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.utils import make_zmq_socket
 from vllm.worker.worker_base import WorkerWrapperBase
@@ -31,7 +32,7 @@
 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
 
 
-class MultiprocExecutor:
+class MultiprocExecutor(Executor):
 
     def __init__(self, vllm_config: VllmConfig) -> None:
         # Call self.shutdown at exit to clean up
@@ -103,7 +104,7 @@ def collective_rpc(self,
                        method: str,
                        timeout: Optional[float] = None,
                        args: Tuple = (),
-                       kwargs: Optional[Dict] = None) -> []:
+                       kwargs: Optional[Dict] = None) -> List[Any]:
         """
         Execute an RPC call on workers.
         
@@ -125,7 +126,7 @@ def collective_rpc(self,
 
             responses = [None] * self.world_size
             for w in self.workers:
-                dequeue_timeout = timeout - (time.monotonic() - start_time()
+                dequeue_timeout = timeout - (time.monotonic() - start_time
                                              ) if timeout is not None else None
                 status, result = w.worker_response_mq.dequeue(
                     timeout=dequeue_timeout)
@@ -153,7 +154,7 @@ def execute_model(
                                            args=(scheduler_output, ))[0]
         return model_output
 
-    def profile(self, is_start=True):
+    def profile(self, is_start: bool = True):
         self.collective_rpc("profile", args=(is_start, ))
         return
 
@@ -185,7 +186,6 @@ def wait_for_termination(procs, timeout):
                 p.kill()
 
         self._cleanup_sockets()
-        self.workers = None
 
     def _cleanup_sockets(self):
         for w in self.workers:
@@ -200,7 +200,8 @@ def shutdown(self):
             # again
             atexit.unregister(self.shutdown)
         """Properly shut down the executor and its workers"""
-        if (hasattr(self, 'workers') and self.workers is not None):
+        if getattr(self, 'shutting_down', False):
+            self.shutting_down = True
             for w in self.workers:  #TODO: not sure if needed
                 w.worker_response_mq = None
             self._ensure_worker_termination()
diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
index 9b1d9a40950c6..be058318de58b 100644
--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -4,13 +4,14 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.v1.executor.abstract import Executor
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.worker.gpu_worker import Worker
 
 logger = init_logger(__name__)
 
 
-class UniprocExecutor:
+class UniprocExecutor(Executor):
 
     def __init__(self, vllm_config: VllmConfig) -> None:
         self.vllm_config = vllm_config
@@ -25,7 +26,7 @@ def __init__(self, vllm_config: VllmConfig) -> None:
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
 
-        self.worker = self._create_worker()
+        self.worker: Worker = self._create_worker()
         self.worker.initialize()
         self.worker.load_model()
 
@@ -75,7 +76,7 @@ def profile(self, is_start: bool = True):
         self.worker.profile(is_start)
 
     def shutdown(self):
-        self.worker = None
+        pass
 
     def check_health(self) -> None:
         # UniprocExecutor will always be healthy as long as
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 6bc1e4d5c769f..1737d096e811d 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -52,10 +52,9 @@ def __init__(
         else:
             self.mm_positions = []
         # Output of the mm input mapper (e.g., image tensors).
+        self.mm_inputs: List[MultiModalKwargs] = []
         if self.inputs.multi_modal_inputs:
             self.mm_inputs = self.inputs.multi_modal_inputs
-        else:
-            self.mm_inputs: List[MultiModalKwargs] = []
 
     @classmethod
     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 6ecf20e717ca3..5f327d7066830 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,6 +1,8 @@
 from collections import OrderedDict
+from collections.abc import Sequence
 from contextlib import contextmanager
-from typing import Any, Generic, Iterator, List, TypeVar, overload
+from typing import (Any, Generic, Iterator, List, Optional, TypeVar, Union,
+                    overload)
 
 import zmq
 
@@ -11,7 +13,7 @@
 T = TypeVar("T")
 
 
-class ConstantList(Generic[T]):
+class ConstantList(Generic[T], Sequence):
 
     def __init__(self, x: List[T]) -> None:
         self._x = x
@@ -34,29 +36,33 @@ def remove(self, item):
     def clear(self):
         raise Exception("Cannot clear a constant list")
 
-    def index(self, item):
-        return self._x.index(item)
+    def index(self,
+              item: T,
+              start: int = 0,
+              stop: Optional[int] = None) -> int:
+        return self._x.index(item, start,
+                             stop if stop is not None else len(self._x))
 
     @overload
-    def __getitem__(self, item) -> T:
+    def __getitem__(self, item: int) -> T:
         ...
 
     @overload
     def __getitem__(self, s: slice, /) -> List[T]:
         ...
 
-    def __getitem__(self, item):
+    def __getitem__(self, item: Union[int, slice]) -> Union[T, List[T]]:
         return self._x[item]
 
     @overload
-    def __setitem__(self, item, value):
+    def __setitem__(self, item: int, value: T):
         ...
 
     @overload
-    def __setitem__(self, s: slice, value, /):
+    def __setitem__(self, s: slice, value: T, /):
         ...
 
-    def __setitem__(self, item, value):
+    def __setitem__(self, item: Union[int, slice], value: Union[T, List[T]]):
         raise Exception("Cannot set item in a constant list")
 
     def __delitem__(self, item):
@@ -73,10 +79,12 @@ def __len__(self):
 
 
 @contextmanager
-def make_zmq_socket(path: str, type: Any) -> Iterator[zmq.Socket]:
+def make_zmq_socket(
+        path: str,
+        type: Any) -> Iterator[zmq.Socket]:  # type: ignore[name-defined]
     """Context manager for a ZMQ socket"""
 
-    ctx = zmq.Context()
+    ctx = zmq.Context()  # type: ignore[attr-defined]
     try:
         socket = ctx.socket(type)
 
@@ -96,20 +104,24 @@ def make_zmq_socket(path: str, type: Any) -> Iterator[zmq.Socket]:
         ctx.destroy(linger=0)
 
 
-class LRUDictCache:
+K = TypeVar('K')
+V = TypeVar('V')
+
+
+class LRUDictCache(Generic[K, V]):
 
     def __init__(self, size: int):
-        self.cache = OrderedDict()
+        self.cache: OrderedDict[K, V] = OrderedDict()
         self.size = size
 
-    def get(self, key, default=None):
+    def get(self, key: K, default=None) -> V:
         if key not in self.cache:
             return default
 
         self.cache.move_to_end(key)
         return self.cache[key]
 
-    def put(self, key, value):
+    def put(self, key: K, value: V):
         self.cache[key] = value
         self.cache.move_to_end(key)
         if len(self.cache) > self.size:
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 9046b37f60005..5c113c74778df 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -215,6 +215,7 @@ def condense(self, empty_req_indices: List[int]) -> None:
 
             # Swap the states.
             req_id = self.req_ids[last_req_index]
+            assert req_id is not None
             self.req_ids[empty_index] = req_id
             self.req_ids[last_req_index] = None
             self.req_id_to_index[req_id] = empty_index
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index f24942068d1f8..abcd4b007a326 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,6 +1,6 @@
 import gc
 import time
-from typing import TYPE_CHECKING, Dict, List, Tuple
+from typing import TYPE_CHECKING, Dict, List, Tuple, cast
 
 import numpy as np
 import torch
@@ -193,9 +193,9 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
 
         req_ids_to_add: List[str] = []
         # Add new requests to the cached states.
-        for req_data in scheduler_output.scheduled_new_reqs:
-            req_id = req_data.req_id
-            sampling_params = req_data.sampling_params
+        for new_req_data in scheduler_output.scheduled_new_reqs:
+            req_id = new_req_data.req_id
+            sampling_params = new_req_data.sampling_params
             if sampling_params.sampling_type == SamplingType.RANDOM_SEED:
                 generator = torch.Generator(device=self.device)
                 generator.manual_seed(sampling_params.seed)
@@ -204,25 +204,25 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
 
             self.requests[req_id] = CachedRequestState(
                 req_id=req_id,
-                prompt_token_ids=req_data.prompt_token_ids,
-                prompt=req_data.prompt,
-                mm_inputs=req_data.mm_inputs,
-                mm_positions=req_data.mm_positions,
+                prompt_token_ids=new_req_data.prompt_token_ids,
+                prompt=new_req_data.prompt,
+                mm_inputs=new_req_data.mm_inputs,
+                mm_positions=new_req_data.mm_positions,
                 sampling_params=sampling_params,
                 generator=generator,
-                block_ids=req_data.block_ids,
-                num_computed_tokens=req_data.num_computed_tokens,
+                block_ids=new_req_data.block_ids,
+                num_computed_tokens=new_req_data.num_computed_tokens,
                 output_token_ids=[],
             )
             req_ids_to_add.append(req_id)
 
         # Update the cached states of the resumed requests.
-        for req_data in scheduler_output.scheduled_resumed_reqs:
-            req_id = req_data.req_id
+        for res_req_data in scheduler_output.scheduled_resumed_reqs:
+            req_id = res_req_data.req_id
             req_state = self.requests[req_id]
 
-            req_state.block_ids = req_data.block_ids
-            req_state.num_computed_tokens = req_data.num_computed_tokens
+            req_state.block_ids = res_req_data.block_ids
+            req_state.num_computed_tokens = res_req_data.num_computed_tokens
             req_ids_to_add.append(req_id)
 
         # Add the new or resumed requests to the persistent batch.
@@ -259,6 +259,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         num_scheduled_tokens = []
         max_num_scheduled_tokens = 0
         for req_id in self.input_batch.req_ids[:num_reqs]:
+            assert req_id is not None
             num_tokens = scheduler_output.num_scheduled_tokens[req_id]
             num_scheduled_tokens.append(num_tokens)
             max_num_scheduled_tokens = max(max_num_scheduled_tokens,
@@ -373,7 +374,7 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
 
         # Batch the multi-modal inputs.
         mm_inputs: List[MultiModalKwargs] = []
-        req_input_ids: List[Tuple[int, int]] = []
+        req_input_ids: List[Tuple[str, int]] = []
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
             for input_id in encoder_input_ids:
@@ -406,6 +407,7 @@ def _gather_encoder_outputs(
         encoder_outputs: List[torch.Tensor] = []
         num_reqs = self.input_batch.num_reqs
         for req_id in self.input_batch.req_ids[:num_reqs]:
+            assert req_id is not None
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
                 req_id]
             req_state = self.requests[req_id]
@@ -514,6 +516,7 @@ def execute_model(
         # the requests one by one. Optimize.
         num_reqs = self.input_batch.num_reqs
         for i, req_id in enumerate(self.input_batch.req_ids[:num_reqs]):
+            assert req_id is not None
             req_state = self.requests[req_id]
             seq_len = (req_state.num_computed_tokens +
                        scheduler_output.num_scheduled_tokens[req_id])
@@ -539,8 +542,15 @@ def execute_model(
             logprobs = None
         else:
             logprobs = sampler_output.logprobs.cpu()
+
+        # num_reqs entries should be non-None
+        assert all(
+            req_id is not None for req_id in
+            self.input_batch.req_ids[:num_reqs]), "req_ids contains None"
+        req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs])
+
         model_runner_output = ModelRunnerOutput(
-            req_ids=self.input_batch.req_ids[:num_reqs],
+            req_ids=req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=sampled_token_ids,
             logprob_token_ids_cpu=logprob_token_ids,
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 49e415ab72e0b..33491f700de10 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -204,7 +204,7 @@ def execute_model(
         return output if self.rank == 0 else None
         return output
 
-    def profile(self, is_start=True):
+    def profile(self, is_start: bool = True):
         if self.profiler is None:
             raise RuntimeError("Profiler is not enabled.")
         if is_start:

From 886936837ca89e5645bc1f71cc0e1492b65b1590 Mon Sep 17 00:00:00 2001
From: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Date: Sun, 15 Dec 2024 04:38:10 +0900
Subject: [PATCH 072/357] [Performance][Core] Optimize the performance of
 evictor v1 and v2 by applying a priority queue and lazy deletion (#7209)

---
 vllm/core/evictor.py | 63 ++++++++++++++++++++++++++++++--------------
 1 file changed, 43 insertions(+), 20 deletions(-)

diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index ed7e06cab2996..44adc4158abec 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -1,6 +1,7 @@
 import enum
+import heapq
 from abc import ABC, abstractmethod
-from typing import OrderedDict, Tuple
+from typing import Dict, List, Tuple
 
 
 class EvictionPolicy(enum.Enum):
@@ -75,8 +76,14 @@ class LRUEvictor(Evictor):
     highest num_hashed_tokens value, then one will be chose arbitrarily
     """
 
+    # CLEANUP_THRESHOLD determines the maximum allowable size of the priority
+    # queue relative to the free table size. When this threshold is exceeded,
+    # a cleanup operation is triggered to reduce memory usage.
+    CLEANUP_THRESHOLD = 50
+
     def __init__(self):
-        self.free_table: OrderedDict[int, BlockMetaData] = OrderedDict()
+        self.free_table: Dict[int, BlockMetaData] = {}
+        self.priority_queue = []
 
     def __contains__(self, block_id: int) -> bool:
         return block_id in self.free_table
@@ -85,34 +92,50 @@ def evict(self) -> Tuple[int, int]:
         if len(self.free_table) == 0:
             raise ValueError("No usable cache memory left")
 
-        evicted_block, evicted_block_id = None, None
-        # The blocks with the lowest timestamps should be placed consecutively
-        # at the start of OrderedDict. Loop through all these blocks to
-        # find the one with maximum number of hashed tokens.
-        for _id, block in self.free_table.items():
-            if evicted_block is None:
-                evicted_block, evicted_block_id = block, _id
-                continue
-            if evicted_block.last_accessed < block.last_accessed:
-                break
-            if evicted_block.num_hashed_tokens < block.num_hashed_tokens:
-                evicted_block, evicted_block_id = block, _id
-
-        assert evicted_block is not None
-        assert evicted_block_id is not None
-        self.free_table.pop(evicted_block_id)
-
-        return evicted_block_id, evicted_block.content_hash
+        while self.priority_queue:
+            # We do not remove outdated entries from the priority queue at the
+            # time of updating the last_accessed timestamp. Instead, outdated
+            # entries are filtered out here during eviction. Outdated entries
+            # would either not in the free table, or have older last accessed
+            # time.
+            last_accessed, _, block_id, content_hash = heapq.heappop(
+                self.priority_queue)
+            if (block_id in self.free_table and
+                    self.free_table[block_id].last_accessed == last_accessed):
+                self.free_table.pop(block_id)
+                return block_id, content_hash
+
+        raise ValueError("No usable cache memory left")
 
     def add(self, block_id: int, content_hash: int, num_hashed_tokens: int,
             last_accessed: float):
         self.free_table[block_id] = BlockMetaData(content_hash,
                                                   num_hashed_tokens,
                                                   last_accessed)
+        heapq.heappush(
+            self.priority_queue,
+            (last_accessed, -num_hashed_tokens, block_id, content_hash))
+        self._cleanup_if_necessary()
 
     def update(self, block_id: int, last_accessed: float):
         self.free_table[block_id].last_accessed = last_accessed
 
+    def _cleanup_if_necessary(self):
+        if len(self.priority_queue) > LRUEvictor.CLEANUP_THRESHOLD * len(
+                self.free_table):
+            self._cleanup()
+
+    def _cleanup(self):
+        new_priority_queue: List[Tuple[float, int, int, int]] = []
+
+        for block_id, block in self.free_table.items():
+            new_priority_queue.append(
+                (block.last_accessed, -block.num_hashed_tokens, block_id,
+                 block.content_hash))
+        heapq.heapify(new_priority_queue)
+
+        self.priority_queue = new_priority_queue
+
     def remove(self, block_id: int):
         if block_id not in self.free_table:
             raise ValueError(

From 15859f2357059ef488405e5336d2c6e5d246687b Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 15 Dec 2024 11:03:06 +0800
Subject: [PATCH 073/357] [[Misc]Upgrade bitsandbytes to the latest version
 0.45.0 (#11201)

---
 Dockerfile                                              | 2 +-
 docs/source/quantization/bnb.rst                        | 2 +-
 requirements-test.in                                    | 2 +-
 requirements-test.txt                                   | 2 +-
 vllm/model_executor/layers/quantization/bitsandbytes.py | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 682f046d4b6ec..c1b6e1bbfe354 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -218,7 +218,7 @@ FROM vllm-base AS vllm-openai
 
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' timm==0.9.10
+    pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' timm==0.9.10
 
 ENV VLLM_USAGE_SOURCE production-docker-image
 
diff --git a/docs/source/quantization/bnb.rst b/docs/source/quantization/bnb.rst
index 682938cc63d48..84f805bb60c2a 100644
--- a/docs/source/quantization/bnb.rst
+++ b/docs/source/quantization/bnb.rst
@@ -11,7 +11,7 @@ Below are the steps to utilize BitsAndBytes with vLLM.
 
 .. code-block:: console
 
-    $ pip install bitsandbytes>=0.44.0
+    $ pip install bitsandbytes>=0.45.0
 
 vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
 
diff --git a/requirements-test.in b/requirements-test.in
index 57fddb416317e..fb4179c3d8423 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -25,7 +25,7 @@ datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
 
 # quantization
-bitsandbytes>=0.44.0
+bitsandbytes>=0.45.0
 buildkite-test-collector==0.1.9
 
 numpy < 2.0.0
diff --git a/requirements-test.txt b/requirements-test.txt
index c786a1249bddb..3771577fe8ed0 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -37,7 +37,7 @@ audioread==3.0.1
     # via librosa
 awscli==1.35.23
     # via -r requirements-test.in
-bitsandbytes==0.44.1
+bitsandbytes>=0.45.0
     # via -r requirements-test.in
 black==24.10.0
     # via datamodel-code-generator
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index e01c713dd14db..5dc872933282c 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -145,12 +145,12 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: BitsAndBytesConfig):
         try:
             import bitsandbytes
-            if bitsandbytes.__version__ < "0.44.0":
+            if bitsandbytes.__version__ < "0.45.0":
                 raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.44.0.")
+                                  "install bitsandbytes>=0.45.0.")
         except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.44.0 via "
-                              "`pip install bitsandbytes>=0.44.0` to use "
+            raise ImportError("Please install bitsandbytes>=0.45.0 via "
+                              "`pip install bitsandbytes>=0.45.0` to use "
                               "bitsandbytes quantizer.") from err
 
         self.quant_config = quant_config

From a1c02058baf47be1a91ee743378a340ee1b10416 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 14 Dec 2024 19:45:00 -0800
Subject: [PATCH 074/357] [torch.compile] allow tracking forward time (#11081)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/forward_context.py | 61 ++++++++++++++++++++++++++++-------------
 1 file changed, 42 insertions(+), 19 deletions(-)

diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index cd136f43c0c57..7f56575279e9b 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -1,9 +1,11 @@
 import time
-from collections import Counter
+from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import Any, Dict, Optional
 
+import torch
+
 import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
@@ -11,9 +13,10 @@
 logger = init_logger(__name__)
 
 track_batchsize: bool = envs.VLLM_LOG_BATCHSIZE_INTERVAL >= 0
-batchsize_counter: Counter = Counter()
 last_logging_time: float = 0
+forward_start_time: float = 0
 batchsize_logging_interval: float = envs.VLLM_LOG_BATCHSIZE_INTERVAL
+batchsize_forward_time: defaultdict = defaultdict(list)
 
 
 @dataclass
@@ -40,23 +43,10 @@ def set_forward_context(context: Any, vllm_config: VllmConfig):
     can be attention metadata, etc.
     Here we can inject common logic for every model forward pass.
     """
-    global track_batchsize, batchsize_counter
-    global last_logging_time, batchsize_logging_interval
-    if track_batchsize and context is not None:
-        if hasattr(context, "num_prefill_tokens"):
-            # for v0 attention backends
-            batchsize = context.num_prefill_tokens + context.num_decode_tokens
-        else:
-            # for v1 attention backends
-            batchsize = context.num_input_tokens
-        batchsize_counter[batchsize] += 1
-        if time.monotonic() - last_logging_time > batchsize_logging_interval:
-            last_logging_time = time.monotonic()
-            sorted_data = sorted(batchsize_counter.items(),
-                                 key=lambda x: x[1],
-                                 reverse=True)
-            logger.info("Batchsize distribution (batchsize, count): %s",
-                        sorted_data)
+    global forward_start_time
+    need_to_track_batchsize = track_batchsize and context is not None
+    if need_to_track_batchsize:
+        forward_start_time = time.perf_counter()
     global _forward_context
     prev_context = _forward_context
     _forward_context = ForwardContext(
@@ -66,4 +56,37 @@ def set_forward_context(context: Any, vllm_config: VllmConfig):
     try:
         yield
     finally:
+        global batchsize_counter
+        global last_logging_time, batchsize_logging_interval
+        if need_to_track_batchsize:
+            if hasattr(context, "num_prefill_tokens"):
+                # for v0 attention backends
+                batchsize = context.num_prefill_tokens + \
+                    context.num_decode_tokens
+            else:
+                # for v1 attention backends
+                batchsize = context.num_input_tokens
+            # we use synchronous scheduling right now,
+            # adding a sync point here should not affect
+            # scheduling of the next batch
+            torch.cuda.synchronize()
+            now = time.perf_counter()
+            # time measurement is in milliseconds
+            batchsize_forward_time[batchsize].append(
+                (now - forward_start_time) * 1000)
+            if now - last_logging_time > batchsize_logging_interval:
+                last_logging_time = now
+                forward_stats = []
+                for bs, times in batchsize_forward_time.items():
+                    if len(times) <= 1:
+                        # can be cudagraph / profiling run
+                        continue
+                    medium = torch.quantile(torch.tensor(times), q=0.5).item()
+                    medium = round(medium, 2)
+                    forward_stats.append((bs, len(times), medium))
+                forward_stats.sort(key=lambda x: x[1], reverse=True)
+                if forward_stats:
+                    logger.info(("Batchsize forward time stats "
+                                 "(batchsize, count, median_time(ms)): %s"),
+                                forward_stats)
         _forward_context = prev_context

From b10609e6a11554be61976981304984510a0469c9 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 15 Dec 2024 14:30:28 +0800
Subject: [PATCH 075/357] [Misc] Clean up multi-modal processor (#11207)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 examples/offline_inference_vision_language.py |  5 +-
 tests/multimodal/test_processing.py           | 17 ++++---
 vllm/multimodal/processing.py                 | 48 +++++++++----------
 3 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 45539c665a922..7bc43242b717e 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -92,10 +92,7 @@ def run_fuyu(question: str, modality: str):
 def run_phi3v(question: str, modality: str):
     assert modality == "image"
 
-    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"  # noqa: E501
-    # Note: The default setting of max_num_seqs (256) and
-    # max_model_len (128k) for this model may cause OOM.
-    # You may lower either to run this example on lower-end GPUs.
+    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
 
     # num_crops is an override kwarg to the multimodal image processor;
     # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 6aaa80ddc9fa5..d22d778f81fa8 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -2,10 +2,9 @@
 
 import pytest
 
-from vllm.multimodal.processing import (MultiModalDataItems, PromptReplacement,
-                                        _PlaceholderInfo, find_text_matches,
-                                        find_token_matches, iter_placeholders,
-                                        iter_token_matches,
+from vllm.multimodal.processing import (PromptReplacement, _PlaceholderInfo,
+                                        find_text_matches, find_token_matches,
+                                        iter_placeholders, iter_token_matches,
                                         replace_text_matches,
                                         replace_token_matches)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -314,8 +313,8 @@ def test_find_replace_text(
     result = replace_text_matches(
         prompt,
         matches,
-        MultiModalDataItems({key: [None] * mm_count
-                             for key in repl_by_key}),
+        {key: mm_count
+         for key in repl_by_key},
     )
 
     # Only displayed on error
@@ -380,8 +379,8 @@ def test_find_replace_tokens(
     result = replace_token_matches(
         prompt,
         matches,
-        MultiModalDataItems({key: [None] * mm_count
-                             for key in repl_by_key}),
+        {key: mm_count
+         for key in repl_by_key},
     )
 
     # Only displayed on error
@@ -476,7 +475,7 @@ def test_iter_placeholders(
             prompt_repls,
             prompt,
             # Effectively match all occurrences in the prompt
-            MultiModalDataItems({key: [None] * 3 for key in repl_by_key}),
+            {key: 3 for key in repl_by_key},
          ))
 
     # Only displayed on error
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index de5a002d474c2..ce6bec1d49aac 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -403,18 +403,17 @@ def _resolve_matches(
 def _replace_matches(
     prompt: _S,
     matches: Sequence[_PromptReplacementMatch],
-    mm_items: MultiModalDataItems,
+    mm_item_counts: Mapping[str, int],
 ) -> list[_S]:
     out_seqs = list[_S]()
     prev_end_idx = 0
-    next_idx_by_modality = {modality: 0 for modality in mm_items}
+    next_idx_by_modality = {modality: 0 for modality in mm_item_counts}
 
     for match in _resolve_matches(prompt, matches):
         modality = match.modality
-        modal_items = mm_items[modality]
 
         item_idx = next_idx_by_modality[modality]
-        if item_idx >= len(modal_items):
+        if item_idx >= mm_item_counts[modality]:
             continue
 
         start_idx = match.start_idx
@@ -441,13 +440,13 @@ def _replace_matches(
 def replace_token_matches(
     prompt: list[int],
     matches: Sequence[_PromptReplacementTokenMatch],
-    mm_items: MultiModalDataItems,
+    mm_item_counts: Mapping[str, int],
 ) -> list[int]:
     """Apply :code:`prompt_repls` to :code:`prompt`."""
     if not matches:
         return prompt
 
-    token_id_seqs = _replace_matches(prompt, matches, mm_items)
+    token_id_seqs = _replace_matches(prompt, matches, mm_item_counts)
 
     return flatten_2d_lists(token_id_seqs)
 
@@ -455,13 +454,13 @@ def replace_token_matches(
 def replace_text_matches(
     prompt: str,
     matches: Sequence[_PromptReplacementTextMatch],
-    mm_items: MultiModalDataItems,
+    mm_item_counts: Mapping[str, int],
 ) -> str:
     """Apply :code:`prompt_repls` to :code:`prompt`."""
     if not matches:
         return prompt
 
-    texts = _replace_matches(prompt, matches, mm_items)
+    texts = _replace_matches(prompt, matches, mm_item_counts)
 
     return "".join(texts)
 
@@ -470,9 +469,9 @@ def _iter_modality_placeholders(
     prompt: list[int],
     modality: str,
     modality_repls: Sequence[_BoundPromptReplacement],
-    modal_items: list[Any],
+    modal_item_count: int,
 ) -> Iterable[_PlaceholderInfo]:
-    if len(modal_items) == 0:
+    if modal_item_count == 0:
         return
 
     prompt_len = len(prompt)
@@ -499,7 +498,7 @@ def _iter_modality_placeholders(
                 )
 
                 item_index += 1
-                if item_index >= len(modal_items):
+                if item_index >= modal_item_count:
                     return
 
                 # Exclude overlapping matches
@@ -514,7 +513,7 @@ def _iter_modality_placeholders(
 def iter_placeholders(
     prompt_repls: Sequence[_BoundPromptReplacement],
     prompt: list[int],
-    mm_items: MultiModalDataItems,
+    mm_item_counts: Mapping[str, int],
 ) -> Iterable[_PlaceholderInfo]:
     """
     Yield each set of placeholder tokens found in :code:`prompt`.
@@ -523,13 +522,13 @@ def iter_placeholders(
     """
     repls_by_modality = dict(full_groupby_modality(prompt_repls))
 
-    for modality, modal_items in mm_items.items():
+    for modality, modal_item_count in mm_item_counts.items():
         if modality in repls_by_modality:
             yield from _iter_modality_placeholders(
                 prompt,
                 modality,
                 repls_by_modality[modality],
-                modal_items,
+                modal_item_count,
             )
 
 
@@ -590,10 +589,10 @@ def _find_placeholders(
         self,
         all_prompt_repls: Sequence[_BoundPromptReplacement],
         new_token_ids: list[int],
-        mm_items: MultiModalDataItems,
+        mm_item_counts: Mapping[str, int],
     ) -> list[_PlaceholderInfo]:
         return list(
-            iter_placeholders(all_prompt_repls, new_token_ids, mm_items))
+            iter_placeholders(all_prompt_repls, new_token_ids, mm_item_counts))
 
     def _apply_hf_processor(
         self,
@@ -655,10 +654,9 @@ def _bind_prompt_replacements(
 
     def _apply_prompt_replacements(
         self,
-        mm_items: MultiModalDataItems,
-        hf_inputs: BatchFeature,
         token_ids: list[int],
         prompt_repls: Sequence[_BoundPromptReplacement],
+        mm_item_counts: Mapping[str, int],
     ) -> tuple[list[int], str, list[_PlaceholderInfo]]:
         tokenizer = self._get_tokenizer()
 
@@ -675,13 +673,13 @@ def _apply_prompt_replacements(
         # of the search text in the prompt, we instead perform string
         # replacement on the decoded token IDs, then encode them back.
         if all(
-            len(matches) >= len(mm_items[modality])
+            len(matches) >= mm_item_counts[modality]
             for modality, matches in full_groupby_modality(token_matches)
         ):  # yapf: disable
             token_ids = replace_token_matches(
                 token_ids,
                 token_matches,
-                mm_items,
+                mm_item_counts,
             )
 
             text = _decode(tokenizer, token_ids)
@@ -693,14 +691,14 @@ def _apply_prompt_replacements(
             text = replace_text_matches(
                 text,
                 text_matches,
-                mm_items,
+                mm_item_counts,
             )
 
             token_ids = _encode(tokenizer, text)
             matched_repls = [match.prompt_repl for match in text_matches]
 
         placeholders = self._find_placeholders(matched_repls, token_ids,
-                                               mm_items)
+                                               mm_item_counts)
 
         return token_ids, text, placeholders
 
@@ -737,8 +735,9 @@ def apply(
 
         # If HF processor already inserts placeholder tokens,
         # there is no need for us to insert them
+        mm_item_counts = {m: len(items) for m, items in mm_items.items()}
         all_placeholders = self._find_placeholders(all_prompt_repls,
-                                                   prompt_ids, mm_items)
+                                                   prompt_ids, mm_item_counts)
 
         if all_placeholders:
             prompt_text = _decode(tokenizer, prompt_ids)
@@ -748,10 +747,9 @@ def apply(
                 prompt_text,
                 all_placeholders,
             ) = self._apply_prompt_replacements(
-                mm_items,
-                hf_inputs,
                 prompt_ids,
                 all_prompt_repls,
+                mm_item_counts,
             )
 
         mm_placeholders = {

From 96d673e0f897aa8eec234e690c9c5425782d6ffb Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 16 Dec 2024 01:59:42 +0800
Subject: [PATCH 076/357] [Bugfix] Fix error handling of unsupported sliding
 window (#11213)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/llama.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 733b1bc7d80ac..2902e6999c2fd 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -169,13 +169,15 @@ def __init__(
         )
 
         if hasattr(config, "interleaved_sliding_window"):
-            if isinstance(config.interleaved_sliding_window, int):
-                sliding_window = config.interleaved_sliding_window
-            elif isinstance(config.interleaved_sliding_window, list):
-                sw_idx = layer_idx % len(config.interleaved_sliding_window)
-                sliding_window = config.interleaved_sliding_window[sw_idx]
+            interleaved_sliding_window = config.interleaved_sliding_window
+            if isinstance(interleaved_sliding_window, int):
+                sliding_window = interleaved_sliding_window
+            elif isinstance(interleaved_sliding_window, list):
+                sw_idx = layer_idx % len(interleaved_sliding_window)
+                sliding_window = interleaved_sliding_window[sw_idx]
             else:
-                raise ValueError(f"{type(sliding_window)} is not supported.")
+                raise ValueError(
+                    f"{type(interleaved_sliding_window)} is not supported.")
         else:
             sliding_window = None
 

From 38e599d6a84bb7477030a5488035cd23f529b644 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sun, 15 Dec 2024 13:31:16 -0600
Subject: [PATCH 077/357] [Doc] add documentation for disaggregated prefilling
 (#11197)

Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
---
 .../usage/disagg_prefill/abstraction.jpg      | Bin 0 -> 104673 bytes
 .../assets/usage/disagg_prefill/overview.jpg  | Bin 0 -> 177439 bytes
 docs/source/index.rst                         |   1 +
 docs/source/usage/disagg_prefill.rst          |  69 ++++++++++++++++++
 4 files changed, 70 insertions(+)
 create mode 100644 docs/source/assets/usage/disagg_prefill/abstraction.jpg
 create mode 100644 docs/source/assets/usage/disagg_prefill/overview.jpg
 create mode 100644 docs/source/usage/disagg_prefill.rst

diff --git a/docs/source/assets/usage/disagg_prefill/abstraction.jpg b/docs/source/assets/usage/disagg_prefill/abstraction.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1a99e3ed8cf5f3b6679196752896fca94a22a4a4
GIT binary patch
literal 104673
zcmeFa2UHW^x+p&MF4B<}DbkB{Q6wVLL<FRR5Ru-IjzpqJks=5RD2PZ`s&pw5=^`T1
z1VRT9fglND2&Vl<f92eD&v|Rzb>F$`|JHksOy<kZp1t?C_qV@3BW0Sh41Cd#^!5M%
zQ&T_|002gSo{AHo0U;{zA3!Al(EWk|z=lfjZ`hei`p-Jl06@+4kNSz;0PUak!8U&#
z_x}9;$CVMhFn}er$5m96GpPQ6si<jy%!FSsC-~dDr+?K^K7RZu)jwK)O8r;sj8~u1
z`~_1rm(c%N2fX>`p=+sUYz$s4U4z}-146t4Lji!Y!X|I5rw22?WM*J|K_8$3v11c*
z2@3L~J`4c<0inT{49_05vv)Yk@C9H2Sb@WUJRs=e8WN;yZhqlctAAd9!v8pMAAimh
z0E{dAZ0jH8|0RgS%{{~woMm&c_EonKSAP(01W%l6P%t=VnkukdJR&scC;S0~xr4z0
zg7BxGu;(8z{wI9(7fj3gtB%zrJ#d1Lf{V%O;u7o$0Ic$0`O$D!Pp}=<3m|+vz|A`V
zgyA5p<L@8n2ErX6EDhEP0^y(SpSb!T+VT7ku#3ypzuR<i@%$V90~hd4u;X*yAwhmF
zQNO<YJ3svW!od0Zbx4ArtX{!Jm%ytCIA_zofhK>$0aq`Y{Z;M_V)qy96=r1p7wj6M
z2hQam_qw^9Gx-Y+@VoePj()Z2;cf7fkbc45p~hB!!R{gYmVd#)ftUX7%OhA%_pfr7
zU=aHA-Y{S5zstScjsI@{x|fyhUv<L0Z7%)=hxl6lJ)Y}cdViM(hh6%6{O$pUXaB12
z5p3``?*1WQ>7V_2c^m(IZ>X1*(O>1RF8>(6dw}jg+6=M#du(p*=k))o?;c?NH@-ok
zXaDXi$nV$Q_%%Q7eujUShlE@HeQ#*6)!*eoq2_<jp^wY?pLGB0UceIY2HXI5z!h)-
zf`OyJIq-W35Cr&v-$618b&mvT7Xb7EgQ9}HJ-tGY>Vmt?{itz(tD^i-mE$UE0Pu6~
z{;UH4-z<LZNh;oV|0uh?2mq&%6bhyNA7$r}08pn10L-obD3d+`030j;@TS!@EI9m+
z_Go`z=m2Jb9pDCzfFl+MqyYs$1<(La0cQYx;5=XkSONBcGdO-Pz#qgQ0)PXzfCL}~
zxCdkbxj+F>3_Js>fqI|?=m5Haeqadr089e2z!I<qYylVm2N0;Js2Hf&sJN((P(i4U
zQOQxMP@SOCq0*<iKy``Ap6UvfCshDd7!{oAHdP8$CRHv~A=PuLI;vKxE~-JQQK}iL
zWvUITU8*0{G}Nrr+|<I<lGIAnC#ZF)O{lG?ov1yjuTfv8j-yVaen?$JT}9nY-9<e_
zJxRSpy+!?<2B2Y~;iVC!QJ^_Nqeo*#<3Qs<6HF6BlT4FMQ$$lk(?&BuGfuNagQhv4
zrKjbhJxZ%Udy3YG)|%Fp_8M&rZ7OXp?K9eD+Fsgm+GSb{Et!suPKZv9?j)TtogJMg
zT?AbMT{c}AT@zg&-6Y)_9gd!!o|j&VUX$L4-k#ovKAJw2KA*mp{x$t5{VM%;1_p*B
z3~~&o8O#`57(y8m7#=ZHF(4Vn7}gl@j4X^0MioW_MhC_K##qK|#tOzx#xcfqMgr4e
zrejPenar5nn4*}{nM#=2m`0e^m<Y@q%u>v!nXQ<8m~SyZWUgWEW1eIF#=^oP#-hby
z!Q#zwljR}H3zj!5%Pa@1oUF2}daRdO!&uW;%UNHu&ai%CV`GzKJHzI{7Rr{!_MEMU
zZGjDci0hEzA>%`ChY*MI4mBSdJM@*EiTxP6F1s`Pb@pub2KEtl^kJsMl85yUyBv-=
zoOihO@YLb&99$gA9A+H;9H|@?9D^KRI2kx4IrTZ+IO8~rIeR!)xoEh=x%9YPxo&fn
zaP@Jmano~4aT{~{aHnurbHC@t@Nn{|@!0T0^5pTn;#uOQ;g#ey=Jn%E=dI`c$oqp&
zm`|6_oiB;6hHs2-pI?yw48J>nGJh@qB>#^iM~~<q@ja4pq~*xG0Ih(mfQ3MWz!QN!
z0kj~u;3+{@!DPXDL6i`+kgU)pp=hCEp?5;xg++u7g@c423HJ!2MR-Ndi1>&+5a|@z
zfN(){Al{G%5F}*tD9=&dqkcyp9_>Af5fu_O5)BhA6n!sB5R($K7K;_D6`L1l7C#~G
zA^t$TTO1<+kua5jOH@jrjxilOam?#j&ar`G`;t<UwvvgGEs|fP_@#`cu1i%)%}KLK
zL#3}t7fDaZ(94{V@s-J!`5;Rzt1jy$n<qOWM=hrz=Pj2nH!4pjuO%NKUnD=Jz^b6D
z5Ux<Eu&l_dctJ5%u|@H#l7y0jQijri68X6Lalhjw$7hu}mCq~3Dz_=`s>rIis^qCm
zsIsXVs3KHfs$$jT)ZEpcs7<SLshg@Ns&}iCG&D7WHEJ|AG$k~zXg=1QI>CLy{6y-B
zH(GRBXSEPo9a;w`HBN?}tUtMPO5v36sftryw57D&wM(>DPm7&)IbC>qN$04}6`iL#
zOHfg$3$zHjaz^}&+nKU6>$)<!KDt%9U(X&t8+^9u?7rSfy&HP3&(WMSJePd#y*{VD
zwSKPtf`OQUr$MCw#!%fb+VHgzoso%AhS8L<u(7N0b7RbT&GR?T_nNSpSeoRTtXz=2
z5PYG{l-l&X>3!3=i^nboTx>A|%#6+Mo6Va`nO`$+x1hH$x5%^jd`bBd{L+9WmnF=y
z!V+h7)+*g<)>_&+%(~l#-R82*bDMqJbGDhbOLj_jH|&P&1?;`;Upg>2*f^9qe0S7$
z%yL}2tbRG+@<%5rr%0zkX8~tF=T|Uxm<z1o3f&dkD-~CWE@m!8F5j;jUwwQP<ErPH
z<GST`#_fUIhWlyvO!sw<(;k@~Up!BH-uK+_f_i0pp}qCIbG@-XMm_~T`@Uwr<-Qa@
zTfbU=M*l1RtpS_?z5%^~B7u>Cqd{^(2|-KOPF~Bph6y$aE(@UwaSC}E$_*+>@4}?R
z;=`81b;9$)@ewu=4Urs?0g>;bWTTRz)~_2}FO8;+c8%_ai^6Zgmv89aD2$<sxf;`r
z5J$uz)@~Z!e0GcZmhY|M*yFL8v3s{|Z@0w>$K8xujW>+1OgNMfoG_JmDzV@W?H%ts
z?~_!M9wm{J-ICv?98bwkA*Q;fzP+n__u*Yinn&9EbdB^U_vr5h-20dT&3Kl{k$F9H
z^}gx-<_AX~BtO{8f@KY6t7RADFz1BjEIqvN@Z}?kNB15PbG>pW^UmhgJ{EeM^msqt
zEr0CEnI|;`!UZV>Kc0F&oh~#iY%Y>4$}Xla4ln*(VpB3udZP4snLt@e8L2#=eCgSx
zXT8rgpFgh<sz|S-t_-W(sJdMBq553)%Nm86!dl+ilo!B@uos(kSL#02o78tVoM@<N
zJl6QQiK{8OnW{Oe8Pnp`vh>pa<!I~q*6y}bZB6Zp?d2V!9eJ;KUZr<3cgA&6kk^sl
zUkANLcX@WLcEh@7d+d59dM$e2_nGv)={M-_8PFX-zB&D-WANl)>)R7=TZS}-n%}9v
zYZ_J`ZhEirzIjA*<mCsg4{f8`qn%^Wv99rR<NXuH6GM|1Cr3Y8f1IAWJhe3KHvMJB
zZw8ABM-gUk&e6>!&mW%8UJzO+T9jF=UD8;3wR~>*-HPSP?5gYP)~Aq9#I?B3Y@f5&
zA?we-sC?<zFxdF8>A1PJ6|_Y_Cv0<UKmIEHwQ=Xn&O3}9W(^yRrR=7DJMyi3Pi?RJ
zyZQG8oIj4RpNv0(e|B)<VBm-Ck99%>k%9P-BtvQ=n~>)y{uD}(OR&o?UErr>MC}3E
zHoN7Z{a_COT%cag?D&Un{j0_=yXj8|R7-zBe_a0q{D=Pi>o*MmsNn_x3pA+ROaZ`)
z5&)0{+h@NF0Oo7}V5J36tE&Ee{@L6gIy(TkxCHrmo4dH_3@iSj<^K8mUzb1hdGOm`
z<NnWA5F1rhP<i@KSIR692bxGf4{Gp_hMJn1hL#3AK+*s6pl1N3H+X<j`q#nu>tOnM
zfI{`pgX-s9G&FSJ9}6Qr<6p)9^($o_wDA%tuK^B5s%FNkG*n^$H3t<92Nk84stBA@
zx<5wv7qZ|uY3b-07@3$^4g=IwG&Iz-G<0;dp!P;}<EM#9%R$E}s;o=TW#Pggc8y!*
zRz@MC_}RKHo=c+`3Dv8?u}sXoeEdfQj!8;M%gCy!YiOR(I;nR~-@wqw_`Ic+wT&&P
zCc3)0dw6<z`-Fssg-1k2UB4X{pOAPbDLM20gRJbFhmUfLic3n%%AY;2sBdU&YHoSi
z+Sc9E+t)wvX7KIU_{8MLsp%Qi^2+L`wa@EcHa4-l-}b)a_VEWlKs)yjbimI)F!~Sp
zaDezw)6&w=GW^7ciaG*3X*g)<M3w0|buAcNu5pQ}++yTDn^9QT#U!qJ3Bz+Wc$Aq}
zLT&jN_9sfeF#4|{6#L)8=x>Dn#)pCeSZS!h1*72rpa7XHU6c&`K7Y@_zoG%khu`$^
zujb`<*&q9bGWwf7e$&GLns-vh{-6)J6nw@i1z;gc;Uj9{me>vo;64ghFL&~3`R1zQ
zRQ<qIzB(Oe!BjKW$x8uZ3n{=`RSMu$K#<f?paA36!vI2~pkX8qxl~zzaW-RjikU?%
zZquzO&Eze8Zi<dzxJ8s&%E$Ov+M&*WNXZ*|xpr&1L7XOY6V0Tbe7e2!4h2YLA<sbf
z?gge5;7_zsfKP7;j$}p}3LqVXXP^LAzacg*OcL16K!zd26ENH$1t_@<52XNIeNz-b
z(t-jU;v>}~HjXX|LowIFD8OJANzvkW?ca0qn@)c3hTk0GH`n|v8-B~n-&(<MP3gDx
z_*+N*Z8!Y3j(%H=zpd!szJlNWliyyV-=4GI-o)QN)BijD$ZyuSMKSlu%%F3h_6$ab
z)T+Z5GrAy7Dkcj>dPG9{7Y<=@TY2|O2gMA&&CDCCeND|voZBVVean!gKTmGcVTIdc
z0tb+{$=vvL%%;st3<XHtixv*KkTD_GR947r#^g@Bgua&Xgq`QWodVD`QGf@=F&_$W
z5}Sx0im$fFjNd>PejzCjfzFL|r2wWFbA~V9xh>AnfhUAA{%81MIS?-kAr?b%Nfe;o
zb!CU7etBwM9n(VOVJBM@5YAD68D(q)Ik(1(W?2b;l$}`Evjicf3{!woRa>-rj5Zhu
z01#`3AOHnGI|q_va*;02fTCpS-{=3^b3n7ZNM`W`NidUC2t^xDP=GzB=qVz{I@v;x
zpaGJV8fJi;w^U09Mzv62R7(QsWk%o(gOd21zk^#Q3M6N9@-Px-s;x``%J9-Od)Jet
zpHhP-J7wZG@&22Lt2Ubg_)!4V>~bRd6hSjG2bYC!M28Mj)9qGYg(6?IKfJTld<PL@
z&v>MtBX~Q`Mtt);*&DjF`qGbRUmi0IWrRD_wZy}Xu{wQW=DXYn+>F;^HT-vLB!@KT
zSB1DP%6{#R=8NHuv?Q|(65x31fz^d3tffBb@t$b}N!185nUKg^7=e>Ir?ER{9OP{6
z&66bUcIgn)T@@avHR6iz4qo~~%wa4f4zX0ody-~LfVUXu1abaQ>Xf_oWWIlGdy1Ns
zlS9{Tdg-$|pi267IX#`|S_(gj5oy8FIJu_{oPE(cp?Ku|q+PYEH~;<HH+ilvt&XL{
z>aHw8@a(U7KP*Ikb9^x>`toYVY0|s!w`lsHG6QoX;rLM8l*WEGcESEleUEIS;n^eh
zc5a6aX*tGuh$V<F6{yPy6{5?L$W#F`KttqGiRwI2zXKay_q=bpmd*5;vtbE7y!!}B
zBVj@95u;$2^@RtF6$7CqFTTTOM!J8r5IOV6%o|T*x}eP81=H~sNWs0ldDz*}CYF{?
za{r{f>l1Iz`BLc*{bY*^T&GOhPQSYxU_11|7Fs&8t#FE9NPvu_Z^yv$KLv<*=H$0G
zf1iD#n(CRn-UtQJ>O~FjBoH;Lr+oy{9`-3I_}92*`-a<~&fTf9XLM;yprO5r%o#zg
zE*qxAoJMqYH$!Pcm5yx2Fpp_lV=4sAI82NE({Kc%gk6(Iey#;OvrG&8Jwv>ZD>D1`
z@llCHZVG@CAmt;MGIXTzEQH{gZifpC^b?`A!ZY}2tA+&JCQiZNEm^BP^nsJ9_&wKH
zJ3A&z8A)jT`KH6Iq^N~Whw!{LgqN8xvOuhd?_rnbo5e7`6GK8l4C-(2w^!18l$atD
z#=CW7vAj8DJ}q0FsJEM6r2{0*g+8B{zkR$nAX1p)6&nHy#e_7yuX86Hfh%-iwx_1U
z7-QtVr2B@{x$Sy53L6!p1GAc&4~=*AOOtAmOL4F|Zb&_QpHk9-rm1fqjHl>>I`pvF
zYPH#=mp2Sntr}V~uZf?jXVGO*K_9<Uw<cV8TlQqhy{`x(^*39cPz)sgyNN$bZ`*uA
zesp9N4!5tV+#_F+e_s5!JiXTI6(kG7Eq!)%fXqw*Zr@N#pA8w9f});CVy5E0%SB3c
zD9E(7k1I)yp)b7SZfLyAt>daGZ!ACWfRur{!44t1SUaU+Mr;t=(52!6?kXKoq8pYJ
zyBL{cG4=gaU)%WSR!RNl7CV?9{k*;F+sE0KH}9M~ADq07Yq^}B0!J)C0Y|;4)!~4i
za|z4I=l1uNXXRcT>3(!X_h>i|aT3ub1@*r{vkXOV6KwI8Jx|o`uV~()0G{o$!@k`o
z(~7T23`^cRUETZ9rffOE-fE^EsEa+4=Y_K1O9mOVE22xelLygd9K*womx&;9zq|~;
zfUWHvN*Sz@m+Z)$H62%*_0C1{uO`eCpK1__w|X;L<#+SwJyovk*?0nEQsT$LUGLJn
z!JA+C(aDu52^lsDwRg+I!<Rm}enhHJfTb)1#{9G)Zl9D>O+E!TN;Jmc%Aal7;&`uz
zRX!YSZn<a3BqyKtZhO&t_6e}kN^&JLu@eqOTH-XY?C9qaPdBem-4vVH(YACg**rP?
zZJ>N1Dwuz4W;IuflW|`|Ts|UOK{Vo2CAOW+wgw{P71N6pK)4|pW2RwT+B~FEtYFWA
z+{^ay?}6KLc=)5n=d0P1og+NmjxE`nhXlj~`L7sqOH>KWk=xZ3OfHT#ZhXIcHF8@)
zLEO76*Mnn#`&w#)st}zujl>h63dRyS+x6r`!^!m6rf7w$^Vf%5wIdj{#H-xWDh!vm
zVZ)F<Gv-B*Z(IfuuU$AvO7B!%^pU{`F6rF&RlvoUEt>lK*ZA`3n3T`L6Jc#5(q~If
zDs*xeW?=ikH9)$vCqgm$B$gfo#-ac(<y7*7Eyws{)m|-6q$A-hb}v8rV(Vg~QMyE&
zQCbBzH5V&PFy~P49!tTb)7R2+?@n7>9Cr3#S6(?Pl?#4%-y$TnZU6|e55~82-;2zJ
z(hWaHN!F=BywhI4i(M<EO+T)mzot?WKqC}w`wi428N?!Mdo>Fh_R)mZ^xJ&qCKW?i
z@$;P@YJ}+HYuFmc99mhAn>Y8o>6iP?fV%`vwk`P`w85-bPziN$->emD*#8Lq<aA(v
zZt3^=;bGGJ*sU;?E{QUfFtLKM@KOf+=Bk#$)140_#iC?c18xg_;Qy5~#u20mf5aLm
ztWk=Tbh3b)QhOboK2XJeD~<42BuGBt2UC9FJWEK7BkbT@&NdU6o^!FB0u*g6AeYji
zyUCE57V=<u0(5VY4v0y&(B}rglZrC=TVMKj=}SWhyc}rhDAkY`7VA0-G2fPZk#5-i
zgupAHj`H=~qUY@MnE-0&ah{p=bGrs5^%X`IgWa~9`rHZtIWz`bu&(aKhV+|o`WRtE
z*Ew|4NIZfMF6Y&ZTF8njxqQAY9?kJA%3o)gtKwKsf^t<J$8`p`o&qo{%oRB+i@?j|
z-0Bo16)e}@A@X6~e^xDvjP6Ah8bNwBa)R2-t_22cv>i(qA3`h|EbVqpuENHI9eZMg
zr`DPaoL)4)iq_)w4S#a%UZ0psTYUh7FTM2@14dbP7e}kF?aTwaV7dzJ3aN=G2wr_K
z;jQ3h6QsWfEQ_=hpj*A#C*lbwRte|di}GkGevAHWbhqu@T593E!Q6^fYJAwyA1&y~
znk^{C27;IKOoT2KL0Hw=+aaKsp;L<<<~ld={e%GhRG8zl@?@I?KB%ldE^`*cd|;8#
zqcK6w0()n}NAz6}@ghsVXr8EsoO;@p<dOP`XYDe_hm7&UK7Y&4Jn5eaebC*8kpv|A
zK9ZamVn`I)0&`5<F)ie`orz#-+Q}@0k($Nu3p7G=5ff5Ne3a9@NRa&jA>Z4yCKwa^
z5rdE5Alt1|f>D66Bb6H{8Q7UR;xs7VkRFS8&|bl7a`jMv2CIm=jfTi}W9+f~S97BW
zbzMQ~?d$O~*R$+7j0cW}uJf!wUbGOoEy+wIf)<=<9O15nmkErm(&2(z<LzAni(s?9
zF~YuO6V>P<yXE#|$7o}CkGzW+LUX5oOfH~y^GEA8qT87pd-<CFO1{eYB|CoYtDD>p
z#i{vu77j7WmZvRU|B3HNK`d$!odQlF11Xa%{$;VjITOVY*WvB`Jknsx+=-(xyd!yY
z#AmLDtAx{pDjBV&*cFngW0^~I0hbPR40K(H5kPb)kvUq3T%Kg+YdkwWN<47&%!9pG
z=Y35ow}fG)Z7*}zM2;w|W~CkZWOPTD%OJS|{@NU2*EHfw=ERgIuO=&TZ)E3YzV2G(
zGcW1lKm57|6Qpac9YHgevL~FL33W%Hc9ub{KzwOENjn7NF>Y6nSwyXmR)q=Mnq>~F
zGY)t71#H`xmM*c&=}+$@)dPJ9C<VZrt)1P%T_L3+JZ}?d$scp)7TI-VzK~U3j=POE
zMQeTN`0-%0<dpke5p~mb-;UJe+v^gBiO`RIA;Zf`%$j$&`l`zugl*K_92bx2e5iO$
zP5W?;ika%j7lu+kN4oUS5i5&HfuM*@E5KO8?m(Io`|{;1CSvG_5%rCrR%1K1UDoE$
zV0ZhiHNlQwPbf{wjU{LHD5FX|Yy`A6U`<HS=xE9Hj02l^0>hY^8m#x746mzn^jSE8
zmGhpm33rgp@>-I5*plSqc6EK25ee^o7S|w*ocuoA^lGgmJ0hd%#oJP0A$l7d{-d|M
zXs;!|`vBKmUSzRv&R<Sp@jp5LDJoaaE}L7$DEOT<Aoy_C%+m0972-qz<{EuUX=LCs
z1A2aWo|&$>?ghTMXx6d2ApAs8thK(#+ox2pv1;y1owr|r3_jui6#@AZ&;KzY`SG`q
z{5J{7-`>A}>HYkl^oULUCf?t~`#<S&|DPn@KOd!%`@bs|Q9EEZ+d^i^BQTR+Be|CF
zY^y1ELk1v+*Yq=p3QO%&6+Yo#-U`E)#`IX-?|tQH?DUw^m@nZX@WF;)M1h_q<RSQ3
zQU*Mba6NKtMqMcx%BSsHHMiBa{dzGRGa$y@Y|Jjl+#02k>Yn$6=j?>|JG3M^4M`SD
zCOt%aSs<TAEK7M)fFW7vigfKvBKhvl>J*tXynt9=yAhr|I~|qr9-#)qa@(Rob=P1Y
z3#!pfr_9Lq-~$~T`Z4dIdw~cMB<6l21z5{Mm^50TwR;|uy^#O8vEqO3keuUHJROy&
zMI1%+TEH6ENrH<5wM2<+=z+T-QJ;r!oC1u#B7&A}_r-AvkZny;95sCL?NG=;5e2wY
zg!F*?&_t%g@Wv>_?%Q;HvK$W7WQ(KmU{oM3A2fF5D}Oi@rMr9<TK}g1FUAV_XOwLB
z+Vw?x-f4||PWk82{>$k(o%k@MV3_Y;jMnO(aDt6(b4RluUx(%@{fogq`)8bxr6_5f
zh|BNk|6fw)@9F=ap#C?t(^6!X8w55)*I6A2XE+=FT1a)|sTJV|{>qs0R6m29ia{aY
z)_3RYO}OK^KD!y4#2qwm)22_M$PUn@FG{rdkmZ8c1#!rhfm{CfP4I=7ffbQEYSljT
z=OPq*wX&X|Yys77XEpzlr7O33UO*>axt`c<=i>_n)pn!qKZAEa19<;iXNMFnlT;Q%
zOMU}mXo9698WBkfaG7OO9;>-UCpY3qx1o@ifU#Rhx1m5o-50s}$^*~@j2*;7&yv8s
zxby;1$A>n>#&SF-YE674>aX~l7~ao|CcILd7zXlBo>a;-f4)>wFSGo-^uwV-=+ef>
z^pFib1$d31?^KU0HV%#%sql8@Ysq}S#(?b|K?w4ku1~)4oC5qSwo~rv>RvjdSGMt@
z^i@-vyoEkzoY5#fkF^2#9x&hpb+rw-=iSI}t_o!T=BmGi+CLPOXJCZpb}7;=Uq?K8
zh5Z&xsIihiVPi*;DCcuemw)rCL6MQQa1a~fg4!>w<iHhog4BZWP?}v|B0_`^?t$F1
z_`I`GIwRNW9OUv&-7Cyxc__gmaY_2<asp$IjI1uyc4~K(Acp7bfz_82h5E1sOPa5@
ztUkc`u#jc{+Ph(<yrb|+<*J_OK*lF0a@H=LyI%)TwR!Y2@Q@5`*zBhpL6#;O|7SAI
z&xD%)zfTAl$Nw(`1tOJ*yi;V0V}!Ga>H8ql`qH*vix(du7A4~R!TcZZ<#du}*BMCh
z#Gg?Y#9xoG|DUOw(_p8_EOJ=Tt?7ixS!zj8hvFp%5=%aY`Q|%%w4BaQy`I1Qs$Sab
z<0)_Ji2=C)`^yG>0KZ&*&z>*N6RX*WJVJg?X25z542y<B1UF9#$M?S~e~N1Uk<wij
z%91@z`=~qMn9hdOk=h<J{D~dh1PRnXukOS`4r%KXlq2f{nHILLuT3_gKj_j9%Ud{p
zwI&fSU7-%yU4`DPDaI|5UQhsUXnX<Y3<aPke~h8RUO+|kp0@1nWa@1Fc!^)cUQu1W
zLDQ^m)$7)SH#6OUYQJoCk>rnp5BWSqU7`xU0gSfMfIeIk3g9LzwUUUY04qjB(Pb#>
z#&{Jf)%JN{6~6pZUHG>JasPnvO3w3!)OW6&QqH;3^aFI_)31WJnu1S@v(E;0JD<d>
z5-#|ptgwZ2vA9N+Z|=$-o1ZLQJzp!dOr3DN(YyIft0-pRDCeB~E!7`^MaVm+lxC$s
zhcW1SjwHR(5si5dy2Z`<1(jkQ*yD5-)bf=BbcU{l2|rkYAgKkPuic0>4uZe56t#T}
z-ID@C2lJQ#BDV=H5^uWd3&YEdq^aFNm|&{jtNGgp?JJee^t)y6e*WS&&jD+bymPUg
zzx}Yy{ksW|evIoQxR5O1s7LVfwM0Jgm4eg`e}YKlCNvhw+@TcQ9A4MB<1<@Am92WZ
ziqq^`u(kD?V77GDi<k3>ga$I>C^%pyQuQjSf(S}}+FDTh-(EljMDBED3~OZ7T$5-R
z@Lt=AwYRW+Z=3XSg{j2)Ud^(oyv3Pimk`K!ll#{7rq0{NPFp$2!)y;!E-nLeHtO<c
z-qQ!2*nsuH27bmih}<Ku7cth0sgQa@G$Cj~vwgDcC1xa68>-ElF}u;Sdn12r=bBv$
zzo>Eg&Zpf@|FJdQ+ROOo1Z70m4KgeFHI$nehJ~%3F8y50obP!l{yTfKwzJT9=+MUr
z(Ucn*=d!Pkvemt0b$!3KIdV@ceK8#`^|R9@D3N=p>xme4<n;|H=P`e6<!GMxG2ai4
zUE+iCF;|pRqEl|#3I|)QH^n5|9t?r2&q3xublH)Q;Z^(ikQlQOVed{(Oj$v-Jp~Ay
zUC>q@3{y2MxYty*-D{HGag%Ex1omb0yhQ`!EB1a05D#h28G!PD&G^SilDkop!^L=-
z9v#{iEL)tMUuAGkrpCi^)kF5sG4}(buczr^-q?N%#Atk87$P6KhF^|~66<4yG5I^q
zzp=Kq9W@3+<J6rQq-=yA0tF*8a}m6JMN~*nYIDwz9>Re!?{ceX>AO=YVP1pmIC~G{
z@R@!?B}g_w`TaWS!w1Qj*-=<34BSxL(S$Qw1sIDF6c>(*RMx~hA%!V+j7;nb7L<!N
zC5ya_iBJQWE?ixi)P5g4+rv1`U2Qfih-)T2-3Il)A_NW8#cU>kj9M@a*%pz&&UH(#
zKbh+JSXIHWb2UTkTl3qB*MUke;Dxtj=NUq9lF4!hEzl+;7~CK;-zOO17nc)>T<QG+
zu<pP(2%9<-?Qb%mc@pkW{k1NlQGU>y^)1#fH8B#WcENHeJPZChS%P6NLq7&i2F+|{
zFs!i1q@=C4Tsv#I5`GJ-cP{M|49#|a<UDzJGBr*U#k+BpDM0+f(G;&tW0iO4hs=vd
z;ku-JFb+`w>2Q3ud5IW;sqONoW@*nFZ`@(~(qsR`tI#^_h*a*jt2<YHW2ZywZuS|$
zXyBP0`<#OuHigWjMDQk$76^5iVxw{Tq`JHD`@JK#+Ij`tzqXWB1vzeM<aoHSy-qk@
zQA>y&Zay*PmJxf05D#xzO`ny*;lbT=mprYbP0B(n<t~Vick)F_EepTPg(buNpWBxe
zUyFh}6)D^INT?WSXkV&;54Ca+epYn7ba0(+WP=_Fa-pUBu%Bdv+Ui3<LX}eYHFAy<
zwXpMxiH#xte$8n4r%^oc;f_<w^o}$e&f5eI_%X206rupu_73p~;R@XJ16c-tmS61t
znU7b;-se>hYNsvuTpr+xV&e9p|DgTRe|6nF&+J<rxYzp<$;@o41$Pmk*P<IXoxQJz
z(T^YLoxi%}gso|1m_-e#j0o}KcrTYs(SFTH?HN30&Mwd0*myP2m7rUM%uk<g$K{d0
zV5`RtUdF8eb0K|ppr04UwSqNF6zpB#!798ev9YC=?wB7P5|S?V<yWMakbmpyRAq5f
zWRVa=o+)T5NJaRWa*>&e!QpJY{IItu?DQohGO;g4`lBns%2{wcLacYR>znwTmRt8A
zcO_xEzM^W(pw%!0w~|bbAnIb%$;>j~z4iDO%re3ck}LolP>O?aYMZz(9jJRZJ_}WF
zJvY%Lyv^EvR55e*X4-rE`Q{q>kq4T4y2h2%%w%`O(%8;{9_}s}GByB1xW$~vvWkCE
z+E|R`I-S0eoGe|_aP)t`V^4GN7}Vlw5o>%XPf|Kr1WYpgPEw!x1|rAc3}!SaHI5J<
z#M2a@Y3qk?&G|)+Dh+1oqI&s%*GpsM4st*`ibuTdN4^*!-6DE|S)^OJ<lKK<{wyF4
z2Q32!(4P9gWu5+&qWZbK8?xK9o`gi-GNb_Ir+mmcl?R2GL}GQ+N+97V=w#b?k3yDi
zE*JC;uYxf=gv3w%4T+bn-F8|=qE>s57*oVFyM@18pY3+JUM0+-pA9>qu5ou;K{TU_
z<7iTGrY%cmJ=ML6*Cn-2HC^{;$Sm^&IC&I;F+<Eubs@RH+%Ca(IVUrFEClB?HCt`7
zsc{oEti(LsL!Wx(_Bb?g1Bw>MruV}dmC&vj<Z61aDvfWaEaqeWb68VjjFss%ZQ1wM
zD&etFub0ACkntmUxe+u8l&$`xhu}MEM%eM}I6OYSn0NsbP`C21c5fCRC^eyk4m;tm
zD1JPtrh2>luqlhG8lodr+sD?GcaiW7Yr6{VgU)P`*|IRme)ib5`GG`j(~?NJu0_`g
ze@6R4g8uh7PX9OJpOX#r2J~!Ji;k)Oa4bS5D9u!^dJ_4-jLBAlwpT6!J+LZ?Nw~H2
zkUqZ(pN8A^Qw*Djz0bZh7{=Cq@tn;Lvzm9k^5jq*Kd29|D1l?v|EainB96dMf;8b5
z(|5hcndpF}E6ek&+B&1FfNe$2`uqEHWbiV6Rh&LZSM1%L{7^PLV?Fo*KJeXDAHfC@
zB8Gs33n02g&_q>&1u+tz1nEs@@YQU_Y~_aLAW+%a<604pS{?XHj)BU0RHvVaA1Y+w
zSWAKiBA1?0fR@g>RRmtNxs?z_9?abyNK|KoUHlN}r?~B8+7~w^(3D?2-|FYT$a18*
z<aNJw9V+K&2&7%j7Rx};^nqf6%85LDIFJ=e!-er#C3vNtPQkP-)20qWSbtt+WTqEP
z#Y7^*F!;#i_@IqPbNVxuyTh<xP;z%JVyp^Ke7L*V+7yJ#2nu%)hTxBs9P=F#c33$R
zSe`Hz+WfL5KjT7(w&Lcw_Wk<%`FMH3_*+Kg(l15Hv9ZZyO^}5b!&(NmSFz({<_{p9
zUO>zYtoY}2u`!rsqK3mvnhAnVbEa2X(j{`lw2s-h-y84-S3oW*N_6Htx$|eTEV%jb
za*62gt0Y7hqy^GD%(tS{gZIlemHigl;?S^o;dK_5v5<<q@=ZeAf~OV7a>UhEzxWVb
zW-?hrM}<5Oiu)bB><%cvR|=*HWTy1h6f*!!->nQ%Y-)PsKRo8pGPBn18<*^unJ#w_
zj&;0S;+KKlzdr_b-I?VhGpmCf^#EM@%t%`@6G#el+*tRP-n)<i+~`Q*mc?7e0CC^-
zEQJz2E=CXKM}?V8qWLB=T`AIhQEwDoClLWkKRXY@@R3<`G3z)8{zg>kz!H>mON6uj
zRdO^>_NTroCquUWnWVRwso{^|O(cC=Og`9+F$KuViD9Jx(*u}&3Sa`t{T;j_jL1W?
z5`orV+F^pzk+RN3K={bV=I(@!%#}`LYqWn9mDLri*KT;%EzPU%CZW}ZqSF+vlGumB
zztpZs;gUeTT8Ixss2gwLO9+Eo;Agy7g->?-I{N#b$y~W{2{og1#5MhWf{E95ci;@y
z5bK>S@mqq&uTBz533iAsHyt53H8>Y;cuyP|)1P!~e!o}F*AA{;<g5W*agE?=bxgPj
z&%4GQKckHFvX*r+y8J4+m0;`)As7>5Fox(_j9?$E(XjVU&H4wTxQk?A1W%l=!n-69
z`Zrl9HO1bWnkQ?YQzO$xX2!c<m|&%C66Pap7S;vfwe4<m-yFWNye+%3t@h=+dg{Y&
z#~Hmq2^PMu$p%yHn)x-xhN`wxAc;AFucd-uEKe(unI=o|vwaYbJ^K!sr}*IRsTJpZ
zwzgT*%FUdXj~|L14VuT4a=2?ALA~N!Ok_iv3nZI)_Miv~;1W0+xNJt`=2{X)Zu4M#
z%_bX%IX~k*cU}2dlpt7m>^XC971zbQi!rbQo$k)Q+7%Wu8=8~{QhO;`9Mf`>tOZxc
z7nW@xsO7Z{BS@9Y_KNE!mFaRxxbEgaj)gv>;qJX{*zC59+^IuKdY~oBluq!22azfe
zUg@aB<;1>6ex>6&TzK4i&ku$f|6Wdo2N9YXvTt7(D96s058n(a=)uV%A42gG(B4IW
z$VVyy83mGuEaP*}>xnRCzjyfTzIo@%6KVWnuO^0s0OQkHR=lq9=El^WpTXq|B!4Jq
zPTZcV!H?kTvGD`MqoBdpl&&$byq>tw&!qUZFnu3^YT>F+YR*1;36f=3*Cm}i#>S+I
zb4dPa7{#6d-})tj^hN;0^LRaivy+1v)_0jbrG|LAhLj<eBqE~U(S0IQopD46TTIte
z<#_AU%}7_=EBtqC0ht9_OCE%=Ht{sz9Wi;`#~1L`Y!5DxS{x?;-$<KJu~;hM-T<ES
zYt^Ne8LuzD0`=&#$$eO<J9|nD_;3o)*D2kW)^lO3y0pE$_2G6J|Itgkr$pXXFiE}O
zAr8fi6bK-fst_2v_R|roJ7@xycL*1X*K>U~Z-;lc)K9dURKG>#^7-<k$MMraX)a-R
zG}ahDpYAnc-#nCz&%+sE7Pr@y*=MFKK_F@c$`;wOayoyZP$Z+$8#m(X8|KCELnvp|
zAfw{V$jdhhtNM1$(sVtlG2XtAhCrea)oLK~W%!i|gcp<<Z`gNFSA9O`)*kfG?7TPq
z+0;SbW8bG$*6tXk9M5tcbPs6>;idUOUxbtf>E^*A#=J6p)&$3zj?2J*R8Uj!?Y<B5
zj*$x^O?GGsILOz(dyAN)zcpriF)5P)umzW~Yw<3aFUO9TKU7%*zedWA5wnXKS)|$9
zzWD7T=F@<2y2VHJ(&o3-y;~Yf^k}(*-~l1G2P~@kFFVa!Ndj;;tj_bLf;ccyK0D4j
zF!Zj-58PbaR||!XLmL@$?uI-VO(MrQt`6Tq@QZo9z(aaUJ~8O6l%#I*>LN>${8NJV
zh31yvKFHUuqlc5J21=dqflJU?+hru$345v=+B|hP6e_ZbyY5`gn^E}%XXbvoKeyS?
zEvF*9Uy<c2otSAw1yjb5F!JUi#$bdN(PhyIW_5LG5#^SYW?cJGX0JgO=p~R=$&aPQ
z9pAd1_UhQXZ`x0+xFm<{{PW}N8Qw(<(Nl$IXUBGy;~e#44kNn6DS+<|s=Xf~qN5Hz
z8`0d~>b<fk06*H^=nFlJa!Ap8W;gd0(s5TjiAwiVy~gl62`z<|kYJtDq~}nOOG|5R
z=dJF{fLvo|l|?CDgb9CI3I|Ko_KhgltC*kj8EX)2YG^88A(SOpDLm*hxVrW5wxaXx
zRucUvEDq98u+2yCju|XynMy=16(Cx2?xJ-hHy|91r>YIRbX0Deq#L(o6D|&!x`wez
zy}fkQAZSrp-c@ksaU5x}Wb$Vk)=^M}y!qcfp$Z@oH6wSElu%sc-h49(pg;8r)Ho1p
z+|Y(V(tR^l#6~AG28LItB^H_Bz`V3NDN8Vz8ny$&7#1Apv!iE0Khby(<VRraO{n3s
zKVM(__p1i~kBqO=9Dt1Lb_?{=d?&|GIe7z2;_PmR;lT|>tYpCu$i20Rupe~sF)J~1
zNX$j>!LfESv+%MhOIJZ-d^Kq}Ju%?Zs|H>^gVDJ9^6c5<+ZioQK5$#oEu<&nE~KtA
zkHCcA9K*qYjgtZQOK0_Y#tlPXcy*dLlQ5rN9Da7yaCq3k=S^Nf3>+lgmV(_R1SlM*
z4Z$4Iv4cuX9+?d!$@@uo(2}~kmQH?<OYTRyrO%vNmO`6h^vp<4$P9)N><N3VHBv2U
zxWLBq)nTI9x*a!-7J5_nHpY3r8KOZ0fc6n79lBJ|DTZZ80u|9}s72apFV47pu?EcY
z6OZI*p<{jvtk*tUNppYZ=!z-8HGmBIr}`!dQbbt+hqg8040#YD912TBa3oY;NA<FW
zu+Id933?w9S+4q&$+Y}!4HTKa@wqZ8Zj5MSooKL6e-dbf`lZhn<1T=^!-ZW0pF`%Q
z%@$$l9P3c7f;k1czns{vmCxjTnRRZgOO7`3Q_jOIBkkiXKU%RcGFuBc1Q`4hK2fum
z1<aDf)X&v-v^mEkaQ-J=XkD$-4(7idD#IlP=vX_DbV(V|tLZbfI9|L*3^(~5SukC9
z+~eM5=ksn+UuzYq<G*?4o_)W4>-C#+Uy>{PG>`eR<X939zz2vH>&?*K1an^zS-cj(
zmgqc1J{&o4;Z>@4?saF~GDnz<jq!bwqzc_TC4P26o~u>ux~N&ivs%#4OXMsjGkqtV
zjG3N-96@>prh$3`2l3jnBU90Y8gl!F4&iO`0V^<ntT85ln?1`!kzZ8%3Zx3^H-Z<1
z5xKtM7Vx(b*pL;)lxKKE-}RcF=!(d2wEbYvo2vFt$wr@BJhkWw-p=*$Nu=K1hQ6pJ
za;D=zJu)_Y^JMuQa1osZzN-&=%=qo7o1Vv+R3<4rD2~#Nv9pMH8d85u;`^zyH_u1V
z2||XY&`lIT&OUO4+^qwW`nc=V_Z-4?ysH(q1+9!N?#exE^J<dFv~o%=Xx-<L=lJ?v
zm1x$C5g%+I1ENbFG_@eqF|(RTj5+(ES)Xe}y$O%DCs7WVregI^^b5M5cwP&#^J{b9
zZNE%jc+z7;{>eioM^LAh7<$;_p*%XmL{n^8%JQp#eTVUyZNKl1(^Vy{@ylMf&q}&+
zE>HO~?wV)K(eI7mGGMm~4nvn_K_5CaP3PDg++-}YI=Z`m<r7F*9$&5(%%H_{!dKVC
z2gG|z_KZGRU(5hsxK3C<mwWwba&sc~8o`P@3=sfHP6>YkvVug<V~TqbnB*@pG8l~<
z|5e{FxEm#SC(8@b&;0l0uG*K{iShHJ2Er|apd}E@d1!K{3@I@Ny0wVWuSLnOyFGz?
z`;LgueSqE0bNZlotnK9kEd#g3Lj&$YLh{dVicuq-Iy4Eqa28C#c2X^Q{-i=RDv|>9
zq68wZlU;*`{cslG{{|oml|}wg(+fhM46#CbjZ8-`X>LQY?5~l$WYJkCRAlk;@Q+Oa
z$?DP(+^IJY55vo{Y@=Tr9TZ8q48HkXpm}EP7f&vC7D&f|93eI-W*CXNlrOg6wKp{}
z(o=B6jN8Mjzo0VFQAR{U_O-r5L-sw-49?{{R#D(GWC!&T>_OwDF`dXAVFzpW#`+p#
zR=etl#NPVv&GT5-3UJ;Y+6{0!Zj>7L?K9heM8OmcRf)63D-k@2hbIx9Bk8%mKF&gu
zC?*NJlH()X&13R)pwsA^!RF9J*d?nqyW3E76G53MMM~?W|AK57>5{Wo(AIhinmQi#
z)5gl0oE`7of+}CNXdT>n4-CtFIpuSM0XDQw6m);?y+{LxVux@%_`++#<-I!$k+`qB
z<b~zfxdlgKT8(bI+aYQ$+rmSlVJxzmhL;v6gkxYaGYE_kbb1`Oh|ek~`r_}qJjWyY
zQP&3`I0}$Zdollp+BF#~dpo(}2MjEdBgdtgAy-(=-k>Vpqv<W+(iVTJUxU)S;U85Q
ziMZ&0aqB{Y`Mx*(l=LujK!(lP)Mw%F-ovV?w%?Kn%y0>O7pOcu!|Q2o-zT_$#=2tu
z@?BfY2?aGMRiu*nNsYX6U0=mM!sUBZ3KOqmElN*i8J&7D%ZJm$fjb~JB<9`X3%JAf
zdY6gZ?5n`uF^Sx=*6Pc7q)VR+Dk|=t0?u8zO8{uDNsU7UKyOqR5_FCaVBa^bD6y9(
z!=cMN98M{(%(_qNw4P486&%hTiDGPiBNjBWEC2SemJ!Y{p1Pg`lS4cV3R}hw&={jr
zA=SZ;VMO%vy03?k9|d07h)}>^x$l(3j`EChdw7snk4D?=t4EO8u=u3qg4+;5s5kqx
z(u%^kUF5zocAi`vNVj*><k_s->>JEFy(J_wvw<qXIAcogam)(&Jrn~f12c&@6a4cP
zWD2wa7f%5g=ciKr8i!c61Z`NZ+}k*OM}he8Eb(rm`>a;Rbc1`E?;8r>55Y^+qFo5#
zk!@2euzm%B{fpSJtQU<b&L3)K&5*mgv~u;gg&tlmdc=`jd|0_3qQU|xD!^n$Mvys`
z@OtD$e^NCru=z&HFs$)LUigUS(EA;wh@p1DkUqpI2qti!_yC4l2OqrvB~7ts?YP<W
zDw*j9L5ke7pfX98CW_Zhe1{zI{g_g|h);B$@R$fYQqA{P=i<63kLKCb@TU^bXt%Ab
zzmLqM<K*#t1hFk=?Wq_R)I4ekD|k7t_vFjgm);W#7vBx9eI+D)6H-!ZY)XjHB!5Kg
z)~BOE8F|(D6{s#a48y}fO1?2l)CGMsha$gqL-=*X@#{<5<$cIQ-obb?CyYmZD3iKe
zrJRYN&5KNLg?Y&ZjFB}5K11Busk?Tsa<yQ(VF}g?MNM_Vnvr+514hz_R4!p<8<qn>
zg8>RV;!mVF_4e(yU*2a(b-T_yL;^j`R~BdTaAdqEUk^LVQkOj5QlC*iUhDr!WB()9
z$!(=TmEBhUxl{p7RC`8qjX)+(`g@oV_?V~8Vf@;jlhi8TwlEgii~IQFF<)CuK`p+M
zWi|rY$R_VEWI1!#?VRlu^FT)aRMx4lq?mI=T9BrtWBMro6<IDaaMgAe?z9Ry9I4gv
zMKv<V`DKTxR&0rnOBz(>_;da<sa1KAa=T@VQ$E>hvB*a-yjnZioK%dM9<4!q>8;(U
z_@}aJY2vvN*B@7llI5!2{JT7`|FIpiCv<6R@jxH<2y{z>wmlN}3Sk;Xe2eIsLhhz)
ztC9yO06Sv)IyrYMv!&OUEjC%YD&@bcNB`4(WOtAo7s7VyYSGHzx*?8$?$@@@$g<>r
z{X>OdA?O(ghfuTn??|boIe7lhj$g7DTohCboFntdZ@3XZLYiwguD|resz#!Dh=<9u
z&T3<z2V(@URn~)x#Uq7KwsEy#rv+^s5AMISEQf78FQ0RgE#vQFT?soum-ry4<_sHY
z$F>{MyhG%Qz!`zm5*HahaE?^cp@o}$C54z)ca&kPDfSVz4=?$m;yM(#`&dnX(%3Rt
z49x}^wj~M?2<|a&5n#B+vmg=4q{B?q{Xlf7oyEt8J*wV2)jPo{r&MpsACWuPA#Gxm
zXYu%y%p7;E`W|vSi~`7(c|&o>B0)v@;&uNUrOO3)O(`-{#4<8IH>ZaW(^>p%uwqV6
zyKQm}d*4d`W@d(-w7!w}r=8tmgL_xBu2e8Ak~K&-p`NvOpmp*ftJ~qyMJV;ON3%gM
z7-fDcoo*SIOpd;`PW8C5`_YL{tO7OeMQ0nJD&U&VC>`jR-3LiZ1PuQa^CAy{Wi>w@
zAnIy~LXf3&)W}M8^&FX_vmG;X2j0z6gtC(lw}pB#Mn%TFci&B83Rcm0DMe5qr6XJy
zV=2Iksjf&JY9d|i*TtTX`ymTO&hG=<zXcd6eY<yJ^%g1}HH1?jffO77Nz8yBCzWd}
zF(iey+joxpT*!2sa~QeXzWjtf&FaL`%~`vk*J&~`a&uatbwo6x3ncHBDRe%T1xGio
z!%CFEK1p143h6)D7r)(cU8j(3)xjR+;qmf;p6G55vF_+S_-r1Hbojwe@8a!F6+E~s
z^f_~paswat!yQU+xgO7OIk#PB2cAbhS=f|dnxslT=U@J~?v3Sc`h?QV6q#)vBmi&n
z`vOjMjGUd9Lrd~^1=({i>r#3(yEi{BtVpufA8;5l8}$e=#oal%{f0Iabh!mh5es)#
z_DNva_($}MvfNQo$LE+S;$c4AS5j&W8z>}|_RMIAj`+gmT2^0Y{`QZG>cUW~375vU
zG?{=;nHqL(?ooPjHH$Ki@@LcYl8FnTU3YztohxP(hOuQ&g*GcCD0nnua%<Y7<txg-
zAlmi)cP%ZV=91wI2&J`wY(}B0uV%?iTA=B2g%pnrT*ONz-qevHdX3iZQ2-tm(p8!7
zH#m3DEqvyEQ7RH4a#a~8%mJI-o}0jYOEURkClmP{f*QK?7}+3F2l{_(yHswZoU3o2
zOw0FhUbXnB<Pz5`VzVgG@Xp-e@nNK_SUmGnf4LGSzZ<3c2pt%PP=JymrxvW5k(m?`
z2Lf#nMi6IDgcmzkWbFma{5skfn{px1yggxNM={hfD`DO}sLYtD84#msP3NaB&dPe1
zE(^~4eezob#t;l3G7JpEmKS>w$!!t0bTlGWP;xAlm5$R2ZdaE0MO)Lp$ZN(}+;exy
z@Hl(UZxgUM0PPoA4bW2aDTsq&E+M$U-c{lHSbDSMMRv8oZLgyfOxZtJ^h)m54f(32
zIiMI&b!v`JsKkNC%{WpWsT{ck`e97dQF#Nl6o3nExE#I6x`k@$YDczVwwLQ2A30tN
zk38ajQ!RBiKU3Y~%AqJfk}H85(WQoP1s@~-TyO(|o?NybZ&I*^w{JV|+ICC?h|Rw=
z@VJ+s!K(Cil`-y_2VKHur@JpRs$l&)DCLI`RA0fzJe{>j_n-lYI}Tvfwc$|lGjH6F
zCh+|T9LCQbF3!@;)zdH=szv8Y!7?5_km*UK07lRT2$2J{-ZGYUW(8lz$m0D82AevX
zSTN<_t3roax!0=0_>Oxnn}*$I!M?VrSYG)<nrxePH?4G_ZqNXByaW=h59(YrFH>On
zZ_5cOz4xlFl_5(^a0(`G(@O%6+8A3xq&bVXAtC-zEojhup*1Ak0{N*~(#J)C&cj~0
zcPT(hm3X$Cf~9?D^do<p(8$u=VqI?0Cz{IH%GoU48QF7ly~VBvFTwR}nYWKrB_BGC
z*IHTRozpGbxV9xi-I@9IvsE&G_0x&N#`0;svSLyGFPj0H)3Q56VJ!^798`yk@yY~~
z!RQF3Sal@3VaY&rnSa*R%>J(?<(Ukev??ZaC^h@(>a|a*Ksdq_WDiIprxH#R?C@@6
ziUGD|o2q<M;|Dp>AIGn5Dzb%cVfQ#P=Q!9ZaktoB!V`8sI5X&s1mroGm-7Y&YC`>x
zneK6%1%9YdDMg#l+r_&)#oxft`;olGB^hn}+w=E<rw4WQIw7&D#>R3CGQ?aINL~_%
zrMod|cnuu|7<sC#A<{Y8tu<%6jr{Nk)sTF!V@Gsv>seYy&Fst4lI>8u;v)K(FNTH8
zM1w(eBWcuG3b^s7mJJ%=+Xj|GnSyM-!^)T2dwEqM1rpb&Z{L@7O`X~}ln%14K}45=
z+6aen^@6vK_=LifrHg8WyI1v_jd}uhb(384hQ!wf9}G4IEq#Nv3}GCXp+uQ!q~{DA
z>aG;)GUoarM?YKiSOLz@f{}Nwe}-Q9Fw^a6i;)+>1t|!OF&GzuqP0LNM2o^l`>!JJ
zI16C{$}#&r+1Djo%6xKO%v{!!%q{S==au1CLweiAJzryxhXrV^8G>{{_X_Ko5Tk?W
z;v`Fg#^ciPo`}R~SPC1vReeX@H3>@x{Yv`_yE1!5qbY|f8)H?JodZ{(cnvU!6j6)+
zxYgH;3cRy7<oKNR_W6cMqxuwaLDxmWcYM&}^p>{==2wuOa1Mexkscq2kte@fV8T?#
zC*Bko+wE#RmL7KB(T7d^a_^g&x98pOM`{cAXH9-htgeayV~h>RMy1#JP%_iEGPvj@
z+@*LL{~T`le6}V%EQ;q9y54Nfm_PZ*5ohzatKO+9_p{!w9YvUa$9rQu$!uRh3c60D
z!hbI$e^_Lh>irQ@w_I(&W@*|&o^thN{rI(}l3Y6+IU6uwn032Y<q2EN+E`ZY)73hn
zFnI*Df)L4zm`mxi?YL$<*E4)gAR|$1JVtsex8;t+3)ECjTZQxDK>1mYv~lj}(yV<M
zSPso+*+CK$NKJfD?}9rzVtA*Tm@ZgjBge)fFQ`r+>m#Er<At3R_e#_K<0t%>{y+BK
zJRZvb-yc?z-DJ-?6{SKXOW7usElE^%O@(YBWXqVVRMsSfBBrcm8~YNO>>|m&#h6h_
znaoh*VwQeypL4$FKKJAMx$pa&`*)x7c-)W2`C}quT-Wu!-q&k+zMik=YZVoENAU`~
zy-wh^9T8O>18_7H-4>3csga>JPyE#pwu+Ax{s+;!VKvWZJjKf)*M>X2pa{xPbltgl
zq1l?jkKv@5QevKCXCRL^!!3I}rGvbbL)`<fME&efH(vSqy!GLAl~PXiPwG!lO-Vk1
zEm5bt=b<jNJZ!<{%?3A}&;U=*OJ$hB{*2>cBJ8iWDf^|aB-m4%QJyIwYzb)34pi{F
zh-a_e;$*&A!MpHn1EQ#<Niu#KO^&P?Ytdl6U|K|y8E}4Yr+8lf{u;9jl^w<opC3Hp
zH5YUBE{pub!SKVrMOUMlVn`DEDX18x4*gUst??n&PxN^N(yLn!pOm>WTwL``{IV&>
zZ;qrQtPIAGTHl6_xy22hEGJg#De8x8pZrCthvCJt#})$SISgwK<^i<P1Qj=3L@J7H
z(O`F?CD@{{{ySC~8O8m>F*2pIDd>~4qgisHuuFW?j?&`%G{r6Grx4Z8qiZ<^Cj!kq
z&5?wiNa;>45b4bfI}z{Ie=l>E?DMKT?@^Ipm0QN{_}gv{qA7{!rr+z_3?rx&0c<8}
zihI8F4@XVVq|YTj+NC_jRPPK!krAiUE+!7K8s@sW6^Py2dT8Ok3Wq&Zz6SG_*dty7
z49I^Pah#O{xq<CegK9sZqVH6U(W+^TkC2IYvS1Nuqxqmz@`~d2BgkyAl+Lt_Uex}+
zL(l=#HNwPvJ5;B%gd-8BLA~FFhU>uYucw)$v?w$Uzo7AlMYQQ^-EV7-MjY%lyQtql
zbnmD-BYS-!gmRr-gV{`zu%I$kjeeUQL=OO0yAyC4<DWOgpe<f0zHS(0j|)eKH09WO
zDQYOFE8Ms!aN^)YWfC5w)70ZASc%v!cvX}R3Bw8P>Gjp<Y1qzr^mFq^g>?6W(8u25
zcS#xBZ75HM73V%%?zU%6fTM|E4v!itL#;B97wRsG&&_@rV@^iz{PtQuPN}zQ)-&Xs
zQ}XUBZ&ez6XV>vHgt=*}2PlL-;B#&x4YE1C)^<$6eWVXXB@^&n^GC7Eb14fOK2HkL
zIpf5o_6jQ}*EFBK{YA>|cOKo5^$5$&?wXaOq3&U~F0Odp@<`A(jUXp9T}#CT%o0`1
z_q3_&H(b1OJo%v*AKXMw1827n+XmGTyFhW&!2ir>vl4l_QNoF_%#W^1_kX6nvRZt=
zW@$og&$i^)ZFMGIGwQWTbs$m!-44Jj@YBwlQ$y;nl(=|G8<t#o)$Sf7yW-%&uMO=B
zHraPzt43Y-mbym)QOYX}wqP@mYqNkQ*=@RC*%|%l=Ro6P6Jm8>JvQX`fwr=F!y*_*
z&d~IO3ai6bYNj4OA1=u==p0@!y(|~yMmdxH+-*FRp@@A)<V9VDqNJgvHL9*&14gHy
zKfs_lv^p~PqR%UVD#6cNP3VdqMQ5KB6qM>A;|Ueq%%X038;IQf0Ir_vTe8|m-@-~(
z!R}_yLN(nW;C7u)>O|hVbaYtXJ-pm&19i9ABvV%U0}i`#T2222e_Z~`84(fj!vd9a
zmv`VsmfKOAI6#}#RA@J@b|8x4N20}agsEO0yJQiaGBY+PEZP@Kjo~?RcirwGdkROj
zKuxUDS4OTg?Px`(8meh$v!z>o$Oa;xC%lv2=HvGFJ}*sXzSr(asyRGelZolE)#oQc
zOkoBI(e6Lx`W8YmJ!ti0e20}-m@poHQwAvwrwCr+86)qlZ*+^JNSu3;qNd)lxXQB@
zBF@;2&O!pdx=04yij{1*7x{p_-7@GB?bg)Z2fE#{wGOp1FIC%m<d63GUNqMqa?xJ;
zBzBuee8bA73*@r->bd{#J7oVWyFdT?l_<;k$+(U`99uDBun!)ChF`og&Vz&6B_ySD
z3L<obTYbEv&1L6P?_c7OGT`VBt%ZSk;6@T>y$Doe)WZx3k(xte(icv>YMne(;`WeW
zVsn8Z%>hHCH9*m2^KW_>&dRL$45p~hoLv+~i;7+F+Vip2Kh92v^W4~;xVms!yH7B8
zh7_m<^y!@KQ>JKrLsLNlJNE;>v4hR=hl5Bt4~kAD^i;ag;4cD#-A>&8%Ti=k&q5E)
z31jzPi93<IXW|5yF~7A1UtVCMt?|`k)d36<L$ZZl|KWInm_xAGqo{S=B-kDpw%~CV
z55xg2(_ew%T%)W5gQk_SLDlQPsNV<C8Mh1cF^0;`{=?yoV`iH&RUlaR2m2oCHfoJ=
z0Zaq?&VKwYZ=&PaBy$XE18MBta>BQhFy|i*sMd;oHjgcVV#<N&>jd&GlEEs5R)V3^
zH?5bh$}X{eP$U5Lw2q^H?PhNI!$FuT`op1Q{WqscDA3(cKP(giY2p568GtwW=kr7l
zWs61ZjL}174*c^qj{MDu;up`x8$39fjqCUL=S%#%6SezQWGvsc`@_IO^*>*u^+*eV
zA6my+Br7sHB$0eA+u>+EEseUYA{TUfXkf|M(Myp=d+ZeM9|{35lk{I#hrI$maR{0#
zoD~L^`5{J!CgP2iX@()19%=RkD`6hu_CBVg3sQj(k^&d`+jl=aem~h>LpN<b5VBTY
zPiDna<_1x7fHW}0I6bt($q5~%(6*M^xnnBDIWoga{)%<i^3%rOS0l!M%{ccvObl{~
zCy=d#zg<F5DInd!SO?2RKP_;w2n`K7JT0q!yx?;PV3dehT~@va%hB>kfLSe0O+5~i
zsKsO-f>2oIpxVv#pQ;5lDQ4)r@bkH5Te^EslQIMvZFo-_*#1)+AAGkHMYSeQbhn~w
z-7}Vq$^KH2S>)&!dz-a4)ab#h%MmZ{4I*mxHrC1H?ONT>`J$EFLd<<DW;cMVAyTZV
zGW4UcgIX7U3I*cwdqM5$&L=(BaPAFF9pl`>_zWMhszB3oPvmG$=<05YAT@$s3LE4x
zF)()tpt6TqTw<6`sI?4e;j`!t`E?m<cEX(pDd|e+r`rRPy96aTr#$~L%oQK20wneb
zl@E&;F7-=D*LhU{E~K09)<x2Tsq3T~opQq=-%h;5cQh;H=#-(KGG#+$i!sKCvALN|
zP~hJQ8Mg2!>lFs1**vRouDMOyQOY4TH@Eufg`Ri5vJWZ?e{V~Ecl6wi0D5}XE3c^!
z+5V-TtKGp~!P1u7K30T%PJDZ?r?1sU-AGn=C}B&;zpW(|7mcb(YC|Uw#PlJmPiyFq
zIoT>Yv@S&V_hYKLzoH=ISFgRYKiLkW@xuR*(MqMsbXuz+<v16lL7;ktGG8!E04E?1
zRH_E!(8Cx1=BJK>aBNQZuTEnzJBLUAst>lQ5%&M|+QR&S9U)TflBVX#+>|?HHLwP}
z{8YRpoIQ`)axiHpoY4^~=ue`EkX93}O)3hi+>R=Xd%SFW%5>sO7!h_vQG>s-M4{Wz
z+8+*J+`c4&6#1@eS=n-nFnc&hquFR*$tz4KP1nUm{e4yGjmg*1esh{t!auvm0DZY%
zx#Uw>(n2a*B8e#%{)gk#JkTKi!!h!QW8Kx1sfvVkSJ`phZ&8dt9Kgn{bP)<(A6QwB
z|GH`ptc@vK2t+5j0aOIAao-_kC9qll{AbWZGh%`3_Xqw{2igB+l^@ry^fzaa^p7h}
z`aiB1vbbs8w;+kiwECMjk<=P!?#IA}(x#B|G8>r6K52IPx4ew^hagSd-@J(;d2Sk4
z+^@5Sz_=1wpg@$a#ZnzKReRzL6k7l0T}`NM5G*<#_3ar3*l)rH4m8vI0D|p9w0a|R
z=<nVIhq#{ye8eSa@F>g;Z2oT)+kX64y|HY|m7sMdD*>7-Bi6iRm6xLCvNu(h?vFvd
z?1OOhkkhh&_iL?zDSbUM56PxquBj$<=NG=xi<m_XlP=+wQRtfx;VBVTqclt4YL%7*
z$xiOV-$A}8r)gpKQ!-A1za$H-s}42o>-?lE%g285s=MOIe~MA=O%%@m7e=}AdMP{&
zoN^SaVUwB{)P=07X9^dQ4CCejE!DQ()QV=8$aD&%!dU*_dX7^wS2Je{-Q3+(swx#l
z6jt7bupj?e_XaCx^E}!L-s?pZh!><+&Rd48(4qN;XHQb!2hG&wKG@h|ZR1@0t?P{`
z&x{<Bc?bvV_HFJ;D+cSYO_>C{fQKN1C7N^_x5oJE8GuYK*ve0tBF7e>V+87JumRM-
z2H1jHrvG*SQ6zxNwvM5}E#3+dDgSbZrL0Zs9!2_jtY(3|UBniu`fG2CZu;-P_O_%I
zQx23nz$2o>V*mAopa;{$BD}c&=OF*HT>pRAT}8}I5VRXROQ;H^h|dj7{^4Nr8&H!@
zP;HZFd)74x2za$ML&W06US^E9QAn!gjX^&}LCQgKq)xi4z#>a>d=(E&mgQodB#vsD
zwDY^<xsrh*pIcm=rS~u&d;MLLZpnAMR(%fH_sqL)=upnSv*t%OJn;*lB~2Vu<fPf{
zNIEy*xMr36Y#396`#YEK-7M*=*RW81CbV=b-ECVY+mG<i)x_7N*LuLSwgS%@aEG~w
z9RiWJo!T^#Ttx9DCrt#gd45xi&cGRDs~A63rU#iIdX_#bH@M7ug?>h{j-pb0F}ZTT
z@3-E{3X>~607(C5Q|%qYl%DbR4`v*|1k!wad9@7e=FxGPKNq9jpOi|R^7G5z`u_Mg
ze$h|kzeXJjf&+6h0r}`Z?wihkSrz{u_xFEtj{o=WFaPhHBe-q<h&cTJE#km0`o$l^
zw0)gJFBt$w<i&dN+kdJH2`l+C#g#~j_%_r7>Dp(RG11GKB}p~k<oDFaN(9#gDHbU5
zJ!sWm)DTarJIX_BNn`c`e-NfxwgmYRVWsi->w{NETw#L$)%t^nch$Yl>9COh*(AGP
zKv&l8*XXGTPEZIhH8K`Ex8-kM+^gqFh;_%+4kG<HaPh<0wUEUq^CKXA*f!I2SdaDt
z!Tztl=%L>?2FTZF6QG!%{BI5K0q7lU8H_I6G{+c-!AWQe>UFS@Bl?6|uguy~ySkwD
zvsFq@1$qbT-U+(|Hfuy3+uG!~pYtAo!RtvGtpr}9IpjTg*&*hE7mn4<x_(I>;)(gb
z$`XMUZ(qhc#3}K=TWET{DHEdnwnY&;jN@t%#kf$_-a<0o{fGptqGhPiox)ast@riF
zD+wR6r{9?h8%swnDo*8%Z;9ntn^GpB0MWvNa1|X*tW}L!vYWS(W+jlX6$H3@`Q|~_
zi0@i_jV_jrQ-r6rDrU=@pB{H+PT-sNS8$1wP&F|WHL3;O7`BL{4bY><2qbxG`BTLm
zpJrK^5pR-$_H^!CDa%Vg-#8((;|_6a(qt1o1K!JUX6~(kzcTcHpsA0->}*oPeDT<R
zgTl4CN1eM?%3b5FYeycQ<9v8D_iaS&KlB2`sW_&76MZiz7S({)PLTJEvk&2>4BxTh
zz}yaY(-Ls%({x;QWCqKsAEzVtnuZRszRagg!cCuK^XV~mgYAH#)JQ?z2vO}gwK&Is
zBBxd_gU>@{73jnC<4LuWw*=}m&Ta>Zh(|Q@WFFkqUMH_YT_So8|F-Zfr?^j@#~j&z
z)3=mU;1&0_tp@umWG?(eZ`tl1^DrwPE5dYvUw}C2?j`<~6e|(TkvG+q<E2GW7U8=<
z^GRcU@~QYHQW}4FAR;YZ`%^jZnm5P4=Y4E`ue=^TPME`k9v=L3%zS7<3sAllXN$o8
zo$}mNamJ!9t^*<a$wDGE91_y+Db0=Y3B9brSK-BbLMlbeZHBd-JybuQiC+3T&$2k1
z{Gy?rI$NS>v%6iPp!B|(L99YN(+!*-S~L3y$ep7UF}s^9?(9xyL&5`rJ?idh!6o1B
z>bw%}O89k7@8WRMkxGpaQ-F>BN3jfBc9-J+pe6qv5c*G}CCI-)bxJnQ=O(%C`2P&J
zx)e%YhLu5QRXdw^0PmTj1-8N)j5xJ~sW9gMVz|5+O`Y<Ii%wAae9&N~dG%A%zQ|ne
z+#`k)(;FH1J~dc%Hmwjy7n>udga$6&Ikd-$>-&-7h8~gQIc~ncO)t9t+GTUo_40X-
z@7rU_zOnZWsxZw-Kc3)kvQF^)^ex|gnPwqSEG4(|#>K}`Jah4*_z%mlqRW?2Kw=l^
zqX##>NiKNAJVyPf(yhdQg$MbF!|q#Rko3P9)PGn6@b$k;$>YC&XbQd^f{z_xpUa{R
zu-|C<v-Iu9e;Q)}nWgm|Zsi{FA@qAO4_YL5-A0)KDj$0Nh}u1fzV{n)<D_7u9{Mnl
z=wG>p<^gKk79UZ+qNO)Nng&0B9t-uq_+8{*%W+7d-$~Nnk%Bi9BhJ32$tO!&Mi!h5
z7M-2y-eqf&&gHM6b-DLK)$Nl4!QxYPs~!wBZ^w|tO)M8X`nm<L4?0hBwh!*h^m<|{
z8_?~VmwwIlcJXt!)VE?=SR!y=*Q@|gylTQpVT?wXYUJbxJaCaO2jt3XF1c>AT<N(e
z8C3Q1os{Ah=ZAM`<8v|b7^5pBHG#0clOw?{wja@x4vya?R-K)vc9>=EAMDdVwe{pn
z<vZi?>|o9CW@8WrEVIhoM*JnfwYyo}ldf3V!<8$4c-*(^WL5K(miO@!Zby|x1+IKC
zv;&DB*Uh0^z*6!bm_)Jp_(=_}!!?4+wn&85z%G>oLRlE(J7bso`ZZBbW4tT#SC<ZA
z-Rw9w9Lhyoabj30jA_Hjb+&w1g2i~j-~vNy!)rXiMXR)3q3_A5{x5`Hld;yh4Lclm
z6e*`km`tEwr;StiOHbABfOSr`jHb_IUCwdxe3~0$@Suw8K}0R~D!79L0w>dmif8lA
z<T1@?pA&=0P=apua$>E;tG%M{^t|`lN?zl*+$`L^+xeMZ{RuM;WZlR=ZXIZmK|KX9
z8VKxdAyye`2?wM9aG2Lj{nee#N&@hZ2S7Q8+3!Gv1;UUlOZ;MPgGZB&6EcoSb++0G
zr@Fwxb7~XqZ(kGW+GUL6@UR8dZJf8FPa;(>`c1M?u<wfyWs<#kV2n&><X46bb{NS?
zn9D|2i!#wPHZIh6j=-lH{Zy_najS)*u-QA`t)?>iK9w3JiJw%lf#C8^YwgSh7Nm&X
zwsAI>mT;gJzq8loT+<nv{gjHLDANe{rJE-Fi4QgR8MS_j@Q|>i@B(lOF=9?`hrTM%
z^4LNE-iitaj9^5J{`msKMslK4M@TYJ5Mfz&Z)U=&FFX48tb3G_yUaZcm6Y2Tst#6Y
zZ8$M_pw{Ck577I9Z(YY8W)Db)LLQY(K7yuaM_x#*(IvN>yqBrvaY9bYLH>aJk>2P}
z<t6YiC?$%4gJG~;L_pt9Hw0}S^Msfw!+Nq>oVriQ)NH}asqXMVYU|4xM^jl<4Z@=Y
zS>f!;``e9$0^bU+gIsC*L!XVr`Y!(mvl4NYUg5cmIyP$h?|k)po_Waf;_4y~fAH#b
zn0}sm3pr@Y6ys*|TQdMs)2hon4+NCr>m8F~F=*9_Mqkl(YIi_@ugrX}j;czfG2)4#
zyG@rRPW!$1C+u4S)p1|~=rc%?jX@mPZu5rvHBnwCT@3+8?WdIgv=t-nuJ!z!E=Ave
zZGX~2?C7&DadYH*E2c~Vb+r{&2ej5v<a&w>!_2F0R&}%t4B&{(9ewfer`Cv;Q>SV`
zZN@eDqpniV)|jK5{CC+xYE<+c=xP$Y->z*riCFbJvHmX8A|SuJ<Wpz<9d~{g`x%Yy
zV;24gMhnk$9<X)!p8b%F-UFY^dd0fB*=KYCP?FialzG&4Sb-kp;&4bmV1X4Cv2%BX
zCkR&jJW?ETg^QnFNdG3ORErU$RkMY>z&-H8$iqcy?UpiZ5tNM6R8Qq6AFO;$ebyN%
z?$9*FHFf=Ip)CibP8>72ewhtAnObMiE7peG(1ad<ya&_M)^JSmuRzn}A)5=--t~<m
zsHvo`m=~8|ZzKZkwKhgCp?~}7_nTiP{7XdP`<3QLJ%f#D)5>?4%J*_h@6<u2Xs(vS
z7z5DdCkjLJ9&HKB6g8o+oL*&?;a69aOHw8#)^FXf3_iYXv=eWI|3YB%^wHMl5v?eA
zR}DdoEfIEhjciCD$vIs}GNaC<SSYR?P~DJ>-y3zzZEiq?a8L(3x=g4fGR47IJ5kV(
zG=!Ca4QNSp6Pi^!QP3yIQhYhqzpQpYWy$uaVzR?jq~ZRz9>!Kr$Cm>>9CSW?h6%1Y
zDhdRY!5}jdavd@}f^izjFr|)1FMPz;%5)`u+0iPW{#0*|U2@Xpmj!WiTYczGq_0d_
zfc0*hq>Xo)VdcEl;iC3i!5wY0s?f!XJDQZ~_!wy4g2Q}Vi%t394c^%us#s01w8vXS
z!no%F4Ro5ce2*wrDtUoQuZBns%cY+lWeD%XiRC!W!!n)%dgaEE0Ab)ZV)qiN34jB7
z)qgw2nW`482?_jotrqWFli;N}ef0YKvCxou0s7}tuNn^AQ8c<+p?sO6V=UcqpULZq
z4F+4taioUg!R9rM{8ywO@0n_(sUm+&A@pkLgJyAsZNjZ`$}@zyS@ilzVjGeHjNIZ+
zHbES^?!WVUjxGpb9k60QZnYoj<Mhl7OxH_FD{}7l+VSnEaNBdC_J=;hEn=7xv<y({
zhBY6e#bG{&=Oa!Qqc{I>Xb|Hhh&HrS^w4dxvWPui8Yb$Et(RUn%~ULx4KzVsNcfl!
zn|GdJh_QMDJLR>Lk@5otE?G_h5*3=BHoshGsJK#?Lq;BbH76$TbeTkpg3c1IBH{g}
zOHIJ>@KObdNvLc60r-cH2Mdb|XH4l4EQ3O|FTCV;9A*kvAC&e?pB1`r2Ugl-dTmnF
ztD!i;Rk?_HT#~+0E}YeYRUF6Pd{?feY^IjO&u>rSzn*J9@VNZF+f5`!Om*-$a~*7p
zaDLRJ1zX65rnB^gy5pVKT(*ZJO+MA|>9&sJ=^rPZFLm#{c=JSEs);yf%>MY}+s}Fh
zmf3vceS`Q;q%*+h^8o9EdRCDCYzymouwFu<lU<*Kr=zZEP_ZS?3}bok=Qta=-g9O@
zi=DnmwRZQTDwLR#5p?apHl%GmEKFs$cNoghB0o!tPEGnqB(K}?UoSKj?)O#c<#`~%
z`kJw3CZ!j|oMwMO3O4J3bpfFJ-DJbvi-cOu!#`eo#ORxnO<feM<2<6s2*l~J5ZjdQ
zxILcaQ;FGD@08MwJ}e=Bo@^yn13WketJ46dQP)}><?oXDo2)9Fy><O?Tm|pPrqh_O
z6ywCRIrpRqm^lZ;Tkd^6+{g@s`PsHhIKZvQVCPob{qka1L#>w}kn~L2(NhLdZfId6
zd{`b?qKWPvV@CZFKWibIk+0euD48yfI23v?%FOwQFIN!rI+jSN4UL1qy;TE^Qn;zm
z){t}F(%sPu4Hi6pM(V7>KOF7k4@${KHt%FU$UPQrIHYmpFA<3<Z6a5TCUzVI6Gzz}
zb;flG-F=p8V<O(A1)2OSi!Ytq?)foJR|=mFn-rVAa0MqeU%-sBda*$Q>(oa>FhvsD
zHJHj{pu$}+%Qs$I@&nWwMPIyoaim-S#*|jx%(Qcj<hkvM?<=?TrLDieelLZXi>u9`
zj59Q$R&CTZG%sKsrxkwJ&Jk*Ete~>fJn|g7Rnpn+`PIjBoXn+5y#$-o1a8~_Qzby3
zT<8FEdVtlBW5nSBCH5<{ZfpfcfXHcDv5tJdkTNL5z+J?Cgs6f3K*nj#c;SBG;*lSh
z46+=D%Yu)gNIT<)zAJBQ<eN2qpW<>VAu#qVe6FQSM)n1M6g@$t55U}vFpR|AdBW0<
zSMOeaE4xA05&m@Qa$|LKob~Li=fF&`gK&#Y@5MVdM%xM732w*<J-T5F*k)UtVZ1#Y
zkxTT!?xISRkohUL<Wc0g)N}Rm3!aLSM@7{hz2!->-zLR@>Bnsao6AD&H=qgK`NJWL
z(Q-XFRpVHgrXqT8cCatN|3+4PYHifI_wjkc+|d$G-s)dX^kVRlI0EQeu3GN2Al_96
z8Scq>N|0%$PvnKLtGx$&m)vWwTgmhVX;t;mUS7rE4sRbS{chB4Kn0H-h(V(;%v*2?
z33>MtVz~K=Zy7vbTQPV&ebutwZ2tAgH8TZ~DwhU6xh`^3h3+*14hJhh6bq^oo973^
z7844Ck^>iiyf1%eE`J11El=6P$llfKIbPSIQ2TWDsx|lQHMcL8i?zQUXxIl%G~5tD
z1X2mFv3l4~x74EM2U~=HWYn*O`OShJg8LH6ePvdZkc4Zkd<yo*lVqc2>hjF)R*@R=
zd8>b~(TiLBCm}XJH&t?;ovvu9G;fmnlBrD-xUn<$!Hko5)6hb){eqo_o^b=mqO`Ao
zk+2Ts_hxT-@K4`gWis^3G_L#v`4V^9e4CZ`q)gQP#%sF~+qT=UJ_>ld45)8elVbmy
zO*6`!Eu7dv^fnZT3Ba9fd(Xl#E#1j)wmF+$^S_S7Tn&rLc<_1a&h6(P+}!h~knjuv
zYgMzY3xH^aYayr=<<lSEW+#2iC2$3+!75)nQHeM)rWN%oo41k5X%Ej_nM<gz8vDH-
z?BmOSMzpQ_fS2OC{S88~)4Yf4ZrJMBG~GtsdWrvlm_V@kECKs)F{4v{8zkgSM6mb3
zLS|%|(!Jjk%%&!VBf5DCS1=PFKA0KrKmGHZ6n`M94q}R}nXs~2j(o#aL2vO}y`h~g
zhkw+?SSPh@oGS55Pv2#@uc<j%e*a0aqb{Yb*lT>!26M<)wNw<P9t?OqLf=D^Y@6Zi
zk{0|~3$W0W9dx!-eS}V;;XB2pgoOr-pEcoU6heF@^zEi5Oh&vY{ozNHoB#Cq+|Np?
zPS@<Ai%IQOeWCKz`QGD3%=&2MCbtWA?<^Z;aoTdjYgmtdG!-NZ>-%w4C{|h|u*4=t
zrpHlv-<4PE=^J1hN5iIcg=#AP{!KK(8FK<on9nv*MYXeqh8FC9^S|v_PmO%1eNtZC
zxm;(*(Jk?0_6(Rj*EBGJA`#nB5@BCk(Ni0^4!K+H?Wy)t-*+nv|NKcl)N#glAY1l&
zLC}{j`^$4TpCg8PmAgSPF`*8)yEOpQ0cWR^uKKEdE(zg%i!qrOn@PGk?R7}NsONG)
zN5o0rqXVZwx3dmqAeRZ!i>OFr24)Hl@zw3JLU?Dc4^ENX9v^!8^=HBdo|23eAsG*;
z#WRTJm-w&#BvYoO9T@`RGwkv6th*==gfP>T##Q6Nz#jXp9I0kLw9hSV{vP@aZc&4h
z6!5@!NBhB&^j&tJaMZK@@PYbh$loGqC)0q2ysL$ZMvEBA_X`;AqSjTA7H(TLD|CF>
z9lk?s_bQ**-az%9)11Gz#_uoWUOfhUc`ING5wL?q`X$UouqUS?p)<kE4^V3bwBk>!
z31wyA#GyrP9C}#pl%be*cv37@8zV)1Sg8}!Y)7U@S`YvoV%mk+y6s)#$wQw*1vK)7
zlRw5!QE?qfF<5;f;7(}2De^JA6v~SFuAJD=74)UDO9OQn{q9ZCQWis>Jwc=f5&+Yq
zo1RX6Z+P%yDMqyL`y)pym5;?PogU9mRdMonP4X(2A4$q8I{ghbR{=)!pe9|}yo(GS
z#Pl>>hT-346?<vh$jy=A>C=Sl=PeA&A(O%8^H1wFN+cByK8)@CKEJh?j4JC@UQiRq
zjzWAbLfF1^RIMpR>Xqpn5djkOiSN4iGOR7LJetjVVo&V&Fz}|0r{mfgTD{i#QxYg&
ze_bkwh{*fVqE7{CP}^Y6jw7lqY4i9UVPVr@A&XC+UL5<f_tCoL?Td5}nu_{zV82;k
z@}DO>K=HA6GS!>mVF0b}u=g?)8Ky{VL^_6#`bx~uDm$J|wXVF-{u;jPkoi`Ebh9`J
zP`6`yW;GN#an!pQv*8wbsg?mC*grE6V-JVR$(rs?33E>#R5+BCl#%>Q9C>ICF*Bw7
zd3oY1@K{PfBuJeAw^D_CnzH%Y?VASAjNgboQ|+*%Lc87n<f+SP(I}(OLTOJK2<#^m
z7f_)=bLGR!)IA0!sv@c;k*XON?)ZKcoi4pjJg9%yPtjR!<Sj!a9$s!E)+1W|CE3&R
zimW@oVt*syvl!*)-DKtQF!Ae#-Mj0~iMTKlz@%u?mG#TPU*i{0*ZtxB&=QLwHtT69
zM*HsFvX7P&pTF|UpEu0$qxY@f)OHNbk^AJAFMQEgpVxC9J<7o-k7ssmLIQKeC@u(8
z;p6QH)w9(kNSt{HPILFAFT=*oJ_U7Ikp?+OzP4$!=5b2RRK5YdX+;bWA+??}<Yjk5
zA~#WYl-jWe;8dDuhiwYcG0dq|_r|D}Vr`uL`mpJOX_Z1{g<a4U3*V_DW{4qg<yEWA
zsRL2DOet28CQf#G5;pD1*+H{wkJMvWU82^<)|jhmc|M=`(BgdgwVrupy5ek=q2Zi3
ztjy2?M|G@GRbunCGLAFR<N9*6BNwP(41nevk?Awk{`ACv@Rwfw9=`qpRaUOe{Fj%W
z4g}7Z2OURF)QZ0kQ|h$E>Qb*I&qwOcl81;6RrM|L{5=nJg!gv%?8&dpu^y4tz`YJD
z&v}CEM>1vk0jExwu^pPkP?y<{r^kOag7K1n_v3T*uHT>|IBM`RGna7&(1bv+M`T<g
z6uxG=|6On9fD>6W8dNiFjqgK%Aw~5@F>}v@1g;YQFZ803zx}a;EqwMm`Ym)C;B<lV
zm;VFIqW??gk=-hY6u3cT(kr7M35e(}oFF#XVfRZ5J)p*vQvj2tuhZ68_irLtM~&<d
zand$088VUQ`ooc|09a6X8HgoO%bWu@;{@T0D|*6}{g!<M)VYHDA=BCaz;A`N-@LW#
zx^Wq_#5TO$Y+oIN){h__py`en>rEFe&N@8)mz}HzcJc}4Dv(beCMU6!D)5UBbCG-p
zopM~s$VA+Bj6$g;X;~a@G9Fr{(AVq6L#aj@e&_lc<=O>aZF9T1=QV<NlX0Dcs!v~i
zn>iX?E7-IxZRwC}<AtjloI59PA6@cUw;zIB2PW(3o3T?I$_vfO9yXMS>AYo|M)KXh
zF*X}sc;B^wu{~Ly_SsxY$yNAWw^jxS$N+d26^eb216qQ2UMo>RUjpw*rM3YijJz}~
zGODV!5aDZlE#pZJe@tq>pN_^>VUnGl?WM=&teb=A!<f^o;uZ<^0J8cFvIb8%T<jEP
zWgc_3_H~QOnJm9U3bqXWgISq&Dj~Vg_f{^VpJpPMlEZXY-)rsX;B4v|+4MGsw`?ZC
zK)G9fH<jV!DXE&0)%-j8>A8yavt4h$bcF5Nw1&<~9mF^=_KhFrz5o}{&<-`=QHi-}
zX4y%95xFa#$8{&CN<_+u;)f2g7q>l9rlr1xs5g*H+4JULzR5^47frbxbPhjA6oJ2<
zqt4?HQ4hqb2mBR3BtN3<I1?_@_4};G7Dkydri3AfoqzzC?1#E1NNb;qeZy{*P8@rZ
z6e)db#r#^U%SVMC{VJ>8Kwi1~HmrD*FXA3lkADjjJP}dq-s+x5W<S~Q8r^NCq~Ub=
z*hGq%_UE~v9NqJrKW-_{6FAtwZnh|}Ck>CWqV-Yoxh>joM3JL+W|qa)vQ(pRx#m}2
zPRmxSJ3U$9J0uh^)IgX9Fzzu13frNZ6oZkO&+_%@%+aJ>&(cKmz-N2BZ&AK;-c<~l
zp5Ll&#8F+ovUAtfQVwpSnB4+y2}FrpXJB4XakO{?3$pH$p%|p_O@>;19|@9Pj5$4`
zSZ_PA%$FXVZ2kH{Ot;<*885~q@EF(_@&IWgv6|({jlT9v$}o<Gw5f_T#;DEt9$`40
ztFPgpWRgOkKh##J`^qU;?X~SW%3?qMJs4Nb<}0B3(BrAy9Wn`OT=0B55No-T--5@7
zSvY_ETsM}^mH7E$)Gmj;;mvjSuCrCgkT>|42CT<`9JUb}1I^_VYj7O|@vxKR=%yE8
zPcgcgm+rVe9|#IBNRT-%8d=z5CNWY^&B2e4-A3Vim7h?p;!ByqbNvck8^Lcn3tFn(
z{UgOBlXv><v4|ZrHh0y}dwJ9yb(7{|ppJSv0IS(Cb0DVZ8oTc-4mcZ$i!AxD=`Mgx
z1<duM$;gNPuEUy^+j}ET&oyVoQYgNz8Wsw!u(envU7Se*3v-L`c@_8<9D(i#;1t&W
zD*Pmv<tWL_YnmsrQgHxWU92L%@av6GSVY`Mi?_%!Z@`g-7+cu9)X9e}JLak}iJsE_
zcFp+ey;t*j2hM*pQU2(&V#?;R05<9=4E(|6A4HZ~BYxCMBn;44pcsFq6FK4Kl4k$G
zDW|5Urw=DcQCw;Yl}p9XQzEupFt0U+^ln)(!KCvdR_@YoR2(qBR)oo*OK=8t{A7w*
z;dro|jo_Y$Mb+N*5)ZE`k5mq3i$nU!oU1kkKEJBjyfm;y1Yr=waqWPu!{-NLB<f#F
z)RP!@io%lGr*f~q8Nyct$;`(j#@A=;;=ExaWj&gF+z`$sh@rf!HiPnC=qXb>RwCgt
z;oj?{Zd5FG=ZaOV;}l(0&vdAnT&$2nX)ifl+UkD(OTOu`z`>rlkL)fS)iwquGK~8e
zV(d{IP)3fz?c{p#rgHE%ZzrYjmA|a7!kdC#)16mEbRwHh`<hHEh+XC|Oozuevxd1O
z3W(Y1N@F!L)BRTDf?7&vVF72?u3Cn)8JsL55E>q~e#?W3Z&6-vWf2Hp42YF$th==Y
zmS_Mv4_OO3t(;6?NydpV1^NkChvd+*k7-SN*7|wydimIxs=^u0g%$#?FY8^b+YbGz
z)7$-e%C~G-8e>DSV<7sM)1zCmUSK)d1GhlIA%FBITj%?@?y1m_wW~*eO{!)N@{4;`
zYoy*hoTNfBpc<g)3|4u~@`Ne<G|huzSK=8MmO^YJT%IX9d9*IPWhR{KNymNJL&Hb?
zUk6Icp66UTtvm-Hw0gq?st!;rQlqQ}lO}6rKu})rW8T!_e!R_VTymY7p6mVtUmw(c
z2pA%0m!alrK+jbu1u@6N-Us%m0d&num}x<!q*a?2e_vax^a|+<<}<lgSNWi%|JGp5
z9gX6Lo%Skcw(Q=*;vYog02W4o_k-D|@c#O?NYn0|?L)yxK_5feH|+Q0zYJ_Nsu$Ou
zdt763C7%`I;cMUNcCr&2HT<7cDgZwIpXa&|_zA8!HF^$kJ#Vld!9^@IZR;8!$ra+L
zeeKx5^!rG)fLA?Pj}N|ED1JE+8XoxQlls8VM64*dN*=;cvlPR^dkxjx)Cm73|KZTS
zwqV*3;^ya9l<qnZ80l4Z4>G_}&m-e<w!?=(cyIuYAQL+Jeh)PpkSMNgI#c)O^Io_5
z3)VJ<Doa=I$YT`y(1~wx)SF1AXiyhUY&RSc;YEJKZPnOdVP89Qx@|_2>S&qcmg6##
zYAU4NGB0uBYhd}YzmV$37+tDWC(vN<Z$}HXD0&TkpAu>;3p4z{?!CSq)VjB}!gJb-
z+ee7!=TV@10v{WvUuEU!t1>uYA2Rx0vr@q~>E#%Vg=^vBxUS0s?GA@eoq=w({wn)+
z^@wnBkVY<Z^gB*^K9w7&M$OI3S)3h%<{r&zjA@3?zpi=x=HtF7Pka49leLfHZtioh
zcJ6w}vuFp+O%H&2(;a9@p&VMojINu~7@h2WGy+pPm?Y=XdFTGf4lmtXevf*w{Y5{}
z?scD2?|e3vm&`>x=Gd-W%X(y}5Jm)|n9}X!0V$?lNX*3~D;ihs={xnE;`Bm#wb^|h
z_nXmn?l-U|TbeFR&(uFz#`iF)>A4{8NJ9Y=S)J5Tc9L}I&h|#y_O;VlJ&kXuze0w*
zUZfdak?W`zUN118e)6%<XuUy-rNp`qc=klluq0;nHpB1;>sj!teXoc0D~mm+?xC-Z
z{91VIzHQL`ami&$ic-oRqjQY3+nXYmcQ2#dh?B$)LM1xh@F?8Z=6LB1*(In$a!O_G
zR(8J5^~{^vR4A`|VfUn1j*rxZ0413!@#D-TR{Tu^Wx5LZ15azAJ7_NLsW7_>V_Ml#
zOJ$!~Bex=dr=klg-XizbEK2OSo3#IcV&cdqge-XrIc7%ztaL6@RtjP6nY2W9B;)w^
zP)X@Je0tT#nwt;5UdR+~TAQ_aCz7xC_<8coh5Ftjcau^Hm#tv6q$LnzadE&i{tUe^
zrBgJu=;re;jRFkr-73qs@(rx3eClScRZ|`Ar(owlTWoTIBIJ^WSk0iCzQu~Mnt<2&
zLXZuMpg)d62!ESWipQAm91d1Znf}yH8ql8lFq7*xsd##KV&)$8-lCA@tk5-c68a8m
zn?6680H0+F&&?oA#s%{xAkT@#G5Q+XEqTZ!raw`fSr|4L@8RWs{=psd{6_#nwt+r?
zBFEZc=;u~KJ~WL0Q!^N<*luKvZ5%`Chof3N5YXXtw&(L?3s*$b!!CWh<1dyI*;%o=
zk2_E?qQ_Z+Pf7=p0jdP3Dc4S7<v3FkR2M0G=lJz}DE{rz@#e1LvuETZyAtJIAGzP^
z{@`8nONEe`MyVc_0JtB%zleALa7aSFh<LO(3Je{7V?mD_F7&$YN)xs|Dk5pIekK06
zqU0kDo`SiZxsflQ`*$AwhD$#13w05iYeI{n=AIY|4Ydf*LapX(nXsacaorMc9qB!A
zB>CaOps!)qc>4I+40^APmG<5af2EnL4`T$N<0x-n{F0VNLG9=Z>k*S5IK{3@@?%-8
zjw6}Z67m*bi`Q0vaxCb7Trcr`us>j%kL-=<$9itp<(_XnhTKdsKv19!UO^sgvd<JW
zM<%YHM7~{S>|`DRQsM?DQXF>^gO<=BHKhgu)s^A7T6{EtBw^{(9eQc^kgrzw)sI7?
z<CYc;9SOc6%QuQMydG7(&0-dEIhP0^$J8dW=##KG1H$mtvYta8KT6_mKJ`<qY^-gG
zJsIvA&SM)9KIriAsHw@i+~bl1+wP)4x1-t;f*}tWW+(L`(N1vq3!*n$jcQU<?Cn@P
z`atwMBJ678N})xZaK35%z{W8f{S=KS$CSQU!J5%O*~=vEE|#E&=ZmBZgP&ucy9~b5
zlsx!eBj1CkVViLG*_urp8CO881zIo$jP2O>peaTT%l%C~EQXAV`_))j(-PAz;kEZo
z*=V(Iiex6)p7ZtXjwcdsGG;3Ivf)VJX^ySYx5M=`opu}_m~xCnw!VaM=c|W&-5M)~
z)5S9B7w-$)58ikA&J(|b`FB3br@^4=x$QQq*brc|Qjm|;Lx>OXYHjVLtr8rkhCUda
z9ojdN?OvPwC4J_M(UY?8{Q`Zv4>sKMZ{5@K1?;L@SlO_cc^rj7TctCpU++z~6Zx7g
zwDU7%B}1Y#b?i1WXWCQ;`sEpBSKXhSm{thCaA-S+-+`^ySyx&3-=VBD!dw|Z??5R+
zFt=IaHM+~ff*6h6n-q3;;HkqQ%|@qyNyR9S!zEk}TiHuTxOq?kK2)H5+r&-)oa7?A
zI@ZigeK(#PuvSD&pDZJK+mm}=%{RzL5ki!8st$mGKxS9|9qp3)9(x2@xlRy<mh8cn
z{FQQ=2?F`MSD`PUt%3`hr75v-wbHEHte2qYZW6l(&?72KK;)^I2LAyaA=^7Z9t_~z
zKOAq!RxFgaR;^%9PWN)Kn6-<^zcD<W(GLw3F?z6TkrMz)L~1E|?PD1kydM7AMZx0v
zfuGEH&5{e;1(w8Pjoxw01-KMIN0HPZ+~$n*WiTUM3Cu{p0D$Ko9cqvlezM6@cj!B6
z2S&>eqoZCodM~rKI5O($!fv@PH+hTo`13v#y=dJBc3;$(;y(0MxSIreSJ7<VNd{MV
zDCpzB?)xIxvw#xvIwiN3s?OaP_~NP1*vh==qjJpi4V*BLeuTKONZ|Apn<E(ln1eOG
zRu@M#!)wj8%#-_POUO>fPO*OZlP$;H8lts?3xT}aZIs?^l(*&#@RT=yM50Y|^hzvV
z4tegof45uFZ;P$TCB9_9y?U6|11FG^+JVw=#%U@#0b^XSc&n>nT63Deso{BJ^X20m
z!g}0~KHs^wop1y-7Y!nmw67F^BX9cv2a&TwzFYKrw2uvi`Pig*h?uz#ftazHqv<K7
zIb(11QvKef_gFn@@Mkn2B*-e4hoxO(nBGUZ5AY^x5BFue-C-J)ymGMbx}}jn+yOJH
z9QQ?><NoDtEO!3igrxBA0olxynd`Bs;Z`YQ3o8Hn2v$C;{D4-mnJ=(jKuad0gkR>b
z(%Vh#HlGmCT73~{`OwccgNQ!=Fc5nDi=wg%^+g=2E@GwPg`h=&iyK<b`$+$8!)^K3
z@43Ga{sS|gF7bn1SPPD1j{S(ED}IMI4!!xo<^kq!{|i7xUCIn|{lj5NJ;(j+_rG%0
zkbg%1+h-2{mlDd@yHH+)Nu_yU5IVtP&|wxs3!_Lo8D`ox#ZCfX?^!k1$%U2}q3ry|
zXf1^x{Zy6X2h+|o!_T~86Mq0mL2b#2HUQ`f5<c@W6g*kk$(X8?)T~wLm3e{oN!eK@
zK`&B!Q$_BPUtKD9A;)5}8R9O%)#4ZsVL?)@DgJZ>?agRs(c3>95AFD0V$_?O(u&Nq
z3e>v~<>;n)94X!VEM?ZE@`tiC%M=9HUD)>oDzGE~=8uB}OZTh>`v@NW@%r3qbJDMV
zW!7H104`Km?BJCyBChzx4{oM-J)P?f_*@jb_dG_Ax_3m^G^zO*<X;|k{z9h7Z71~f
zGmV<cSIIW^3HZJ>x0TPWE1i)M3>9xwjbQ;*v^|Mow@7yk?bu(|ob9DKZdaWW_Rd;m
zuR}%Ep)uvIF+~mzvG6U-_v|TTomDK#56GIUEfJ|6t!wiB4+x%=2oq|~a~lULzTj-d
zcM;7}ezgtf8|fl(+-VBhZhW$TIP9<-RMVIisW8MT*q^k2N=~_bOk&FSv0vw;Dn-FQ
zk1I*>-Z_n1<Evf_po=OGdHRDv5}U&Q{0d~B33I9ssULvDzx`t#P51HU9AF-&ok+Ie
zGs&`6t*_(YOWk#lLyM*V@irhNUj}&#$L0$Hj=T~uG)n<6<_Jxqs04PZS1n7^ZdHu|
zEK@H)*lWS_1r7lWv33@E*!adkh+iE_5H(ldg}#TCfsu@$Em_DYf^bdQNk>-SX*=VQ
zCyp~!k4+9v9zOV$Q$@GJ*su|7rsu4jmP6Pnkd+}sFbb?EH+z_Nc%OP{gIU}*osf^F
z0o&}X&wNN7P&r3g>6!~=wikxshY*QqZYD3lkoVBkS`l$|1YZD+#sB(|cxYqy?ODrS
zvAse<<IX=5-}b5t9NXGJm^7Wk#}Yi$Ci_~6TbT~z1i+1yml)0){xbG+FfVhpx5v&c
z{F-|k>9~eAhwNZvRCf>9J|=|8`FX@r5wJ9$V9+EvzmUYbOP=9opebWW&!`{04BJ<s
z0g}tU`RV&+#9k7WpLuX^+kujU=4OBjZw#dth;@1Zi#sq?1l0Z$yf%I&_;|Uh(bMj)
zWoZU2yH9%r962QUtYRyn)N%j9C)G;kRSZb8qs~FCXFx`QtS_u(ctEZDcPDk~>coA1
zojGrr<KwS8rFtSjlP)RjVj(%D+T3XpZ$|7tSJEVq<LZqafT*WIAz^?icAifD2>Ib*
zWv=X)XrR$vo8<)GT=&)bB%$ISHKgZx>0*8Ds>7_mp_NvGI?u^v^iobdX36;_+nz8t
zkARgBlx#*QJV3H7V;l(6VT+VNf|K4KzpJwHTwOoE>6?BMt#)XaYJ&;!#*V`uK$Zsv
zv@9<T((3;b*SG+;(|uv3T*x<U_(Ijk-L=Cl!~VW+mTLJAEXZG9G3A+*J@l!8+iyFl
z;n=@?wJ_Zivbk*c2~~t{+{8Sc8;sLn4{jis3OW1kI*ydj4p3~@!lFqdU9;|`Rh>O@
zoNl3830Gg{Mz<;>d;BS1!7T!tDJla43L8<qF4ze|Rash0%*{8tLtU#TG?StrchxGQ
z*HmC`2mSu%hK_2r<c8_AbJH7q>v{8Em(GKY2!e^IMyAO}tO$G3Q9c!@(Xyo)8BU-4
z>MXS-GuA8>y?hPcj|VN8u8c$Jjjdtn$GaJExh2n>ivX8O47m43RM!B!_dL6eSpB4v
zc&}z=4^qf9?3;ARAf`5PZ>oBiyyxg$Andd-Q$PF}2`k{2!WkDqRln7M=?~6-9UMO?
zOvqmd-m}+7P|)`KjNc(utz&+N%DWgKRngZGDlaEph~d-W0;<6A2ZFq=Ot6uL0_+|%
z^#(Ung#Q9fr*(OPVLZfE2$SrPy!Mm-rSY52`YE?bzE26^3sL0R`Cj|6i@wy$k&9dX
zk@+a9@c^vIP2p$gU;=156e1%6?dT1OGOf1}+-jZfL<G31GSZuK7Tn~mrjW5C$5-}r
zz1P7^+W{@itS67HJ|AK8fpyyku^L<Cv9n)+A`SRoa|-_(EPd&FhNDftKi6nBqy6)z
zcRQNhqsaXEhuk03v4%8?#TEP~abV&a=tP0A>evx!yF|iu09AX+bHgqLrV;+dUe_y}
z`Y}VMRoZp-IX|yb-^Y^jJ##&$j+ail5C#pz8fz-ntv)Y-wNu_?5=cO;Z9~{eoN$Yr
zlQ*>eZ1G0y$WOfHdhV*~FIVwWoa>RS>_K9_6Dlap9ntn++!EYw8L-?Q*y$D!K}MyA
zIkEZB;OzSb6~=0n8&%zDCNc5_{spYu2xrB%lL3mtTD8Bc6we<_As-)dRZU|Tz07;!
zT(klJcJ8g%R&+grDQ4G-+X8Iv<T9xv!?Ezbd+DdJ7>lfn!&g#IH;l{=JE%@MfM%C#
zw$T^VICb5RJQE)nCQ@B+6B+buRuy3mH@g*9U}(c0fFLRQtwm|*w@jZ}-IT++?bwtr
zr8#NA#Zths&fL{mStRI@Ixq4gmsrqs!}BYlbRSlxp*GCP=9zY99DwUjF_e;W;mB9t
zd<$5GGc+i`U)NOfS&F7?^~;%s>r$WZptnP<O~?uliW6fCj39U7DFLKl>Rq3;TP~ds
zStojFZt+8Uca$Y+a@>z+jW4IszxixjEH~|D^9^DZVT2PLOjhqcJHB-2=JfNd&~P2=
z8mq+KzU<O~G=&l2PT85Q-Rn#2_Dlk$k;<62>-xhX#qPxMG#fC^gehCoZl<p}&ej%!
zA%<~9b?M<{U!%Mfqb~of;&t2W5_o}Gke2iVzw{g2AU5k&i!653kdO6<;J19+nyDMp
z8LE5jX<@>Cznkw$cfEN$)q1q{wTKG6?YIJK*t9}{GTZlMlz4|oJ&l7^xq-&W`EfQ9
zYBgXV2t(P!!DZg6ul5$j=6Ze=4Zo_oa>}>hLXdP<>Bs(<mB(r4wvG>?0TDU@S5x1K
zo7OwYYDUc!BSn4@<l(%wGQSPO1QNqv?M%#4kcDc^^%Wz(GduL+E^gbMb0W0qaz-8b
z#F;hXG}9QQ(rmsM#zA(k?u8alY9gpCb;^av?q9h)bUVQ^b)ODn&Enu?ai-81`qOK;
zey)D$3W<0hS?kfMHpOrl`-w3nC&i(z)WntN+K-zJ)xc>Yy+p6>^bh*oM_rc0DpdG&
z22k?QTsf|qxYP*nVJEs6D_64&-NxB1eceu%`#~_z5DGCn+w@rbYUHHd`AQ4*EiR)@
zo^Brf=}DtN*7bjP2paw?CtDCau8+OI6Em?`3@sWIK&HQ$RqQ@A)yyAOQCo7L4zdxO
zd2-EojJ_7>otpcI<1kxzI^zAO_b<!Y?xQtBfeO}%(<Vp7ZnIddI{#*u0(KW#QcEp{
zd=l>jtD#dZccfHBmZ@A*UpjapdRB7V4x?|z*c)KuN{8N7A<U(sgvv2O$z`20U6qH*
zFT6;r-2&-fcb^<=&X3)<q?4k6&qrOgy0a;}5zXF1+lH%$x?GgVb#X5p6s056a^idn
zSvkJMW)F3z_BRfOE@jCSwCEzxTK~f_jiPJAX=FVDyf>*c8`+9OVB7)o$cG3fo=69@
z)C~X3K5aI3Cca}|fBY*(-L$lFPb(KkxLwEEu;wzX%Sn0*V=MatAnF(ib4JRgxvY}-
z<f++p<wxnoM(6GzLN`SkYXZEHRFJ_;-?gXxFu|PPUVFvA=yPcb<hDM3+}HQ{ec^78
zMk5Zn=RXhfumo7Qp}8ysc(&I%2{mioxSjnp(|M?4ul?_3u>n6-Vb$9S5+N1456JRd
zeQ-le{Gs|Q>Eh?B&@_P5A({*sK~eH&++?>wI|w8j%oT0)<W%QK9r5~b?ex*XZR#@}
zf!hVtU-&-M-jkh`cm6nX7ci?IlO0YnM470unx_)P&JgsDUtJ#yETaOB)i(9mURf#P
z>F+K7eCF5I27m#d>>fw**Ptg$$cYO%<127zH}!jx>-BZ7%>Jv7J7vVbPlvpkghTLn
ztTgz`Y6LIT140e%KFSLSff74q)MAI3`j1>-r?x4HoOE5ef*-pO`mMw7v(t^u6+Vp3
z##@BD_TrxdKrkPw(0KrqyNSXWsd46Ryn`o!S^@}pWS`h&g;0ib2t_8`?EKHl$p^`I
z7R?Zy{u9JboGj!4x}TyGX(fPl@WRmxtH#RJH~0*T{O<d7lEfIiQJE<6>2QD_MWCnL
zs3%;Ucb}JFB=E=F$m4WC#KCTX<L63kwIPX{?wKB!i|>;ft~{ep%gFxP2}X%NhgwYu
zSJ3=`K3aY01vUQrf`MqKgpKh_*rQ{s!MOC<BERp8iq{*09k#C9Tt7B3iPR|9Po=j2
zpK~j$iOvRJA$#!L)DYrS%9{P%={}?5ze-$HUyiEW?Up4gN4fE<oIUlz&31i4*>^6T
z^V>gH-w21HIUI4N+`S$Y<Rl=yjgeunThLS+<P<l_8OH_I?K!eWuqYNIpXuai{sQ*5
zVu*UL@s;n2G_~dz>^qhCr6&1QvC=n|2JsZE3^;(2OCX`$=0Q_h>L$NAJ8e&(;%+xv
z`Kx}(tvacbCH}ehB6<O-a(hQa{AIAq{ZY&pYa|4#8z48oA+%!H;hG<KSXZ($k5s)L
z=!8*^4STs~%YvEhaG%h{@O9~qFCW4<kMh;7S}{cs^c!FW9?yVnp@o%UnzG{AM<+ZT
z?c$v5B(ZX|OX(T(TMjYjn{N*k_P!iC0IBFX_2}0nKV1_b6M!Adgs5i{{3ou>cOfMW
z55TWMwkN^<U+leiRFmDeE(#(dAfR-mM5QVqq7(}f6=@<yRC<YsbRnVwQDUMXy$J{i
z2$3dAi*zXxIwB$hN(o>{P?Qo#P(vWe``yKN_TKl5-{&4<pK<RW_w0NAh#9`TdCMwu
z%{AvUpGWCjhT<N^iOEf>qy|*=M5I?x+?~Fhj{XVQ+Xp*rc-L!9ByD*A4A$DffLw=h
zvR7#eKwIK_7joUIc=T6~wOE7*zWaN$kr1GbR3J`&MUjNt;nj$)m=@86x-TNb+G-<L
zhTKMWg`TPUXkCuUER4B)xcMeZK&h|$Ao@Mal5M~uGh@M<=K11Y6?3GWsC92<O7Vgg
zuxPB}4`M_0U^5~h?CPpS$EK?8c9H_wYnwcF4bxuo#cL7Xx;%!QA>SuXzb488Iv@CU
zr*Z9$<y|bJQIla6%SWSDL6VNg4x$6ooX4#`go-)-0I6m8EA4JEQX)K(T!UOp2y!dr
zc(P7an3Z_f%dE)yFD>?p=#`blhVUr|Rz<Dp-+6IEMK$LkMs35HL3dCS251UwRgNSb
zA7R&X%(kAI-EYIu)-O`bMiG!cD9_zBDGo_#vYWhk3%3>bw<?_ajU*ANs~o9PW(haT
zFo=C*0bdDcVvG(C2K=f81TzuI1(rb3B|h3~<@4S{cMkcd_I=4?(v<m-D2POS^&ABI
z`T4j{IXmxBn+PmBphYO9@|;NFUhQugR7@Zke`|hlx<UNpkSULH`h^{gwEftt)vRRj
zwA+wqQ$&<DeS(sZ-qPZd&uYBqXj^sVUG(VQcPEAdPAk0Kzs2End)wL90^bdQK5z$F
zo&)^8%>-^L5sYzCq96D0WrlWKje>CZ#hoLJt~dE_Th3LtDaY*N>Df#3octL6lmH*8
z$et!_om1Ra&k8E`sT=&EUuW^Wt9Z@VKfPaRho+H(9HvrsqF#2Q3nK>JM1mXk7Y}6y
zS$UgS-P7rRcEW^lh?{Y_=9AXmpZ;3Ev%SY}mhR6qjMP^4@^LV%TlfTvFxusfr{93b
zw<t_t-{U029yCktsdLf)NyQxQeK~*8vCKrIt2U)i)^F(ati_pqPal=sFy;dTD%=VF
zZ5LQZn8$qa>j)n+&aQC~l^P;~z5DQGpwbh17GQG*)=bkt?mjWA7)mAC$*>Kvpd7#s
zY*5LauoxUa(OpagE!T|{#<);5$P(U-Nio`mk$!?G8#nv!_E|@B3d|`#Pv_a~NQp0F
zT@W3fR^yZ{GwPvUlVT_;fAlR<xWy3qyz$~Mxy45697Wc4qMV@(C4HE4yx8m(bwOmu
zFNM8Q-o15}Zzt}$%8r<=oL=L9a#Tx4YhC5<VyPoIU-Eqzs4J4jjuYiM4g!hSIbPFz
z#lE}h7!n8SwWb~?_@lgcoYTM2b={V?f&SoTh`4|Y+ZEf2s|69p>!CQ7I-Vnea~ix&
zl%4V!oXg!$&#hAlKg&=K3=q_;I*l7Gk+hkV6x%0zWG(!e5rQ>N!LjzM(Qx1z18^CP
z4ST;M=5P`PA74fYa=F*+lD_~pHdp)is!WEuuc(eC%RJhTQfZkF=n5s!g4i+;Xk~uf
z^s@{kc<3=F#79oZOJ5_!xKj*)@SqU=3<F!n9k<->Z%OGXQem3co=+8CJ^9UV0XGit
z0(4tKG+9zNdzlp43T*~Bj8K)=_s)~vLH<tmZ$c7=rnE@<Y~8IX{wcOMzh(G7gXtel
zA|3b%2?Iu{8K{`S$MmhHqq>lhjrpy0kklt6TjKH%N!KK`uUYL{zUj7aMV{TkWxhOp
zd^OxiI1otM0{4f>`H=;R=>SB$OeIH@i=(!q%{pzL3$wj<JVu$dino>@qA4q38RuK&
zy?zced4X@YiXDRkhCTQ%t2g(w*wRoJyw+uv)<-3<_8HJxF(>j_1Uvc)+R2nY5b3g{
zwRrvF8PD`t%4f`Y_Pezk&ew0HyuOT(W_7Xm;Mz{XJ;Abth{pvajU7KpUrqdlireK<
zfhYX93oQSb<Gb0x-6jnVTQ9v@a}Y{UBj32hb^R#z!!(v+B;$%P{$#Wh4S>PaZ#mEL
z)P`S8NA*gb1P(c#dZBwZz&+wg%Aj&nnxJyprl!U#I9-5%(&juhTigi@_|FdF`Xv-{
z=-ZzU7k&O*9#NJtI;4E?bi~)>w>Bo$DHeu20+&d60X-i9Pt^1uR2lXItUOpzOOd=l
zubNnykpxL&`?EY>mZJSAQJn}O$NKnAA8t&E3xb)aY?~4=3AqlW)zOl%fe&>up!tQ|
z(oth`Er28j5)P0PxY8%w+L`A$0Nw7|0BHKfi7)?nxT>7~bc}c(dJJN8UkzLTAJA3u
z5WZ(XqQH1Q)<?6U_C^8d>2AUgUt%tjb<7O9Fg~-)76N6)H^M4l;<`nQPAJ|Gg!uNF
zCj7<Yst@SzwfxrFz)JQH@!+K9|H^60kw1Qa*<gafdTjt8=$BDpI6!4nG>spZVa5#U
zA%y10JFaieg)z!W-7Sg@pEx54HQ%#?uV$W%|JI89^5Xa@TljN5Qs#q=6}$`(nt8%x
ziHkL?0#GEosIl%&n7svwcmamUgDeoK_9*Uxts~q34*Q&u<mGz!qZ4-1Av6`n3?wQ6
z^dI!thcU?P=`W=|l=4lp3zx#p!X5ld1B-q3eZ6zba{H-<v{gRNk*+{ob^YQ3n0cU^
z!Ya_If0stpaxO3gl4_9;FP)+y_<VBD={xGbL9H}2fP&^nVrq+DCkpP(9NH<`BgQtS
z%P_TA-|1iy>oGc*qYSC?e3nGdmtDgE<%C|6su1bQX~~v|*~5No;A9a#%yau~362i3
zA7jEZ0yKTBJQfh<RBdBs;kxk;?|y5^E9ei^i6qT;zV<0E?0IMU+pOcpV2lzVIeWZl
z0b~hehMgI+O>8G@FI?cZ_7-Q1$*{H_&CUUEXqd6jpV&X_^65JNi@I2c?`97SAFW(_
zeKgSvk%9Fa0vP0IGTmkXNW!}^h>LyHfwAQ4G>ltO@3jnnhKzB8bcT|+lk*)@-7i;i
zlpY`W)q8G>Yrzsg)0i#Fl;GM<W4E9YuEU;4kU5ey6bun1QmkArF}nwUA0Zue6{jDu
zjWvrrcuj~MtQ>a8&I3a(!kB?YEd00FHc$#uPh{Pq4<JUujXg1LsxLL`vgP-l7#iF2
zQ~0^^qnD(H{hQD1v*qDTZ}?F^VDi7TS!z*>er!4Hr+NvW3?miBfd_f$u~RR!pIog>
z*uBG26MOB}_9{U|gw7)?@fLYX#(FUW^^kLzLJZr5e^8?#t5xGCBI+j`s(k8ekH1ty
z!d}7QtIvc4&H)M+;E<XTueex3LNN9o5=MowrBW;s+U<SI(T{YX^6E7wKdayoRadcf
zN&tK6Fu<XtZhI@qad}q1k^L9DztqByxl>OUcu+%DwY=v=JrQ2}d&UQHx>M!gGuT*6
zik{Yuk#~P;DtfRr3KB^NlhJVw&eZXPt?PJ$2Js+%lWs(C;~jgTct`m3IBLzL0)*P#
zr9ni#K)P9&Y0piBPgfG@=Kw;U)yG6}6LoPsXd)%}O@&4IkH-3#fC@Jk-KovdJ<2*(
z%(A^^JA;tA3K7Xbfu45`R^`Ph!?Q1f@s$B~Rddn_NEz$^QW1Au4s6~HXh;TPdbt(B
za#2(r)RRMso?_x9#?e?S3G)ZDiQ2mXt&;h&^wzIN7B;6FJQT0K;2z^f;-+(P0F|}4
z3>HRpRs`Dt&@f4256AUDVs-VOf}Ra_5FN!QGIxi4KIvV)Cw7Na3$^E(yV?;+>@ctb
zhyfCDdYKkphHEncCh^Jo^VpUH4676eG_kEkX-YA(+)U{C_--^ay28Fgb}imLFyyD{
zcMBe)U{=9~aH9;(gY6H{xQfkIY;8h&Fbhd_S;t4aGwRYx&XT0_sx-!S9Wi{l)3E%j
zx`W{4yS$J}BQo0#fIRAYT2T`}!Bh*&!?vY6YA2$aPd#hPV`aBxbv`G}FOeqC9Cp9`
zy)PzqqCMDC?@Ok}zUy|w#A<E8?neL>BYNKJtF#joZ9a?vK($V|h#<~YzDc$DTr2p^
zR2sS*eePh{fs+38Z?Y9b=iR@z90M>7j}|e`tx-cH8}7hTn-S>nzv}(G>Z6|8G12Nm
z{tHQ)?ReJ2I|FBTdLA-&h~64ovQ?k}Y6p71<cIKjf#_N=iT$1}xNc%)Pmgu5qS_s~
z)h0one48~>c&p4is_vVceHFjI1DQjnTO!xah0uU+?WO=SHh!a@aIZ2JQEYoB?0Cxw
z@Wye_B%Ge)V|19|8YLD{gBN3YI(XH+8p_L5b~mg_%Jn*U=G19Bxum}2U6|n+0??Kg
zI`KEHjs;c;8biJ+j^Z4h`HM#g<5pL_t1h{GUI3BazO+5BaME7DCdkXyrRRE=A@Q*e
z4;1fRp-p+Snpg{uy1S)_ak_iLlA+g~<EKjS@>0w3M_~r{<*FAR(cWdzVBPd$OVima
z8#^&q!kQtdgHl`Orsu%_;yJ5X$^`Qf3=s7t)+(q!0mWbI*IyI5L8-~VqFmV%K!#lT
z6BZ5<fjh99y;lS0R<U~YZo;#+phM}2K<&yxYsA#%Yl+O}2Ok8ago7`Q!}!3Q7sP#&
zfNvo#Due>C?5dt6Bs}~A-DC+G%L3!xK)0M!nQ`!W_16ECHuGq$hWXNi2|yFsUu~|u
zs5F(UK|wr#(a(X<BVloXo?*(`NjGB_(Pv{QD6ww(91}^8CpoVHY?X%Cp->Bz>rItw
zPk;BE6XIHNpWr-^q>`EG(fF1fAopW<r-0*M%Q(DsZ{mr)WJJBCbkg-lz4F`J<<r-~
zcQo8MXv%M`<^uLHZNtsHMc}!QwQQZh=$Ow8PPE<CEm2a(;q<Cbw559W&1RB}^^aLJ
z68FHbvb{Hw<N_+tCq)Ii@yw|v`n?2Zo-C=4`+Tm@$XMEIL9cDMzm2Cx{kO9;BM_Sh
z5P%X~WjdhAS7QpWR-cVyRQ=+b5)_BK9FY@xTC&KaA?8lK=|>H#Gi6lv-q|^xFPDfj
zhfn8{0m5<<)RrsF;9po+Z82np&x8)YU=??*mqfhzRB?1S|K#kE%hiYSl0VE|wP)NH
z>q}zk#~!JGwsC?OVsVc8jQ;PPv@?EBG?bs%@A3Y*+vY$^=p>Ey^igT*6ZrJ&LoG4v
zyHF=qg@uz>>&DS@vb~w^H1V<CpyJYk`=<_KJ_s7__H1*`TGs$n)8eB*aX64|?FUkp
z%N+4ls!K+P7OE#jI7t?bbEq}Q(k~BIAk$P%zqRg7H{GooQZ0FHvh_E)3b0i3%}8a?
zti@<^z^-j&#S$fq_IcAS-6#w9^rtj`o!I-~+V1GQm1NSHLuUavMf=?Z9uswwup6$A
zo34k+a)RgrkWepBA-v3Tp~M(^QT;^JE{B^B?)6Uyy?6IoZGF=2YucIn)4Po>+)*$C
ziHyznv%APuc#fn7m9RL<#<g81*9dgU`Dsf6c@R$n?J}QbzTO}Yl+Igof5k0Aakb8;
zXXLIV5^dl<@G-e=oE&jFAHSLCpCIE}RW)oRHGy=RnoBjE4JFhm$2^(B+Dy!PZN3#B
z2+)#+AOrKpjM+EHkz)u?k2LvDqvDXFdZqqO9WAQf+v1bE5jj5l&K(Uq8S?n6^hu)y
zBbWPsm!tRL-dW<Vpx6p}NG_~kbeKrLK?ETnAdl1v|MlC=1%Mh$A;I1m!o&i_tb-QX
z@kOJKZU(%uNP|khEa^%=v1||N;^?f){Et68%5+N@e4J7wS!KinQ{#AmnPvxKX?5RQ
z`$H(pv;{`J<;aq4t6Ee|l9;`IPC-XWDW)-kZCYt{`^Qt|BlgxUR~qJdi$U-306F<v
z2XKo|S%|f{-|N6h-#i7@)sPL{gQ{B;6j$GAv=ulv#HhPT?J(9Ju{cO#9uN<QP;df2
zcAXhE#vicIEUt>kJj6>nd+nN+)Cz8~iw&?wZBAsSA`9WeR)q%BrCr4s$OpPk1%A|f
zTB@IJ>)_l2W;KLg=Le(nw)F*6T|ZbMDRKW@5FZ!~VJ;@_&v9*_SN&ZMw~A|Xl&U~q
zd?`FGefVXy`H*;t%z^3?AJr5r1C*cQ=gty=v73I1qa(z?tF##!iTk>W4AzzsWmu(M
zGRvjt?9TNtTH|LF5`KK&9ihEnX_u68-oLl6mO$n`-VM*fZ^gYE%~b_2%MXK|(n{A|
z3A0h{u$9{HF~r3Oz|AT??#o@^77~55!|+wwZC0Vohul291)L}v(UCqI+SfZUo!lKh
zo8llRDDN$Ne0#}Z=S!^2=HL?_Vp&leBniG?63{Q;#&4N&OKQXf<{{xBL*o-4hp+#B
z7B(AduB=n^K}qGbWzFL@CB86ILH6Ck{~6@<H|7TU5M1K?;2rh}T-$lL@A9~W^C4wL
z_0Q!k`@N4p-<H}|&S`uYw-Zy*m!2rz1`pqMa>w`8t1YKkwQL2n0X2gL+*@W{lr3@4
z5JCTjD!)7qU;3?J1gR#_)Xb(}X>T58?pS`8P*%9l_Tk2+?;_B61gY>iW+Ld078mFf
zzFH+-QEssO$}p-1C*zD9FS8{jCKs2U*)`E0T6W@-B|!Jh8w7ahgu7}nR+&P;83U2G
z64#^j(Kg(~#Q`%q>Y?GjgPFe(`fu$scS;*HY#+DMtv3*nb>HQ-*Ys*(tL*5ifQj9a
z2E<$VL?|Q(th4YXAlwxK#^WkCXGy85;p4BI17u^rkqL!P(wR!DI?uKa=R4PhZ%CVM
z!(K4KfZXI};NO82`<?;N3{GML!jm&tn06F~KakJ<N32&}%KW?*l7_&gZmX6=E97}!
z+yB~A?$Cvqb({R+Y)jD|hBkeM36}h@Wn9}HjwLlI0{_ZCJ%a&jbyaKfn+ScD9&bIf
zKSuM>WZ$+&&bnt`%$I8_-y1p`X0!%=qyt7Dz?^s^$46(Di0pe<@~*;TNuA{Fr*2$d
z+}kLVLPET#GzgJN-Mx}UiCP+TOEFJVSW|w1m}~hT_yMp2ynxY7mnSq%a_dGRp$)V4
zp(Sy(OFLAw^iIB|;f<-b7p8F%eZB#Kn@4``R`pg%wwLxe`Gcag-oq3GQ19u17;+83
zhX=BKd=wSwUL7v`hvr2P2+ul)ApRUFo4nW|rwo}LmO{sBx@Fr9AKaAH!kK)s?k|dZ
zhf{J@2QP6EJCX}_tKkoDlqy3P`jUv{^eRVLdcBxg)CJ`-lKr0N8>@|r^pI|51U?`9
z`c+CKp?MTMDE~3=V<%`F0-@Hhcp!=+1or5gyFXs}+-yE<a;svFjH&@n__;86^;>p8
z?$-A*>BNSoN*S6il|Q8k^WCvWBi`XO;q)77i&AV5+io)fb-xlwAcZY&5me%B>)78D
zeY>I7MSrJmX~OvvdMQ&yS+hr;dw6@CrXPNO>1dlX8$i<HKuWKK=%>w6wQDDgFEg?9
zdM2Q$=NOcGhulqwnkb`To?QP`+UJwq54M|5*Q@d=@~KbnpH#-3`DNV?#Gjtvft(|Q
z5e4Av;^7_mYD7{E?iHHU(H4?*ZXHG1o#FAd@?9w`M?OKY*JCZ5rdYey2Y{4|*fxAE
zYEfVs9w$w#?THv|7a>f|g-9R&Ev(U%0{orLWsXC4(l()xEp`)mggM&=7>uQ1!b+V!
zfuf8?cPJc3zG@f_2=kx%I%PU&WT0^%xU&0l(gicgTelUGevWga4K3L?&Oz222Duyd
zO1mB8{IP?s;v<+#S_w;UDt!D)%x-+_*QNJLWYSbsFF&$S4N(m?*!&VT0`lr0u_`>l
zcIWKo7B`Eq0a4;fET27k7fNbsF3TfksNnUGRYL8zK8LOw>8Z6Jv!)z^c_pKD<fOFP
zZSk!D=|JE}heLv&%luq7fzHKXuF>y}cI0^3$WZ$_Zyf5MJyx>wdEBA_udK-~GbN`F
z+wSwd+TMh%<l^De@vxoH9_VwH0qrvwBetwm9|ocayUQtP@OFaYN5?s7(|(fQO=q(J
z4|9*;+tH`CYyUJK3h*cmP*xtW05^*wfr?TSFdPtY5BRwDnFdga3Ab)yl~#1c-_a)>
z-|sW<s1REDrkcbHV#jD$2CPYnfoemIb#xGg;JyQ~$XaG~`6LVIwTNRod?Z}XR8}Mx
z`7rjYq_qquX+6D@c%x}--xYglR86VABjA90)czmD`5HeBY``NFa3tt3j-(!d@ss6r
z6E3wW?j7;>U*afK%*XBWZc+PrH6d5{Xop9Dds;4R-zOBBUVN*&jQ96<8dy72`xr7%
zR{|0QS(FJqz$w>PgD&9ccHCug+#K7QFeXMyf~+#A+T6HZL@34LZ{+uN1P5#{rwQyY
z%AmHCS>SYGGG-CGYHcF*>Ds@;%%nKOxVB-~1dQGe6T?vy|KdpkS9c&G##@%xmltUW
zuP<v@<}n6xs2_U-*X9qu3aIhw)fT6w@sVU6fByr&kz%X`D~8CYkPzYPME9E7<``!6
z4dm^&n;7dCLwQ-1j3!91fb9fO09aA9A0*UOfX-d*0$x?5-W<l7p@StURS^d(>fZF{
zPpcnnRoc;*&*{^Qe`23!16AX^a3nqoiiV)ub%39=xfrt%9Y(5k2GY0E)X7S^H%;g7
z*u|dlo(esb?Aq|sr!ly?7*R0KWK21_n11Zv{1%WL1=c1%j+asj+u9~?M+N@p08^D)
z4ED<~aYzLk2pJmp1xy%vCmd&-=q{ZhTb#H@CJ$~QO`E@KJ9GB_qM7aEorjvntY5)2
zu&s)^AOH5QFThCyOgN*4=uW?Drrp&qA<=ttuDEyC8~T}^_iqW}{>ME#Pk1^XV;6Je
zmZu4l^TaBA9Q?Xi^eZ!vaXQZGxXid*mU8F47G`*xb%xS=Qi|rpjwxm-Uz@VsQ(PNx
zl&3R+vus9lXK1wJ72sEUHI*29=p&WW1o561lgc|Lz3b;OH7;-OT+-{l^Hwt=u;%-!
z6VL|I-o^pM6%_3UxgwLd2?9Tuj(P=0c)w_<+e&%$$c-JCJ<+SnJ74oZk=+S#fDs7$
zgnOro^G33F)S|fJe!rN+gT(1t*h&nZBm;H&0~35EEAzT;5K!0_TQfl}0!QT9LI-}`
zSPGk9_g$MK&IeiBv5(=#j{zc(M_tGQoA!WzA7TIVHx^u}%^&wo99g5rjX8#g;s_gX
zJK%V|QwP+yY}H&~_J=W2(?q%xa_vk8O%P-O&KCm~`GqgU=_L3{ENmPkK8hpZzsUSN
z*rb0e;pY8o<i8i(7myJC%_I3Q-*7M}#{NM}xP%4MyUhaRC)dR~Mf^_F)fRV?5)<4*
z_1>h-us(EVKe>7}&S1xjO(zW=Bt71KC@_mcbO$UY4j^p>LPk|M3XZEf?6>Y+v2&?^
zr01srSQL(fUReX^?=alA4Z6e5!9N@u{ncw?J}`@d6oGHW{}V*UfT!QL0Zff4_`@;6
zTuLkt<Owo!-*I##eElGT(+uLrtStDy9)L2{pNajS>t_A;X$Qgpk+pB22f$5=X_@TZ
z=wb?)R?g;goXg!ltmW|;PT8H^O}bapxUKGOC(`)%^RX<I4~v|{OtWqAAdBiR#Q~#h
z1Cg6Ut|M@yWSA}V(Z#=b7zVC!?Z_&bD4Kszf~Ug%uc|MpF17{(<Ej@zlmjVe#q)K)
zb+4VlUJ&Rf1BwV*2HP9@M6R4AQnX385Y3+rwWp8LbO!{{jda-I3sKj@LJfkC7wbD+
z6;>Dcb}xSXR4$lnA_u@G7Vb@E>iXf=bu`wsSWbW_oZO}zb=ROB`LI!s5Y?<lS+HNM
zp&fgtBPmTkbFicYeh8twa?=WT4Ee9hH~&SAn~PO|vd^p^`jC4R8c;G;OK+<)&#|r?
zj0*g8fuE#tKnrDxznp3uwj}c=BBVE<YG0a6;#sfVH;;!%y*$Z(XSs_IbvLAo9O0_U
zLX;s?%8O%uWl!z($?M6K4|!Fi+55=!n@DnU{AeEF^xY42605wz;ckTSxDI&LXvfsq
zRl7lsHj`XEHE)C@U1R%Cj`|bdT}GtM?mt-YZBt9hzsVJCFi(uZ*9?|P$DzJ~SmLj2
z1R$6og0gpgG02L&vFWS&NhJX5vas&{BC0(SI7Ow~|J`V>0c*DT8TY;(o~1gv*meu>
zljo{Nb*1!IxXe?T!nQRR>UG+)4eRb4MQT(m{a*L_rxFk^2N<TdR^mz^KvQxv21G$L
z-RccC6xRm)y?Qa)19}c-Cm3%Uro@j%Vrb@PWrjdBP0h({<l9dNkHL{2QHdjZ5f?4a
z#bus(qVct$44((K4*#0JY{b<f7mMlI%ug(t3hW1bjcb(j8%n}mt&~m>eSCG6IR8ED
ze5KWnwSu6pu<PNec8w-lnmBwCN3t4}fJsdgSSo4{iPwZ-W7(Hk{PaF!hH6r%SH&bc
z@`Sav_I7i-{s}kU^w26x-L8T2TMi2R0`^o4q#MH8a6?B8tpR-FHj-090P8fEKZ^xy
z+y-H(6-){M_G%QXWfEHiSx!BWL^dTV*7J1|+r(ehAU7aKmnZY|#EYbW>6b7K2}6&I
zV!w1N?q+A5`P&4q;OO?+>t<@PaC$`?2;5x8b$kU*$1CKoM1&-c;vxft(@WQ{<v%M}
z%LbGey`Yt^5a;A`!Pc(5Q5&+c1EnO}!y%!a!1D8QpTe&=!6Wh2gvF?G&NcevU1ssk
z7NyG3;qwvCGaok_S=xWNCC#ITGYTVq%q6CS_t1ku*87TaW1|(gADv|-95o}AB9RDi
zC-9POF2p&77Bz6EI&i0rl?VJGtO+-C9D@LDaUIDiCCnMces5g7!(l)+EPQ_y$4CYe
z0zgl58UqeVRpM4El2{mRj?@H{JVC{AG_C^HUKgCjFYBPOqcfaR20UBrpMQtZSxA;f
zJy)q7bSulY3pgc>Q0kBTg;%$F)&R|w{+~{OQ|Lsx9ctYS6$2tG<r{4o#8n$K8NKp7
zX@$@UUH`)gw{aa5fFC9QN+e5w<<M!Pu@}}6`EeC(1F&z%7|z%qPWbVs#u@#EH<57(
z&xOAuO02#5(;5G1^BrgsFvO){)qcADseub1m@w$tU{KQ)%))~l!x(YR4mosB(j{0F
z4IY<+nC<w}DQ5rd31Z>8dgG$}|LLj!Z}k*1nMij=vebzj$-~SjFu{V*r4%n3KU-k|
zztvDX^j?ugAT^Y{T{Tanx+6rcJ5uV^eZ1cNc-TSrqu+~gXg$sfn5skYmB;}YZp`i^
z@Y^3sUGQBvJ!f%?aC;V`1BTBA;_l5j+7D3${7i6zT&lW%-kCIv%XZO{`&;<f>c8bj
zfrUhYQ>DNO4_OnCups*&<~E~vF|-2h)eVUabIhc}r!z4pQ=Nf|VW~xmSL|2&XAhkU
zWOjF{PQRuSTeLU*#q&0~05_UIgyI~b8vqu9TmjJ+@BmkEz&_12&NeAEo-nIiL4kUp
zKr9GK*JUhDP~i{YwSZ>xQl~+#SaZE!Ooay{e!Io<-c6NGV`~0t+bwze1dWy^OCu99
z5&7U%ONjMRw3i@hInc73<-+*cvARxtV95Jz9F6KR7>Z1Hz9MqK_R8~lOLVl`G3~cc
zEDkyvLcnshfq0@j0)!%kv={{)$SRNLO{i(uIM!0VR$Jb;(SyuE)UKu5!Nzy=)UWFJ
zpYaNQf6eIpO0ESrn573Q$54yE>Gj4eH9Eq$mQIeq82F~&4aoM)-V-Lt={qd$FclMW
z(;?=nRnUv3$#?6p0f0dRF1V^2BM$0A#KM2&wiBNp`!h$>a8!Gk6X5x`HQ!j1ZM9=`
zaV&Gg<G5#cauE~dMfK!_2~3y9)4-=*&rSS+r;tOWTN1{ctU+G|V|z#7)dA&HbB;}*
z|BEl=+F$z5`{6@Rw?EjH0(;E+0Vk^Q>+C;8UHLY2(!ze@dcT8~f#Viyz*chI+5-SY
z{+w|HeuGn0?puJmsXF~15tX7BzK+4kp6ik2hQW1=ncFF^&Nx5amU{na>&x;A|7ES2
z=IUgRNE^|_hli?Oy#?eHO3&3d-PO>x82ROYck+vO$c-=U<*%B>HDhIY0@IYeHU~OB
z=KYDFMX+_ymzaIrY|JJ0Vf4&}rlA^wRL3?M8QbGVgS$F(Z;9kyt-AK4^&3L!2YeG&
z!kfqgZABYQ+eI#UHV-#T<;{^q%DX<0kJ;azxLt3S>TUC-4}ctabMFiNc2wV}vi$>z
zNiK}F-;Acl4uF8qLU#go{u(@<ytRlyeAptpV32|~rZ_it-jynjepW~R>U(j(h=0PR
z(~u&MDU-`PcQqLIBlKUg4D1O=r~~*`IV{jq{01y*FX+h^QC8{nEE^SW=;hw1(SK+~
zRebN^o;uTdA1MnVW7Ad(McG>&@R;l#_n*UOK@X+k+CqM59hflw2;2gEpu@OB=Od?y
zHY!_|8mr!V`HacQl9FdUJUqM)?Ebl<<6^-TZk=KW&XKFqBPh!(^_01rm<vNHuZNdW
zH8HmKdUBy&`%vf!9-9<&YY6Y9++fr5UF@tvum7dZC#aFc(SH(I+IX(48sut=<5<D|
z<^Q;Ik1q)P|NWwBAQ#7f26V~lIH@;8dvQzG-qT;rsBE+A>H|VA9Fm^#q<$3Af=^e$
z>Gqwq3ECh+38|)Bxo<wms?xQ+e)DvWR&4yQokxD_arJ=}cu(dSAs}IX;BI|om_bs5
zxRny%Zs9r7!ccNAHHo7E%;h(m|GVk`;mQK<0=RelVZgh%i7o)%#Sh||4RTmTl9$sD
z;<W$pL|^>T1qI=Uzem6toTVwlRZ8uGvFuEM(^dkf{HuFw!#(uh?ZJt9@aa-=-2m+Y
z+Zo%Ft2797?SbwVJ{qI*41M*B)z#sy@)_AH4RqRj;gzYjeM41SkBGo5%zEQ#J*zO}
zhaQfk3?xWkgMdsMKqC219j1iPWGYB3ccu~DYg}(nq4#KfBO&&;RHDJPCL!KP-L%4Z
z1+u13<3Z&|h9?z)=epS(lxX?kC#s>XG=_F3tTsJ0C@_$60%kK<lO*H((wM4YUskkj
z8UMT5>EO!q*U0os@v}CM@h^c%8N;ug0(VBPHd}zqD9x03x#bW`pbahr0)7CyhS6vF
z7difLg?((6ducznelOlqq_(z3|K@(PUFt$5B|JProN|y}`WK1{Qsh^J`)XsXdPTu*
zU~M-n?tl9E@56O4`oDYdlcNueQ{OzUMg-)F!<lw0FZMCk{0xOzJ+qZ_s!+jJ78+J)
zbSb=G_<XENc|h{wTenHm-mM-UN=n4M54dxm>{foV1Hf?FT><wXjuX3Jb#{v#j8y7r
zj14QKsF%9)x0Rg}8~u6jutjsDjG$V2q0#WZiw=S(x7`#!#Y_SVFSCrHP16P`IJzIQ
z*6LxZH)*7aQhtnTq4iut)9!}v!BZNV$%RK$&Br*=&%F5J0dvL)lMP{1#PXON^*haH
z8ut)iUr(E3z+6Aq_J#WMbz;ToEeRNAmT0!0&$f$>7P?1MKjhLbYQ&`oVLH2S7Vt$h
z)NNp7`QyRcM{(>~N>pumzuc`XVY(=!u**feLng*Ov}~cWIY!CZdG{T|^1*XOH#hAy
z-Er}f(no<`<0d!_P}<onzzB!7S~FHDUWw2q)=a130hS;`sk82j)F=i<)d<IN({2TK
z4c#-c-M=Hp@cTsc%HUPgM+bH89R>Hk&&?-#<7J_b_^m*n1y+TN_N3cIx9HS%jl7{A
zQPK)@%WH+bXB&;!ez~X-19R^ji`j0TZ@Qm%HG?Bn(Ea5wD-ak0GK)BakIH(p6ka};
zwK>&ce%I|~zaZ4^+r0Vddn*zW=TBACLxP!9vJ`q15?rSHb>|-awDh~&{W3i<<JKMA
zy~C5vlUB+?jusw|GD6&YJ(aUK`$4Nj<Jw>x1-2Tl%?MKFTw<v({Ceg}d`OBxuk{ix
zu}wQ%>SYt{m;4>9rN*RIz9N4+$#}o*zXzk9KsnIK9J%A}73kn4Kj8^fgri?apudud
zrvJB&cU6xrbNBIbr3R5(z@Rx$`^Kvi52z}_9KT0b%o5);&;uR;Z6x!dKBap(M?J4v
zz#TcCz|)AL>$|Y33^L8ei)wX#`9jn=RvymZGvlyfDN3mKp#97?^D{g=Qa85jd%zVP
zM7~Q}oS06E#nX|9=$0eg0_;&V=p0ej4vN)Hq$sGiQ{4G!*IQpJl;T_fs_<NCs{CL=
zZ<ERw-D{V*-xqO<mFzv}5J<3odRbwilwr_?Ri|&$xAv7@N5)j7v%FvVq|?4XQQc3d
z9Ta%0zjrf__FWz>ET5P{9LG~aI^Zmgfd_i1iQ`O&HTv>Lu#qP5lRszQw7*7YrH}qG
z#Wa6-vRp6kQNQQtLoWC2cwNFh^zRWsUp9cg6pO&uEpKQ*ZYgE#Lc|)0(LdyPQHh%u
zWL96|Ui1`yIl^|SSrvRe@O9uu^v+4CiAQ<P%2!~-8oLz%z(J+X&BvkeAVpTe&BVAd
zG&)<PSjH4YY$LM>dt2npgl~NYfG@Q9yUBl0J~&orySUO_8nySJPJGh59Mg-NixCDg
zq9PyRUdxfGu9M!m@`?7eqEDVOGKlYIe;&8kZL_WGYd~=9c0g`X`M*Sw5r_a386dOr
zPhiX+$Z7taq~<@qk?#qdT>$l`jvXW}dXK_?T?I;(|2$}0gWaoE$<+aQ){+0nvu<R;
zq!?t5B#eqrG7^9~h*B*FF$Y?pJ_anZS?9(4F@JKJJP>x+UTq3FmoVzK{+PeKD#cX6
z-ov#47yH{TB*26$T7vZVubh{>Yf|iOfY3<To$Ud=SKP*Zd3%}kuEVsJaruyQ!hxB#
zW9MddHm{tvE;KKgl<S+%J1^$T)`kSzz%FA1tIK_91g6vpVnCdTG%6alt*q>~x*nHT
zIdaryc*$1NeDEMier>Ssf+<l4vG3e9{CgxzWwn#Y(h7+LOLq=YtcE;}r1W&)x1*1V
z99#TRpY?{@X;;m6>-Y}M(^RR+swHO+UY?r*dbADEy6C&e4zY<2Dy&0$V!-P2?G2gs
z>t5E)O`mM1YWk-3595;k3N8GV;{XUepd!TmMw~a0VK*0>tNPEQ+>%JYyZRY{D6f}O
z9sXJq*B_`PTNegw*)DFNc(V1k6lWd_RNSh72n28^T*HB%>8J+-;f^&!^4KNl-5WfV
z;q$n43B1hAUpxt44zc{$Tjt23MhXl51xSuDBOsMBF{f3uvXpv0=fDB^7Z+jfZ%RyA
z`*#ZOIijpMY7h9it|3^;^@~Rt1v3MSmOi3!)JxOj)A!8S7el*Vf2v{oWPLeVFL5qT
zEmQ4@dy@AN9v+QpP^r7jv}cI~(K{%sG!vtUrXEK^f-X|f#TV!MTJBeH-6u7D2$B6&
z?29@|$6Y)yVT{@Wau8t~Q3NIMu({;T;NMJ`<n<O=juG@euKjLsDYbtet~O|2%Rrgd
zlW1owb8eNB{A`7M8*^`R_L00DXFCyK0oe+3cpgXwJK^{R&#h%tU#x2$4X6z*KkJt{
zOq=y?sPoz(Hf3J?UT3Gl)3t}Ypr~dP!;F&AafDXHN~UUQcPyYMv_2GtNIgq?VAFo5
ztSQmOR3Mn|HQ{>!&=q3qVc#JuksOIbfHlN@8YWN%-wt+#4)a-wwJ%O^7s?l<pM9~|
z8Bvt7wWGGsxofLiePSW(19>bC5VM-&NJt8ImF5ptBTrUb9&2L8AZfPAVC2~QRa^QG
zeVuoMr@Z`768m=x$|hgqyBS4_lFMv`*G~9lhxVbDHNQHzr5P9GfkAv|D@gLs#eV;d
zkV79nJ-Ea%E}v)G-o@h8QiE<8GJ3X&UpvsN5PjMI{T4w1Wg+MmZaNw#i<M;=GYf${
zswmVC9ZKp!3fx9kq6uvVIKR*s&E0P<CoXki-Zw<<JiGU%e%gVX!{SB<K%Y2aJ0VGK
zW6M$OP_8Jq_y8&hAE7A1DlVP3b(M%*a<bH*1+V+Kr_3)eF3Tq|(OMMyOTk0eFpNho
ztx%`oQ7S@|)p5FpVfGM&rK}FE>t&IaKG@_676c^7N1v_A?Uc9ud}p;CAAuBW5#Ssh
zgv2rXhLr}Jk3MtKdXD!H^fcddK2|v0{kL+eYxucMAT}blQF#N#dMcJ%KwNBQm2Sw^
ztwT>1{+6b{0;t=_MDJy`wh>rahy0B|R45vweqDpE1c|h;dKjSqsVnhiwHv>SV^fCi
zut&)F+V<hf$yP6%qus+km0fk1B415cK5*kuAhP8*=K>g$TXAkE1uO~wDuJ2C(56O3
zb5!Vk=NTvHH6Jl@K!hO(_d>Qj%b`{-CGlG1!M!WHPx9PwNxZTDA<plU-A{l8*jx?J
z^b^bzpgoH?z>x@ZHKCTfcR`zmFeluoWRaHoo1=ESZeAU{_Q0b3KH1-9kDbqU4L6G`
z*e_gQ`KR$WppQuD-`WYd8X`@Qy1b2}K3uNV(oZ>|b|J2#(dFqN%|WHH*;(#kGUV~|
zbgx{c@PYmpJa89v!q-yfE><a>%{&cm+k`e_!njG;!~Kj~)5V=A5hQ}SFX%<kSdEMC
zh+JH~jur1%QOf9%b9~=ld8kN5)q|HeH?(3~17N1G6&=C6&do8}<BZ<MT5ct!m3NJd
zHq_Uh+4Zp6xySE~yMf;}%~$5+K;}=70MC3um;v#bJ8TJbA0((vAMGI1FCfPeGcrlP
zagslBRqDcB6g5YPHMm-T?Q;orCtvM7`tyiKvQX9A{?$(b+}q%B`=H6ccqDKhh%x-R
z(Z6^+$130yHw@x)&9W%kWhO#3ofP`}P))AW7E|25ZPtV%>1U4YvEsqL8=W@e$lYTk
zQCE9F?arI!1S36GsXZg|RqlaL<sdEN)RKLnVS5k%OZ(31rj{;EbLZ)6)R&mO;Fc23
z7z6t6jLvZODU5mC%#{a)jf~sJ2qZ1i{8hqUIm?GAqtW-9pUgh0$$nCO@8eUoe^$cs
z{7PaO_|3q&mJ>j}u@kq0<H@kCq|!`#sGgN>{z2tPE1a%=vcufsqk<1hed5)Xmp7|x
zn#*T#R|&J<A(6)o^;O(h?EWvV=9R`WT`qxBlMyyHuZAs@b9pwk1ahAyMS&Q`dT}h#
z7de5LaUn+-$(xn=?lKEEg3UR8I<`Bk`$QZwyXTU8b@Tn?7>VFpkt^@I#rf7h<r6^=
zPA#O1D2eTd(Ju^0;M#PeI&E3m^e+qT{WEa>`SrBLyOBjc(4$v<&$FX*I$sp@2hMIC
zKXb=Mr~|$SG))i9=Wn<ng6JzY&Tpi)@I`{<@Z4`?SLB;p&%eCYJM1H_dtpxh;&FV{
zsap+SJY#g~6yk+i_DzZ{1jMdQFfVhnFmmj@7_lMX!wUmy*z&^`SxofO?+IcZ`8d+K
z_a^0=(ESLXvlWt<XI@(A@H2oAx!`Yr<yR{F$^ctCR{>1x?7L8g#(#2rzx3k04I3f)
z24u96K6oaa6}nND@@o|NzL>L7aiQKa3}3|2SD{hDTt&kI#`MvKwD#Lq8*+RiX5d$6
zeaK9`g}?LQkN$9QKUuIpjwLpC8f?Ig-~S20BBcHTa4Id*zAVv3)blScEuAhYiBGYZ
z=273|yI;;P4p{Uhnu15{M(kF~Luq%|k===DAVo>%W4_qPA%ywCYR;E3j#*$OoLRvq
zj`g(pmRBABi)U&n^Kp{q&3LMfkq^20p{(bl!PN*!8+APequHf<MkMst+zC{U_$zHi
zM%rXS+S|F@{6}#$)|ZJjU_Q~r7L$zNwRf-~sju(kx&|*zj0T`T=w$IU{bybwa^N2-
z56F+7afU?t?IccKbq|>-1yo@v{LV(oK9(~vTv@aXbS6O#Y6gqkFUQfJz)JIjNJm-%
z&MgU0GJ|-%)u<j9_Fp-REHb$8CTT7VH1W*e!SrwkVA<ar>EOnFKr}9O%<fkY^8NQe
zB>{lBe;FXf+5vXCWQ;q&fC2-w96EzY!0sP0bSqPftJUxzjx0a_bl{9mo3E{o=F)Pd
zrMS$Ioks}dyO0KP8u%fV@J`f3DpTy91Qp)}-@&q^1cVcij&qO#-D8yL()D|7<$KnQ
zb%V~^q3))c?I)j@>}%#sd8e|i^NCqxmWtj$r}j7u5(;8WK!T_s>ODj!wE3fMsJ0Xf
z$@cyF#l@k0!JDwlqW0uz?HxO}^4;E(Z2Db`xk^g{s|7oX8OExAiSgqW0emz<2JJ;L
z&GGSPq6FuTJ#<wVTAWSt?bZ5Bot{)q`l+2IZ9Wz45bS@FcYV$Tr$z=e^$A9@e^aYU
zlV5o)_A+AOamdZh_b44NeDeKo!nYfxy_{pN^eOK^NR$O0*3?{zjxU!}lY5l*8#&Pf
z3b3O<j(55VE&@z0ZhVHSLVmD_ai_oT!Kr0xzH%~VzOh<5;7>D-7FkW>u~$ysYAS@7
zl0ESEP8|k_!HEF7)d?7S@LO4cxl0Npx~y5@1Xj2?!zGOn{fwbsM+);DvD45@RC5zg
z$~94E4?bS&y1z{ZUI7G3mnXz#WEQL0dftGLDcj)+D}mwODV@9hW59Sawnt?)<U`X3
zz6a5Rm38_zdY0~=gxee9ZUuTwTlubhFpae@JK&-qI$~=-7Pj?=*>$@aHI7^l!;ebn
zCbCqSnKStK7S+!fr}AEc@2k4JvM>F(+wKXU_U`oZx>sWK<AZ=NV?vCzXQY}QJ`HN%
z4|fo2V4X*?x@~TOcMM{d^z?Jv=YMEF8AQ0Lr+%n)Z<d`+9&hf^bc#?A<H%_N%Thap
zTf8~~woD(l;Q!axAmY7+HQ*|QBWs0EC_#eZrM{%_wP0q;!O!24U!DJC>390PfFbXg
zT^RO5mKO*sPgjlPDsjt*0p!?KNRqCY_i}0sE!QgTx+mN`cKBRv@?{s7GLv0Sk;yJ0
z7O8D*<KOsspvoVF@5_*2IU}?Swh0h@05TK0m5UvfpWG$3#VoIFCTY_`V`70~aVY=O
zlrovROOxS^b`Ia(8?!E|27s!~`*1HZ4>ShSgd9n2W)U~Nfc3H*ZAKA@Y`!ttvOD`+
zbDCb}-V~?T{=Z81#p8c4Q%lYb^Pev0vzdt6ox#WE%WuJO)4<po*aW@B@z!XMlPN`C
z3hw2jLup7)yzNsLR|C?LN(1iD$ro~B8vJXflDwpr@c*HR#Xk=mKVV0~K<Z`OXBe27
z{CEGB_j`j}xdSpw2r^si7!G>u01U*1dXE#2?0`2Qlf-Q1mf<{*LKqnM(KVtBA-onT
zk1qUCZ-;R6!IsNM=N+oQagN9KI_JLPXZA9(FM}gLYSiRMA&-xIZ~tf(^s;B>v)>U<
z|K#2y_MBQzNJ{HI!tF9sdEKcCUybC*5GfgvsOF&Q0g_(_d3&wG0hxxlj`gJQ#oH{x
zdnH_oYF)fJ;Q(3dj`h!b-zZTNfIN&mQ85Dvsj%gL5Nm9!3)}J*5FsJYed}xqvrauh
z-mLc~F*lOrOex4&mkI0@zWP}&njAv_5<SDPYfo@$hoo8Vbhg#5oSO6ugJb;5q#*A-
zkKGDuohHW0S4);!uYCRL&l5j7N~Rmb<*>cGMi;e<F~_(mlo3rWjtnlaXo0Dy6tjC|
z?}(^!&XT*??M)ZQliWy>cQCpECA{mu3m5L{zX?F;9)FE;vmJ94ls4JMbL3sh;jkn5
z|M`wpiX{Q;E-2m7x{YVc)+TXfYrForN##MWg19`W?-#+LkSwj9j-;_BX5^==?-WFM
zvs@rmxPQO!jK4_t^Of`Qswt|b@f5f7vd1F!oL!522`XEgchXaVFPexI!nGNI>U$R#
zjNmjm+VyztQ0>;m1I=qP=FP$B)*8M0Qq1EsblzgRkAIdtOk<OeV~oMrxdL^#?g8K2
zap?WAo(G&|K0j#7YsLvEn_KikJG-P$b8&dB@20BrBO{YXrpL9w+DUgtfuzIYJvy0c
zw%Df~#@WMEWf;W%g7M8G=T@ukkVJE&YIEKY2994J4~Wh6^?y90`K_T);Fk?s9BPMG
zA{Q+S{DA#3f<>7{bz1p0)^<83e8?X?WM8`GltuLkdn|F3lr1nYTy$>_3MiRJj3x|p
z97pa1x)zTP?->FLl?Hjqbq81Y)-rAR%_rJq39U)2{a~5xK(dsIITHTZt=RTD1DI%3
zXOcuuk?{-Cyv)VkqBm4jQ<KG#*PZ0|(p24vy${vU`%N$@*loDB`gC$N2QTUm?enfT
zGGreffF-GX3K(`i*2_P_de^)~!j1E$=t$e<6?@p|RDb`QwC0SA6wM2rMSr`aJC##g
zdF+I#ggB&7v+%}r79_^h<jApuO`^`RgBC%g`+9Qa!|?YWKkb*CH}`%GwflLB0+?{3
zsIUzn#FjqR02S5s=2R^T)O?3p57r9L9lF-wC!8mL(cFBk(A$upQx(!;zhbTk(kPq3
zC(I5b%xYpAf~Z>^?mDanW5G(gRrq69WO&e>(A(*aiul4pvvale4^+%2u6Cv%W;HZZ
zYQp&)>hI$$9s?ja{J&JW24$2+;v@_b+e{=c#{;L3=l_il9sggbT0p*n)dET=?M@_H
zS`WunP9cKIs(<8<>hc;S#<)xQv5+m?6Uoal1r4)`{@FvWN$SBXqVer`KF(o+6@@H_
zHs}EHv@5an3DZc+(sB1~fH}4xIknO3OtQ5;(IT09X%P98Ts68FLqPzFo(Gs)<vk3b
znyE%N)Q@@5QH;)FXb+p#4QmlYnRkRV?!U{NNK3G}%s&&zxyDTZHk1r1V*)s^3TO&n
zH;~A2D`Z)A-mNHbpC$@;AIliioj9@o^N7^bh%buPx&iUQ2fjyzO7q*CgqNPxSR-+h
zAQcXZ8%~@q4uR1v;c<!wKey~gOY|Jmg!a)@%c(mzlX5amb1lk9Gq>KfKa6zF6x!~-
ztumhoH-ZKmQGh(40}`&D5gP~Qy(__tlAb`-j?%w4zgp)zJZbDD+&RG9d>{@LqO@bz
z<UZRNg)xt?*aHH;5hggW&8y8=>;TEuHfbOq`2bln3VbBF?p+lLR6w{t2Z=MVF;oTf
zdEt_YvrXrUzr`4=opuhmSozMnV5{Uywgx1+nbpAXYIWVo(xI;pcijaM!iSI^!_tSm
z-OwptH+gbwjJii&j>6xLkejUJHAg9L;lEktEcWAWEXR(U2>8U1RjVC9jjlzOTZ+57
z9Pg_2$xu%Tp$F7{qKUP?Ty9(L<Qw>~oO@KLVXZoibkcA%0-e;=q7e>>TGK&~Pk&k(
zHqEBon(Vz&eWcDO{DWbKbg)eOgQlIWijR3Wg7O1!j(qYAY9~vWY0PpjL(AE{84UQ!
zD#~yFX@8zMr0dytK}Fc~GVNV>VkX~E<1*_=Qje=43&jQz>@K1RDyzf3jun+B7b0#U
ze)xLB9%UV=TN2y(UKp11V1HX9-vg2>4M=bwwRB)ysRvgl*0~(ta%i+gAAQ*P4dbG(
z&U}(rKFOkZcXRqEqAJmF!YSWmcfAmW)BT0U9VdbvJ&@~!aikA3Rp|Jh<%u5Z_vjfb
zl!bJoYQi)dzgTW0ag~PR7TnumCgT`CbY`jN+XJb0-LYFqyxgOMNKpQMaycF^A4H5K
zO5;3WNhCHJZ_kpM(I5zQymUUhGG*x>;HqNxWhq#xm3kns`rG#azZvBCgcCi9BYmmI
z^(eOx7%0HxoPxHa1SglfqC?w^HV+52<!g+$eM@g?LO-|5DK`F=Yd$r*dH(}`8NT(l
zd}0n_OrR%s4>tu&%)&;XH&KsZW#bs`6kKfc-dqj3KU9R(b1GM)j)cU1NBBLKvp=--
zIG(b@tvzfJ(SsAjh_EkW$AGXsid4@u8#3DCrzpt+;UU2-nJ-u=%jbz%TIY}Z-Of}R
zu9+cvG$MD3k)H0^a%7XD=#aD6LdzeDRo3g7*4!tC!fY64XEh5(U$FjM>vHHFb^eEo
zU&!8--(Q=Oi*Bwwv?3ptzjjSj<J&)J?;qFgIz|NSyqtUj6RdMpL6iaRo>-!3>CX&E
zkQaEgrQ96&Vh;KifI(a(43<y~{<PnP1K)SLj<^z|O_G2*a}a;=n2f&JU<m&v39m+R
zQxV&6b1g!2u+$Sc`IgLwV1uQ03DjK#EhFh>_%#!;#R*_hhC5*EsIdERvlstnc>ntN
zzf;N=D*#N&b?ia37!wC3X%5sw+zF{q_IgkTM4kN%veMpPu(<KjX-^$lJ42{BzA`B@
zY~S%yttT{AJD4iKtv$p7v4^X!4?r0I;OGpJBPG#sHz^uEO(0f=B)h5nE;MKyAk}qV
zOK^7+#eCQ*bn*ncRge7fyDn(^5;ENmF;=n46k*^yaQAbM4?(+GL~3wSASFl1oc_Dr
zb;`@mv@Rq|ciYDZQ&ie#mu%k`{0|gE+2^4TuDiLZfc=GLH!T)Nwbx6KXa}<52P<@>
zzA=kMCK+@d6_w1#xnI4qxbIj#?48)R#4U!QfeZvirbA2^?cN!3_U(FlMj?shObd(>
zVwe{30ce9W3ep^{K`*x|^0A628Us8Wt3gdevpnDZFGHTsrw>k%&X=^^PuXdc-0HD;
zQ}Jd#QXKEfcaB5|WIaWffO-cah`+{B$cM;kfgZ6)BHaQosS#*mfa;<u6SIc`PsnOV
z_q1bG8;zyYA@>2p^lq!X?Ry$^Hk~dpRIyl_QCojZCr5IQPm@K*(8lf3(j?o|A}13s
z4-YT#Hh+KrO%D^*3r_A(I>T!%$!`N7UKJA5<yK-C5ym-WD90FRcZA357Wpsul#LsQ
ziW}*?k+!*#C}8TmV{GsWJR8uHt#%K9KmpxWERoC)dLS2H`Jvu_=1I-6)C7yzb$2!)
zP`=SKyLfI6)I66<+mmb)9b)+>o9edd1lBi!@liD&#ncAh1=tV3B*10oLRN8NHKD@G
z0<7HiR>QFN<BuCGL$dTPt?=QLOSYVg6u-Y^ylHiHn!NavF2M{Q^rA1f!uh?i!j(<e
zLZfe6842^}%(#@Stf}X^N6g&%${u~jDs6X_WRRmOFe)?m4h=gD5iYBArTy9$h}s{v
zRZRYV!M6K1%zQNkes+OK4FEic6=FMZAg3`6(lOhOBsn`7pEHs$W;0q+dDX)l{f-M+
zTVoFVoZ1ZkL2s|rdE6U$s{Y$rhEaGCMvtM^wahZ$JI@N8W;JuqvndLtPF;^(mibHi
z>JPjv0BFxg0&~bRh=&^wza|#pXxos-In1)7C~O+&97&m5Z$5^N^9smd-dg&pwY2HX
z=2r)=8Wz9vKOUH<F@;g1XbYi*JM@i&>H3*I`9Da`g8m+EZbD8@PR6qBW|GzsXH4Yz
zFYqX+zuG{CiZ4S#WVSF%4kY906K1TKnLZJCmy!73%*OzXE>XW)?{oC+7_G%mHL2MT
z6V6#FT4#P1G&?Q;2aYMTqO#17SnHf?@UcSoc)u~#X0krZG(YY;skSGpfAhOTMH#^(
zZTmy7*)2&wVE<q3eR({T{ocPy5hD9;$`U1%HIb>TCAXANVk*fpq0LgpT%|0Lq>__j
z%ARB@J7uzmBun<p3`s&}&~z~~m*02pbD#4(C(b#Y?RlQp{qu*Hx@N}ry1w)M?C;O}
zBZYqrba<RrtrkB^opwEGpd<s3XE=Kno_4+4cvl8vcueS%&doP7eRa~R%M^;JLSZC<
za+G&sM+pTb=ygGU;;Lp7^5k@8ca4t*yJb|b$|C#3jXAE6$^L04p`irMWcPh6;0T5u
z3L*?RvP4E1T^*9I5<6lCddD5NcN9D?Oms-k=(7~qtlhW8Z2N$ap~AlG#&vz+Hu22!
zR7|e{1H?z65hDbUM#prw)iRPdykATE1_t}7q#3MuyQN@Da-Ud9*R$PAj+WI2)SeRD
zQP&QJOu2*Tn#8!7SNM$mz?kO&+_W)|;g&N(b2XM;Ls<WDXUnkMn3B@vngH-LBDf9k
zdg^hSK?PA@1njX=+$R^b`Sb9~_s4HdY>MJG6Zr5}+jAKVoU72|IAAD3cd?9UzR#GT
z&|6`PQ=y_)vOMA7>@WI(o&6i%Ci!I_Vy9R{dHSiwuerP3NC{m=UcED_>J$2%>ezA$
z23B?<RyT{nv0%)NA5|J>ozk;P$ElRomvO?{`feaMxg>7jyv8Mgu-jwYa>9h7a!|A*
z=b^_@a_aQ&uM`yMx180ilsj}kYpI0o=;J<P7546Uve@;^b}$sSk*Zor3{94>U51mP
zy}d=%5HVD6Z<}^jJE`HY%Vdi8f!X$^zQJ|TOJJbkxVHkG-NY^h<?6@%&`3ZgriFc;
zbg6t5Yb|^<XS7@i#!_8SRXwp$zD-D;e}+?)mwxKGH*Zdq2E`rn;x|f~lsNg!@MCA%
z-iwF_z)mmJs!05VVSqy!PXGyQTc#=^$th{+lQN<PlE*%r1jY9Gv;Y2#9#(t6sg^DH
zL%NHBb<2>w>{O_$h0~fs9Lb({?TIJL%+?KX@JFfVaDd5w4u#aGMiD*|a1C1FVUEEf
z0)sV<d62)CXA-2SIkm1(DyQu3PJmU(RhWw?F=Nw51W6d*8jO9iwQg5wv7df$ecAy_
zUM06eb7O<2!%?%U6qf`r=5M_ROT4)fL)*JevO=cw60#a6-{Pkmqxikr!7trgW_+kG
zaIcXmY9meQ7jm^2ch%7kgQ(Ufu^gVsm-MMYlL)YUIaq!Sj%Adx({i+?3Bc8mF;?<R
z)rx#r6#UpzPhIL{-P@-t-BldkSPo^Y3#)~s+ZO~yj1Mu7A4EQo3IK-*-IDS>^xLRV
zI2bNcL>*PP6~_h)+eROJ+5UC>1izclnkiRaYEgYsVQ5pn){=Y7W$n5jj2!bqj2w!t
zf9c=j8)YvT!K1XnV#}HXJwUYrd(HzONcDGdpq~1im;#LmreiWOv;*J^w@9c%Gey2Z
zY*lae@)U5c*x(6j0VWU!BrtR%ZX1lwjdw<g!y?0$#hS^-B8fLWW!J~|>61S6>9i@5
z)`D)fo+*Yac(XSPZQG@V@U>B-aN+~a%bU)WT3*${ZDVJ8<&NlBD?fd6%h~kQ*S)uR
z)ufkZKRT-T`Y{{t;Q9m%JNQ=GJ$mOxe`SuGGCe0sO5pekZVw(YC!uB{toj|0_pd=Z
z9r`!Y6UHu6`CqD<wrCEvfwr@>cj#wh{ue8*kFMXXv?uO)e4P9~t9!m5m)syU5#cpF
ztlPFdgGfVpD(IM1C4Qd$^#J`k_k{z2Ux5CYlg^jr0hoR^(-P`FPj>{9V7{?8x0_N;
zk7C~P%&)Y8F<DKB1WOh+r5RLdP)MP8@fH>;Rk!KPNnLBHb4)j#i!!!=&98rFmd3Rk
zJn4I5U&US|Oy-el$xIaq!mL6xM)w0;$JhtY0P`0N4^f3+b&soKx@;lplou!z3bJ$t
z*xN^DmpQ*UbC<Yu7Xy1RZNq_tW^(c2)fqcvp3iaF6{1!kt4A=R6L|d*u>AN7k4HWH
z{7rausr~zl^IiTBZv7zCBl5f*RXU0lUDvRAm@Ex272VE^=#G;(OZo)0F8qai)=1Q|
zo)6fqE-%Mgs&kwy6zoq-7HwW<Jf{3H3Y;MR6r3shmWdJ=J^_ZM#ng>Cr+RhZSArrr
z(Q8+h=b@7UGJ4~0w)#HW7}`WNi}Js+J8SH+ce!Ih$W`V+sG%MlyH57i3lm5jR*zYi
ziQ|HW)36G%birGcq!k@N3%B%{J~&*J(VNy|AL#9HF7LDy)<1ri2yavb_vOU)_Z*eh
ziKuvCqzlM=pTC!HdiNmM$G|1IR$5Pei@T%ab^bf18?73Z!tIan2FN!_ECZ!AK`&58
zJM@Lj;1CVYZhjmMj_5dRJsgvTug0u|youos+TKOzWd-A?8iNkIkWX!`x+1$xEYYQL
z$20@tt)gF(H#kmARFkg|_{#O*gD+Om6RNh_MxWcIyz8EA$H|AJ0&UChQZ-w*`eLYC
zUXQzEGJKQRNo)W?K<bBPOe`vfW`cV2d^Z=#xk=T5!`v7RfIR@a;eZNdkfig;S4#u3
z{05`P%H`X-T!NyociW}L9rwf<EOj`h)VQIV`2D^hmP&;6I(h~`{ixv4!G4PtYsr{`
z<L6En#NXlHzJuqX{ASGa{v~Xnb=+*jT(|i}K>~<akYyF3e4<}Z18tFCR1)(n)a~3g
zsYXcRjqgMvBNn`uA1{qt*ZTDwIt(w@%F=NU4%A@Rl*fCB-`@AZHdfoi>VS9}Zq-|h
zPp|492ysRr6ae&PBy_wv2D79f#z9JN#$@eiYrmIWU(TWX5(fGK`U8*}N?%v=`qg#s
z3?6a~S)l;8+ZGcJyVE9<mDqj_b|{IGRW3;D2%|e!B-V5?eDel~)BGWjRq~$RrsY*)
zmyKC*`8i|P^d-6)QQt-ZN$M!Ywh9yJx)v^=F)L<Gt`0yFn?gM7pXWvKKmNoMa3W4j
z=Vskmj|l_Sb<Yufd?PrQ(71|+j~jD)kkAf#;z5tGx`S<$7eU%PO!mJHOMPqDWnsw!
z4s!+hX&onL$tKn|3xnANwvD#DeK4uALt-S`6?SzyD;JRX@Jytpl1I_)%|%8X24|%Y
z^ozwe*MKb14?Ketww0Jm&~p6Jpr9aIO6-+<h<D`S0h6ZGZc}$z?u`~yk=(%qw_CUF
zs6NsQtRw?Dg=K5ElB+S?u%l%QYsbI|%MLvwSt*|hy%n)WtKPJv)NekVR90Ub^gzfe
zC2IYp1-ot`>-fKRoj<@NVhX5w9-xmKwDOxlR1P2?1*5ZA;-~!!)T0l7D_64OSn=ad
zj+12IdjeJ2prSj$l=;@omWpRvEp#>B@5-kd7SHrUb1}!YpTLO2?LYp=`Bh_PRC^}a
zmvc$e`yD6Eflb*3#P2jtn_Uc88dssJNregpIdE0QtU}(SgYenP^;xz@)4MD%Z;AsY
zh+;R5JB2=l&_s{a3A_Sm2{145BLPcd2m)89#RgMPWUY?iAUja_M(CMJW(B%uze_32
z<UwL`dRABemjKZ@E)Va$MmP}B7hhF&^xPd5C+p^WWVOMTeu9D0(4=123`Q$%x^C>&
z!!H<`H5pci!QGvKh4z=1!`tX*tImK4*)j({wjPGuyd~<5Zz)<)u5HRM>080CUhy(|
zopT9|#LB4}Kr>YUmgaN7`+9tRDUuFUxE}aBl_2>5MsA<a{NM~MgEh4Tx0xDq;%kqU
zq;vWGJvZwG&Mx1;)01nUeX&ty=kpDD@vcQct4`(#MFK}IzHW}|Vef&LtTD$K1zDyu
zm_27%M}oJA+Q90mL5&A!uD1rzVYJ@SDbFWEE@ZFGF4o3{>iVYJ^YZN?;!KV_$f6_;
zqA6*Bo)LA0OtXeAIRIyd1grWZsT3^<oYsrB-n~)QqtUPRqK$f^gLUK@m5L6!$XHc4
zCCQkpBSbJW!4+x62tvm}Z|)ZB9dHx<%r<kJo;$7SXuFo~=z72M3W4W`WAmHk)~2nt
zIQ=qCD%Vo{EN2o7p2hL4<5?yxXpmr}*;<3Y;AEl*6SBHY@*u2_+OyVkkJk^DYBwrJ
zD@g5<I|*|udw%IU6Xk8(KVJCsJ-ff|Jw!VIhI&axvGkY<EX5*t)d1FZDHfft*Ac%e
z^XiE*C&d=s$63^pk}~J3W~aI0UdJT_L|!>lAI?RrAWNZ-kzmE)Dsp%4UOHy1f<9Vx
z!Lk-8yq-SnR)d)Rb*JhdyN<4Yc;oz+1Pg7=!zYi5txX897jZ&&plJsQU6!@85o3>O
z*0tzmEKQP#CdmWK<hf^7T4a6K-$tTeZQnW1!siWgJypZo6;FM3UdCVWRF~LmgsduR
z$IJHxO-@xnLb|>8i~<rvM7~!zJpJ-W@#A>~+pdQEvIJk@ZcCn1uHJ5Gj;a*3u@c)(
z;M-`8Ydl7}P|t*_Vtdr9ugZ9ruWgNB#NH}u#Eu#&bPTm?@e*}4E6^9^Up^kcFbQ4K
zC-YHYJ?g#KOS2lJ2%J)I=8lkbua3fHd!qQUHKJL}HCC)}qsf-r`=st)`}FcyNqTv>
z)+wM$c>lZq%=x$a%n+#ara?oil1=8gp5_84F_Tw)R%QXx?8gz+G3#-s(Wyd9|855;
zDRtuSzp0E`x-IY-dlLZe_8sB)VpufmZbYpXQ^Q1!!{yKSCoJ8;d58-zp#sock?sq@
zdU=RlIs`0V2B^)XG2+>%&U-8k*nk#6@e6mZfVf%esW95YEAeBO=bcR>j9L#a-DG>-
zt1rU#I!mt++MQxM&ll4Sa;`x?_XF~#POkd^2rb_aSo7Ya4zaJ9aM9|beQRRFJ8i-Z
zhnUu;I!|&YLy}SxL|HjSf>czj`fTD;&f2?SLNK6mq~*r?3~u}7os%n;?o^qWRFg7R
zt?j}sYHZpN%lpRhqpOKs>}J3jFOAe=@(r$@WVzFOsl@OXnCvdK>-<kJdy2|#uUT_N
zLs7oI?!!r95s3I$mGJ%n!q`>12O<b?3E%v>V$GjFrRm?IHbJ|u&v99{u1HAhKU|UN
z{oG1}uTJF6)0yn+4P~aM)W=b=5b!xGw3sTA(5ylM=A|`Fl>z`BJLr?JfvL<y*wH(^
z=Is_qevFs;((i9tji2Z)=k^b9aB2t_*g<&@GNYFvUVqG023pqN7WiK#I&B)O!jefE
z*R&q>_UN;!#9M3)6-|y46rW;}ia&z%ln_Mu1hT5RPr&yh5exipnG%rUucLmyK5nuC
zr^(L6txV6&$FGDN+A!t$!mp2GZC>vp%b*ih-&Cv4>Drqn_mK;_P`-|(3?HBqE1M{O
z*UF_^an32b6!-Mt^W0alAKgS3*RF4AI|H)UwU*vrbA6snXnf3j5d->B)jm@|KWYWH
z0<Y3FyCY~+xeD+^ywQ~V2IxGS=4!$!ocZ%kKfNZyls1-LhVa<|>91c?`Q@|P+GifE
z544|Q)C%HpOR>XbSl=13!-7nGS4|G|xrW$j8%}tZFcwX7t1Ci@A;P##w4!ih%X2xg
zkGw9}JV}4sDRqh{=Din94+Cmvpy>j+ip0sctPTnShQJ^~m&89};iYMFC(c7Wo|Lrt
z&CT<tve$h#&T~_J$pz^{G@qA6VK@vhmjiGNQwuATqmNh3r)w6tJsycxlx0d0eU4mm
z_dHv?*4^K-w$`(IN5fN;7*w$d4}`8_vx%rzzY-gm5lS|)nnno6>%r=Wl2uCRx{rNU
zJj&?1a;L{hv#Unnu4aYUvjCg0pqc%%6M!&Q7vGHHdEr9@6-_%zh)~rbP?wO!KY=ow
zOtISreKq$GWgZ=SYh8$+lCM)uygcYJMq(NutquVl=zN}w(<tfmJNP&LNhbVtmz70_
zj3W;j4<=nXekv&C6;7VH92P}*flE@05&Z(ZEoUv>zM!Hiht<8_o#nFYsK@f#U7|Mh
zEq!Mt|Kyj6^<BA=0n2hHu7YQXK~LT!_YNX;HtOJXkBTz~X=Rjz3c{sXqo6<v@ya<2
zuh&FON2$sc8Jz0!Ke(?kbCvFz{!-W8-Y>urMQZa^e-L})wgF*$Fsh(_SJS_Yy|q@>
zIcc|NWBko8PI0b*`;~6AN+_Lmd-NxV3vlgUAbp$y%M(b9L!chPR(t*%)yh2XWe^!^
zblV;mXO)1>Z^Ex_VXXyVjn10nwfDwm@d<<P%SsZwd~UpYR#y0a`E|gSx^h~aZDD7>
z3swhmQBjMg*<zbyzS>^qA0R|-t={yS=9Op?c24=`-n0~*-H}<cJfDX7(grUIfF!Re
z$cNQ2!e~2LC)rQYUZ_>|6d#X4aum5*ACW2+NZ9I=-FSAx%<D#lx)UYtRTeknx3`{o
zr8OZBT0H?vixNahK>>f9l7L!i>;cXflU<bAVQer+=7Ed$%lM=nGJikJ$k93PCDId;
zxL$i|PwY@awQp|N@eO#4AF%4j!L*!&8ZB7UuMQJKka!^(Ok*%>%a^u&GqT~?uf*Kh
zX&W=<qhZr3^i8KxytGkS^$&L$3g(bq-bkr`(?^-GrWa=pANo3Z{P^<0{!P8VXqU*z
z`HKetwMKz65r+U8nSB$4Chxui0}#rYbzN`9j^iY@M#!_;d`N6o$}7AcRB;+|Lq}l>
zQDe|?($bAe?{PjfrfWXkA>pVxR}sN&mx%MJYw0Q;mJPMLechJ@8aaQ|3Ihx*6R^_w
zL}i&esYkMs0m=b!UVP42ykH6KN^^){cDC7#ok|69(hBB^B9rU+_BRPX>p`b)r~!~^
zzbX=7?7P7Z?H!-vnwlEYO8OqaX8AH-_>a_Z;kEJP<jpkIY686Wdv)wppjT7_!5H6a
zys(c2*^=}$)~2lJ;K!ZDqe0P%Mgj%`XLt>XIcUml)MSj-C5WLif|~uMV3z;aIkdkk
zc))N;VbP1Cn{Oxe?nv9Gre17lsN!B7wC4I|Onn_oxS8<)21#LVK$*G@$iXJaCn*5E
z1Y4?gO-s*K=t~cIcHP}aBME>i;fzY-26LIGMN{#gjti>rZF+4F9ehtk|9pD*++CD<
zP%xvK4WfTa>IKyrRJSUHCuZXd49?i_tXuZIAl2X{D;|s8A!}5`9J`A@X>fbIQ+`L?
zM-odon(-Ch%#>s;YX_gbk15iBna<LnlPlP6j`@+;{j%yVyDPDeG|O97RZ|sj+)a;n
zJAi%>Y4KV1blr5!&t`IqiPzB2C&u10cQ#_tb%Y2+j(-1+5&rFLe}WI5&h1lSmk|?{
zyQ|?kud;9`Xv0QBT*#Urz}5-?FUw!$v7g|1o%P67sP3t%{hz&ZZAH&-&3G2%n~KRh
zv*J~Pz-KFE-l8Mt;|5>QidZ@01GY>RfJy}s3!5L-sxT3NPgAj|WjN#3jrsDTd#;&!
zWR7&udwP0nw<_D2Wk;4baOGam0>=LcOi-=uGFDH`y!I)d$(=acRjH?+FU77e?H-mY
zx103seKiPTYl<yP4n#!C6O0R-Od~LypC4hr*)odQMZgh2b=TFHpkQgHi+5p<OJ?kN
z?wxh-#}Zwn2d}JZ`LO48#|Y_0_Arm!pgJK7QQ~CP=uzZf^58O%#je>qBIyMkOMH3F
zpb_gYRoTRQSI$x~bi03*U~MkT9B_p-4)TO!Dj*4#VLF^?5r6q*IMG9VJZ}^C(qnfI
zuH=eqd-y<TNXyZ-+k+7RbW9eGhm$AX64X#Ri@qvTQ;k{8(x7S_@b}?iD(bAn-nlN3
z%<pksD}T#giw{wIElM<i1}H?;Xe|X*RoW>+l~#`=0E{WoCN8lK(q;$E3OX{^9I84m
z+UI+)*{NM3QnJGC(?ey=Ndk$)%>k=)2Qz%`3=A_M_&l|F#bH;AwI#(Z`HjJb!`JjF
zHycINOE13`s1|(RiF1J^X^w+{Mcnc--VnC7q=(;r^3q4|o|@k=%UXk!h8*EnpBqDs
zWH$0lCV*2g%PuiliGV}8;srmT&-E(ylk-xLrXvZDu&(P{pXo=qpNjj?Q*~T3;_FcH
zy{7XAj&0<&n;3GEVKNax5To6ape3+xyDXdI+FnBJnn45J)&NYz9l(qM`%q8NlHz+X
zy`M8J^S7RJw2OJ9?R?&K-|N({Qx7jvWWJUw|LUP7?BwuCNg3;;!Nfg9#n&g*ZH_g_
zeij+nI2P}ASKc(^myTsh#&$<cPJu0tgBpxDV;Q;)>9)u{P4!P&@LDZaiU*~WeVGVN
zmwzH1Fy2Ka3Brw4MaXJ;NLb<Hv@eCN-TmFJW_i7bdw)q+j&kY~3;M9q!D9KrUwCWW
zVNp(v*O{uW#l_kTYP^OIy|VYU*|d(Q=H*{H7%eQfc|*$5>YL)0s=7;90KQTosyn(3
z0|1#XrjvFUHwF`<9N)dZtEBe5G$k%BbjvjH4ZzwJRY34M$+7pK-v%`Xgl4%t%_Bk=
zBM)1<7ANeW8(gjGIfOm5@r?wJq=>c|De#Sj-$vatTq{0*B#wQ5#Yy(CHg))ug;{#Q
z|DcXr))<_a<2qQ{w()5GJ-gBqCv;qlt)A$HCB2>D*LS)<iTwjy&(y)ch4O7)z9iN)
zF9`~+jrPpFQ|H{~GtskTP1SiD?akK+_G^H5tq&cY5P~Yi9<tL5<aro85f$H2!Z>tV
zec)+>2<fl-UQ=5ulvEn-6vYMJb`&lX?<?Wb9v<?8Rfwz%BgO|1?p+5ydBN!SOsefG
zLMUpf?xSg~OUSBsENS;M6`zXU<&TDJT`zq$wB9!|?b`k0f&X~+>ZhCj(>21c6Bh(}
zut0zsx*ci&T~r>R^yCVvb*)m5r@f19Di<g#2{l4LgY9@BiST1V<>)5+A$n{V5SnS1
z;7*qCFk)xQ1dzdQU0NM-kMcmM<;!oBpy;`l|9J?=Es`&>+qw!2TOPX%rZL&lUFx3|
z&<o=<t_AKk2uU2Wy4mAHs#Z)=I`iVV<`!a(=L4aS5sV{XztZqz6wOBK@~r9@2qTvB
zuoFxagX$b$$V2dI8qlO*0&_uz_pJZT$S3#fnfW3cH#`Kf<K|**OIZ>1&#`dicS{hh
zS+XCXX}2&48q-=Jp7L1nyfCLPyV|l8&6Yur15nPDM8FC;iylz=fI?zWjEjIn&IW?n
z(C+sAj8;K2NDoaH{tC{RMJ#hWL|sQ>;TQVTV!>WKiU1b*<EoY{vtW>kY)M7lq2OJ_
zIW81&<WSVM{L68uCfj+hr6P*kT7abiKb}G?<xaq?+&TTkT%l*j>)213^Irt-6IS>P
zzIjhDwU-AG0mq?kbk-py)@RK}%c=bfyb9hN8G8<z)dd{MAlw8=zQJ#7bvw(#8)uYe
zK=$Fs7pI|lTZP1DrS@7GF1(MKAdSHzHJTW6uZv~Gjw#>f-P4!qlayW;9+SVX0skdG
ztL?dvgRsdqbx;ib^)>;bT$serWIuYLL-!j1t=VFYjM-<GcfbYo{@_i?a)qZieBLS_
zy1cM3a5?G1UW=R(6$!Bq5t@&g6XxP^JN{a`&ELB|zX$stYs?0jX0E^W1F1X%vkHN0
zD8$Y|G~mat$h;tgiy`FM)<Uao26>w=4;XAEh3jEckz2C8T!B0B_72Cg>=aGBOG{C}
zwnW}7mxNk~IN*I$9zr43#F40|>kfr(C9JpL`0gR|lRkHrZ-b~BLUUXyoO7VVDT$`K
z!r#D=&PTHnFm2*u2fwcp`V@#Lo`7Az0e(pz1z))*G@}^{U%f{SWXpW7BeGN;g3MGY
z@`tRH=3-AsheQ`OsGOM9n~250)}|nMIS2i?TIY0)xkN~n=}gl6+XN^0$Hhn)8`Sc?
z%Anp%u*{lhWk^i$U?(E-4&;elP(Lo_&uM`toO*|NKm+Ui!pPHDvI2>A2ZU2MIVX34
zJ=w?%kC+0b@_Ek>B=3jSpgs0>9(}s#yX1L~kN$DBQE)t?O{fo9$vDGzkPpE65m{oQ
zRIPDKdM4H38O@q;9_Gm+2!EDPP`lPUieJ-dpX&VP$dQaK12?zy$w^hJFAwxW2eOa0
znCs^7{6mF|_y3tf1|e1>SIc-+PR}8W)<%u6O2sveI#RE>mb3%1H$gNd6v`W_1z#xl
z+g4*9rWq{%px6c4U+iL3ec?nW|FDu$q_5jsgId1AtyxOx_?Z^_bkE6{Z5vN`_GrdX
zKWMHJs$$&eI_RwV+Gl*wDJ3RETx|KWF1|P5FltqhX}6(DfLwHE7Yxmy{U$}nT`@N0
zp#O#ORy`vn&5~jeg^03ta)J};2hjoJa+!8t<8>yI^&Tc!CqMcaz41uGWf1)0i@z@O
zIyt#QoiSQL;4_h6ZMoxBt`LOX{mymR1nbpts^_l3X1B7218<#fHRG|Ql<a*kqkI{B
zQ-<aIIJsi_QTj9@+Cp&gQu@OejiXD-w08zt^anICZ%k$y@}ysI7gq+ncJnGgzgC3i
zx9LHq2Nq&8JO#(4Yh72JK1#vNi0Jyw(m}M)g2lWQ;$VCo^uTn_F`+>;+u(5LbjaU*
z3(mkYmx0azAkngGLLURG&LUkHn%~6udio*MDjbR~Sxf4^HBcA%6?ZhVFK8>O;7h-3
zJ24<S@<f6Bu~$~_@26<j$*we**gd;!L9yImc&PjO@PATz7IoEs*WVmnHF=8`3<8Wo
zfSraAzV);`#%oZqqY)ipb=oqD^3<q;L@UmUaH3PM>5NOVOZ5%y$I5p3c3V^emfJil
zyDOFX&Bt7<^(i=&_J578Y5~H)^e>1fZtWbGx@*uq<h2+yd$y1`a-rCRt`ju&Pa~fe
za9stGf_Jv!>>j$p9+*qMC-tP13yQ<a$8I#F%m1>uVxOfrD-=w3lO_S$F@sw+9i(HJ
z7z{V-C_R<pll<g-7A#tuydocY2fDUNaq0RAg@88J0y*ko)UPd9-n^MF9;x=sl-7Hi
zSi)-fSu1!zJhSjdiSR%0fj|0k0M_-RkzBw^c>m^WssEe>vaz=tPm_y8+<(rzW&hDr
z1ROFy+Oh>Kr`~T~97w19Xebx7%Gqka`nTFYW%1wt=7c)VS0oEK6N$fhhzop4osEQo
z*zq>z888^6T<S3X6$s(##(*0Nh2y2^RaG9q1(av)c#1R@wB<kjGG)oPK2F6;JmBfU
zJz?69QB5z=PkYvHeePZHI#v;0Lc;=C%FBF3Sb=Xwcr|+rE=_Mk?|ANI!gL;ar!1TY
z<sVq$ci~%)T290NB-Fn3e>~1g`5UZt|E|}FpB(2=+j}o?@eo>Aw2R|VMvZDOwCn%s
zH8dcL-})V~3(0`G0A<Vsn}}(1fwT!U{|QzKa}Vf;0IhsvrYk9<^y3q{Ow|#9x>T$r
zMNNRb)x&m((x)XEG{I0}7r#J(fUNR%JI*>dmt_K9N^>e0ztuxN*_8BdpZ+eX0}U~)
zEBSF5I<lRNRJ%s8{k2E$nx+bfE3Ju5-;<J<r+8|ggU6H-ko<j|_KeRyoe<455a=b2
zJO0C^&gp_VE?x#GO`Cw&{QC%084FwH3(au+1Cli&u28!UWq*m)3O;R50Bszzvi@Y&
zYLMTc>q0+X(PL{smD?Ub%`qhQEQXef2#+sAa6WTf6X4@v3bkv$S)rP=Smpqy!hIfK
zFp7ZlU$Ke|=szqMx`=7*WGVIVuwYOeC8q;sO*B>E9eTKyjIgy}yRl=7_w|>XxVT@7
z=j$&d^!F!^7#^gHT*v{V^N1PzIwzX;%fw3T6i%DexzQgLiCXRNEb+W;>>N{?JK0Qj
zs`Qw}H;10F&)JXj%Q6|j0r?9}mro6s4Z6dB&Z#AcFO@5VE8~&5xBaGf(WOdxd4+9b
z15fj8HKl~z%u$uun|LVdGY>%WZ{Dc-MD>}31+MqVrv4dz^S9D5Kl>V_XR;df2oQ~J
zK%s*V%{J_HoPLRE61svO!UB$;?xnGFT)%*tla`|Zvd&LFG)nD})NjK{xc(3PUNYkc
zQl-Gu+}?$MM_?#_fo3&djv~AO1)%njeGeK-3~~d+F*L_eU=sfRP%W_~v)g`4r18wP
z{r}|DSR$)7-Pr;X5;ye|)o*A{lccbPMIbFFC%WQl;TA-Kui^A(bFAfnVlb*rPt4Bc
z#6O*g{2PhNznV8fBZ6hgl0Y21;J}?0E2iw>80si?3lBNjr)bqPp0fqJUX7{Gp32=#
z7S7QJ?$+~-KVs(o+5ayrg$R(&o8We?Hwi`qov|9JywH(T+0z<=u9bM!q}mXp)721l
zak}r2+1_nV7v7tm16-HdEgZ<)&&uWBdf&n_`-XhcBf=iwy@<x1x*ei`BL%l2T>Z)Y
z8O_<4d)cIi8@_CxRwQNYrV^@~2A=}_p3=YSm;O6H|DS3CZvUMMF*E?NMBETQVdh$-
z-Nb;MNupeXL|BH@CZz%YDev-iaLAyQijOFB$6MEdrY8fpYhA7yEv>f4mAkDmvmFVI
z5-ofM=>m#UntwA36d|}+1-6KRCBu;=e|C-wJ;%j`)&TXCn9k`ub4TN{=y+6<!Mqn+
zINTjPhv&GO=eS-8HG;!8=+EBRVrg6!u>qavH%5fF4kHl4g+G)z$0eJ}VLt$PE5aPt
zMbsTod#N3rJ(zO+e4wEH&)$jL<X5>Y#EbNtL>VQ_0F3gI8E*p1H};2>h$APW42Sv*
z?ZDtpcc`Kh4a@;;+2ngP`~mgD6N8zZVrLtR&!o;|ARv0boC^&DCCD?B4be-{KP*E^
zPgEODI5lKBWi$W5&;7U(WUXu0tVwN%{d8~MgMXg47IMckHQA+VahqAbi`)E{a`R7}
z=3LkiM8o&l<&ZbjbFTs_0HY_c1C9|d?EGl8W8NmBG@;8!;&j#g+5y2!Lh_bG%b0kl
z^#@n3S+-`oo8O2<)CS*8|E@d==kE~TfDtYpEKvhc9N7n5p4E&Hiqr&sS^Tdq+#D_E
zqq|>RH@K@kwP#0<G{4!9=!M7XE6m&tw+oGY11QJ_F9lGzzWjtGno&hHqsP8#mQaa`
zOmJ*p+pbtzc7U>LgQ<^z_s0-Q+(n^19SQUF)S?pmFI~APUH=B@`oC){+9ge-IX<9X
zaA0d22IfhZuDhe9WV0(-fc|1dsJTiA-zEkTnEzkRBMfkH+*!bnb#O&05)|~|3HoSl
z8<-(5Om`SCB1i<~QjPQ1OY8|Ke3raVtXXx4T)<^!W`^lX{J#|iaw5_HWF+|hcj-j_
zJJtK*-}>uAbbexQmQ9j>2HQql4rH|l^m!_iJZ+gG{%E$O>$g89y>IVDnn(;A<IQti
z8L-;2=?lLVD~0|S#(w|A1OL^DLyMpNmk!eZ#=ib15AUDGHVxbME~5$(nLF_H7%|Ya
zeZd|i^lA~|j%j``WvYnJ?gr;`s$DwQI$O>GCJ_KbTY-Rt-%beNcEKExWCegj@$?h%
zA_*W`I4O|1sRAI#aGbTj!o7f11?}nXk%z@ZG&}jIE7d5W`ePc*I>0U(!33qQmi@E{
zL~x7*kDqTI4*{}v8HnIXd_P{(HWFRnQR$*ojjXY1UH9JZtM2HC+qI(-&e767L-Bi0
zrC1zEy>tm93srC8Gy!SNuH6fYcRO;S$;yjpQ}EBHAma+nZosbwclrV3dgW6QSs{ky
z99K#vRfr|u4bgizi08<oARrCLOz(UMYIm1Gussxpq6?r|s~eF>do;WkOJF^HAoLCd
z!za;zH9CrL1x*Wd2f#>xkqocmbmDw|1cUW>0x+xqk}-Nf3WC=ruvce6v<IE&nWL60
zQz7tN+F)IAAw*0D#J?MX0zuEa8T9-+CN>Mf;V6U`EN8X>#ZuF2Mx5*b`%@KoBmvUr
zFtBXciV`sKo{x(S7MXWDu|M83cB%?l8v(Xh8fxJKzxh^yY{VV0#XVfy!#@kf;yGND
z<wbqCXj2w#${(GCMSZxa4;S^}qCQ;Ihl~1fG1gm*mlxyZ#hk-`Pa-Y)hl~E<qJOyP
zA1?Zbi~ixFf4Jx$F8YUy{^6p3IKLRVSQ}leFE7^U7i;v3@$zE4ycjPp#><QG@?yNa
z7%%^1yd00FyhMY}o^FR`#!4{lE0Y#mlZdHCMWH0eB`eG2X`SI;2ihmx6YS!SX>WDb
z`9q6+VN=(Xa;2K_;P(i|3Rr0X>ds`y!P}U|aNom@7s8a6LZ<D;&!tT|-8{r}mMev(
zn6I_^qW%gxPJosCSfWY{DLRq0k<blMSHx1u!W*8OXWOW}%{<R_G<<Shc_NkinlfX4
zX3GdB<My9g)tCHfDC|Gm4!roe{~G1@d)$jZF~<HLpKbA({`Ipv_Z!pI;$Hsid-;3Z
I!@2JN2WaqnwEzGB

literal 0
HcmV?d00001

diff --git a/docs/source/assets/usage/disagg_prefill/overview.jpg b/docs/source/assets/usage/disagg_prefill/overview.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f029b4c05c8080ca98dff682b91a865b91e2adc3
GIT binary patch
literal 177439
zcmeFZ2Ut^WlQ$lE6{L3}y$M(-N{fm#5fP-fsEB|xQ7LLn5b0GxKtTu{6(J%Wk<g<e
zBE2agL<E&U0um3T@EzZM%d^+(?zR8@cK_e*bv;K<ayU6T_srZgbI<%{PIjht2!c$D
z=)j8*h@Blo69R#7LpWK4A#C6g3-}MhA`RjA%P|Dv!XoqcV;>gvzh1)%fv}4F<NCxv
z2>V~J2k-fp?e1Uy{+D+y@Qn*R!@lppf&KSb{^gj3l?_t-xAXh=?JH*a#~q7V|8_?X
z<zlwK9kU-Q=ltt6e;MS@mp>)&rv(0#z@HNMQv!cV;7<wsDS<yF@TUa+l)#@7_)`LZ
zO5op?z|Jfr4#M)6jg<vV!&zBb+1S~@jh&PI*T%`k`In9BuN(JYHlAM_-`_WuU*BS5
z;{gBjadUG2{rrFUVCNH<*Pq$xga~o7AL3AEV^M^#3bC*WvF!A+l!71x$G=ecH#R^z
z**Q45xOsT_c0*WM*w|Rv**G}ZLEwSq)-Nc5U5G<iVgFH15r=bJis7OMZr>~6RytPK
zCFVFxRz7(CN-PhrxP+vXw2G>lx`w8nzJcMP!$#)EEiA39ZBCqVa&~cbbNBGQ;CInK
zAn?-FYY~xA(bsR>iHlE2OiE74%D$hIoA=;herZ{GMP=3FCr@A0H#9aix3so(_w@Gl
z<6ggcJ2E;pJ~25pJwsR^eqQ{twEXoOWqsqvCiN$<wGCn$|H2OV`Y#;)fgd4|A69mD
zHg>LG{IIY_fjgTJJBPx4PT`{tT<5|?6c60y7Cm;aq^^rc>7XN7?EIBsUU6kTf(qpq
zOMl_$KgLk(e~P2OGxT?Uc4i>_Y%Jh}u?az75auiFeD-S?VFF%n&&a}X?m)hlPIT-*
zrnKf^fbtr>bbuPT19{;*k7ww4OyYk)24mA>(H4+{XUoo34Ikm3QlHZJahlD5*MRqy
z=>NrspQ^vwrd_Kjjp0@QU$CEa96qvZrsIAAhgk~bVwYu?eQAu;bBE_=mWw%qMT14R
z)maKSWd0|A2zw{U6m*i@fy7ouvkPvkc(^hPH&d8LzHc*jAa};Giy9p)zvjIH^%d`i
z`OF&w+xfukK+R`k*Hd+;tC(+G@1TAR2f<O3olvq3e#;_nWpoE}p}GsZ9ngruE<;LF
z)c^B)>zWweJFrN$9s43At4GT35EuJR=?yK<HF>XDt1J4)S&zHmvf3})(wGtqEp7KD
z$(!szil3tJIXe(uvti=UHg6SU3r;TWylE;%ga%NS7#wGqbPV7?&6G!daE7hD-GRi5
zvrR8!BxzQeuw_m3&qY*sW<&G_Gxo_21nEjwe=a{{5sjNMxxNEA>a+vFYB5D`mEq7o
z#O!J7Sb{|i^@djeABHOXS2XP9svq{*|Hr}pNW@vwq7=%%8te~5q_V91a83KeP*(qn
z1~)<fKN<diGQ&@$b8CpNjAlt(XDqjJ!aqO$4mg~XC;fX$lJKYc{b_H1u<d_k^Z)k_
zMnUCgrc(YPRHMk!5WUU^AMf^uu~NDJeGhf;Pv`fixBpj5%OBR9|If>t4|%R`Aq+TC
zHoSSWD_o{?<gDeB*LwtFQ@J<u)l2UVZLt3~s=#9KfT<_bv0xrb`+ErTm~ChH|F2Mk
zEt@h)Pu@;sgzNVQ!q@p4cObPIU@VccLXX&iC?ZRCAYOLi`L6U>66$*jYFQC|M0eOi
zG)BEN1@d3DYoRHI^gCRPYxYat99NmDJCM6)K{Ar*%SWz>I#@vdtM}6_JCMu)6d<u)
zy?KntglX?U=HYKKA6P6zPyd(h?lUm@eM#8&;T_0>8tM1o+vno09f(gRy-$zp`7bf<
zK=#@HvjSBA+cD_K|0c!I+F+xuFj_kdk%MSirZDn7x_+tWl_73U4X8JGvg^@}qK0d3
zdyKs6+7Pd`Lyx`-0A9<+7*Bp>piI$Z7eV=GEFqCDno2dJEX5#{ZV?B4PragedIm@b
zf0+H`^HA?e**V!f{yN><v<t>bTc>Zo*DP%mF=gVIysk70gdp&QAjpd9GB@MJ^vLny
zZWMpGh878ETZm3P>FsC{89PVxd(~bi)eYU7G!>!7ate0w<bgZsRkd@446Q8&@QJ(w
zp{{@lLPa!&fa?)VMRR@zWXPCqq-H%OwE22Xkc&s<lcy-ZHF0Y02e+5k<^3Pb&UnAP
z%&oi&(~AUWn|E_<2FB#;gLP`3=`*TLoPTw7eoVql`T5AI{Pd==wZ5g|N9#0cx%KPk
z7BhvfD-hYa^j6}pZX52mhQ8H|y5XDeu8=RUat!T|!n_8%go<yHz&Bt!!vIB8=j@<u
zCzf01+Agc!ImOhIZ7mYb`4<axSAT52%C)?2v9~Y(>@zo=4-oI-^YjaV1i6RFJxjAk
zumZPf3bs!GR3}=5?in<%D;{kba%|z}$%p#8m71~1q0%lXHv{2=A&Ea@L|M$Gy~<h{
z%6eD!aE!&=2tT#1#w2_#Dy-hZf?~Nt_npMfXTlppCP^jp0cYIE{y0zb;!7RUwBzBX
z*wnoW_kPd-rMoeEa%?MzIuI4xbziY;$^esRlL99g(uD=S_fE(w<~ab91Uq-LeV3&B
z+B0=$8(;d<oXA=+Bx*#{r_kF-@6;^ri*E*=*P>1Yq^$^X+AMPzU|FzXm`o#k(f}Ds
z4PiV&3Vf@Pt0En)`D&C=rjm}!)ANxWJujVG{xZj0DJFI=;l%qANmgsj-17P7Bf-Ml
zv9mKX{bH6#H)7h^D>oq;9z7@NnT$5r{8Jm6403diAK>$y?^y`Zq`n<>qh!?T?!wTO
zW?V|19If#97`(5|zGER+Y(r_Px`p1VwJk-nMZojs1y{Zk(HT^6jc0~;e#kH`8Jp{7
z<-yL!SMYp^zg0pdUUtEGNi&bGqQZ0mwPun;8FM-xOH!c-zyyH;BF6yW+jChJT@AMo
zxJOv#zY$Cs3JNl@{qnKi=3YZB%WMYrIg^F)M3GSko5!z-ApBcfJk!t8^(Pu?g4kpt
z><*0B-YhbxWqWb`@jf#vk7l_BVLx;tjO<_!XiR1>*`ra^5->yc`=sSY_Q96Vet&LN
z0u1UjCe{_Y9DVWLlbg+UG3z+;ta5te8dd~5|5Tm@ynOzFG-!g@`HV!hq#pyG0V<i6
zW2(%dhZk>7tyIMNyM0-oQ@wIZ=ys<Z06?iyjOXB2iYMS?dvrW(7uMf?+Sg^$XQ(UR
zE~8+<8{TW=gD$sA64q<m*HF4xthA(gXK<E&xYm|h&&b#WK^ITvt`W*or@hp-x*#6~
zc>SfB!SN0v+a;six*(GUMc`t(8^Jv`mrYsVzB}*IkJfooM7#4mL#a=9cg(j)gUO3~
zqS!ZxuCtH$iHF{EjkG-=E!_`ImWufyg)syyb|A{x-DoNY$k&e|BPO`W1luP-Sxi)R
zz(ye;odVjb#{7_QbRHowNk^~6Oa|!a_xuUEfkf@|@o~?+bPDyZc)a&FZf>`4Urdx!
z(7HeLjUFp7vOguuS}ln)hxg8?k{Ep%)n$u>1;Xp`D>h__d9IaeVy5s%z;z)5TeU0%
z^z=+#4Qktg(3PH_cc1BB5I;D0d3iwgqS7m0#}pQbV7A^@aF+1XRYbuacnXw!5}yc{
zG&#JGOEHa4Ox51>7KAh3O*Uu|KhJfPhTZ8~vRJp3t*-ag2_Fq|kl4?-yw}md;E8dI
zq$q~J4)x3t3p$TWW?J;!6WAa@Kqs?zZTe+jhw9ifGg&+G!0}fU`J8iCwd=p-)}0d4
zX0Lm`-(2}c!Xs7wB;F<V$=X0Dzlp>Sgnbe;_)@wO*{Zv-z!HE{61yeLcv8B99{M%k
zasOcJJuSDf>b>q+-?@+QF{z*D+m(!^wbxeaUTh6fBuACT-sFx$Q<%rHu7s$bA*o9{
z5DC=$W4IjbQUN6bc4r6TKg3gYM=wBNv$?>v5Z0itAtu$db<a{M&ge0R>bV)L9Lj$S
zdtPu>i?9nzI!Z$Uz6(2$3^N^oh^!V&(lh1qp+OTD?qyCZ4A1om&ki&vS(<caaYcMo
zcMuyQc?*3cu)t1Swb0yi?|!_gz#+`%JSqm5q&Y<sncUjs0vuNO>=@mcVuZsn<wkn9
zH`}R`t_Qxh-Jh7qbG`dk*qj=dq2bSd?oBYFoR_i0$ON^#2+x7_CBa>Q)Z)2IOm*P)
zLP0<Y1)8G&{cD)6zJ!hZMFqF5%L=)!ei<sKS=b)&$_6Jq{CP{%!ExUop`lt-FMgdL
zx;YFhye2jT6flq8VYaDll(2koun>j(Za?!bbhowBFE%P&7cz0FPq2RWz+S6vZ2IX_
z)FKIBtMn5f;g$AM`(;r6CB8y~$dCNCd+(OSoUPuMEXp>h7l)jB7CO)(z=iDY5MacD
zM)1m<`hnb>*>3eXIua%T<%lMFk#AVZ4ZozF2pD}}(WR>1B()!{%P&z$gliyFTPI<G
zC}1zT7n(#{wU0xx02gPw4K$-_8uFQqO{<jdx)~ptbF$~gZ&>$?w4wloM0z$f15fp6
z57~iK44#_dU7E#`9du_!PBY?B7h6GNDU#3ZUB}fd&sa_QwHXAZyR2)SQG1}6$8gKO
z?2R_qe{Z4#-;_7hoPqGYk$|hTKx&}6x}fVBL1>yR9l8Ul3cAF2Jc|7c<zifgFP@lD
zKxZ@cL6t4G0a0Ad_sO6J6DQmE+*_WR%kww!(U5rN9w7thR5mpP9pz$m2vmGl6w1<n
z1XB!~FWZ4M^`pTrFc{gJ*cl@`38rv}<@=kqa=I4o%=g>zq|dQ>$zIB72DM*3gM*KU
z*Wtu~DB3NA00q;(2B>q96vD~5QWwFP1t|xx;p9cdferAUCa{|~uL}2Gw$5_veO}^N
zb-?$sRBaWt2_RLWx~w9qp-J-W$;xE4Zs?snxJYfj22VP|ej)X6@o30{G#>G}Ipc>c
zK74+j&`3;9fZfnC8`T<E3a6by4xj{fAOQn|`>(kpA_#+qJ~!D5NPL&VNF%<x@0Z`~
z4xN!Xza+<XVq4krbExy7_eN2eAq=2Q#?eItsW;h}$@m1&AI3`4@r)WXAur?zmh1`V
zp+hKwsdQTcmd}T-I;Oj6j1suy7?_2}g+1=ZUd(c1MkK%w{_Hk=_RYXQP~TuI>H{<R
zbAp?-aEblnK0f|;PMRVwE4knmD7`v5H`$&p9LD4=BkK~dE2%2Xbn+1K^Npr^HPJ?P
z$2<$o=6>3z%WEyW5Gw9@T~85IQn3DL_4bf%0;=8{O&SEfvj+oX20=tq1ZpD4)hAw&
znI(^-eI`~t-5Rwjn{s6$asvu4h!5-h3=oaaIw}I)L_7lH7Tdj!2e$&sn~P;f(+)g~
zm2iA^Q)m#i7uwXE$rOynQh`(|3$RoLP`}R`cEdRW4cWSxJayyxf~1gR%kc@Or#$U`
z3VWDediogA_etThR||0rdv=!^VG&2NC4!69#=Kx0Qy~CepP5eIWjY$jR2|(+*U{E;
z-cWkc8vAzjXK)=~YGBS?D}~GF_dYX7lWpUB@Y8+@OAI58(jt)`V8Tp;WxVhT9ZG&l
z$jj6h<aP%XhmJI~F=sNZorO&&D#8m2qH?YH9_)ko2%%DEJ9H6O866!Qs4iVIb^0wv
zX-K<10h@}HdOo(+@PoV(a<%GcnC#Q0w1F!dJj(mm=L=h5*REmStpR%kS8kE{j)tXS
z<$e_Ee9;w&hAr5q(hpX8-Rk(<n}6ZR@wzuWcl)&1gRf`AZ_84={K3E%g#w}AuKFE_
zWSIWp&#e!3Al(5Ki|1F4eOrGlWkQCqec*T2uZb&4T(%<NXko}<7&~nKK8hc8X(-X7
zqOmF+aS|a#r9Ih8$X8=d$IE{Y)~VE=v+3u0YqqNOQ*lw;I9cgbO0Qry_JJ;-u|_ZU
zoX?viV42xg@G*`hI?|R&LmB1OS=U&II;H(C*(?d)fk1P2AY|P|*KhGSI?{6o(uvK1
z7qh0Q|MUKc((W&;0e{?Qn32XJpICvELUoyLhSL<7)3Ao-6$|2Qr=FeJGQx>GuqNEL
zZ1m#xb$LT6`{|2Lk~2qiV~%Z|ZHPmbGbeT+GHbAly3>g`Sc667ef@_fa%EnZ>ZaN&
zKk3-D4CQ@^yA`t?7ktA7w=zt*f|_BYh62|qtKaRtRpGLLuN8%Fs=VXIOw*G<u{Uux
zjfP3|!ICDO75{**@b{P=ykImMhbLQOXL_g<AU4zzLs{33@_y0W%C21*e`$!bP*!33
zk(Kh<x17gTE_BBnKwZJkKZ37k;A5avyI-M9Wwl3$JI}!!V1O3z4aSQK!+<`%&PiT2
zV|#XCFyYlro#&Gm)t(#amrp9V2x`Q2gyl43D~7zk9G*h^CP|-m(J4q01Dt8_9mv~A
zlrM(H5)Ep*oofWGJ`j|*1FF42|M@WR6LgWwpo@LdL%V_q0P@HO`-*nMIW94IUZQDQ
z482dabyc(8gMQk*H%k%&GXi2?=+ON!ViI6N$K<V(SdpMq5$rqH1OY>)$gVRB+CLn8
zjC~5Mf4n?gtJIirahAXDVR<BPni5youCKrwny4PSo-_o83ZXJb$X|0&a`ZqjFzY)|
zIhGXYqgFI$e7#okN5%~GP{Gw*b=A!pElVQQEi9S{OKvx(`RzcSJs-ygpg8FVKYYyM
zqQc{HH%R<X1xCMIQa%5IKOs#UuJ~nLslc3OoD0)*Nv$2RTC|lFv13;7`F^-ql?5|C
z9bhF>j{6d{mCDG1`3c6u$?o<-2x~?oEMzMJE*9`SdccXi7K_kn%NfkgDGO{nYE3Lu
zYR)n+Jk{5HA~NgBiTT=(BYPPtj5{b$jA<x&*d^FZHX*aOdO5TqES`Dr#W2E@-(|Ra
zf|@mZu<d2>E{(m#t5>X-_Z`BF#l=S1EvGp}v86BrQS%Q)zQXE$rlP_IT~B78p~xln
znBLMYVUD!k96Yuvr*@*q&qi(LLhiA;IDY3xltLV4sj7<y=DP!#X{H7zf|E(gDWwNc
z{Bbbxu3L|1fR{GI2y+u@_DNl}>9XPU`dew9v+1`DsukF{UU?>Kpl2a+@pdZh6!k!d
z`!~~EH+U#Y)EA68Gf>&OU^|eDu$Y;Ss;2;S0gl6n;O0@ggCtzS{HlAZW_Yqz7Zo3A
zBrGf{Wqt|Xt*mmr3tEM$uc8Z!P!BS)nR`JEl7jiCdZT)7EXG~wL!O(|Rleh~k&|Z>
z=lJ^Te#hCdb54i2q{IyBTw(Ua*g))9N$?H+u?u3xZf8@N0kDhk>1Kj1T|9e%s}DN^
zr_LI7Xp!*=?S4<Ajk}5@Qq~u6r$(OXl`S%=#7TT_u3mPt9#s1T(=mwQ3a9O1et?T&
zeRXA!<LJ7{l+9F-nCPoi)NqK2-F$V8n<;7iYhKgFAl3Nc*ca#cvb19YvL$B^of2^H
zVx~PKDdz`l$Vd)AtYyz*Rk8DF&_-xKW>$hKI%;+p`9b=TM{}~y<|~iErGy!o<Le=+
z>kAID>dBpJY59OTlZOJ@Pb=U}^F<Dt@iSUs!BCOtvOcIJQ^liap&(4uBPc4@)eP#%
zsP{Zt`RodJsb@4*-6cKy#pBb(h*X*wb6oxq0}O2DXYjk}QNX#qfeW`_UgOX79<GWE
zy48k-jj-%@<gGuRHllv+d0R|p#;z;XZaS!QoHO%QUmi^9)*gBs1Uv?6$V93a8QN)#
zPKDO%+@^r5?Ik<8bCcTb;ZX$a?&Qr|m$*c1#omQMr!5-${o+q|iwSiG*UqEr@bPo%
z<L%*OTn|!zas_QSrk}3)miq0z^_)a=(Sqo8#r?0@YWr0#_b7?!7)bj^{LYJO8C&f@
zvOw(b25dvD?2mX^Weims1Wut}cy=IM)u^{;5NyB7t0#6KqaAhd*J3C*mVBd`USZe?
zqn>EMFpR&8(QlvT+8O=d1(5kt(Twa4O~g6ID=-U!&e(R_pDmIF&RkvK*(?b^s4}2a
zRb>~*9lI}0b}LI~<prdDV+3onToS+n&rhW(($7H2uBwby6NWK!cRjf`+Uo5Fngkj;
zG?ZqwjnVbbSp;*fhmR;t*b~bo9GL@|vprNLpqYXqs`Z4;9;7-<>X9TCaNW7BDeose
z1oR4nPIk=b1c%OvUH5(Mc>bPq)Bfdytq9*9_zN^?n0A&qf)-$c#pbSK3S{gZ#C~!t
zJ;;db4qexzEg3t}ovE|^ZS`1*@TZvl$DU5_xsWD@ZyY=GMJ)48Bja}aQQ8USI9#a1
zh&g9;F&z^f{-f1IWqub2lr;P`-*g0Ay5_8vlf`sHXW29*dy^nx_F&;THjvK2sDfS0
zjEREPTamP`R|455$>~4x!{YR1-18TLFuOG4n-aF<)psBxijZpP1ET{OPi+{xF&vCC
zCQBf`p{mCfI9CmzdwoQja=-8;>ze4I7zK!K&QFV5U9w5DWic#DEu{=r+&q*ngr!~{
z4~B(grb)L;w^F!z>|<{9W9RJ?fAXH_31igU(8Z8)ta}<UZ}_ZRq-b}1bznEKL1BW3
z5K7d_01?fN9Q^VgCKn`aI+&2E%^2|T?7Dib<MhLdbY;JF=d=`7xpyVe?=8_Q(=-Ew
zEF%jk@D<LAWc$ij79C|XH{AECBG<jbxI915D=*hwCNEOCi0cUCr$@;5%O_rMMNwrK
zg#<<>>;fur2T~tdu+N`dnHd%owby?!pPc?v*q`s^wRgC1Nb{LNWJt6wlV_B66mjqa
zDi|GGwEN}A1u_(8fSqWU%<?j_@vHtWeo^c7>Y0I%ds6%X88xcD@u$Y$57RO&wBj@K
zQ0H~;_-Hc<vA&r29S92oK^^&y6d?CMEQdE1Jk%cSE9aBh3Vj=&rxfetKhmDZ<V~g>
zq+9(0fX>0>k?)~A@^4|gfm|)ohLB@tIG!8)ynAZ)lf&yRrQmb+hs1s^ioze0J}jEB
zmCi*6b>5Ij*wn@bj5d6?b&r&-1<pGKLr-QL!n_MhjRNC4w?e@_vwh^M+wCgk`UTfJ
zjc=;Fg+oq#`+g#~e<Zwd54Q0_zVZFk*hSffR?|byj0>f4D_jH}FimH_MiGny10=Kn
z%5R7RSiH6aNyhUuZc@+*?I)ktPtAU>d%wZWe18hrOK5L8WlaM@Pc!Vuw*9|G)fI{!
zGXKzF7Oh1<Cp5@-KKAq)Zy@<>Z!fG4AKeh{pL}a!^N6_1IL_j|SOy0&-@&5Kcf+_7
z`ogL&Y!_%F9%g%(vmV(~f}9|v#4IG-RCa%WztuFcAXS*JUEreq_(;gs5Sf?{#@h4c
zuoswg)Fo_;fjE>aw0N{b5rFm@*{8NEMP2)%bzxlZ$@sXMA4l8)OK*6X(&C}u5E#nA
zq8rzVC);DD)qEHaQ5W42Je0d%x50Ge+>=Wk3Y46=2k@228<q9O0|vOu84g0Z`ddzr
zmeZVC^Fy~me2Oj>#N-(w$Yf48Q(Z?)_$Z!)_=->Oio~<M#zOVHZ7rMHuB&+7%yGU}
zB+0{FX>eelQk604rg|yCciomsVcbK3NY1PwRe|i^H5mD`2TrcjTLY6}uZh}_&n=B3
zKBk(V+Xp$97w?N_1`6fC42~RvOnwjiZXxQm7cA;1#EUu{;rQGB$HLF|;y=J5+yb$`
z3+I2G>)-?ZLnd;-M~6|3y+|a4#h>Z&Cl2ukdAKIiwXQ~J9Ssw&F8^3QoLngO;P_i9
zc)*a-SNIKwV;=<Xz~&#}Uo54ASQ~nJ`_@6C9#sM_Y~qi(G^b5Tr#@JvW`CP$9C&wJ
z^|<BwLK03=<_O2Bx&815(0bg;ESQp(Gtwe-ljX$$4#Ey3?gkihz=<`ezTEon+eMYl
z)$%phMoNul`|s6?gdXAIQc}g1X=-fn;ueNTRkZy`JiZBp07d+m0`y~bWXpRLq`gq!
zjjHJ%0|Rx%skgKoY?E+>d#~8HmZXKsgtyEK&4zV%Xo1{PQDK3(KI&w6=dvEU;YeQA
z(I-LL2RuVrEst38#l3MzyNEr%3{KF-Y$y5!t`{Z<qLCe{%!!$oQ#J%a2|%}pDfc4Y
z&-XgD@PKDt!MoJ67Ma0MMH7`bI{TuVd+>lt6j+r4!>WVCs!sg$%!2Kp>jLpxb9hJP
zw|g^<pXDuT@{OMOH8erLb$xWbDl{p!y=Wo(Zf|A|_8j_-ye{Jr))yKNgCO^qHsr6x
zwyTu~<lk{`&HeasY6i!jpyqe%P3x%<=PSm%^dLqJfl(<)mvHTB#=f{P2Zf8fBNWC~
z&E70rD4>}!hxN6>wd->6Kkpi6#j84>t+?#UZ_UgMTKtZmRwIC3iFErMEyT;~5|o_x
zwL_|fT)r>)wA-zOFV|Wk_ix>N_mb#k_%P<s?6Gs9iJfpZ>;>q|4un`m>MKRgVacbt
z+hK%hr2N+?BY3O{lu-Q!Q__(8<A4|5`=e25`?E9$@%`y7+UYT_Pnd6Db=W?9ESh{e
zbH<ihK`GCVqAS70J`baw6PpXmUpiDN4`>v@v|UWryjvn)|G<p3sej>vW<wi(u0+wS
z0XVf8OnSs<T%VS&Mz1a%f%5~YJwB>gLAMTij1qj61+qJ(N?PNS*e<Vs5@kIj1#@K%
z%qcP2=nmJPeup<nB(^IMv_xciDgu$R*w(St31{6Un+TI9vc_(~Ldqwsd&EPJK)oC+
ze(*taQDj>Xo0wc!Te&rFpIRgdH0l(p;L@3j(TROV;iF4|AKWgbRSKyte|Q!nSa@Vl
zq-wj;yenM<PV@(DB6H4|aoZ#eRo=I-CXy*q5R_{3IrojglbhP(kFWD<_Qb}Xw2b^r
zJcRjZ^ozsWum-gi8(IK!1Vy%lb0G)X<`PSm@#1tzJwrSFh~3)}ugR2kg2QFD$JdLU
zKk{kvTrqrlc5)SdTOTGJ<ik4>fl8JRmNrZZ4N=TKQ`dwh?eqNfBGcf|V)s@OrZD|>
z4o+-**rL13Lex?6H()5sAL_T`e3<~fc?WVErHTEATlt%N`Sp%eLUp-=zP1qCU{CU0
zSmUO9GVV6MGVGbtzGFFo5Wx&RIdRUW{c!DZ^ZQ!-&qijg-H9iQ&CYElXi>kU1WwsO
zc|nllMLIqSHD73^%t%FY^aKX(EfSEI81^lEXHb_^Vy(#Ex!=jz;e#-jFT4oL+76<S
z96taqGOP>5YGOzbjFp&)iq(@d?4jmKcvRLc4RAgTlWPc5?Q@zNkkoEGG6a&j4RVYl
z43Wxcd%;!YI8qTg2CrWt70?cUWpbkX;ru41PdtNMq)D3}jXW<I4OV-4sbI6x5B5MG
zDx}pbU4A&E*ip`SKz9JgzS;rwRLwFcJH!Fk3P57u(Wj5mF0?b~Gw#kjah7?oHq8q&
zP*aIRU$EAXLAOCEqlSQ9#y@C3vkNbQ@`cY_v5j5;vKR0|L8rX}4?gr6kvrj5^`g+N
z^!&n{+k{<<Rx4yc;~9FUipg_p0VRy=gModM7X+uDuJ#@!qx+e%lcB4d$-7!x8%Bk{
zldhswH4ME5_cG6U7Ylpd>qAZq+doXyRJ)EHvFk-3X5*2p2z^EtcD@3vA+4pMgs+87
z-5BaZngPxfC2w>u^#&?sZS~qLTJTcdmV3go;~CQzPmg<st;wqMrgS~rf!tCrFO0=~
zwxWw{P=x>zS)^|V(%{laa}M&bO7SFKtn+(nbG3YL?r=-)o6=XQug^X3&2aaSG>h_2
z7Ml$cE_>a<g4hj8;5o9>2P{2%G75iS*yv{fc<IoAXirCqc=grtOKlNxLtm`IBI_(P
z-#r&U*RjtV=SK+o>6n7(@?9ULjxZ8B4j^Z+^@%Ykf+EErhY$v?GXjlTFwsDxi5z}R
zSzoQEl2+e~;P=6PpBn2Q?h|`%Jrjc9qzKZ*WqNlYBJKL;XjVWNp(-AsRPhi+P-q{<
ztm&>bhJAX`N0rRixAogFc)3+&9L?5<EFL}q#;yw<$ak>jr4?Js?1CCN+Syme-SUTl
zn-qJ`?B3=M35wXqZRhvt)gP<2otpBv<qc!Mc)eZj6!UF7;*h^7IQ5dr^JxLk5{+8G
z_rhk)JsT)_z1YkPYN;wCJ$+Rkfs{UFgY8>+57?55ZQx6L#LpktR}Fu5-{Eo{U`W$L
zzK4_Dpwm!7Vcv=}x%mum{V^bTbmN)HVIs+{7VkXr<N47$gCjBPt7`E!*2Imt3y*j0
zIdTw7Q$cm9w;v^dcr+3VMr)rKj~n?kLg^M}wQH#IY$MNtg4?F$-;|vgpHC$#oN`Tn
znKX!87`A_K*uEbhM+Jj4Pi*|0@UR)nx0$dWs3@-dnGphM);<=t;QzeI$s&Kr*19({
za`*{BRp=q_>e-TftZ#`3_i3*e4Yw_+7BdxfZ+LCb-r#?9BR?y}F81i}+>A>k7)Ppb
z3U2>EebKV4Z$NFS??X+N?LhY1v8?DhSnPql6JZLv>_F~U0$j{DE;(R1;?{ZWT}>ty
zHNP3Qbs{VVwUQGE-;SEl+F~sh+!^~Hm7<;x=n%H~anq(_^1xcc!P{y4kEvB3EG}tX
zW$qr^fuP@D?a5{TI4nOXkn$0Xcl2RP>tUmGFz5RFA=|(A$wp>6Ud)UYIq1WOUeG0R
z5lMz!9%3z9YZ28`!6id}F86D{9O{0$m^{xvNf=53o%Fo!ZIn>N6S^9x7VCgJMU9&N
z>UNWy`Oy))?FZ2xj4vfOrMG-dLiYumKA<>DHkXR6ppG`UZsh2a_JfcC8^Sxef*jVJ
znE>amd9WzaJM&|E@Y|D3=^HjIN~2fCs71eezjq+thLa<(8F2oFyd6kUq>pG3!K&%2
z&TeMThv&{k!%8zftqchV`1>_F7<>V`f(%*L7#KMf#k4JBih&WFL?Saos}l-XfZD4v
zx*d;N6bunq`e9*#GRBfIARFzlAK^X_5IWc1fo$ny0<NIW`ari2VIU}iICz6Rt1<A-
z>Nf+TV5OQ`Tc5zbY!v9W!I-ivh0Mhi3fqCaxrXWnZyyWdox^?6APm9{u5zE-%u3)}
z=CkIl459xheZ!l)Af_unBR901lEREa5#ZqR5{+73Xa`+(90;DTdqaOm^kY}%Y3hen
z)Nv=*jhA3s%MOIe*Run$5}OyKE8;-_2So9HL-nm=*!9Gw#H-tirPxCJZ#e&FnSKLh
zdf&9c7(FZ|MqdS`_8Y!oza#rJhF(G{`N&Lqj9H5M=b0J)E!F=`(-k3G1cFwv)EV7E
zKiUzxaa;JXD*0Mc6~(hOU_1S<8e(UGI`jc?3hW9@Hj@eKc+~(LrWnn^wxSY3%+1@f
zr>~^%hPgwi)%U<D3S}B1SRi006PT=ICSe5|fXg~27NDY1LenJ}dO4)X2fl3Mg*%Yx
zm4ijn<xxrF)~_;`n^J%^swLwU?40gwy{%E=$`eY^Ns4!GP+N$>;f}$lo*57Gy9wvp
zlKf+Q_JB3<78qfd{T9982PmuVz6aM_mgovkb|A29B${CLkHaI9|Mn*l|0rPBZv4Y(
z)WZAbq&($R{3H%^_@!T-!l`|=<8HMGMRiM~H?|DFGDZJ!lyGj?6HQL2qBHHbQSs2f
zA3^?G&Le$6@ty`t;5ApNQUIKwtu{~1EjkF)Q_l3}`UsI>i5MM@SFa{YB6|>%0{h;C
zgg^(i(jX^ToeYj>O)+Fkv8)X{kOV0Cat-6@To7EEDe^V%z!hp!heTUjf-cq3T{ZcU
zpks=CaW$U&1U5e}xUCE416{yfum|z;)678mCD;ULJ^N`x7D#?ncR8pJF|g%V2whP3
zj}bxBdJLMD4d~AxsNov`Xs>bcIMCBLkd!`}HCt_E&fU-=`!dDU=(3;*TYM*=n8FfJ
zdX5!c(;hg)#!Q{;27QdO4s1&k9E-5o278Cx7>3`0{)QMB!4cKf3H{;IA+ZA?D1?KK
z$LXf^Pr;gDrti<96gucHe@6+ZE!j)~(8HB(l0j?b;mc&6D4VfrxfPnZ0d4=`I*<De
zEr1i6e3^?O@e5l5ti4d{&hjIsJ>RdP9x#?lv7Emn<+sA2-dlxwmeFx*reXl-^?yUj
zZ)DSud3pO2^b73zGNvo@pDFpBY)s2<CjL23|Dkyjj6;**^m+OX;4D}3ouyxD{txNZ
zfm-#qitTqw4^@^?iwfFhPOWGM9VQaA#bbIk!lMgzn8d`ChD@R86Yic?6Aksc_tba4
zkF~v7a2@IDV(fm|>6$hTTu8Yup@u%=s{rS(A-jmm<HL!+cPXO(Ew1Cg+co`p|9@Tr
zCjJ<T*Zd>))GBHi>CdJJ&j@MRw!Jsc!^viz2(Wqzc33B%9;ngf!Amvl$byWl8Zg#6
zwcUr><IMP_ZQt<g_wk_Xn${e_{#bkFy3TO~x-<U&AUuCUSlfcQ?La2m;oIELz;?IS
z=*Or@5sxByuo<2slW`NwVvAF5L8eMmG{B}X#3;=RISB_TKR0w+dWa-Lvk9sTFq0ZS
z%}Uy^ziPY#dD_&JgcuppkhGr6esk!wHG~=lr5`2hcB45j!Gw^iLE=d02fb+N1={H`
zRok0E(RtN2N_U@{RbmT<l3`>Q_^bqzS8#z36OSTW3G(-qMXT~i&pHAFL@oc!lQK5^
zdY<C)eng0w`-RgF=SypqG5jNDY|Ifn8RSC#E0}Ztq>IMNV7U=u^)0q*90*BQ^*kZW
zK0FXtYx4EMx!l>+Vj}O$IBbiDd}1l!YtcwvN+2nW{-8<b@&$PYOsJnrKjd`$w3t&Q
zgvF;;Lg&lRry)~IPkd&M(~E&KU`}v_QAjfJmFP#$vblhzeK?pi+}d1|JsCQm{GLZV
zA8FUveqDBtEA9zqB1G42RL@jrm!Nr5LVy2w<k65Y=|27w#)^A*%cdZjB(arakIvR`
z_~yKM)t#bVt^B)QlYi@+2IY<NNqJRGtMI-9DG>S^qY^fs3#~IG**sNCqk9s#+f9jl
zQ5!WTX9#&l28|(yzG`~Stf=6s6MkYI{eWkHvIa3%z0#$A=p{xil6~f?$#CgWB12&O
zfbH4y+Ln4gujJ|`#J<xGA0bv;ndm<Aq-|_O%ui$zEX>R%Qo*_*Qp}2yNcE=JVUrr3
zcgwUK**7=k<@-RV<`K`H98}x4gX32YW1dxSMKh{F%)3K#l&OU!O@b2qIB^c}ru?+`
z-gvRAO(Ny2kH?TpqnFb42i@->5r;lKQnK211}_@|U2_GXOC>*_=n$bFA65avDLtJx
z&r(pgH#z(Is>vLK)v~ur-=DlATm3=o)3|N4TiV{ouOf{V9BcxRwZj%-?*hd7qJJ`v
zB=11H7nl(<lVU{t7ONT~Ds}5a3@P9@kCJF@flzJi8)4Xd3HG@vY`OG1bo)pnLwItd
z5loJ<!6YF8*@>n)L}W7dcs!)<ecw!8Tt>bH$LdB*Il#e&xULhZ08|q6Mc9fiMTVLR
zxHM%hKzZFfKTd1YErZUldmrTyNO^5NwNxe$?juO~&g9jOBI{E313?slA-mK*m}JM^
z03-8gcz3SCoo_}RWr3o#U7SBNM#Qg-9DeQ0ay7q^b0WUK;L<X~qZBL8)MJiAn*>)<
z7eH@PIY1=g5;wg?7Q`RVNm4$|6;=-LPJcJ7OMGE4B9m3ug+;)O>dqH`UgO2uqPk9C
zFJXn5{qVZA<PN=P*7tI@eI0(4*@RghVxZ%9=dp`!XEO&ZE$^<s6LJ%uO*t%|%%wD}
zwE9i!Ho5i*COV36F|P&@(DA%!Jos9{^~UcYfMoGJ<BPh}r`H|6P)!J7D(q^VQPDv-
zFY+~t?1+iQf-!5RnIa(ky;ojxB$U3N7?jBueS>@RWngH;tXmDKB4d@i`OcnPn<D+)
zTPcVR$wx8w;YZcY3I#{<wcaA^Dfs!Q*oJgp9||h%hqlf@&m>RaQ@g8=XZv#vg&{6d
zZ^9R?J7a~&y;nFjtI|N@j6n+^!~!YM1;O-d+JZ;Nv~$--m`o;Lw9eUU8)5{VfiIl)
z4<j$!`{4Ne+0eOCFgS+LkC7$%;C!>uctS#>?}E1U%8RaBl^<2c+v5c$_>}$b=P0rJ
zr)=y%#F(R~djH!9l`!I(6yg|pBCll5YtTOo75;RlqG8hDoo~MS=K~?v6(h71HB`72
zV*14S4t`XBW@p^nAv0lO3QcWDx19KHlP}n4b&LNsb&&JZ+}RwPV+Geg3hei>nupG0
zQ!fJ53tLPmVjqx5!0i4$G3WoVDzL4haqxrC<<Ax}xeEI?c5}-uKX>~X!?UcRwbjfh
zM(zh&Nq(iT4Ih~UZ5R^UO)`<te48#K*R31Zeune@HYRDQ&8<Cqp9*rA`|gv}FZU34
zS!<UtbC@K!WCuTTNM35R7HJSYbdo$+emf*&wUKd5%e5`~K3hUg_PcCJ-G-<)eNH7o
z5}F}yo8Ppu8Yw%F7)-m+4usDD=QUbM9-0PM51Nnh;{tn{@+J2r+L17=x6Y0pcC$y=
zvu}G{25Dr&rv4C2Bm<-7*<KXqx>v)`l(4s#uuXi}!Dfpq7t%2Ap0)C`Xx$Z1?R77D
zWq|-!7LHwv1`}f%4|Bjr9Uxtq@m_sdF=cnPY&k!E+wINMA%RcB=8xVeobJ3Qyq`_h
zHJ3RBZ8VHG0nr8aC9v_plX{slR~S*dY$lquK`N3OFKtDiY|Awr{$lIScZ_cEj-(cd
zGj}TflyW-sQ|9OitqmO2kx|>R>nrR<b7#6S5!+iRKZ}#sA*;o&$!q&N#Ybq-UVbv0
z&UbZ(zw^&`egB2FV($~QiV=+-=$hk-omuLid6khWM;ePXSVM)&JsphDLA3ZDIFOJ-
zj_k+o$FZfHGWc*xJK)-V*L*;YsM`%o_EDWQksyEX-S&A^4P11(<5EAmN^H%4>wd6N
z)$NV?DekW2qdvvA2LOL+5hK4nkhp~ujW%A8NN9wMbiYq8pZ?Y~>a&(4{7u1zFDpiF
zp1)Qg{LYDYUYNR7Cs!FF7R=$z#DQCMcM1$XiZHCOJl6Q7I!Y|3#W#lQ;X%dESrDmb
zsCcwghY%v1jHjzYIZf;<fa(*U77J=LzZpLs$C2yO3TrY6!O%;4#Z5I+DA*KegX;=|
ze2h9^2MiI>g20I-N2$g{x3QWPA3v_p&+dgbN20aUiz)eZaCMFiM)(2A6j!kO<XHn;
z9LWFKYowdd9_1HQY!QBK(qO{HaYo9(Beu^`l@OE38?85Xtz}HE7WvA-LQEN*USymA
zsu>qAo$HZeVpx2UF30l&$3A+@`+vAV9MxwCO>XoMhqm0z=XDubcuCY(iyD;q_^ds_
z3VVmie+*n=jlh?G?$G?to@-~WF;6(78P~x?xrFxz?*<LKrQw3wTvTJmX%w5Xrl^;5
zZN;K~6Yjw$*M1irI<f3b{11-;S-{Ln3~U@S0v}16^a6S)VXn;|O!gD1H6nJ|pKc3z
zK0ABlW#OzWnG3>d(+SDqKxv`s!HC%;0yN=RMo9x6&}b$_RoeTLu|0;u&^tw}9bQLe
zd8edFbgRXzFSklc9C?d*EsxdQz|lnqdh^g^C+O^eEvP$g(UD)ykP`{#r=u|oA9LS1
z>&r|idLKOQFH4MXc_2ubeH|gmxKdYFH?nee<VTBAV`F2wLad~da=3*gZ&J!WG3O(^
z#b;S=`_y{JW7kxg&GTQ`-hA9RnQbAKybu^cL+!eB5O5T28TV~?_^jk^yp7H=sD@kJ
zX%+-q7DTS^roRF*#N58AHR0`-XktE5QGGhbf961BMipCjMr^I?dLHO%2#g}sd;!>Y
zztm&<NtfCch_IlY@S4LzMKMX3+Gm?Infrwk`t?@#4Ep%b8L+zunYr>p#2AooNJYQ_
z{63Wd1{sY^6(G_16&V||;$qM-_Staz{+pK<_OtcaY8~9`pz9c!HgWn%#NdYzsQWPd
zK2i>pO<pwZ1O*l!3JX}fjZ_A_=TWkk_I)+dP1ZL(_Wrv>l)L!XTYg+?&yK%iEwgqL
zwmhWR;r**;CMawM1!BH*F(@H1rk#sC)SKUzo||1;V5f=mY;J0#9FSa4owxKlo*MWF
zRh~|Ju{%nsTC`TSYHbtohN4An0PM+kh)Z{{{+OiF9mvTslPLZwtFgr<C7b+JgU%N;
z*H03sSo~6<QDLxsm_Xn<20-`uaEu^qiFUm9{Ff#t-sn$c>rp=0_$#3XRC9Gdz0|$Z
z9{DUY1<P$FMGqqvldbx0N#i|7#lOsLDm+3}w>E?;AL#Gl2b-x~9J#-aG(<JU+^`c<
z-Y->QeYAf;0_H(KM0TZ%hxq`5WF5f_Y<*QHR6Kg2yI<-Hg}bpWf&1FM(6u)G9HQ!K
zn2Jl7j%15#*Pax|N4_n}NWw5YJqk~DrJ-aHds2x*D`jN=KD4NbO5laC##bja_KeJr
zx+vpKw7zVmU%d+HUnV$K0bX}4ZuM0lCz+g-u$8<j<RqTI!7iw8#sE`k{&+etU%RAz
zZ1EufUPbQpsA_s<+R7eEpNgaI*}Bafu`TBkRF^eV3Anm28E3*pb42h>GDQRA&9HWA
zM<+LTof<tIzsUnMH@78Bg*Iee+!FGB^U*a;HnU=LXa;|RoJlGuLE?(^KU^GDp+u%%
zeHwk!E3nz)+rjCxLl>Sb*1j3h9wFXjO}ZgmoD$9lg*(8NnS-E)<z~`OMXxRBB+iEj
z2hb1C@A9Xy=r0v|$T>2_2H(#1iYREmarW1V+TSTFdn?2#|0cIOrUwORPm%;Flk?af
zLDoPhpNVNrMQ#~Ud-u@7E+t1>)Aw1{mJjQcZ64Of{$#u#?}@~;fn|gcDW*7bP+pNy
zg$jnU!4WF;A5<@dXnvvMonAgVxc72W$H2Nok&6&c{%X-FE^BHEDN-)8`TKxL1f`i^
z-vd4=>f2dlI#DvJkCOWIDkC{a(R%QDxU`hCo_bnw97o6bGFGfU2`qU$MQW3s+H_OF
zPEx^`Ic?w^d84;ke4;HXB`h=q<ERmezW|#Nf4<-;dUg0?GrMfao(WgFsQW#TMAS43
zQQ<x?>98jw4&`raGv+e-dN1;|16*zjIoal)iV6zCPMZ^=5|=3aMKUjJv0~qojk<hG
zP39kHKM?gbeOG+pcwDfLTMA_;1x`K<mGY#!QgpjArws|5jg$fxvaN+gQfRO~pPiar
zU29Gh<AK-5)s0YRP7le^J$GMSYF&uMeKi%^`d$HM5SB>s=uE0MC6jcIFeu}2jm-VB
zc@l9rnxnG9_sP_Kk!icTZ(ry>h}-uz=d4xnR$f1v?1~o-K=Tn9r3gVXSHGDO;2Owo
zD2++!{osC7*kE{XuFHmQ;Z1g(lSt&0;zgG|aV#kjifxHnut0T85;g!kTSdTkhQ*n2
z00pIA4d)JEfs&h*fpbE`*ZPw~^^FUiU96o>ygQeEJecQ7-<2cmpQX|BU>}SjWvwrg
zsaOx-$<W>omho%a`@UT^-?f?WGVt;3&Zfq;hPztc;_nsI%^X_p-PLT(SrIpzD8*`9
z(M3@Ubs&t;vzLAr`jZz-u0B8APVH)AERhn)d}kUd%?T!c<SLq@f89;nY1j+>*I6Pm
z$tAg;KK5gNnrmH%=}8gIp4xls6t{`M_&SDUNAp3LFz$l+#jfboh020zv-pKw9&)O;
zeBO=?`c_B*Hkfj*pZmoX%!Se@II%5I%<PGPD*+peYQT|v$<5to=03Rv%;`N((AYH2
zd-dwRjOk)!OSZ077H%(26&pf#C&2iCjqdHlSd);d*>5vuHKEkzCRc0aglJ{S3lS!}
ztrllFD-T>rpST?)v;0+&D=oJCg3zewHzHClnhQk3dE41a=?BOscZHNi-{{|E)Gu+b
zB7j7?YbBj`#f{*^pY5F!um=PFJb=3WiaBvD^#(2x8>~A;Je}B$YJl~~4iN^O2z$mu
zEKla2DtVi%9m=Bl^1af<Lz+_6n~EtZ_lbwr-vk_bX<VucsM^y-4Ld=!-vK*~XNo7}
zRoN0UC4d`LZ@zvriI)p&<`X0{chbk*G8eBWCrSATNV!`r*PvAJjT3cGP+gj^5N$3d
zFP6MIjqAyj81g5;)3*Iy_#tt|NzWRer%nX9^Oh&Fzc^>qVYPXu{M(Gns|wGcHnDfE
zO@?*K)StT#Woewh%V}llRjsALVLhLbvvDXz;Qp4>@-eTsU*f}kOhIr-5j7c&EsSjZ
zV}c&L_uos(!4jf5?5sUay$#2_tBpF|NGGAXF6=-S70}Ps827!wl8Uu>C5rE04gK&9
z=!*sii_2&Oq!X&^GImh}FTDd<5efvK1Ee+raB@#4zOi}5yb=M{L}L4q!}icrz1-no
z6Rp~1m)wQ<Xy2}8mwgshKAzTnz5FKF1FOfinAme$7xJK^Xwo$8fH!j*^};X%c9Cya
zpvhsww7rBz?-RCHBb%7cy=PYUB`f05x}k;`FKS;TQksD_*$-W*E@!?)@20;(Sv8Xj
zG4cAs!&`yZ*$smHm9Agt7*>2@Y!mD+rQfcp-j5&8NOE$B$5um?KyTn~#%D;<2lbUq
zDL@#lV<KvV`=Js*=vU4a@2@Ih3b{%-O6rM{v`$jjdo6G&Pk~C%4Z&DT1|ZU;=tks?
zt+rET=dlu-v(9I`B^z3NJ>4!^zRjjCX(XIsKD8?s%4e7}?qL1EUrKn5ulK{OOj8c_
z`!DDxq+X5U%)TCV>BFgfJ>0(OkK*}Zhf6q!!n<N{b&<PTVLGTT1MGaxWo9RIH{Fj|
zXxl#m3`f{H4I}jVC(!;OB1dfp3J8Af1G<H@?^&~*nT2>AZr=M@c9ByK_Z^X*&}VI5
zZkI%3EMAhaDZw?HrUxD4NuevN3+^?mdvYR8u2{}jmp##Z+Zijs-S_L487xuLH>rLt
z6fmbHYA|I1_svXBy8k%I1csc^f7m(_LaN#8Bl)aiy?ItS_}%L|G3UM3Z2An5jKgAE
zhGZO-mr)7>Tir4JaAC015SuI=dXGH2V3l;1Vwi+5e)?4x6{$mwYxA_tY_Pn?7w;^S
z=69FnXH&`pQ|Z=yH}tR0qo>WOmVguu4Ai=uQ9(69)cE0$_H*TcZtsjqKJxa{?mQ2h
z_OTjO%eMR(i~WPj;YSzymHD44pXXc=qQXdtPno?}k>f^cJzZo~XeL~mq{nWMujx56
z7*rlG{N}-8OUTi__`3%LKOM?^f7Spad1ljU1xJ&{BZr_3YdFk|{8F#OHwq$P{954&
zZPO6`U6ttvG#1oHwrW-JOBZ(EBh9ACQsz<8*mKaC6g;(W0vjwiHMG!#p1)<@pRb>r
zx?pPTHH>WvQSpXdj$Pmo>2x{1>!(c@?le#J(kag$x`l_sDl+e(=F{?{k~-v>AMuU4
zU9IgBe&hk0h8A*DqhOrg%beD_(a;FV+L1Fn>5eKJ!gGp_`n)Zj^N>RrKyhisou*5V
z2DFGn$yjJR)R#y)VJ{zzF>kaNw$wNFHa_LKr5vzVduf+gahCgqT+4VDt9w(qD==Hw
z`OGQ7KJ1G`k{HF7nn-ExO~ohW=k@V<bLe~?3tnhyNg-N!-d&MqRa`IM=*-YhMl_ys
ze=~T)kS;=`8h0=fk-|YJ{s_U{xCQ@ly6dy(`(C(!;NGC8c3+Yrr7ph7tYq{={j@r9
z<#A(k-e-}=jf=N3`!Rr;e<wBx-e5=<8YSXaY{{d&jscTK{^<yJaD@}}@iG>ToiJ>C
zx$YV8*236s@0AcPR+nJ;oNwqCsNSJz|9RcMwRjK*A|=y|15xZ9+JScR_vW#?Jp(^@
z1sc5~T2UQ?8*QV<(<bEu-(5|gyYMjBSO+-(2!a;<4JZNIP2a&qm|S2!g$XZ>PQ(=*
z8f*7Iw_s{$m##c&_j5XBR`Oz&Uyju@Maj>wY&Yv>Z7{P73oe*AXew3!5l#!Cdwigu
zDWUJB$YE3TYIJ)#Ktm3CWjW~iI(NapweY@j0rYKJ$e}%1`xBNVZe7!Oud}_3C?gMn
zJ!TK}p@1&B4=RYKA0{_Uh{FZ-29a|!cJjj|i7xr;pUOLq?+@{;x_B|LQ&`wWdk_Jx
zDS5q2WdOu?5X@n?oS72x{f$K2>>3|X8^#epvAbSYlbHB}q<<`FWXQQCQ{>~kjdH2t
z0dAMYKI8&mOFh&9KL0Cf1(p?>J%N<NHS{wCK3zVYByHaIU4eTY!5{q#*swl3=VNf)
zHjS~W)orH1>_o|#$j~f<u#H?Rd=$k|!ux9Cd1&7a0ZgTwtNFN&YsGxLUvkMy7)~?0
zJ^c1W)?+NYZUB}YAwkB?5B1|?;dNo8p~~n2LS+dVO~53X=wgC=_{rnc5?@3UC-G>D
zbjp#YgD3ADpVVqq+Hw{-|9H#p)>UwRt}yO)umk2)J-~)`?9!a<XyZ__bVj@wf#OmY
zA~%6ywG;8Ic-1)ebY!+mtX8|%;Xvi!M?`<e3$QdF2F7A2+8%@r0}TrpN_IhNQVe@{
zNw^PZd(;n$oU9*R7+c97G<m64I6YxH=gW!KnT9-8lE=~15CSa>u%Yl`CL438`9}VC
zN5I&+d1Qv6Kr{L%$!VCB`lcTCX>Mw>LbHg3tw}DlkHk;lz9iCRrN|q<{6A_w4=Y3m
zpUUDpHP(pcpGt1HNvn3NY#aTYAK+rum^0SS|BbaS?a5h)3Ga6>B#(iz{Cc!k-E2Qt
z`@c)JN~B9M8s~j=N*H%4Jp-{pS?Wm3f)xtxi`d+(?dNpd|6%XV!=e8F{?Umf`%c!W
zD9KhKg=Q)t`AA8~(o_->vXe1W3Rwq*PsCIRS;kJt#8{F__T89K_GN~QH?wqJpYQpd
zbKm#(`#IM+_ql)9xqjEV?mwnWbItOa_xtsFu8+s#sc0EK#&qF)!N{;Huy?HXaK3Pk
z)Tgv`wbcmS@qXjn?Q_uBY_HS9G1)U?Ng@}vWX#zK0O){)V>&Ph_VeE`kM7ud)u%{`
z3T^4SQctyz{uxi(3Y)VyUq7K9eeJy?sk@Fe_=$XNiBCUlHv%Gsvlgl9OBi2IIPId(
za6d4;J6D=iM7ZV&pfV~v3iULa8J!KkaknPpFsjv>zOL)5{3>RjseKweNy}gzPP?S&
z(uR@(G&yVqBd_%C(2={N-7zd7!TRT#aj8MIb#Y&9jkX@bf|Y{(QFX!CimS`rEQuGy
z8$E<K<FzEKzu-e-SXV>fQeClk*o39YJ3+IX*>;(IKgq1C>$tXV=pd$qj$|BwPBHfz
zPoKZ%2H6$6%N1u7_Ql7bbxo50@Z=qpY@6A72|vnkd0li>Y9stYf=M{)N;7imEhw5c
z@_?-ds2CCVLcUoIV_OChyN9X^bx}gjcSjX-k8fA^%LJR>>+KGLrFqnR{tat-F&yPL
zB73muFx10TgC<uQS1X}wOjS<nHkq`l1UZc?C`r>3YRW08176jRW@e6=am}HOn7C#3
z%#2~30u07UA$lMmVug`+yCbfvK|APo`zJ!MZ8g>-&WESYKiaa3Pr6Sp6U<jM%fnzo
zR2k>N;xZup^A!NJP*t5e^hB4MY*J`Nv3I2#Iw<?fJAY`dJr)TIUA{M>Ah~)$N>}o*
z`wh-0pz)*xiYT$vX|>Dpv`~Z|%fCM$6lFs5s&3RT|5))xe#9+t#b*i9>^0e^rKP{5
zzIKE;Kqo9o`B1wfff@io4vl~dmS6<u$f5{&lhW&u-Ng?P3j-^~gz2bBdSpd0-yyTC
z<L9_=M{z1cbJ*o;Oi^?=y<^#tvN~qLFoEJ+r%-b9^@1Gf{=I`$q@wwu2R7>mnlFo{
zDa*&W7$=2}8n|R&xrkHQM5!i4%!CIskmUfC=qCnt;dddUv`bujr^{p9!ct$FyYG8;
zPx@1V)D73<O=%akZ-=Ze9AD!!KDuC3Lo}vZK`ON$eIki9?a83hJjpIY8|I<U(Fw6e
zZtvzUn^6t-=<TUWP`xmwU~5ZrQZA`y&H6n&mg(O=`&Xr22YsAo)=VB%XEa0CXoDX;
zK#vr$*Ms2@p3$jN<m025xii7RHh45Ak|(j};SreBe1;oK!jAERoeiW&hFbAuX6=qX
zFi4$hnt!meIK<!n`?G6_9hd5;o(X!aIrN*)1+Q=0ao_!sYT~dBXYjEj0<pvRHmtOf
zMM=mule!Zie>aLwd0R|akDl;J3DIN~#&`3Hop*g+Nk|t|m%GXJ@=lE9wZ%2Quizh~
zDd?Qdp6u>`D`7vkz;|Gm(jd}r*ac0`Z6(5NbO(N|4A35!2bzumVqBa9C%UKx--5&P
zvi1PpjcvexAm99Zl>GmjzeCw%*7G9d_8O>~ot2uCn%%^%Sr`9?Eq=CQ@4j;S0bu!@
z7x_<W;0^t+T!AGdVSR9;LoMVo!criemr+W;oBJKoH_Kid9r?!dbzWZc#x$?^mrv0O
z56zcRC;&?Zslb<h-i~e@s*;x_@m8B`AF|li8$xPL5<G^8qv$%~w61b~42kU7<PSe_
zAE*2LQf9^bNMMo@*nb+iG7T}EL6b+J@zqc|irTisIZ6NeYk{Y=zNQSlsz&cyqF1!r
zpx;<BPxDG`N1oqxLb7_}e#8?S+fVC`4Rv;U)7moba1qwNcOk27{^6MxJzfW!8lEd7
zK52e$sOjfPlvIML_SQRi9<F$PW4H27r9X?rpiTNFC7mUX<?u_;-XL+=YINS{o;Q_p
z3AM4mKA39i*q7}%W9PxG@ow(ClG7*haT)LX>T7WwlY5~pdLL(Z7zOwAy1~Fk_*qg5
z23hmLIqdVX1$`x%J|EXu`K^@G`k@!B@1@?bJ#zjR8=|9F67ql)J?#S$#FiGie)Dgb
zpwr#ARt9#ClCf$(#^C|9jvCocSJ`7Zw|&1Sqdd)C&n3+jQR<4PoKNm!9;p^9cye|Z
zS^LiMrLp#L7X@44XTePw0f!k#VRRT%$rE0MZBe)Ik;jZUWD}>zVqabLN}1t@PiXxR
zZ)beY3p(m@@XOwP6{p*RQPrk-Z$n%-0usz)ca`?OPy9AGOS-P!N682h%T2#nR&kE+
z<7D&2fz^0^rbsiHW{Drx2mT(X$vGHeNU?q7Qamy)(B+Xv_H!IRwBvS^^ua?fy#)F9
zCy|eA#po>q;{P4&Lu}P%w3m?@o!L^Obf~<n&avTYqjA%uYsXi~*sI@P-mEDTB&y%`
zwHj#%CqHyzWEzfXnht1J&F9oboOzydP&nS;zTIQ!>N{$a0tgxPTeP`7@InaL`TOqm
z!U;0Tt}gdLVf%Jk3ob|nZ|zxKVy+2r868W=Rl+)<6$$N45(5rgz44ffynk46_KDeV
zMx0dUo#Ic;eGvwBnP@EtNvoVnf8@p4!`eq<<Dyun?%sB<Vk4v?CjQ9xPl|<gNp$8;
zud1>G;KplLIJ-LpEx}gxq&+`i+R+nupQNBrHA2Y5x{^nSTFK?zEKvf5+-Ajh&2e&}
zVxl(s*OqptqrCKo0k@h&>a@?-AC81CHZFU2B&lz1Qf9%uk0kOT6vxoY&`k<ck((s#
zYa}|KcBJ@tr^1WL*OEKdZ_m8{st<%Ucyd_^Aj%6Zrv;@n@vQxbWAc<-UOIkZ#!ShA
zt4^z}K5(YeX7s{@6mj1;#}^bQ3npc(U@T4O22m30USdbJ!|CA^Li|<~_l(<*c=(-D
zbxT?4v#i7gxIdcS-EGuh!^-EyeZWqYY#$&mpQ0kUF(mSiH#J~|q)-qg^PP%nP^maQ
zDbc5K4ZXN}O!S&%r@+1Bbz<-zPxLlrBCu20@Um4BQ3%_zkD?tp36J<V;V@azbyrJq
zwtVkcrTOtK`OnISirda~a;pq|cTjEajs*Pg)MB*D_g@0)K2jkOyS=IvukMF?zK<fa
zoqSZSV@4iWGpua3hl3HjfzvvVK21Ab%Tygh@9U`wY1E<OqiUZ)VQmi#Ds{i#?J<;9
zO{y=d9o>3)Qej2eNCER|2*fU;9R39af4`A2J)Lox))83@_HJrPPCsG8{bz(av|ky4
zP9udG9NOaJ%{Cl*%5LEB1J`nz9Epb{-eKwW)gigX=IhZc8QU2lB*PvGD@GlO?d_oE
zN0w?G^cLUqZ24sHDJe6B?8E6S8MUQA!iZf9Qlj{_Tt_5JfG%-nsPTM_w!(sL!C=U!
zF{l;_bc=D{mv=;@Zr^;vGN7xr!yomr4A~V`+QOdxcd6&APaSJhsqf8ug19B`ZRdM4
z^4|2%8~nfWSxW3Qq#N!bN_Z6^fmWuaCW2D!nYf8z9uI2W9h3MMXzI_kSjiRDy*8hP
z$X`70*<_k60f*(n>;?N@AMHU|-R-&U^OVq}GEbY&maZtzHjLR27#ndB)i|oA*{tk4
z#%zKJDPZHg#qhFU>1L9LXDRrWQ`UoBR3n4TdE1PvHT?d`O{RKh+RyZ3@CmIgN!M>b
zxo1|l$+W*XHB1q-6ck02;z+Z!zX$k;wyw~w8++A_R{c5dHPCgS3G7LYtA^Vz*|OBt
zC&1>h!l_mw5UIUhDy{Hci1p7^-LRrRZO!j_s~F3h3y3|x^`zA7?g7isn`;7NE^9g*
zzJYuaQ<&3}eFE%-K-T#WVN7<=52(9^bFj(8qFcNpSbCAJvm-{U<aWYH6!qLWr4G0_
z2i(lvO<LHNVW4t(fysy-#E6A7SkS)}7CA#<OEs#?whuD5(rSA?^a2!!=&L2l*+iH!
zvKJYT5oH~LBB^*GXu5q=Dm%Y81VOd<AkGLOS>DwBEHx>;5IJzR#voBd>L&<GZmo`3
zF|zxY^}2Bw?q0NIfxYRz+Q#?MUj4qiM1IEPs2%)*d*9$8yQg_?zQyf&p6<@o1Y{*V
zn&?J^4&s(Asm{t&BeDdcF-TkQ6v`%5b9$^`?0IdLuh*K-I1ve(X-b0ni}YEd-8K@S
zicP<B1F4S;Hpju!cAZjR&5~f;U}rVz(EQdJVL+RuDokYO-NTE)ZyZN+gMyVjc)lbT
z>=^kRu{5G#rZVIkPOGG}=hK?O(Qwer>v)peXPP+rM{-p4aW16FX&0l=TSsJ+o(~v$
zjW+wQ<+Kp_u&zEpl#zF771Xq%=?Vy$K_W$|XG5EBLF}&D+q-EQ30b+8v%{ZSMUEMj
zo)OG{c!mlxQnzl<7^wyY-O<%nEDaC<<XGqWFo)`)<38f<810mSU3s}*4jwV_DQc4l
z_4~2xHLk;zk38PbR;RBn6#>-%Ysf8+B@e|+dC6R(HN{9CnIH8U$o1jLIL+0*ch48c
ztEYK(pD4g^VLl@(@k<|E!*0?9TBMa^oy}X*{T$VG`^s!CH(e7rPukrm-EQjpvS(@H
zyAwo1Yl00#nsTFVl5nhp-dJcO97l6!xk90|53}sj#%r(cW{d~>N%#L+#4Z)_)P%oL
zQP+8S!fV9v{U@%bhXh@yn&v}s>%i?Yr!7uvJ$9oy^Lw~tSO`F^Wk$(<kx8_mY|{tu
zJcq~{sqRnRkSeMb9?bw9TJo~LuP?zPxuXzTv^xIB(EWGgxbj2no{5-~(?7pr$Sw+Y
zt=IQ}#d&f*Vg%_z>2XHtw8t%j25DIa&W=td`RA0>-t5gGDB3)VWd@{Nv^?zVaqTe}
zqfsRa7^6F8$<i2^5cTO`Q2Q5UDL~1$GZG(B6AY#;`Ze(@oj!h%3SkNgg$jkR#M6Xy
z?9@Z7EZWtFCen_jSrOxdyThh^Msl2e_>KA$x`JPQ7OX!tXsAB+7%ZstoG?##2MYIK
zut9YR)G2sYAQQUxFdGlPsj4G#Lu7|%D0N@S+#~N_^G_8*Q?$C=*GI^-b^pUo{aZ}M
zJ)zfzIeF@_^i|d)2~I;BYq7diy@R;6(`tKFW}U#J814CYg2bqASeA@6gG|9L%Q9s+
z{RrK!_(v=u`l*s?_SNWZ#LN1xb+xq_6wcE9t;fPkr?0DTA1T$L%8&>!v;*yW4sDv6
z7UM!iiY(AF!zT2tYb}Z_p5km$)VP(k%Ox}F%a0`t;JEq#bWzjcv!;dlKyhiwTj!py
zLalzLiwniYXW$p!v){k|4eQ>hWwWj5gs)y^93il=BQzH5Dp1Jg+S+3^tB3nq>)E*i
zOG%`HTzBM%3GBgDZ72%I*-iKjI~TPM+Aw5s>{@RbL&+_ZoylJP$Ue>)|Na{`yFJF?
z<U>9G;Tyi-|K_c@zkBZH419z*$r9=FS^g@*!IltL7qAKNu?#rtH_Q=B22~3=R_AZn
zQ+Hs{V}a`3c^@RLgbpZXnuq@WS%%Wh|6bmIU%h|Yw*Sxfo>O&0Rg+y9pUX?Lgqh>Q
zGN*JBUx2=67_db>AOey44Vz1v#8MooRvZVrujHy#mZ<g=d8uP+9}}=~;8n|4D6{S4
z=zH7Wp1JFlSY0ir%bkt>+Uq^luYC}uHLoN0Ty0%cUi~{m=~k$B#(DNw?un~z-cpj4
zO3h~0;V{B8;!WZx`6IE`Y^j?<7}cgMS&zOgt#BaEuji-x<}ajq-)yK#<vZY@min-J
z(y|y~yNjsG`-n(k0oj2VBN1$|r!5$RgKD;@>D80$SuNEaHVULZKYZuW+ebY=%F1AZ
z?N{T<Zr|Une5hrmLs7vfA7P1`<7N6A1McXe4|5w2D(^GVfJQb*-rPX86OUo3a!6=D
zku5IY%~D4L#8%e@<i9>OG#r{m{Qu%8_o;xamYR^4#u)(G(mH!#A{}{{JJ?oTEX@SF
zt)h>IY#~t8NP8a55dy%P2z9Y4*1m@3X^trwq)=<Uxc|fC?4_;71c)S(Y4=f&SDU1q
zNu~B^Rmic(glf1bd#kRxs(z?6M>_4>+^mG;qaSMVk<A+&zXDNkpp9N0y;RN=4RNu8
zP|QOR{12LGg3+qpXM0oh;`_5N&Ha7%g|4W;ri^+Gd0HMn<OzT5*nZqxDe^En>GBfL
zQPDa5=D#v)=%3z8KsEr{S*jb*skg??o&CV|AJKL0BH-K9c5rUs|5bj=ZiD&vVqGxo
ze`a`6GFqPAH!U^GQjntJM|u}qgDXO$ho;DkaRXWGFWr7@Rq=l2X(w#)+K<3*501V~
z!i{C9ue(ueSyChhFQiu#lG8GIl>U6R6f(AV=3<(a_xh~rN%uQwpD^KRJifGE+N*CV
z>Mok3IQW6YFk;ss@2rkBP=3y~;zx*1Ue<jUAD&tEYXS|MiVx-WamB&XeTU6ul~X@Y
zC1wc=9H~dtydkO~bg<2yKn8-yjX30q-)&?^oiMvJh*sB6F8vL=<?fJO;+a=Ay2r`M
z!O?>M`8UO@Bj4kXJ#Gv4pcgTQ*y&g|><Fie@dhGN7rj(JZodplv0vai=`FXuydXE>
zHT6g-G~48){d59e_Qcdwr9jTY1NHS<#&xg*^D*uCYGP|BMv$dDxFMw9u6EBy`8&%v
z@49<!t@P&|$Hwx0T#gwqn-r5ibn?b7_42>uXL;c4c;p?crw)V`T%wXL?krsz_hxi)
z-Ci%NzqiXZOt*#>{?b^0{nkSz%EU25zdQ2V*E-#6OEL(`8=@8(*eRYg@ZAqW#wfut
z81*yL=2cx-K9t$p1&2F1Z?8Knss|p~b?8l>1HAfMFBTT1`c33@$9qf%k#-3+;>K8b
zjItV;b<SSDOv^1dE}nNtNNvw77=E$2W$Nhp=X8=rw&T4U{8_nf%94I%^@v_uTW0==
z>DWMwJ%$8?P;eXtcO<-NKTDbp+%)>iTlZ|eA#c{5QObv`fCTG)*;TJ)<qSy|!Sf`g
z)5$0W9X($qBd}aLwpj<wN6FG@>zY1g=2X^SHCG+<%<Ox$dRt}LpNf^&C2Ph!KXHHl
zGneF-m5I6G#}LS2$i>oS|EkLek#`I+Ue(is(nUz^ed)jqK8di1wAgWbN=z-e4E94%
zL4j4*Ew%N&6x;lOIF-SXs4uIfr$0qo<{qrlcV8Qa46l#8Gesu$xjvS)+hepN=?vGC
zKb_$fy0$Wrjpn%_T_l6xI2K~{Kva>Z(9cS9`IZ^k?*<YIkDP^jj}^a;KD&eP4Fv~Y
z;s*8xKAMppg_4@|luN`@H7!~;<!qC4QP&ralIQx^+04MI`=N0>4Q~27cROkA5B7R@
zE((*`gW}`#k!jY*5gCRl-2_MKjzjEyk8gYMoNIwh+CG>SSbUdw=OlZ5bfirqsco%B
z<@x@UTrMlAC1NFN`5KLz(EtH6AP~$z(<KcUF$!xAPWtHwsu<}<T)sriOa>-;hF(z)
z@Ox9XQ{hIe2JiP@c0W~F2oi@6v=s&RuTn;#Ny_WK)l7@`Lqq3@KEiDw303W*1+~mQ
zuy|QCSw%^IHg>ZUNG_wwe;~`NkhQ<E4<}<Jx*1*>;K5jDG_;n*u_ZuH##8ovvdeGS
zmG)qTLthmLWu>^<IRXq~GcTb<+36ha1ppsvXWy*C^0+g4oF~X7C2c*4va#>r2PaP+
zJgITQzEdexzXLbwKV`MlN%LErUhS|FJ40FJzt$m9{o~Whw>uvb_DAV?<fX20naJtX
zo~|m^nRF*aVDE@PYF0}*OotH0B1M%HZgHiTb<^X{XgWjYX;tP2X^x}R4WdVcVO@@d
z$@;-ROk#PRDdt>t1mFj3@xw$$I8;=~lA&wm7cV{lI`V`1L$_!%G2z8+ImM)-><Y*8
zw-fhI7=36r`K0kWH2*V0s%P8I1Ow18%{JI60w^2Z!H=wdL>z;7sTJGZxIj&A>)sVg
z&&uku7TQqrmpPhxwbotQs&aKiqT)-(%D#L2Yy);C769qG7-dk{JC-%1S3F_y+TCA1
zY_Y`gj}JXNwF(qc>f@!qTvFo=;o&7n`ZYLdf8VZ;-73J0;45Th;gV<&5I@ee;+(Vq
zIIeW&er(TW8lEGpiG%>g=fY)_9srEFaz5dI+5F>`ap09^tmCMq7(hHRYr#S%Kt*Tz
z`Ho+CQtPOHeIEPI?>(*5Gfmi%*bv#KdR`%s*I?zpVT2IkFENy6$QYx8opO+t#k!0J
zIuu8rJls|8foID)`Lv1vwNDS|5l#IX6h?t3Dk;*^%1hj@%rhPgq?OzrGVuD^y6eQs
zExTtPDZBRdA8EI_gFlT}CCms|514fzp%WzP+xI~YB~81~8sAw*y$uhn5Y_s#KD>C9
z>CthiX#P5gS$4hyHst-n_G)_^;AqV*|AtY$q*teXqN2zmP`<_cC}-sk1N)tA{`csL
zF=6G{aG&OT;yLR3Rb#Nt$QAiEIL$_6Y?>-WAsb~I(iri3HpDy(&}9Wm!+SAY^)#^$
zA%GAiSmdz_(*>AIzoP$5?%fIAii|}GA;jjev}b5*`mQX>S3^US_B?;Z;&a=(k_wQK
zt}In(j21TC5!D@!;fHc8=+~cDgqp5is83uGWg2^F%I;Mx7Ct4UUh;5*U%eH5jcEZW
zCET>H<an0nbb^R^ZveBiYB95+0lF2X-f$nN;pxCs;<>IF6ez^5viPQd!?@9xXdh}5
zn&cN44_0s?d=ZS6+jS9^`MjR558pRERL@O#)8s)PTE5KGchA<Whv3+0HdkMJ?GdBn
z+d2zJ@|c$UG0V1^ludEd@4Ft{YO(>BQx|;!*l^ITRaGScE-?DipjP#=H*Mi;*y7N-
zTasDWvC<i%tamUalH~i;Ei;#mU9oGcaY*?lVe~HA5=Ijmp=PPKJh)Bx2{hB4J3J&#
zaHKr=BTjc*Ol_cn8-ygS#&Q&++sJ}?4I-lR!F$$LsurKTkF1>QNkQ@lJMLaSM8E@!
zau@Kl;x0jdA_VQxXJ|{4n%XKZW|pShVNK#!hdzrm9JYx#nq=}|vV2;bFC^H$uMtUD
z2&%CgCE^+Kp+D_^)G;+M_a+_6c^V5i!5tIVFY?Ho4ydw=W*M>HV%>b=vbFmWaF!<h
zTO?J}-+Y46C1k*n&wP5^#V{u!X3wW?1v?A6#k774^3f!~jX!CaR*k9Z7VrpFUUtaq
z4@!15JukHH^2c9q7KQ|r(oHZ(cYKu|xz!3%Cq|+*Y2<cp2byppE&tL$%4Ey3Ti@Gy
z3+8R}_LfU^rhDPWsJBR2j7Esz6pj>rg^EnnQ@7Om*1{*N@*&AY;*02?)1B?tK7G90
zxomEJbQtUoiuSMwV}gcS&Y_94WFyk8dv4GZ3Av?t@7xW%zNcA8)G4baWaV8dwh^EB
zL~y5*mpzzTEG>F5Qi!9>GN!FWQu#()a2K2UUq$?=bm8uD6UsK*^V-~j;xgx;_H<su
zQ{&vRBZ9epyHD+AwD|*64@<rn%DhaoefG}5X;IFgb#w6O=Mvw0l@Tt-Ooe|6^L_@5
z)puh3Jx;5vgV_R)JEW1qKGhcIHyWUf8KF~eFZ*jYx{2&J%W9K(edCDw`Z|Ztj%mTV
z#x6$Q(KM#w+QW*%zTP6Y8MLS1yu8ysX%@fOX?**J;KPenSFQwx33tqE{v!$F@nRQp
zj$%F`D-kht<W%IuhcrSsm91))y5f~GFO)Z&`Q=gy*A7D&p?<rCtK%L*<7FM2mp{Qz
zyIb3p*%M4KiVI}gB`ciLKk$p!Ih`OF)Q2t`8F*)T&rFDWIy+jI$z1ZzI^%aBAXyS2
zp`vwWROCB~W{Vr+rrIq%X6hIhqj!hhrmsFFaK7g0^?T|D4W@~9h8IjUUgU4Q<Zznf
z2H$%h*xM`1%6FK&YTJ<}A$Ci|X|fpW3eYg+1{gu%l!eK?G)`8*q>;O_Y4*&{;C(WY
z@;KOi^@+RRBVrG|{l`ozR2-b<Y+*>?$8;&}W*6vM@9*abpfAwyaV0Yf$KFkrJbM@}
zGMb8GK4cI^AFzN0j~Iq9+Ts9)iqIdqfSe|iQYHzNBL)}qX-8u_#N>NZ*0h9s&3@tY
z$)gKtqIZKdR3fN)X~z}$PkchA`c3{Dxb!{|OncI*W$L35gKtlU5rsT-j&7z59_Lg~
z{5w|YTEVZ2ga0Q3sC|%SMp+dcWjC}2Gq%x+MqMv#53q(GT@N5viF~y2!cB+6#KxEx
z$iHE6uSBlDX?)C)6PbduMU?&um{Mzj>a7_|7yc2stz?7*-b~$xqxJ9<CU#8(atDx$
zUq&I=->_Y_WEN1o`Hls(<GB!EJvf3TM5rFNv8@g!0>9$y_zn9`2dzDptRt2lfF+$_
zaRY+UG_YIHu^M1&=D3IE2%wP=^%&y)4KwZlE`#Rk1*}^S@SD6e5l%D?z{{pSkNsnj
zsQ+H%|C(2la}GOIjfWBMBB6sQ5F%rGP-S{w98^gnqYo@BMW{QEnkk@Y@r%cvSI61(
zJvWyg@k#MWv3&LHIA7<{C!A8O3-Bdj6p<wt=J60S5R}N|Yh{AL8TJPnfmzTznA{Ya
zpGu7N_<1MIbY&^b7ZpRM`TmBr!=GS4pcUcn<bjc++u{nGK<C?o!n@{0c$La{TjXuO
zY^MtI3z@R}#WzKcfeSZY^YrngIAbmQf2<AtsH%^3`S{CJH|2ndYQO^_$;*Gd`X6Ng
z?#uyz<JWnMC8}~BJ0I|WKx9P)bZ`CuyV*1M?e<?$#{b;;ecdbI`|_><`GV*8ZBfGM
ze<~qlaDNBv`P2V-C5FG9f|xZ4U^1yM8*c)YK~RLBAG0L%z@dMc)cwi>Rc_Ye3};aL
zu|7=44}U;IHne=!zTPoia5~+DkjtV@EB1}A_oj!}3r%`YyqA9<N?kgL`@9Mrz&=51
z{#DOE$F4*B`sO>D7mq?kW))FeZ?5<%R+U{`74J;=9RDfrxq^7kcRM`hb9DQpARCMi
z%VCgrinbBXW|-#We0*6rFw^rkLtmh}eYWl%KDiQK|G;i(^CNP2jDlL4rDsy}JCIxx
zcdsnMWj55E?Hm%<?0?k8dg_LJcRYM9J}&yC@V5syVzJEx&~kM9NFG(D;+J)%m?al{
zMA6Q8%euig^=bOmF;#VlZjq6mTc)S4toj6Z&Od6ql2nWd#5Ok?U|Yh-kP4ABw#prz
zu|0j2)BCK|npI(mc)RHzO7=L>g=l%`^P{7-)DKNyFo->Pj({r|r~@#t3g%O~6?Cb|
z1g|4D6+WVE_d0hOql@X7sne3)*A!1v%2B67n4H~_%oCm%J-T*GlTgSeRlAw3-CkjV
zAW~MjLLOC>$ZdNN2hV$<3&oE=gIdFE3J*)}I=PLV2B0Zph6S_<j*ANrE~rWHmyVfo
zU+##i7BGCjZ{SyGiH(Z?=;V*jDX?8jD|Va%dtm}a<C=N6N64w#jBFgm=P9xR7t2yn
zj-#G{KIAyerL;W?7)4WZ`ulKMub74$y4h#>o2TC8bw?P<Qwh*XM7V$J=+)BKp|1eE
zSgUHJbpJJ_-rxbBoMY$g(TSH*$Lmg<Ae0dA;2{-qv(?Dvv}OLX#57VqWR|)a|0~n{
zUX_;xRjDelqj&#{O4Z@V?b#ORj}(v75YF%QHh73jSX$E<<6AwuUb(&>OF7a=D<M9f
z9PffNLdt61r_I}&{S3|a-m;fo4_f{>YS0r{X3vVJS+#UWZ(?bF<b+1o0*~Xh2C2km
zraIH(XQb@%O4Eo?d75(V;boDPE;qxF`L+OL<@Ny_OTm%8B5*+Lz$!^`S7?hM-+@Gn
z09zkxnV-CUvFuB|SQm0+z5a6gJI;7btwRhwia>eVsH)ds0hUr88~E~A`+jl0-~#!^
zmO74HAq6$uL9u!aV4*}w&Un4^VD97*deCD}%Gg-wf%y4F5BE=>T;k8Eh~GJya$A|5
zY<7&r)$x}@lpt{|Jvsb*0F0?hg##K$xH?FeLLlsv1q#|{_e6KmOYZXtkSFqc>fH4$
z-Yvamo?CC`5rqu7cG}v}2xh>~WKM0eB$!k9X7~=)btsr3KXKg=p5QQ8Ub<KkXk?{g
zXE1(Nwyn_q(*DD^iD+@r2hVyxz!?EREnf~mwJQb_fcib+eZQ8`(bw~R^J(iS3x)J3
z`KX{<-;&!SEB7^vJ?e{9c%;1boK_ntL*-@3`A{;wjIYO;h3lC2Rjd04ICeJTtp<C;
z!glR0Zk@ak<0Hx)<1DiGR?{YwiDiOvPxHCPD<B@;c5Aj0|0~gr^rtSXw>2FYGO}bC
z22?oic9OS5c7^8Ed2~f;yg5?%a6F~x1uz>2*?5i=rUgzj>B0_!k1+^bsikogjnC8A
zGv!4^lbx^TE6F9T8I7P&c;3GHSwYA(sk4?p(|FGCf)f3~#VPy}ooOf{hiS#v*0yTP
zqfg(CK<*5=B_e4bQa^iz_@d^_y#TUW*7O}7`EQpc?MWi(Ue^7N@L@|Vf=oM0hJ_ej
z1X-CEz-U{=1Ws5@r^nx0(e{1eFgeP(qHr2tAA2ov;A3a!4DmifXp(iIzeyCby9|Bx
z^m3Wz`xPdu*X5~GKdNZ{+So5yqBFsDT=X|AfS*nt@nImqzzSu!5<JSzA)%Y4VI_Sx
z`&d8dcDg1yXG9UyGHyqA#hxmQLhi4b;VzQ<>Rx;mH`~pyY+yI)5g|3RWh_-><c)UJ
zGw2PujZq)C4$XL31|Iqq7v{?3cgV}Ah_iZ;xFTTQOt~We8>WbK#f=3soY<*YZ>!Nt
z>HztPW*|APNvW?ir>{{+v){I+VxF<;5vsrKbm;TC$-cC+0y2Ih9a2ngCZHSXAa-;5
ze#4{#z&He@ZWP^$<nwgfE2dAPO^I(nI2Egxz1+pq@rm9odMZmNQRm^&C3vOTQt%Xc
zgg_~+rHxuih162zpfsvu!j(((_DZelF#nm>vMpEqJ&!#xwnxg_UtH38$=`I-eyf{t
zga&W^0MHmarr>;#zExj$q&7m<!Ab+Bk=C2}!N}6D*Zb40wT7xp;?tKy>l=Xzpkdnl
z%Hj5_+s3;hjs60D+=t!`OyDZsrG!UfsuuAyW04WFkJtI>$6w@!5UG-6;haRKO$`*R
z8sja|QnOwc8dOqz_N3f_Q$VmqEQ`6`uFVpAHiakojJk1l>(Mr+tlABE1^^ytRb)Xf
zD979UEbVLHi38W}U+$+Q4++L|Ywea?sg<f10wKsclu4?=)6T9cPCBE`C^%$C=92Dw
zp`2U0K$Emz-;CRl#Cvm2y$uY7tU=1_5yxB@Y9LxBSzv{oO$<F#JibGp#<4fd2_#p6
zQNF<m=lGU=hK`go;ya=gMS6R&%`oC^kgyyWzzDOP=-sV8aF5ytp7REdA<tUjqsU?D
zuaQ^sNE!zxN6$SCwNQQDcKJ`O9Us}s?0j%SR!5i&5_Ay1qshoasBm7oUG`q-{-ow`
z7k}@gQf*c9QL(p|%y@ISa$}$OmMvlKLeUHx@RQ!5{n$kaK2{LK^rz*UmqAvg<9Cl)
zP#hCn_0;3knf_jZ_ukG(h?&Lz$hmR!lK!D$Al~<dSS!Pl=$b~hVMmJ?iXiq&f5_Q?
zSDQ+df<#(%I&Z$~2$wWGU~utOoC>xv!OqVqXUeo7?pX#?n*nM$5uPDFQ+936+S9zT
z7%lWAD6<g4s8M@0SuRu$pL_Q>J@RMQ!m}kKPX0W4^z0DV<(`kiQqef{U%2jLEJZr*
zDa$H&3K@%)VC|rg<tLmbp~obIB3&cdeY0}%$Ha-KfUGzAvinh*50zm&{4lMEjus-Q
zF^xV3V$mR?+=+)YnU~QTwfKv)!4GtqX8l`}A&gV$BO4~M?zwzxxPvzCYlthgKc}VG
z`ypa;lNj`Ds>JG}5~9zmx<tUaB8_yZtTEwEZGH87V})`Hd9KpLCq-&|_EMY7pmOl}
z3+(5JBb+um=VSI>+U77v-of6{gK^x}?|EA1J=%>FSybq{i>sjQ=88|F{J{qJh$_Rb
zp8dRWUqQ&lRjOS=(}9}T23Cx2`MdbhT4|2M?v}G0Cs)yT5g%u@(-tmXsjiYU`$nYw
z#6l`Kwmgos$`W$`upUV)DHY^VDac>mND(5<#Pc_-)oy-5N*Q?+Ndag!nmK%1A0ADH
z_E&?f40Q6Nfj4Sa!Oo3i;Ml3yA<#1*^wl5YUYFn~0Q|wuWFW5U3x$KLu8!jD$3cEX
zmbe)wp@Xu?*<FH#V%dgo9wJp)Ai~^v1H9;uk&x@#3qy<m2&<6XdZA*@_7!lMXgeM-
zFqH2qcu^8$NRxMjI>VTnkXfrS>4C~<(R@8Js$^wDzPs}5?W2w@g)a`EZ%PL0ZBPGw
zsnhBeysLvHZpGQ1H&x^_%8!B$`z&9k6_Ln7XuAhjS-dpzxr*4qjuu;W&byhVbMLhD
z?5?DIIV$YG$#{Ag+idjO2c(+K766sJb3CDvL>D+Wa6G%|dZCNPQ_K@h?A5ERrul6r
z{7&o?)hDq@lIU}oJPj<OAk_XPD74C!(;28lxf!I~m%TCcFsKB+_&U{fMQ7u4>>)z#
z2_T^H53vrBFUT>M4$uxB%%T^yAh_tLmY+}?J;)0|T|E8$RaFF)I%ajCcD(aIr|8(n
z>-)2BZpCrE6Vam+mpr|u0YbeQ)fhH4*3Eqltx-u=C4Dt?Y@p4gVih;4YWadS_=5LE
zouDKJog40y{<-7Kq=iEG0MioFhLvm#WS*PDSCf}&r|XQkr<sbfjd}&uob3A4sGQsv
ze|*A8+Z2~I=qML%HC?(m!1W3eVD1X3rI;<dg)yC3XCPbwG}~%Gj!HM~ZzAu!)LNl2
z<0|cY_FWC5{Ku0gFXUCq{mhFMP{+y1cD6_)aw;8P>GP2o4~G*yR-dAzSULdAWN{wK
zFPg7Gj^6ctRa5KttSpv~FH-Tx$xtcoGJd|6hdRGuE<Q4g_)64L1Jm7uyjyhC0eymw
zf0{{3vt{10P#8HkbtZB&marZv@DPl~d4_sNF7H?h(_YG<qT(Ewi*dz-R`Q;SkdjNr
zKiEl5ZwM)&hBeE~@o;s0Mi~2P0P1lc+Ow|_g%yd>#l~M(X&iE^N*U*gkpCpk`)FV@
zXG+V@^i-i~ra8L9|A*DE07H_UZpD%aYr*WJ2S@6XB^MEfv|_}8!N73q5pQj+)j&_z
zdbjo4O1GYNUXA0q@Z`#7g=Zz$?_z-AtJx%v{Q%0}^XQEP8jyCcW?HfK(Aqkd$WuO}
zyjQBBqiV@sOVedBM&|jW7h0>cvU5_(%&Hz%?2fq}w4Xbc>B5j<r=>Hr*e`V9BTb5q
zOg}{~$3Cgq^q(?`GgR~bW9w0um={l+$iH~p)8ZO7=>=V8d6TKf3Tk8*v-kj;?j+6h
z1wGn?X8YJbXCg%XQrd}9<<OrVatzY91gg&R;kpz4Ra!xKK!ZEN8ANgaLO}=3I*75@
za*;?P#R+P#i0_~#$X2!W>7CNP{K2MH$C;!={<CFwqN~R4KMD;)fCXgIlo_>jk+u^}
z>YOiN%psB@Wqcz|A0a>Zt93xvTo%4Rz{dS;taGv7Rn9F*iOQb}sE;^El@N)RpoqjW
zi15I-&|?D2_fLyy0n)vi<7@57jlO5R_@$3_z>--NPD$sDZbkD3cd;J`&;CTkAYk3o
zR>KI?NIoTf312ccKH=`$luY#nT)p{$w4(=_@(z5yct=wOO1X38eNUE|!wS|Lz9Zxr
z6_`7bVPy(=`HYut#y3J}!L^Wg<6Y$|TG+}Z>kXBVnuK&gg$qTIxBWf{0npw$;{f|T
zhd;!jYs#v{3P(4gO`9iZRJHe49V%@+b2ER}Va+%7+CR!8#KrkkVrrHzm+gN-$U(Ym
zj{x1YX?*+YBkUeGy37c9YNjx(@Q;+!0t=SGf=A)+tM{Lb^KDt4g~1MoXAMp?@N{<*
z#;iaH59%MYj6pWe$eD&D$)O^>XX&XOQ9tFJi-QUeW(IO`OB7#dh`6y8%52C~^q7<Z
z9UH6BMUDWH?maZkNd>JmR78_fk4B?j-ooYF!L&Z0UJ$7H{Yjy*@)n_bIDObAfWL5C
znRyA*SqbQl(xCNi8Ry`vfmWrTrpuqcxsDj++PU^<=e_wM-udf}V9Zo2T)fefd3u;L
zh_Ar4!XaC{D6*pO%)rmI_EW}I169L6)|DNf%FCuHx0rX1N`(yPUU}RUXF>R6hd4vS
zHfuj2@*t-g90Ca~RwCCz{Dgccwk6mwp(kf|=e_qb?QyV-ixfO9`%#wAzdE}9u}J`m
zX6l9Xou$t_^C-S%mNPk?E0a{*Q~bSUMxX!V!%v}C`hy71FuSnL2v9IT#V!LK2run8
zvZPjH7&QicG>o!*#;rW|^MuLQR*7>pzQ$|DJGl-zDR%GkS{bWE<z}nVCR3;3f(r<-
zA;}MEg<hJu3;7;L|HzZy889_)*5H(e&=N)kxPeCfDQpCab{;MaTE8N!qxsF#amZ+W
zJK|`PgQ@m`v_p#hoqL_Xaqq9>PC7ee44fC~uOV`GL}F01i`}~*9Cau%fGEER7ys3%
zhZkF@&wJM$8tPkUXqa<>nX7bmIg-EZH_Ud=%^1QP;!V`>(Ddd~7-bO~p`;8w&Vxvl
zJ2<akx7>O0Ija1Rh+q-9ulH5d_O}bXwvP6zRe9;st}6nmYA?Idl+Ej*lS~E9cNryq
z2o>z>f2#BaHmA98!QfD1lDTe>jJ3-TjoY`!Wn4}w&mO$+v@H)FoL4VAgJX&5w<{eU
z#NICPq^v#-bS%MigN!u3x?5mgE^qve+xxzNt+Qv=g@p&^ap^m~rls&qQ<BUQ*rPlS
z&>5zMBd*dNB^$uRMRE2&yFctR%{6NgBnY!^+ZhJDdQn2t<!_)+h8wxUxCOZ|6<NE#
z6Wz#9?tjWKjgkG*m*iGom$Z@=dj2C>IY{A9OCqe)lkuJX0!b<6?7WUv@7pLe(kU?x
z1$49eezhh`L+ieu*Is4WNPXi^Rjx1J^X|vp3}?Fypl@f_50M!Ml^M@~#f?V?u*>t2
z?mcMusr(iqf9Kk<&<A(40`<=t>V1jrsWZyW!$!T=hV~Az{^XPZM?-!}{B`^4wghtz
z=bHg`DVk{nc}_*<Q5R|rPN#*j&)gCXujee-W>fthzfLhz9ljAn_|hGLq*>y|rWv<r
z?XPL83@|@dL)S`ZVqqwwD(G3rozeMT$3eh>4Aq+9h_&B0w7%?_I#%ZlN|?N$;?|i0
zxo|$iqXB4AK7&2fkDTUUx97;;hz9_ZlsGIf-LdS;l$}Ra65FvcjG-p&J|hz^0EKBW
zlYX9LvbSn|=jBZM&#yi{D_E?}yBA`|2n4@7XP%?oS54T1=^4EaM0$cHhiZ&l((Z@9
zfqm@U6L9H@-;Z62YDI<DCmMK-*9{p-(3xpu0<sph?32qnmSWFxreOI)4BN8hkEp&l
zH1O-htM?C0KLmf0dgGj{&}4Am3+)6o18QgUxh8qg8!D<&mdqB1d}wfo*w5oh<U4fL
zm|vlHJ_uAl74f=<`0`_RHR)&~){P_2>Fi@6*mW3X&ahgEX6toi&##gr9!(bqtG$k1
z*57(T;^ljJ<5op3`*D*fsQ6k#c}xwKSVH4T7rMb%E8IQX&NGK+hUiUeeC#{!g1dh>
zvW{D}kYR<wvvy1<&Muw&4Lf)fwA9-aLAS!(g;41E8L<<6JadZNjO0TrXV3fKNpdFH
zY2H#ne3(x0jz(>-G5^GYg^k9bbAU?s$3FrxqBba7%l%LSB7YHAS$-d0PFYaGGB0yB
zdhBg5-aMGscGOvx%;QUSgCL8_l3|4Prho0{XhY1FY>~<s%zD*OXVBD+v92oDQ)Ry6
zwt5qV=36p6tICVOng=ts0GwNf@sWo7YmiELkM6R3bJHo>@yKvmQESaD8;gcai$m0=
zUPA8ktvj0!WZ#^RpdNb{oMw&mVTJvM?L7zJ(R8c<gyisB03!o^CwikslsS^=k*h?C
zJU#l!&+0)mA0a9rT*kT2PC9*W$K!`O^6~IwoGe$P1cxsU9M>ImUN8d3U;!@NbZP|K
za*EzXQ7ROPY}z~b?72Cys<=gs7^^ZC=SWI_8p)US`TeCmtaqm*7!k4RKgeTPMh9Sv
z@8o=$Jjl*(()ojA@qMq7$;%VWi?74fMtnTHiESEdqOq`PFz+=37RUI)t^;P!XIO@!
zK$OLLbc-9m_Q;Euw*44dTc7alSPdg>k=dE&vr;f!_;H4~i*<%+jBR!$PQBI8#)#}8
z-WGYh>C_xn=<qgw`s>AU(0&@}6t&;wB&yFRx~7tZt~O>M*-036PA^<$AGl#zU{_l1
z!`VZHMQt-77A+}U+jfrv3K8xl%T%A|lc~)aw~oo5a-4xzUR}EB_tX0>(YMQ*cpe{b
zfMSW}Fpkl8CKPk}9Eb@`dW+BWUm$X(t;Da7zfjm<w>6A4*Y;eKSIS+#o+`r<6X5Wf
z3EonH_ECBzU1kzGSzSX5c~bKlrw+V{WhP=mgOTb$?)?+UqAsAe_bVy<h7q;@_ZF70
z8Ca+_7&;+Jn`LidIa)-UL-|V{zhQ@&kFz%aQ*X-|?&DnD<TYHu02((5<X=<IiNDSn
zfpdUCf#z5aM*#<RNgfc9-$&_Yu7L`pGp69OS$RIIXhk@9oCUJBXk~E8)Ub0;L;N``
z;y?O(C*d4Eu)EF~lkl5AK~t`ted!I(#cIoLTw(S77Wm+Q`2`@?L$buvIZvabk*i%G
zwG7;TZKBkUUnA6=B>S%s|7IO<+7B#rK$|TlB8(-cgZy%F<817y1msWDegCG}pIFwv
ztOEJ3S953~QZh^q`?+aqHoO~TtpB_W&|LhNi$WOSKg7@ly^iqaZ%lkDxIzE4kh#CP
zD5!Umm;d_~{omfA=|D2Hca<f<#m-o7!Hdk$B00$5Fv<r|ZMcwqaH;FZ#e(z8@Arh+
zElEj+@&Dt#rTf$Ut=NAOr;2~W%JNzu9D^I$@TH?j(wP=41XLS0t>MX%&S*S)^42|x
z^7FAr2fqB-hIzqh29uF+kKxti#JB7W%n^4x@~$QWk7V8^nrh7MYcrP+9s!H-8o_1v
z?|<T6mOTXf8WxE@#q{MgV|Qb{kvkkcFo!+Rr>2o2=e5!5RJP8hK}}?V*}9pAgckhH
zl+mZk>W_1>SJR`#($8M__3O^}(!mrY+MUoU0;$ZhB#o(rb}(2tpRp4#Djr*~Uo}a#
zi8W?Rtu3)G=MY6brHx@X+jq|8?4Ettn2W95pw!hh=#+>4#n%@00cSPo+Y1g0zbzDy
z`gH-oon`xwWDu~NDNV_o1F!;X*qeVs5C2#G|4(hh|5?c6-v26(oGXtVnONv@{6!&2
zv8}pA*ShU+w3*o+S63i5wo?g61cO>A!<U_nk+|FE30fX3UJ6-|Jbt6gKhGm*(7H+H
z#yG7#9%GgMY%^s3dbbG-V7CA9kh1;ux%*<Zg(5;SVo$sHiT=X{4DLs^1^<`IsKQnq
zs5R9E_(tnsel+pN4i^B@reo*!SFk`dwIB2PH_XZMZb%sCv@~*KRfZFx_KW)#@g)+{
zJeYxX1Ns@pj4QyTKRj)erEf_WB9rjq7mGgju`cg%v`ekm@O3sLKbnCTaB%8Vt3Vxe
z$tRyfMwSs<Q6t{BlJ6?*>z$~Ii=J{^sHw~NW=U?@Irr%4Ty9^!)-^S)h`S5zJW^YI
zz>1f#OY{J|g>mA$5sJP`wE}uX2|YloO$gW;>dGEM^@=3>lQTbA(RC^E56hUTin
zZ>!`<9Vnlv_5y*;yT4dl|AvJ7m%q`x%n;6SwkA7$8O=>&Qh1}68?ERP)VGZpywMuf
zd=8V%x8()<oIc@?Bp#VNQg`wOW+CH+ZtAc~_?8L!%oG85nyPOJbZ*LOGcge>sJP)H
zb?y3uCt@${OFLG?L)=j}+eu^nk(!%H{L5*1j`MJvAqBPUQo=gJevdhf`BYo4_u)f2
zX+qnqzoh!m4;Rf2#>>kM%I&5-Wwkt37yJG94Ksscp=S(3cCj9D)Tb3M-FT|ReQ?{3
zrH+y%hdA(Q{%~VD*%E{71QDjn1v_zn1Q2TG-)4LV<HxT2?TI3EXvfy;aE8b<XZXlE
z!@B}i1zK%d356^h`d)>@WXp=~3uhmMCVz2xVP+i<&7Ehv(evZcI<(d1(OMkKBUCyo
zXD-N}Om_Zb%q3!UL@>orsrj-w@&Xji*)0Hsj<I4a3A!<dKa763b6Jl(s)J%;o(WEi
zk3x3l6JB|3iDy@w*Rtw_8ht+!438CcStSSRlR6yn-}A7?@icp6B<Ca>Jm!E{i1rA<
zynaCV(M~yd8&P<^L)??IM^IWmiN9iNJ8by|S<B_BV6y~nUW<spPQ5|a)RKt9Ih5=;
zDj{Z4jE;(}$_l<>iNHdpx^{!My~p@MwR(1s?fMdN<jZ)Z>HPH_qhUW+mwlMWJn)sN
z#O2_^kXf@@$i?5y3k#3QeQ(o;`5H7YdC~R6RUzef{t<hYPZ-&)_b|_S`hxJm3OiLE
z1Wssap0n9h?NYjzTZyw{j!)G9%eTn%`u7^dHSy158V1W=q?Iq~TU}5B6U912kILg^
ziS{w{2Le$NXbGr+uG8X>9$m6RX{aR9N^yD%;|mXrXm>?Tv|1Z&>K_e!)(DI7CvStW
zDnh<ZBp%=tR>e8}`eLa>=<75=46461jz1Vy6EU#EXxSpfh2PN5n=2*jDkkb`o;$ws
zcA6pXiaCITy8XLDfVqPiMAi&-pq{|1wI6{`B#%r}s&R=d<7rLt-!KPT;$T#=!sYqp
z=t~Z=lB`!R(`4s({>Xh(+Mc}aW`eN^QRncB(Yrtrti*yd5nwhUPd?-kIK4GfLE}{C
zdtUM!b)ERJxAqyRz>ZG5;Qh3xT(#hG--e6FaIeeHGw*S(-0y!$nwl?3i^p2$J&5<k
zw>(#tw&70OS+H_;qocRL*gb#0Q9{MhXLD-G!=F<&61D^l#FkI<nV@HxC-s*-Lnbu=
zPAaG3`?7E8IA{Q#vm4(&(vf<geO^U=EJ^alLxi^#-1J@L5ffR}hde4=m#sssV~n!$
zgExJ<e_m>z*SO{W^tN|=<73<(S9bJYNmerD+tYap_O?bWLZ9c+-Up95;1^(r^6%Za
ze7^aaA@63*Mn@^HQ$X%jw;Kh=tWF==aVS66`n<hqUX6H3;(faU#FJpf0<lk*K1nWL
z&sX4wxk_%tni}sfJpAFj0{_jH_TJjvk2N|T@)Gl4l2@{nj*sj;Z06FYmN1ZU?8v7b
zH^;gA5HFTb3%+S`A0&DZgy)DgA|N$(vF#;*_V5`>CQj1iK_=p>VoBB49i~oOnIq4B
zwk&s`<L&x(N*SJSS5qC{31~DA|A(pD0fAVQ$B%}K<?J2~QReXH(+~sr4pim9HVS33
zV{|(|()*=Z;c(N@_vo5q2mOC$eL4CJ)rTh`pJt0fL>id{`kWZJbLs0>P>yBQz^}1f
zX63cX)=J~O4EK@qgzj#|5E&9t5-n1KK1xTQ4OebWlQFoubep?=a_xB38PLrrCnOW6
zMuB3-K{|+#^GPTHBTqHFJ!qRPHu2OaCt9&Cc#pR>wVZHG)Kg3~2?@<o$$eYTiN2%c
zV9#EE6RX~3@c);9q3^#MF#N3z|C`8&`x$+R6|)`9_SO5(41n3eB5+1WRp2yp;H94+
zYa$BzI6FDLcG9Ezi3cv%3>nh{T)eXL4(=MJG@P~$n%>OJb;%Z43fcS6CPL)w$vq50
z0oKE6q>ln$_M$1b!7GD1yfg#xd{9%{lAeIA;l*~6di#X6eU|O{mwt)wNJ;8j<?MLO
z5ws2a2?wFd5;tVb4dm7jXuMIS*u`s`enMY;MOtgS+Z*14b$@8;qN?$VXZFr-7)q<n
zanpKBVPKw}ix~pS@k=is{SBj;w;e!e$xktQAYT^-?DiM5=}XVdhpJ9-Q5V|R6hf2!
z$h_Z`WNOw+)Vasm1+2I^MicWX1G)Uzi)pqn8l1R_*72c+>ySoG@%$lSpR#+rZ&ju#
zHr-pRzbB-dee;@tm+ekDz)a)tg9P&s1?mR%khY<92i7l8;Jd~d)CU25a0Jo~A4RTQ
zjR8#eLP)}-S2y8~X(UVcyVLMkGtbhH`_?(py0r7L(ZKwn2fkX5Gs1}_agHMFJmAIf
zvKvI`fKQ9hn8j4vXOHYdbkp|~h@eSPGHL9!QaACv8@$!FFL}sgZ@Iwtq3=Msz40b}
zh{HDm==^8c6&M4wDG|ydSmn2BuudA7xV*l0*UlfaVtCG4MNIj`AC;@_&k9bAt=BGx
zF?rDzkUxmBDxu5F18BM4(njYZNWT>!m+{NqRQ6BDKV0ko6x9AyZ2nCr-Zz7Nk_}KM
zu8Sb{lVKp}ZcAG^cTvNmrR8q&^Fy~&?Fn8lT%$2NZsetVsDGYSeycX>uvsA_(q{D(
zS!I>j#07F?Mrk>_B7mU4y3ivHG#F^d?#iLF<%p5FBK~i$jhV^?9FBL13iN_5a{cf^
zPUT@l=sq1*WAcpSP#(qV>4VA0oAl%B%W@jWJ^AFgJALvk7@4LaYpePhQD+W~qRU{i
zdK=g$m=jQM-`NTpZ!3Jb@=`ig#m~$&U=)*{4+@BSDv_<GKlKjs*mC1_cr4xpGJX-p
z7N>k#iHewEd<AbjVo%7eg>_QXPWtg2M7d_};eEZ!Dsm56T>0<EJpEJX{UuR`lz77Q
zX0U7?U7Lz(2XK2JM`tyHXZWGyPz&3~0bWg%@KlbXY_;t1YtCiO?S)iOQPeGR!|1W_
zNPldl9qa8gM%r&!IjRM}ZS}vg_a^XA_Iv-hk|?Cb7BLk?gbI;uw4jnKB}z;sAt6OZ
zn5#&#PDn~=tVxzhS+b5@$d-MbVUlge%%E{GOaE`@`Q6WRpZmVgdCv3ypXc{K=e$m@
z*Sx%3jccy?Uf=Kc`Yi87_^uXJfgU^#g%L=Z#1EMb2oV=wn^zLRcw{^x(8en4=uzig
z!<GW)9Bx%!mOK;bMZJixX<x&(gx$d=9|mWW-Q8+*%15`r*fIO9-n5NFL(7Rb&P>P1
z->TJ1=2F{g-)&q+j=a<}3*n}A<wt>={R%KL<5(beAGp{v%hVyr0V5xSb#Nv<f%%Y=
zp|^#hR-==L^~9rRChJxV@EV7@d<3Q0)QzW0&g{J8c{)30%4=ovf)#w?vXSVyARvu!
z@6TWOHax7TM(u~Lwr_BOgt!~08XFN(T=|P2(txFF&as>P(Com}n*#Gxm4!-I$+slO
zn_G7}YuD&~>32mP3n&I(j6b0stYiG3X$_HXjJv-SccFhah*$@S7rr=(rQp{O>68x8
zw(VUB4ms=aaq#$ji_P~FBqBfTI5zn}18})^H>+qFcTh-O6qOy4(W8R{YrKqu-=`{C
zQj`}=x$BI;Fno(+u7x$<UdM^~+$XC=;x^&!92m54NhiF|R=<r$j`O*@-=815s5mop
zc4PKR25Ns;-c0+m#T^^0FXgE>sxqeDbjD`3M`R-E?UR13etaoH8$bkzf(EkJ@7jZM
zR63peVt0wNv$wn2V6WV5!ScNi(qH&yDL3+Gim|C+2U<(Z!4pVvNPfNqS>pta%_1ip
zOX4pgZn~{JcqNc`$=~&%m61|yKLgbakOe%5pe2GVbji#`r80x=Jg)jyMP&s+(Z?89
zc1-|H_7No^uXzk_XN!EiouA|tiBe1(U<1)!9{y#oMBOJAi$ojj`suzBR1>g)fviFC
zw~%wMBUi&=a$k0ohBu{~T#;c9U=HVFZum|qFEtSKG4kBch%I^`h38WNwBie{=2!Ig
zkU;D^dqH!hNnK^Eq+5N$&ufiRQ~PJrJYC-N-DVk{JJJ>s#+D)=S2bA4nHrT^&yYCW
z#oYqCF<luKrJiObxA-xs7~0wSr6CVxsalyb9`)ioHriK?4^2P?{8T~u7G@Nu++b&b
z0q27g<6$O9&x~4&;*!yzN^&8xC_!s}C?~-tVSf6}`~2lz?JKT^RX2h%#a6-qDb^S+
z%;b)auH}4$u1$M)$5XJOdOjJ%<L(U^>6<TXP4$UB%#65pi0A${r3b@2k`o4sm{uU9
z;#?&%7*~fZG@+P!rlMc9j7!;pJ;Q#?9z*K)EH5}Lr1jL-t9Hk^O+2}x!LALrzvSPH
zFPyXPWT;X6S!aj2ZI$Aj*Tk_XY<QtsFj08X-+T+aL+^~EpQmO{!p4&xIdc9z3l0L4
zM)TdV)<+R!axc;e;Ka>qQG;sC0D3!BlVu5K)&nip405YdyF;8qFsk%JoN+9}sQGB7
z39tPAbln>6_gzi=(l!Ff<K?TyaM6JGR4Irqu0|A@hkAPo8;w88>>_n}OHX@V4XPPO
z>|O49f9$n;-MdXu!S~}LaqYNZpb<&RL%m1UNQ(?8j5$P+Jf)_%GDcnHhsZK^RwYIL
z?wTfZPv@!~uf^o{ExF{M6nm_=Y?#^NW^WJK^LCI3q;w#y8|+EA(ay(hByQDI6B`KF
z-^JEKJ_<PMG-m6Wr{+`g8v0;&;o^xam@b)j7Bsnse9!pWY!8fMY6%-LqOc%$y<i$!
zofE!DkhBd@W;6t&2fsZZR~EjYtD<hxWt|b^e?oI@9psAr8e?$jWYkBp=OC(}of6MF
zF#~Na=V!TMTJIKtAX)HCO{p9Vo$vCo)X-5acD$YcEmFrXc8L1ix9(2SzLXEqnERZU
zdgocU(Pa&NQpHpiFl`0V>a{1FciNqwzVKx5Rq9gqUVoY4dQs(d*w50&-#(p*iRHBn
zSm7QH6#3LX5IV;H2r#_N`<<B;u&HMtda$CsUPM$_)^<+*L`rGFhpul$wn?{ISoJ)L
zl9L&C5e<YP-+Ck{eUqXML;x_wzYz6^D80oD4)l05Exo%iP^({agT$Od(?J2A)Z^HD
z`%%7N7tkeKLxd6EkA^WW)Q&s$7gValQQ2_zY1W$7ec}@DE|_eL^Y)@9cng;plhztI
z4t<4yHKrLZ3L%2@G-l-XW?TUjy8^W>vlu`QOf25(xw8+rZ>7Y@5<dxSzaPmpr@i34
zJSgp@{Pd*K=>3!h8kIl>yI{Z$xQrgmU6(yBY+je>TVpR$WA+xEp{FiJoy*P15qfvj
zO?d#Te>oOKG*jf=TX*v$*_?R<j-dOG5!j;q)uXF94;S&$(?p?i|BRUqbf$6dBF9Qa
z{d9NErSm!XOKxy~T6=y<+~)hyfqa;dpQKP+15rG{g^r|RSvzN>>=Wv+yhFpTzEYUt
z3UBId<8migm0hu#*Fxg&dq0ue=+&@Rxu3flE@qB1Jfor-seB~DdRVvCvcGbRS<mb4
z@9R8GRY!~H_1jLSlzlijwsdEH8KK4Aa6fPZFdfYD^94`+>O+|Q4*jOP$~)0_tYVMV
z%a}h$jL5CQh|L#QA%iM6agv~`46wIp!Y_Q^!0qF$#kKxP7?xETq0HarKIPoJuJ(<k
zXFJ11qqVKS?AXn#zJJ(_h8xNx5r7>Tp?_%{QAG%ix{){;t`VC5UMp5Iw=U<C@sjne
z?2s8zj&dqpqH4$wx(eD!&T#HF-;Kq=Dgi1vjW<|nF=jyn!;kvkG{=10qomy^^L99J
zc3CimT@|3m(4sgq2qbwT&@gl{Gatu%U<}n4Vc+7!Ty?G$sVpWPo5<}t72U1!ENJ+F
z#lcF-#?Qw4nI`oBplZsD<)lE4o<Lds7Q>pGM#4#9Iuak;RSWxWDSaMg<d~_*ax;!q
zbopA^?B0xv$F8kRf|cns?)os66nZz2c@-$-wF|IM(=nm)Pb;MuYw<Z|uSbWbDfX$+
zD`Le7e!(Fln{*%Y@N6)+&-nnJ>*kTj0&EQ-^wmE4>-^WQ1PQLhOi|@dfKz>?qU1l-
z<y2qxz)|V&K5d}`vxWX1XTpOgl&08C90Kw(6imq1x@RDYj$>Z$E<ceh4W6ErVqd;L
zlkcUej`hm9)jP$`97^|Iu)e>&E+7Bd#-k$zGsMa8F|#6a1re-+uIH8xG`e|>wTo%I
z%Z`zbFG}Eha?@)_E*y^Fu0J|1GE`R2<%3UTVY(co>6yVArk-J#EvvhZcJJ=-@SsW2
zg+4S1trI$)94}abyV*8uGX)F{SqCv+40io2aAW!j?;l7V!~-Qf+~7H1soll8gLX6G
zIxihemD<9&cDuh_p8w7pdmtl&phV$+h3Q|(%K}|3J`^XmpvCU_m7<b}`cZGi3CB36
zMEBve;OK#EK)r4o!KHnG#*E}taQC34nN6|>dN7CprNp7yLH>t&x{hyK9KGJ$YhbQ?
zUmSVxDNpaCWm}i50E?hC_>}lQlR5%C8VBJ$tgGxmfO&gynbb}#D5F<CSN^2&!hgs*
zdW%fUF{ydKbcOczcu`X+fm)|9YUK{@XI!B#WtL?P_c1?k!g3eYM9&A9g*UOS$ILL_
z?8Mz#bq+=YIBWWOG1Il~ouA5XHM&|?t%>vPGUW!HGdRPEDEDHhpP~0r<yd;$28Fs_
z?$&o5FRWl!ukj(fS+{KAliW_S+i~aa4U-S0<AqoG=&s|81&DjfbArlcSbWn48n6$Y
zMHT~%Mp4Wc9jE%7zU;i~i5V|T=hEL_aY>6kpZ=k|xHN!wo8i@ib(iWUnf}#rTp~sd
z)+KLc-*zR@_~1dNysSj{7t^G3ZB6fvO7}Y5YaU*`=JMGs*k?kxhg=;@8>+aE+UG{1
zh2RRzL-e+g%+zbRJKyzrl$)o`*hs}M+ii)HO_Dy$JER9_Q{OBLcnJFr@rJ~{XCt%u
z+7r0i+-ltWr8-Yob!{A-TtB#Lam6G8BRvAAoe9@>wkr1XF(#Bneu}O4jy?9m(j;z+
zyxi7-Shp1<z4759#E<qgrzeDxU2bTUp>tJYzPg^j5^`j_>O|5bi$hmV%(=}InbjN|
z;W`kU{7Sc>PO|n)t*Aw)dE<me(5BWMn*P0A#TQyM`Zl!-%|vS{e?DxbquA(xG-pfu
zj%Nd*y=mW3o%g|Jy{`*GXjGMJS{o1F`Xm(7xaaJFokd6TjfU6W)H(Nr?5wh8MMg!`
z(r)uF?E!=B7mO^2!`X{~zvz|c{#|XLkNmAH6Kaf3W6&CH4n7chQZqk9l^cr0h%I_j
z%^0*JoTr3q?J}6oS39rTd9HOyxSK4pboG^}!7+&W<w^{8>1{E4BR|uNAwdC2@R}E}
z*%*FXLnj<p*Si@mKDVI#c-U>ea3+l3a_3?B<D~hmW`0|9;?uTd3l%$4GDIyM_ILj|
zn=J3odPGl%h9Iwa44OxVf4A5FhO%(?KX`$M$EFTZOr)%$1NRpdx)KzG+eY0LZt7<=
z7n)~V9G5kf_yn5CqkP+z{>bnXD{rl&W9HDJ1^>JcJ(Sr215W_{fJ<G_Wdy8zS0$c0
z^ul}%+tz4)QADif*``LH9n0>TG9NcY8(v#%)*uvFp4is8x8t4n&8_GC*2q5ax9LF@
zh)`zfNakDEIxAokCsj|~ahxc4eYR-QoPZ6uiY=W%yL?5Fyp>w|!@0Ng&agHD57VD5
zeF^MaZmpILvcR(n{o;0je&1XWI{Qc{F5yJw<)Uw*TuFIpM#%cc1AX|1pWWwrSZ&vz
z=+to@rxLUX*UVw1K8icb%&nC^(`BDfEvr|wQ@33JU4x(i{q>8tw?4eG`3OkUtmZ@<
zQRyC7=^Ky53_`=dlNFmBbhqVdejHRxee<6w6G)x-BQTw~HQs82<|LDF?kf1zJo$82
zxm{*c=EaMHHx|ZE?0;%D(-IRsV+05y0$dqa0)ORZ_{`Xb2rrcp+=Bvp<gwycft5G%
zU3;fJqFILapYwx;47X14;E$xn$Iq?$OTC)^tn*dcaxx)zJW#;Y?67m-q_b~lz^I84
zv6u6qmADQgH`(czuZGSfvKPBBa!~adBq?h_;UV%0R>&UHMi9vRWY->sIZz0U1cK)K
zUm_@hijU9S;tX6<ZV*5}yR^_-*Ogu|Ij<@@T%=E9_Jhn23qwGt#Xgi$&(d>=4LHL|
z);p_*7lD;cbq9BHR13HAx461VE<Y==s`3gEnq36+D}bLklg<77YF`g@gE*vIw~2j{
zlg4)S*x92wLYR`;_2QOB>&}C`O~u%`4`UY`K3bdvz^`{&>@CVnkS7P-K_8)=g4HtH
zv(PxZx<rsU`(zmm<jrmYI_AT}@7mUVHGbjf&@xImqFh}!o^hUOA3f;ZeI#5l?~LKE
zqz?0XKjHz1>C-0UY&oj!0lypSa{^nerOH8$16mz708bnE3mM3qH}xS^9O*@@W?y+<
zWL57S_A)TkR*8Rivtj<HyOjCv%cmYdW`)h@*{V?yIEhqYGuB=wb5PjvNT15Cd_sPy
zA${2HbcTFX_xE#FhqE)a*8Qn#*;sXVEg=QQA9>F}K2#kiboofil@ylT53biZ@aC%b
z<!WGVXAqA^1t-47{#3OVA+IavWvm{uhVQdvuG^?6gtCoR>wn84@v2`PK)nr`pRU4<
zoH8IS9y-T)3(SPo$A}H)y#1hRXP!HN#@4Q+dKPqROg|kTwWLfP|1?)&a)d8X6|qSw
zzZ^quf4S}SH!wUwpbz>No4~*PpU4Y>D`CKQY>9e)%$~mnw{fzpIJ7pAiyPN9`EWCe
zS#i+(!1jSPq6#!2Z+yB-YW!0Yc(UyK^U4pX1nsdl1)@3_O)jWQ{w&0zq`3;iTu{Y1
zX{uPcJ>Yz4ofJMSjYE4_F2l>l`}ljOr1^C?Hy4}Ly_=L@b7Q6_N_*k>oBE1PP2B!{
z0Nv9DP|N%vf>+?m6c4c_#!OqxCD@J?557tJ3mR`8aG5iab*DT#u=}tr9jYi^X?`f_
zo?`0nvu@jUF>7NU-g`VUJotMM=_SzLR0Hngr?K~==m64m1|3>2#cAp(w;#{u`Uv~m
zym;<$&EvSmiA^T4a;Fbn`AkgqS+EUn%UOO-6u6ZQ9zeF@{|XPQe*!!DKeg*uF+%^u
zL8AS{6MM9a;6n-5iJ?-+i1L&O|AEwf*AEiE*D!fyKI;>DnoFu$F}fpAwGeb26g6`a
zz-Hpk?Wt6zH((`Ju<PIzfdK2Y2EszKMEp5Zbg1DbL10vq$c>+R`(Q%m3T1cPFxSbs
z3nGnqJ}}#Bpa|R30X?wrW6edhuU%<}!V8(tHd#7Bf$#J~uG(>XA-av*+Jjs<PU!c^
zn7c)|@!XwgY)5&_OQ^~Sve2-xYV1R~&8su_QK0=OdOwnWmbS8=CIMh^MnLj!-U#sr
z$Dq>%XSf}BBK<IBM?d!pVeB15$|5d0j;ew3Qv|41eho+d!7*-<OB(A40jlPI7wWhi
z4tk=XCmWOrbz;!~)3#m(d}yrVA0Aa+@kzx}*K<JJCR-ze{ie*72H^p~5gO|Qf&g2w
z0L49?_6NuC)4k!y0_7$4Mt^}j$Y*%?1_D;{<w!0?faRXr2`vKx)b9=vGA}6p``i8I
ze*P`@PH7zwE2F_m3$I7Cm@<qLam7jF?qU)PTW}Mot_dJHZ$pE(GdF^Xc5w7(ITvP|
z743s1n67l^doN$Aw1=}Vs;v$lu67hqkPR|#DBcO)0}7hefQRO~WdhR+6&m4cqt*v(
zT|_+q%*jkAq@To@vfVQV+MiS_8weS1l@F1XP5m&zXH$zzzBW<<!E`YJ=kUSQugn8;
zbd#X=2))~7pX2IB_a_yOxK8}wc@?2qwZCdo?*)3i{VvDJ`mDvHef&J@eZ)Sy{E~@?
z{n{OHuuqIkTwblf@r_vSxc4Wgz?8SH7aU4nUvw!QDX;<c)&lkF1M7CiY5|PYrNdU*
z+as?KaEM48{jiF{7_CnDC>hV*j2-vLWND*uR<|{o_nkJ+$O?B~on^J|MvYYW@zl<5
zXT@ZFBcu%OX*jLQ5W{|xng!UrAP#$7Q~~i3j#p2Z@l+9YWd;aBuj<T*mw=;QC+F}Y
zT*U%;ro@p$yxjF~HVR_<ed|+LA&lMOCL|#0(N!<joBzG{B{2plZvM{~AC4J<0hb_h
zc|+rZ7!Y^~1|@^jfJrFJfViR;z_ORO{KKO*i?c?EK{23~p#MM@@!5ubP=qa}%Z>C0
zbsyN$g}{yh;%UD<gzC!u^<?=U>`IoL6cm9J+<!EmT@TfQc~gCcFx6NN(6AxmV97TD
zGUe|Nig9#0P5ueKUm%Ew*KC4k8Np1jcLsS+qc&p>!4DVerC4ZO{XusKQ$6XTqG|Um
z<w*b5O{pO$BOY%a8_LMP$Nx7|?(Z_cqPQJ6x;0`5Fs|WSL~&5`MD#D!8%Wb<94@$i
zW4imFh&O6o%obW#wAH@SPf}6$rg|Hnze*-`6<9iW7^}ua)q3^=w=5MU?i|3JkE*_p
z9@UGMY`nJnj%-yHJ{clS5SERF8#&S34L^9g3M(-1G+};2F(4PXvP0DHeg9`O)c~L_
z`f*^{+DFZ4#?j9pv0-DlHSNVz10L91aQbPVYj?eGR(t8UW7@08wb>iaHW+HM8d>`>
zO+R>mM!pmMZ96HGQavV&tR9tq&y$-QiFQTtY~3CILH<jTf$Oeocjnxtt}7?{?9%d6
zHxexXdFELmplQSN?K`wAG>xNy^Cb73xHyXc!LuVtowJNwl$zdCc@ONL+kY2t|E~M`
z+ivV%`Tf_<_R1Z~e=fZ9`rhKt@(bb{cn0xjL(OLbctM>v`y?2-%6!x(F$hKYy=Jck
z|3AFd{jYFve+?M=xBZ{Ze?7eV+%DQ+p|1kC@1tsyM|cbWGjspzm}XwOI#)RB2hWr9
zu1Kafy732(xt29+5_KO(cQ<FJwwwS!f!YS-ieD+7sKfJfmi*C+4c0&~z!XeKY9W}q
zFo*?C_kL%Q0bA*l4v|}uIWGpwfZM17xVZZ~kQ86KFE=T7cB#<}V6uG(oZIGK6<1S#
zRcPDxXLC>a&n2L=IzZBx4oVglf47$d<oknXmr^^oJRQ*GLhkbYS)op}L-sy^hx4+l
zk)-dQ%O^eWL6mq`)QWGf^73l@>=IYx92mgYFh95HT>KB7%)l#z$4Wdu{-G_u0ABC^
zmznrpGYUFF|DILRN4u0Ca(LIAyzP@!eNVT0p`>Bp%i|25h-4Ugb1;UC$Kp0Dx}|IH
z8--t-4l*=x$7>@qcmfGV7_dLqjv`6%h{aow5R#tVh5Q-z`vb*+G;Xx{SSVNQE;x_8
z1^-mM*zyqYTSbGX>shF?i48LP8vLpMNM~pW`bq4{A>=~@Y(0%*#~30~&)Q6Svh~T_
zbs+XJQJ~j?9Hv73KXRDR@}CygQ(w7~(3s6@Xx@ki6K07@BG8wx@GcY1He9U;TWPWV
zXVxMMT6t^UfLJOHz6|0N2cV+UB)V#4*Z=6kG1mirM!HzE8%crMlZI)83Y;H$uy>@P
z=Ot~bDm4<V<4k%BWzyQZ?C&9*%D@CX8;cmtJ2-Esi~QX+;a~pDOox=Y0=HS$*@C^0
zGj33jJXu3U)ebiEHb!XFeJuQledghwAt9aeN_;sj=H_807{9N+?9HHuwdbd@5_g?H
zZ~`?6*3<W4AV7{*hm=K^cGZmXe@VBcd0P6E#pr)-tI91*U-z+mUie<*9c^xHcWm;z
z>P7qnkdpb?-4j57fqC^sGYM6VDggHoFNG#wjCGl07~p{nD%ES6s(eBwT}gF)-OITI
z525!HSZQm>1thNj*thC`d5+=kAn4~%@UEh)n@{ihQW5lK*L3aUUG;Y_UkT&^4z{!H
ztJsdP3B&v})&_POPm-%U7fX+GO?WqAOgy?}U$RIY4nE|`wG8$GkX}MCVUg;Is?_mh
z2j>Du!$fEY0OSDGE_T{lGPe=pho%Qd2?5^3Wqy#T^=WXA+*9rRFV@1U-|%KiDp1^`
zI?k+lXiD?TnZTdV2E$*{tiQ+4Ket}ANw&d_fW>qYbbRr9kc&IaR!#j3N|2wo{NWbN
zB|yL5uaiaoC+hu#slOHe`bY2ke+h~Gzq>JW!Pejg^sI`h5-caSeIMa65|-nPr1Kw5
zpvdaLB{yGT;>W&u$o8E=+@e_wy)w2k=+}->+ITA<|A_mptQ92dHb~3tkdC;1*%e|H
zVbgb0!WQ=7?kXu%ke-SsCu>}=&$Co~Kwx9mw$P-sQ{M6<MAL}LIF>Ep#$?Cf2U*8B
z*h(32W3xUKhXouZpBO_fbP}Bkv$jJb3G!D-+2Jd8OpuWVd1Kme`Qk}FZ_%4aC+i%v
zdd-vY=q^+owUn8Avww8qV&!>@nD{UO>!lm{tcHrjtyiro%s)=Nl3O-!$VR|B`Y71(
zmbyLwrir@U0BzHZ;=0+RhfpPDk4k2XdUrn@F}^FJhP|-zq6S4b1)9!U_b-cc#o%3}
zS!>zSvj=c(h7Fap)Cz%)OqtY5GXKF9oB~w47E^~<Jgz4VuFuY#t7XF8=S(>aXq>^-
z07MIRIkM09D6xPTit>lAH26|`e=s+UKNzB$x`O8K6sU~}aULP39;3Cu)>wpNdsOD9
zMjvMi{`<}U^rJy`cm9cqUIPbp54a)LmP>1#+GoR(9e@@^#pWA9@q+Y_KtIKSEM>RT
zeL<)JAJ#<QpZ5`2hZ=Dy-lVnkWawlX_J{xz-?v`LUQvI%ax6;&(t?V#*qiOR>lMa5
z?=2zukj|ci$TJ||2##f$cX!nww%i#hd10EQdf8SJ9oL4PG(1>v**0)m`MV?29Zb==
z0G(pS>m(NFJyxI86Mn;OMQ!$2Z+9W-mQe}4A$#}1V8`s_jY)NZO?U1;&Xy0=TypYr
zZa@Vg1RxiP$AP^UCXZDh>kVYPH>NXnB9%2v<lcUD+U8?pwe|G+$A)=I1MY_cq||L~
z+-`sO=$+|Z>D3+DD(-U4g7N9Wf%oq|ZI2d^d#zLt0SrYq<VJ*doXb>rWFXm;&356$
z#<NFEYkjQ>X<_ACHX-#^0&*AZZH*7PNI$WBnKdH!gU13|ZJ^7R=e(}mG=-AE@VPx5
zm7`~dsXU!EClmm{NNA9VkM||(&U;rcoxNwQT%T=xmbf>jZ|ffAv_9}?`jiZkqJM2Y
zS*WiHFe!-3MhTRzpXqWnVa25o|1bU&W3#}_PUa4k0^400Q;mV5ihRxG5~NHr(9WmG
zchd4^_?zArTh9zUp1Zkfr^dAg@q0)$0%&j)w%6kZGs!Ucb3IJ0)Yrj-xjrrlI5D=P
zV+-2x@JpfdH5@+9%sw8U<NXkt!V+P6a9(hQF-^#|-sbmEt2As2kiste{caw4*|ay5
zbF~Y3es_GOtGvgFMV?g)z!+JVsnZQzMGbtPfaASa+Ut>zw6Nn?gj-yGEC`|PDt4O)
zP2yCEe&IW+SgrT$vlopkvHT%tQ+!LR&t<B2T>?y`f=_>xaR05x92>X2xe|X=Sx8K7
zJNA^#m_wN_P>|hWP6a%>g5v5=*Gf6@m2xwk*hd(#2jbWoo%qg8dBvH@Z7sg;*|#^}
z+aJ9_Za;UnPnnG4g-yntgT8x)IvBi*INR}k=tPbh<=fV1f4}au6sD3F)%J>_an7Oh
zH*v}(D@K5X;DXuPaijdqSj0#C;)t=poV&-ROAIriw_9Z-Z(2VI5}?|NvHPKF(MnaA
z%(RBb$nEgXzD9Rr5O~5iMYH3`kim?hZ)T>|zMh5d?;H27coRw*myRl~+Gz!)uPldE
z*MF=GO-||sx%t`1e~H@F|7UXd9|fU*bnG8IFvU+SF?)MFAmH1nvUCr5?S$aP06k8^
zq8Q^_g$VoIN5;8aR<7zEex(KhB!nnVtc>Rvs_(sS-?lmumyDmd&0V+sOTam>56H2c
zuRoHdjbrQeuPC2y+);oN!M(e%9gv95Uz00twMR<@d{2mpcXq+Oukh2bGz#2P_1~v0
z7eUpkj_`2MCY)N%_ZE&gS=i?Z90g7#G9_ud$w+yjabq$r{;mbTyOBlDcQdtG=H{7F
z&gX;Tk5#9ymH33qqt0F1VYt#`f3(g)=8OCGZcX<iy}hH;vD!|FSEJ-zG2?0bS4BVq
zGX558TYps)`z;C$5ka8aY*iy%OL83JXNxJ+Z2F|e<wL(3c5C3<HA&AaXmNd0ch>Qy
z#iuC|<82$<R?@pPm4zKo7xye?#g6yRXe4fR+z~^*x&D1j^_ojTx;A7Y)scR(g&7A-
zh;{_5stmg(t43$0WYpzoOKdq(1K4W!O!Gs(0Eej61_svwFp9ldgyb9bec)8-tAu3i
z1_zHrai2SO1s*b;Orpt@9#V)a+g-l5E$MZ7Y0@*_xqi+=u%CVYH?#7OR^(qh_7A;U
zSO4INQmw2eQ7<fdeI01~!L!u71@+mNz47RAcP*f3L;qm$at>$v5EQ7A2o;*tuV2bA
z@_mbAWH(eXTHWS7uYu2#Y3x$kPQr5_3HB8KQzmy!Ds1Vxedu7koB!+2gpsFT=3GXg
z)%zV@*|1HfZjT#?Vrl>ZbMEEkzfc3_Cpl9^(gQ4E5E%2t6mXvJfM1XieDJY=LlJgv
z=0Y>?ZP%8WhIL`q+QdW=RZn-?7Y4KMavmT(@IbtH%^XHymbkeR5pc=<j5&)xY(z~{
z<^Z!{WG)t+7auVB=s|yT?Ul(xJT?!x9mEY71-3Z|AgF`r+hc0cd6lv=5%j#R(V5P!
z!e1ZCm*Tt~E)sW3N8gRKnOdsygcWe1+^uZaVonBkKhW_+h#71j7-(^H&YGP+_RiKw
z+vxH6!J4pby1n}(*`IIS3Op3+UgoQo;yX@bi%v36!zyplA8JMwDtkQ3kB$kS)R%SN
zV%oL4yjx98zQC0FG$wgyXKq=j-`2D!9re{`Wr<`3Dvfn0K!A>Y5MV|g$~D95`MMOB
ziic)SCrFgGel@so{Og4yk)L*&NVmirqSw4!4?8lq(49jvz?aIw^x-yT<cEB3u{W<>
z36RJ*oT%)o@kxekJoA<!TC_hYO`p&2bSa_GARmnMI~eCo_qV1OC5%f$s{t_~YCK!_
zMMj*_CpTQa-+?+(#8U$=i<a10<=}*NWmH@BRZLRAc1|VaOo)Pn0f|((8dNFw<3cAr
zA9?Gxs=5W<st>1G`)}5(xwv+od3C=r&%|*=IFW9GmBzFn3lthqj|ifeU1+NXYhXR7
zwiONrmZIV*>1{@@UTkjMc92foH{h=4b^Oy#vsfh_T~*{+10GBZu8<kRx^WTH2NkG=
z>#2ErviBfDob%~D@gu3jDiuv7LFWxR<?od{4fygVJ#z>uSKtcVr2$dv5{AkFP(%|%
z)xQKK=!EO~lZwR8Ixc8@v<ksl&=Y9z)T1FMGrNnuY#C1QxQNnh-L3M$`yWSZoMK+`
zhG5njIE=?}*UR#=b^J*H<oKq&mc~&m!oM`&gPMzpq67+Jyq%+XcLiuy$O?er2cT=j
z_pLv*eKW*+l4zjD;O)G~?+J7dz{D;^#+z9TMH&WkV%Tr5Q`G-bYi`$lcFC8+?7bor
zs3eH)9vFvq1N5{qdXIJ?J+y%!?ZSMcBfrml3!iMTs@tN~8sdTNbB6e~cKG0zMW730
z1-CFZ?0?}Y%JyVbW?v_#)oOISksDHSYih+vCTV+QF6tYJ+5j>Be}$IKKRlNDudB)Y
zZB4^p&heL<hwr^lSk!NCepfBq!_EVhg6&UHAwO3O<@Q_AaN3x8z5N|se<!Z6+6Ab1
z&;VIFdNl<a#?L$@B;W@dO<yp#g1I|`HYeX8Q!05l*=(Ik(eBsNqOz(T25xIH_XAjW
z<{jHduRR6`lRKA`Zk<DU#k-91&*q9+qK@DEZJ6`R?+EP{mNHvDfteM-{IqXGM5;a+
zs?qwku6@7^`|Qi@4f_eIm`f&-<CW2=;V0zz+rfeb78nl^>kL$37TGtLTx#z-ZrzhH
zTF`asYFPUF`|jiOal9?E7b5R>FI+@Y53x3}*Kl$?G%BERql}5rFmFEPbTrneG!SXq
zRjvEz;kdz_%_sJMM9*WWfnG+c)^4`W&CmyD%6<AQ^GbK2`6jgE1z;YcnRHD%rL4Q$
zH0R>=xN?iw=J32@`|nX&pG4U{-JVotI8C@l1EfA@N*1ZYu?n>=trTnCr&&hZXfIe2
zNn?&2b$)sF@jl#6szcABrl?}ZdtNee6{Z%+gW1Myks?w&lP}mek=!vdCr4*SzmZ#C
zH23)W&Ag9^S&IJrHP`DEsFMWU3E0F)pafoKiNfvY7_zxzxvI9iKXm8$@c2BEFT_#B
zDGAD(?_Iv3grTkCu@VlPHQE+4SbZBu$AzZQu^|P-^>)L<4vnub)Lt)^@R0xb<j8&A
zPgRP}H=Lr^nqRi(6SWrjOBpDgHh7z?8A_Q3aHs`xZ`#g!L~w!d(Q`*ego@s&*AeK4
zR=-)&i~~66m4J2KK#dfZo2*6%!a-_kXZtkHG4#hgD_ilcyM78U4%%Le5|}x-(2BVQ
zX5nSjSx&MB5uj*KL2FLU=3qJu_7voIv!B)u!EPtL%ep6->%+rKs`eV<XMB%!dgFzs
ziZC)A?&N~}cR{*t=$#h%^`a-mHl4jGc9w@{O1b9qjKg)ol!4Hw_|}t(spYlF{}M^$
zKMTb6@6zS?%X?TMzD)#HnxyE1{x1(Fy!-^+<nT*Z?!K<Cp>?t%-Sc9Ah_rF~L6h@=
zd{$ZX6c7h!KO|{0#SvY_UWTd<5^j71qQ=O9y!Hm!-9<Vk*Z_6CSF((}uzdqwuic7}
zWl1t3O7Ay)tLN4_(v(*%Q8CtFikcIyd4>T2j!>$c8e5!~sopWC86{^Ku4^78yP+#V
zE!)*X$Ry`NeHF_+_Uk9)MC$E%F1?rtVon3C0A)^ErI-sr+F-rgIqr6P1A*t``--AN
z=Nx9LNNFj{4hL5~rW@kZtH-Tv-Ac8Ux+X9tZvnTy|LT9Avi_G!+i%}17CH`Xfp1qE
zND~;?NVSwMjU=T})3TP5p&J&5*FW`j9I!sTyI7eXI$o4Q7J*eDsw((?&Xfs%FHe>}
zT8bk*PK%_xOutajv9V>0YT(3TJwjZM*}y|iy?;<_w&BU4=Qo4n-W!0uKIi~YkawnL
zvQDBGeyU~Ps&SJ+phDOPdYoCQJ>G51?T`o#b2eI*d-LS04ILvE4Q?wAdAF-aH_{tx
zY3p1uymYfj9fzpY!K0TiJA0JvC@v~Kyv;H-I{Z<JA%)M|XJN~WCoi8*?Q;4#BFEeQ
zjLr4WtN7RZkIwJ<C7$?u{QN8H8Px7qMThN_{JJXsA#mQre=}tQl1`8><_nR28W$-=
zq=JXan$$0H42gSeBlYW`cX<%e^w6)gf*Z^}xTM|WcJrXtG0DL&j<%)Ex{n4fVd*{3
zVI!6_ca-?Sm)bh!R)HeD&TGnbp<L-@Yh@`6Xy)$pcE`$$q_L+9x%MPcg?qQ_17A4k
zvwXl(mqI$>hHM(eiHJqlLPQ9(RijEc6k9H%)~{)C_OemOsa=DY`0CQ`RKMHjpu7-j
z3rbluirl(FphWzG$Bgq<Pt2*239f%c124K)$zT!KS(LpxI_(+@mFo0;IC>!X%WEwC
zkV2^27!oyw2Qn|Kp=vaN^m1^ta+DXY9~ZVG0X7w>X4nN|&^cG&_6(7-VM3O#n#+z@
z#8<FoSl2NP#EsOav?`=HTOzliHuy5We;AtDvJK(UGi98!i(YVr)OU0`QMnm87QhCS
zgp@u*@V$PzO>Rr$O0-uIR%E<I{^S<fT$W<{{qEoLMquKS-RWQnlB7uzinF1V|4$~A
z69{@Gh7TaCW&sHDr-5X&rMI}gL-;%vMj=4~Valfi`6sW*MnwRaS)i+NL+yG0!_o3q
zL~{PjqRkViW*L?Mwv>7I|HJ<hcr>fw?n8?@Q;0v=SCXID^I>gL%YS?2zyEHjQ+w}L
zi2cDO{q~|I4m<Wt;eUTczrW~GL0P<!-|w#Ich_vx@&EHMhRz}jh?KsumNWk2h)1s^
z-h~*SO3<lKKhEEivAS{(TmiE9Y%nQp14Kgr?q(O<Ge*5Ifwn_!I61i3MjSy5%<AwC
zeFOS=VyfwmWyeKP$1rI&n;3HAs9HksEB<bSU2NOFy#ZQdR3+pAnq~JjJAbrIk^Q5s
z`eG)Z8$;GGQ0<dc&>rW@1+J~wAT6Xc<b)q6s~2|=)OxCO!$2JlTh+1n?Zuhr?$pH5
zcBD(`Q8{el&a)yF#bPUm>Hb^+Yqf@UDOZ#%fs}{Xm<ccBP0tly@O==gMgeD?j@|TF
z;`ABr6B})nLp!PfIF?`RM$Jo&=$t`O>0x-KuK_Bgu_Xj8aAOeV`bt6#H1O(#LCd&W
zSi;T1Jp8z+S6xw}kSedd>e?pNN82{*td07?15k_uSf(Q^$C3%y*a1011H{9@wUwfD
z%4%slTa&$a%ij}5W6w)}i#1+;9&P6Dk&rQ+DEr(!)#YjD`GNCwkL`x4g0xe^_la%V
zWVrq<5BSHs8s8Uoq2R4)qRGWesX0O6iaFsw^Y=r)#?#nnfm`v(8Nk&r=A5;qCq0uv
zDs&uRAOBl2-QWD4axRT~(i+F{12E5&^^5C&jmX`TtPjqU&c7T4RnHLTpAdP#I4FG~
zEYfmp=J`;4G`1{25bZIei*RCdl>y>&r5@UTcGbUOkG}`t^0t9>Ap<a4X~$%x;QBFp
zM1A6|SfjiTj>2r7R|NT~**1OUg|Y?${U{cex2Y4H1nDEo0M`3FDL>c`)aq6bH}I2`
zzh4RGVFP62qq_uB1h6jf{AspbIY)pw2!@C`dlSTsV$~9sMgYPEfgb>^ZU&CKV;U@H
zH6VLVK>Ko8r2vnz2-WE$=|(?zR<vjUhfefGu`z^eipe4k5FDDx6#&dFW6l_nH8=Z%
z2ipmPWDqD<gROUpSg!knCvXH}tZ<$IODT}#kIUlVoj-Uu(o3Nbtqjjany`<8w(}Z5
z&q%}2X`ChSplk%m@h_JR{cl|MH*+SKnL(GpRSH6+Waxtrjw(k&4J4fG*y~YfW^)qQ
zUVJFe-W~6&l6KJXqmuP!yU-)*(LKz}Kf2f^Bxw-0Z1fbw5{{vf(Bgs*s?vMaPnEqt
z7zgC<1_1Q|aJ9+EKREN$&-?b<0l<8=G(<!K<@Vp+zO+BMao<_{0qRs8G@pKV|FFNm
zbA6P0(ZBt=9jGifFo3p%D~SHx-w=XqH-^oS-sQXe{d&8SYX)Ce-1&SUTju3fpOJy|
z{z@3%66MQX4`v@<J}0N0eR;BqupvN%UeUM-oJ9=pFnKn1_rz5*)!JtW8F6!c{YLja
zUtTVTZ;7?T-F8LN4FSUJTG*}PJID{TkQTgL+lkkRC$aB1%5$E<xO{V<Wu@0HMyn!(
zfK4{4TW%eYo)#}EmRw5It_x*c=R(&wQ4r}JYD@l`fZLoB%_IXGR<DB~y)ZgX2%_Y$
zYCp<E924nH1B&uanEuA6oR3^}=fSytkrpSPy_FS3?8u68k*vQ?$AG}RmISv8Ck45v
z@fFwNqJSuBJx<ic?6y1lhR-_cwR`QZJ6i;)d!jl&y=|E?#9Jchr*K0XL4jDfk%0qZ
zNGhO@9gh6q#uV}7T}e+R#J9tDP}EqQ2C^8@U{U*eHv1alX>+2;fI3(3Fg?j^sR}pb
zG3lV(WzJ}SK%u2P-q7kDJ)cZDX@t9Lu9{bhIm+xnW(_JYzwAVk-XQ5V(16B$)?G~7
zif0{BsJd=+53GM@!F-#G0c%<!cd|tP#QB|wk<+!`Yngk&0(r)*Cl+(#yu5(7Puq}H
zPXQe&&`>UPQ@K)~@bEn4^|}L8J@NH6`m;~HGlb=HT+&}Pj(S@f8UCx}Q62xzqyFy@
zLyjDYYONAuH)FuVgKmW#gt+S^7`pbHScvp^Fe)#EL|uQUE5M2IYD|x1cV5TTH92#n
zq*Sn%{H*Kk#*-(`0c2bzs=%b)bEu2?yadc#B)91<WrhEt21}z1sAF-ox!lInC5W;P
z>eZ=YtuSu+<U~l(OYDAS8Zij9rc#XChFrgh3WC-w+Cd_)V{PxOyOWd0zPD-F`)B44
zEhi1_{d_`miuBH*i-@bad=9VrX!$ntAow76ZVRBhsJvV$u5CT&?%$A@a$wd#ov{Wt
zD8<dvou4=J*?s<~Y`{lm<{Oo#NdKNYWrZi|vyTTI?L45qY#qMjC?@8TEf-Zb2kH+N
zLE+6kKCN30hj?hVeA=6K=j^#xMv>wkJbaJ7zT(O9EVNfMeXkRpd0VWoG>^PDZ`11g
zFTau;5g9x#mX=5Lv^It1j@XMF$&_2na^YDMZ6dZzU|j)TUIb^<;E-(roSj)XS&NO3
zQr<hLfMtU$<x(%@HFnMu-7NU1@ks2gwMPzIP2_A(eQgn$u=i>3=^;cS;XNQ`ssoR!
zPDX?w=K(~*#|#X(t&lkp)i}|;7Kc;N86EfDjR>vzK-ivlX2MwEUWq-ivj?Ni?IL_Y
zRO5&ENv}W9`N?guiG<L?XH`zfy}ezTE7=jgv!b1{mVGf@bFm_y!A(Db9g-%SR1>2H
zaqP{4OiK4ePr&Ul|1DVo`V6Obc59`0gi_gy3hFbfxD7dP>o_&7B*%T2mQifYoiEv~
zX~<-iU%|?M8wQUB3WIq0pMd}KhlcO}xB$H?a75iJ8T!Mp+vA?%s3F(8?y-Ie5;nJT
zHXaKuD|`G_ziIWogKzMS$S52g<q53CtEf)p%2@#yyUNXS`RDu0=s8xi#T!N+?TtO_
zS)Dz32(07EB?pbCcAf)WOk22-Y`#2p+bgX$b?~IFOy38&`?QH3;|-|wVqcIc;mXU6
zfN3%UA_`RRw~;Yyzr#04k)zr0TywK|{pRLNNZMPlorHqtYPBOY)vOWogU4ql2z28M
zGkf0t;E`j6F=OVSV~aKm`Jmv71dADzOgf)TBjH;r(6XHg8;EUj1wjYZ_za+UjeRNv
zWHonWwl&veiN64Qdqp9Ykc5T(fcAj67B?~z<U_SFAeLz!M2<~@8Es2kIgq3PJa~-o
zKX{Z~RtiyHp_~54myoG<m6t&)_dhSp8^=B*PGIK(8W04&3`if-_I!cpUjg5Hz6krT
z^Lgn&S3(Wmd(qJ!Q@3KbY5*gD0gRJbLEr`aJo2M{x<3w-rrBSIpz6y=((6)DGFKcy
zQNat|#5i~8*<XIUPqvxK99X$l>ShjWhmx=luEin;ZAef(Erec$^!LkxooF|@WIx#I
zSMkp9e@Q*-EHkdTs}t7|BDS2uIs}1PKzc?fyxQqXTx+0#zD~1z;W1*sq<YV`wpJY$
z?t4NZQr>8;TojO#%eZDy@P9l?z`o4$^Y!0J0Q{xq)~}DVg+L|hIy51E!JHE<yC3jU
zTYE6eeI#eL^ckSc{r(F?xyitCUwLFNwf&buF?$(I&2-G*J8l*79e~adTxO-AVmPR6
zz#GgiTfPID--<Ccf{H<_5;(JKowmhW@nM2(7VN-WvhH!@yhf6;aUT>YY6`SP<eI@5
zghYUeNA3QS_bSfmbB!4_ZC<kl%<PN}eu_J^oNj|j9^}Yv)FS5NDb>znAT6{n`oNHD
z(Atc~HQxQ)`mOy9I@pp!t#(C5T~?Veh90T1S2qxjx};X-ZL~bOe?ZpFx0`$9XCox)
z6)y}bc>IDcI5CWxn`s)_u69sAWaMktq4~}&W{x3;my%wr5<7hD3eQ&j@DHAT7I%Fk
zOAIqCt5pg+)6G#I%{SQ{TgZFycCg~fLOp`$;_HNRW+>`Dxn;AhZ`_1xGKO3W_i;B^
z(NUpnKPUG$u3zk2H}o!r;{}^!?z_zA&cD=_<j-mp`@$VfRC}<9fYoReRyupr`{2`z
zop$|2<%!S=a&FM{;cBNxmyiQmTjKf1SB36S8+X1;{o327+Jqw*Y~+*>oS|))ULc?H
zNKg08$O(pEO%ZBPDkSZE^J95w6`b-RluGo_n~(f@xTDrLke+G-WoJ0qKFI#k#xtdm
ztV^%eb&01$$7go+o<c{i8TeUBZ_LrQI*rEHwhxSQ`@mQQCmHOsoe6DDxJYk|xl(cJ
zi=fQTi>k=}pfig{&qF-mo`}F{+zqyjk)=?)%6Dm}4z*-gl@TXwz@2x>c{XxN#sU;{
zu9!_`Zb&l*h3P~VA}F3LAl6WQ#>f3O^F2eh*ozJ{;7{LozFxdZci2Q~sBW|Tsrqjh
z^<vfE9SweX!^>WaEebKSI5Auu@MnjC-?H?$Z1LM#mJWM&p{z^~WxHMcfZuCQ>Yh8s
zALHAmsLAJzZ56Mb336LJ$~*^<LUTycczfjCT@CThyW%Tfpc!_B<=5mdi)Hbh@#r`_
zWtf8+)T$Ykgy2IFW2b=+eaidNPOB5t&+8^mQZK*V-1cm+?(sT#Ai6QW9O&dZ`>6uS
z%ao-t(>TEp={YigsdiSm)pZdP7<N7Q1$HzWT)I%mDg1QaI(8GQ5m~|4KXzvOW^J)q
z5R5glMNODzIAv$73OG2;rP3a63n}0-SXu0^%<`63Wj-|2GARk79r+|lmDdtu&vOS8
z2(r7uyk}17@YlAg*ZAJ8z&@m=(gQQ2%V`w(o&2{t(l;BfS>7y=;3eI7gtdPOM=55B
za7T5t0EW_(11LK#{+>Lr`>sxp0*>1$x?2nXU`p&L(C4InbF~lqXd7`T07G${8!wf!
zy4Y)Klp=o4Ae)iE7Ej}@TOgV7Hzdvx#>>D%=E0V8p+^ZtQUT-Rb_0n{XN#laPrWs0
z-{CBIjOS`pv&a;o#ykvzfII0QYH<?mT{3kS%)`Bh+|Si}kmsFQn!{IYBVM2W@~Jw%
zUyA!PDvkh2rNI{VDe&zcb2|-?aB#}ttPVEhyK??(=})~b)WJEXQ%sMf*s%4TAQPUK
zHs4DwUv|u}J)AN9IHRJlz)f_&af+382cNZt_Rk{ZJ>`=Jp2YX3$99U(&f~f3TUm$D
zUE`>#3MBIh%#%7q=C3Lv!lwdupd+52&#Ce+lZzC8II~8HF)3snsQlgPWo~}(rEPcg
zl_vSLKURcDID6kUrv$Ukz#_~cP9j$l(*p@dYv83_!!SD(ZrfdS-{W=Tt5@ZyPpm|v
z^kZ^3OPuf=(6WXJ^j2b_1XUSSjyipvzUZwRhuLmpiss7VA>$wQ7S=$EZKL*CRV>PP
zj+RYxH&%HfP~?Pea%#BJ+Fv&GzsJx29kSs+38ve!yHG{fxGS^gyC1orpfD;U(e;_U
zS3~fw=g~YCI&oE^tL|2+b3T9q8vz7Z>@7=GrVM^2o>37RLi#AolveQ6wIuj(1?`U4
zn(c2}rS!r^&dahDb12{rn62fQ4nk3AU7#KF3iwUU_1uNRSMhs3FeDmet{)hc?2^36
zn7#*nGM<1QyxH%}|E&F)%Fj%Uev)_}w+6GliDeStMyP~a8Lz&V4-ur{@`#p?Uz@bw
z-~S-|()Fcreq?ghxz%O+;5|KkSDkd;?iL4yX};~?qww6}&SxMsB8D7<)&vOq)yZnY
zNm~Q0T<p?|5;aVe8@38p?|(NX>=yvhjc}_1ZuXI2(D>j}gn<R+pcX2MJyFo7xmANG
zGIacIMBst?wBvoqnw&SaZym!qIr90#!*M4Z>i6;~dY7~+eQ|!o-o{BpI->^2MzS)n
z2+54My4QA9YUd)ncZpo~tkV?7d=~3ARCXm?Iaj{Gc*lRIy7;;@CnCSWBxU+^>OBKp
zP6|ZAA@cdDcAiRLKm`MI$L+!rTq~b;Ug{Qw^8Gz0>=oFnEm1<C(z`B{WyWo2(=>;7
z;iy~2q1Ds$fF=@dE0QfgDslPcvX=>x=6;qW+<DEM$+Dr^i$$H%Kg70FRey2^sm2C%
zUcL%WGU2M-5F`v$qYQ+HVYTC~n?+#TN`cL@4q|)zSLMlB^qn(&6|wzx<5v6>FNxUD
z{s2*ctgX-;CsyG%PgknY+upvQYN{xWsk(oaY4|F?)*&)fC^Kl4BtFXX#tk3ctpydQ
zf3iXxU0y%Dy4ghDV_);9s_Z@6HW~{6i!e}JtX2hboCau?1t%RM#meqSuF<>g9WYDV
zF!q{$t#tO2X)7}RLGfV?<!9BpqBaSry>1s!UY$ZFClEdQV(@JiKRAVx1}+j3BaB55
zJZS5St3|NA$&U*Br&o2d4Z(J*vS|0^&c!{m6CSGkBWi_AF4^YEc-IG=Bflh=ztYsO
z&;7N_g20B}zp)V`QU#!tnOsXJ@JQ$%BO^F^BQ~@oNP<8=hJ$yRKY)O=`#D5pDWSnL
z;+4n?3x;#}6&=(A_GQ1cX-mOCtMPq~g%2K_b1!>|Ew4-D+yu$SCbmAvE222bNcz|e
zl5B`5J!>=O$=1e>(Ky>Ye+trsidg>OkwT8uK_bu;8FCR}Yv0Af9sG0gUc`+c{{eUS
z0WZM3gxE3nk9$&BI0}}!v7wzhY8QxW-!#U?t9x+LWlHKXYEbPXF3_{=Y~Xev;G5qO
zj2sA@pn4;BBW!9|$*=)%7grqi0_OQ^yCxrBA1{WqMYQDh5YzJL`g_(!Yhuwn7W^Ne
zYIRVujKlOg>|iKRUa+jD_OmrO=?2p5R|@pV$YZ|eNcR$>pS7KM)n?IrD<jfB!E2E5
z0prh>*FtPUl1QF|w8jMLH~hVFWfHW`wiu-<e6r*E0mVhmynX<vv#yx{iyhUE86?uJ
zkRssnP8&#}hsJdSBw_P?g>>wra_79^nM7KoN3Mh6yj@|Hh-aG9$iYqA$2l#L>Qq)X
z^L(s6f-C67@<(6!iY>U<P9Nv)pOs*0l(5Tt#0Kth^`N|m)|G296(Fry3|+Ns$2tX7
zBcQ7Qq&CD1>VaHnkssX7%<4RimwmqNV1lV9U-r?B;mVKlw?aDmv{Y?hQ=E=VrvFqE
zBt1sGpQIoe>XvZlm|zmBUUqNA`5nPleb-LB`m)aJ<FQR!mav^)Z^UY^@@t<-62rX@
zS<Zbk$3*T#dZ@v2!2Fx#&At#y4vq9tajbQa7mqoAu{~YyuxeVUu29%c#HgaKZ_t6{
z)f?98LIc^{b<(Wem?l)gt^}$W-OR@fY5A@LAb7qspCHE$8eKgbJB}JzC+l}`?#{4;
z*VvJJY2R0e>pxNRzb@gASQ`N1#~?qp6zs;j4rekcUo>UTc}k$)uXCSscZyYeXB_*e
zn{rr|9TMpS(ce~TfOmTbh%(x#J!1q|Q8p{<dMDgiD<0G~_~p<QDgV@3ueJ6<4%hAs
z7n^LmzlbNeWH!lG-zJF+z2L6PWvN5ehTg4v+0KP!@B1{=5>EFnW*w90IMP~@q8m>}
zKU;Ws#Jc22iTiLS#SGl@8rYn*8D6r0vq5Gfi7u`m-PxU!U=|yA+saj?KF7Uequ~qd
za9rtC)h+Pqmx&yuRIIb~f}I87C<#BH#$89Iy8*45Ofn5@1RDxf&H-KW2;`vVE<)_?
zpo0qzS9Y9@5|GBh&p(PS<%(Jx?@`_X`md)%nngkx!|)dF1`T@a_s}^icYA<TMPaF6
zRjj>)#eyX^YiJ+BNQ|S^w{WmH<~oXglTd9hOMLF#9)eLSeIXM&{%usFy*s5IIu*a5
z-NTxe4ME%oC%*U^@Yw#r6>a{}y~F^d@Di{uk=;f|KJ*dU3OB3>tDpGdo>5*=UgEXw
zAd|kwPyUYf`Zvk3<~-|pz8R>n?O<2B^jKaYBa|##>^jyiguc^5)08p(UU$>3R&wGQ
z_uhsRRXnkLkg|gU;FEz6m5<8U(ZfEMm{~*X9~`yt<{;FZ@BxQyhr;rjA2SCX9VZ@L
z`c%<$D128>xfggl4JL~;V8ie9=^!Z~B$ZRe!R-C}_9LTxl+P)1vfd~lruM%xIHpp2
z_tsYo#m>&|yq$@YlT@vcVt%6(&)Y|7tIlo!kvcz*O?FV#!i5a8Qn|!jJO6y`_J@!V
zMm9i_F8iEL7&C})9wTm;>J->pFe~s>5N&bitOdV*yVc!3A0sj9w);lfaxsthd~6i0
z`i~MudFU}tGKLSGFdn~D1*Sh-Yf{O0`c!gc>aoMIrH%(fW!~H8q<vM#unBV7>HK)h
zi-d5t7YVz{1)|5nv<qE}`sx9eoDY_z;a`XWU-!CwJK6mFmf>OCv&Ve92|vlARtu-P
z%B^!<SL_=KwF|pa%S*#YXbN&S!};!<QKqh;7@l7170xr`l6p^f{UuQwwJ}H<R%qNe
zGK@xC==%PKqUz8*UBG&i_f+s%e&tr7Q`PdPqCWLIk$0B<u>_T3Y6-L%O|F&|#+el|
zq6;&pH_u8~_Pb68`iOn7A8Ee%*ihc-Ky%|N$5Z(fMcy~5pg%t0IXKHy<KU3)K!^cH
z&%q$4Qn||4)Tc`y2!Fk<W8X&~K0mD1{{OJ|-ce0OZJ%fm6{H9#9U&?RNL2(;T12EM
zY6KKe(5Q%%2$7Bw5=7}81Oycn1Vl=tOG~5+h)9PZBm_|@k%STgDb9A<JM(_;%(`=D
z)?MGaYyNO8mnJ9YJp1gupZz?)A{2W{5qkKe&>J6w0;UH+yM!gmad%bG-7GoLSn|7v
zEp)y|dc6_vnIZJZD-#mk^9nD^6_TfTS}$H4<0>XS!_+#YqrI7mv=T-V)1eHwZcgS+
zGNalQ_SYRBtJPOe4=|Rono}~1Oc;)|;CULp!}s?6ly^;i9>YFsFvtAVllfO~-Td@X
zLNVENJ~kAqO?nTdUZ*>8cRgpBx#!ZX;gf!j^<x-?;FN6WW6-Exo>ab!XngHy>o0Xw
zNVD+;%31qDBlfNA=ETr_bfu-QN6CLYJN@q$!3YurAhvoK)rn)O5U81X*MRD+*bs&D
z%;;Xm!@F6tRP8Szxgv{|=rV=c(oVgatoH{pbT2(#mO9DL^Mmj&1je|MSTg9m(=NlK
zxqO@$_6`;=w&jGsFfcmnB8f9`9R@Gx3noY=wk=z3(Lo5ce%5RHI6pmQ&LVZEVS4?P
z>{-sB7BwGpEYW8$O#bhBdU;(K1|L~c8LV;qtAw49mo0PI$k$?4sI(~j&3#kpgXK1-
zq`6QU<ME89Kz}P&5_lU7`7>>&M@L8XVx@1z?>KYjlEjr$;Jmzzkgc6?%?$H}MDw|}
zShwr@FH*H-Yeei}r@5<o1i^3YU&iMq_<guwcLg%Fgl!;i!A9KB7{a<GZO4$X0!s?4
z4r~=7#*2OJR+xyv?CCU7&)S6$zOWb(=*lOB@Y=TF@5M5EPfQ;-`I+}2DH1D&mZxK9
zMjz7=GjmH1Gageen2_v-e&?M$>s6k7R!-zyN|$%~<k*OU`(Nm#Lnp?FQyvu#^F}~0
zz`ZPyQ^MU_#%ntUtHn&rane{u=g_#S2bw=nE8LA2&;2$osxS3$ePHAwf)rj24G<pT
zWoSV<^ke_-TtFMJr3K>h-7{p`5njW0a`@+-ktx#p4=NesQ72>+4N~VE*7n(KWzww~
z-;&yZb5#^LHo4@6<E24jx!Q3pjD?E1rcSDu3`U)!`uVA(=v0Hkxds(TB9^3oSzgL#
zAz;)4v&*qSVvB7yGfb<C#)t=}ugR6T;#s~^lkYWA!B1MeKc&6P;J-wb-mL$~{sRGb
zXx#FErr(t-O}IlKDi~g7&bZ=4M3J)fp{qb7Y&h8(6_~33d7ej8CIMshSeu7<K4@zS
zRCKC}*pg{hV@q8ruK{a7G^2C=@~f+Qq}>U><}mcTRXnD%+qB-?KXB3BRz9PPsG!R!
zbvnhVX3Mc;m`-hNj?H6hhRb3|*_38c<&;~Vt*S|>2a@}g29qD@cZ3Dq2|LBwLjay=
zFE!BjIJKaDgqzgbH1DXs_8B4KFBmD)krVdn2LJfYd&o+^mDTt1?Jm&wBX8tK+FN5)
zOMpRius+p{WfrW?xX8(ALa;2@rvt8Qx60nxdfeHi;Gcr~{Tjh2Nb?kWIOugKke}(q
z*bDHKAdzeWL@<+M#8=PHGGSz@;LKceE2p0-_#@5sgAI4kJDBvf+)#k~qJ&1^^21r`
z%vFXzCq@?*O^~D9VW!jToQ5%q%=bMX(zW$IpB2(fmN{?Fz40C9Yo8{M+&GQ5uipIe
z{h>`lI%v23CwSWbh+_7?{O|uQ&g}n~^5egI@|MF2Xjtu#Wad+Vrt%md5rHJOnt81F
zLJ?bv>7!YmPfxmD<y&M|FQh<fdk<3YkZsm^B69bA>h9$hl4y?olVk1B_t>R0tpXZm
zh~+{VYgcC)DboHxtS-s?4(PMq@acZPZXKX(-{0eSaF70XGys2XXLAz$lg1x2JO}Y&
zdNZTw3lEgCyJBnVrMr)p;Sf-rmAR?PzRb-_If1I>OS|QW7nkq2+M`R%7D+;&ca9OD
z%|$(C%ir3W`DQ+lPP{rXx&QC^?(`e?{JeKzJG=pDUhF@T=CQ^(!c0Cs>Lc)shMYnp
zIl6TxSAS?NcPA_|G0p82Gj{oWT@*Fr>eB~4wV(Hdk$Jv^BF71CGI(`v4~%AO#uC29
zTyBSo;-N^qKw3NFar*{>cr7Er&+GpC2Uw(6`PXwr-%p=VoP+QpbeVh%Bt3{K2!8A~
zrx+fAQ3b~U9K$F_%7ZW?ol1n&#@w@zL{*#W@2Z-ctdmQsh26m3)gDLAXfL%<jNK^5
zGeb*)ah>>svFf4l_M8I^th@O}qH3i+e66KE<pUAsHqKw;yypX4tJ<|d(}lpJciq~S
zmfM;n$n5IEC=b+3nVlZig^A3=yql`IJ@qV&4Zr9&0g=NE?;m|DId8FT_w;KPXa}^I
z;ziNhnMFt%pY?!|C7q8V(NQ;vXv2sN0SOM2m)G&gz0T<FeeR~4_dmn;-F%EVh;5N!
z8|TdrU<NP;tLrW|594)$OPzc8K2rqi99v9ccYXY&6N>ynnh0sjtx+I=5b$z}*^0DG
zigyedC!rtfwd}3AjsyKE<P_;w=fPTA(*w<Q9Ze?!Rz@5T+&n8cyt7DM`$#3sn#2{@
zODm!9EdhEoaWDCICdKRaY8%vZNDnGP^sv>B_Ak32o}{(VuG54$+xBUp;|kA#$~PjP
za7-lxTW7imyk0|ez07v7&mE}FLGUvZe|9S9WSUBLmwVK~eKU`EY&9MUDRNX4xTK?I
zh0oX9x=Kr9oM4u*U<=S{{VqZr*SLb7SVuQAPrw20iA~;Q%klK0ZUI7umbdS+%y!Q7
zS(le1tEeTAd8@5VVkck#)0i~8fY8_RU1}?g%HmETX*O7L^bLNH!uiw^s%h$;H}-n%
zk-zjfIHT-cyXDKsTLh)FNX0=sC<KN-$94yBQeaC3W`7`W_Kvj@s(E<1!gI{?@C`N;
zNOxM506690SDN%mhMC|O%ptRQ?AVhv|IITZ6WQzmI)N==Nrp$_HIM`W69uM2Cqb;-
zNbV?jQ@MM-77T0XJ`iQo`?T!*qj@w^B3W;?S>1$L)$ACC!e8oUoM}Az;n5pb_<3}}
z2P58i`XE6LMh++bjB*7xhW4PJ@8yf}??Zx0|C74@d)K?(ab9OTG=1Y$dhXMRE$~wQ
zofl(a$re-Tydgo?ZxM;diuybK`aXcds|Lqiiv4|Rbe`d%SeI<LD_!=8RK~QF_etE@
z5dygqTa}Q|SxgDt+p1tNhL?BG)OHP-ol-d+g4<nMQQLLrrIXI-sAa3TE@@kVw9Q49
z(e>c84xmt$G^wyLqqO+oEh4zB6L(9qF>+62q=HQwPtQ-j6I*MzG3a=22iRMul7RQh
zM~rHY^#K~C_hZJug8E_j{;OSXai8vPpT!t;q)Y$Su=2j(x^pRY$<7kTMsh0g+tIr?
zSy9vn6vNFL#5N;)DX|;#VrAXgB-_*zGZ}h!<HbM6Iwha}86WS^osDBlsxa&a*=OHa
z`Qs#EVKrqJbVIPw@Vs8sj74K|IuA;DrPuO<Sd;Q@>2mU$F$!lYGeHwe9=GM{re2k#
zaEaGXTp;**{Ja)@Iv-2B-*KGX$lrl!1+&xT9@Pbe-#U0<P<r%)mp=#6CM#SFomgJV
z-v8<*B*MY{*O3z)3WFD}2cFkRY6dD;M86rnrDoa+GzKQ2t-?CA{`t6pS9#XGjg5wM
zYX8=Aq}72P1sTrhuv<Re`~7Z6(V)Pso<^9p?07_z>rjxQ!`t98->%l?7tNdK3`IUa
zp6?!oY{q?|n91-xynos6(-)X70VRV`wPfAIbm8|8vK<PUD)2O`5zRGD(AA;N;#)hL
zeLPSVcj`RKhWgSWPJycukESonzi}Y$;da7mMSv+F;JwOV4B2<Tu0P}M^VH8ysEK?&
zV^(k>NPen>*thqbo(wi9Mxs6;%+)S29X;q<ZQoi6yNvvqOAKP<Ft4(d(Ap@Fn@QfF
zE=~U?U974a^6HyfU#AXC3O82^K2}NyKb<sV`%r-KQoln%c2v)LO2ledvmEE&dAYGJ
z_R0gV-uSmEJUw@Ob@(J!NYoOR4scN|;mXXzb{pNo_05FdXuDQBYIN)!u|e>;vFDeQ
z=&XZ@@YLIi9z5}n(^U$2WX~}+&cu@@W*LJOOzQ0Oq)3GWOs`p7R5n`@i?V8h=XHA_
zbp<A+y8>iV_m%k6^z7~YmKM800u?seWsz_^AM-Vit;BpV1CwFHhuQKe6bIVf!~j*@
zC|fgg!_0V@%ZEREyl|1>d-Nlh7v|AFDM<cFMZU%WA5(n>G^Pe2;l4qvi!^e{f0L0n
zr^_wD-s!PFX?*_7Td=I2gl-4bza|kDGPs0sYj_R!1S@4^JagUd0^Ot>uRO30+XYkx
z7b-jl&s+Bf_;B(Ctp>{nOK;!Uxzi^t$#76L5a!-w$_mExZD}t>sZhHL7t)Xjk^H*w
z@89MJcD~RUDySV2FOwWgwA^NXyYZBgTRgjl0`R07bxUnzs^9sB23mij7FxJ?S~*uF
zaptS5iA=_WEL-h^v}BdjlTWLWFJ2SN)o1&O8;tXG=n{aUY<FkQlc5sKLQ+TFLvEc)
z^xTK)p~y_6ctc^Cm&W@JZ{-&U)qg@vAhWy@Yh7#!K8qnfwwOcbwL7AvlB3*RL&OuW
zx9K7@_p5CyhwSs8H+w-Pe)3QR4dWEfTdpwJ!UdI>_4IJAmH(=eJtJ^5WpiU}<Jb#x
z6msX~Ua))%d=E)hr}Z*MIq%tqEOYivP9|FYEmN@*ySrvmJN|}ba}=DOn4tV@tTxrT
zvrMxw_0zPS*VRiiUmLzVlyQeQ-osfpxNQ%Awnx>O$SHHaXVWMLN{{F6s&dIqK2Dv!
zdSa9D=!^OYmRv?Gcmk8c-f7<H6g^1i9%K66@^;)Mp!s=!?UxVRAHBGpZb1x9e*~?*
zY-8@9{I_+82G}}ocP_{#ypksVK%6<bc(r!>Av@DUFW1bZ47}JzlT(v*J6t1ZrrYnh
z!neo%fRPhOb%+jB^y3aF?IMt^xP2vVw!g(ZmbkDq=R!9LKjmb6SBSHG^js;~-&bTu
ztjhR0a}jLSwoF&L;*5FM6Cgka7Ej@<56n!OXwB66i}Q9YRI9gNvBwH^p+v?b|C1sA
z?R@1XHbyRIU#~yPA~5h5*6y-HJogI;ijn3HleV!BgE-0%WHTe!u7yqu4gTz%D@B(A
ztzNxM?etAm_Dw8+A8gLXqp1RvrjS37{bzvGJRNI5MaI&X*^l;e8#~y)P6bx~6FBC<
zSO+dHR|vz4J~6`8X1k8vW9!W%Y?~^t0se|M!p%+AQ_r?u?BLB%g6!Jw03L-c4>~Q|
zbM*HJ3p&#8R@{}2#@%bueB0dAb7D%y{b{;!W&ZqS7`eG2w5XjFL#h~C>YqKsLWMUy
zpeMl*q8{-+yI%{(Yb)LV%twmzb4{jM*$oSD>W4;jpG*6f9WpN{tJJvE0$HIX>PlZR
zciwS@7o5f=zTdrCG~z1lJN*Zu{dK-=hLiU&7fgP-xD<QO7&jz%@(*N}H2{49Y3Nu%
z?k?mXh;b9B^E@f|dyo`E)a80N1G^9a><^VSnF7%<&gE|D{rCeZMR7y9Fxu{<w-0io
zvHx3_)5ZShi}5ZfxKW^N^Bo{g005FReH;i;+y1?g=zQ+MaIgVg1N>u@9Cqd~n5yl-
zmT?k?{aFRr85T)(H^6_n0is6&bW!dPBo~{+hHOP;F{8j!m||Nru|Mk)2(+gl4Ws}R
zX!I)rIfJk|*-2uaLx5;VupTfLNt;VRmuDcKI!0IpulI7k<rYB4{Upu)fj~@N|G*po
zzj+Bkvqo%EXHo1!+JAxTtGGA>%?HL7!_9WJT|kkSJLR=I@CRsJ(Pc^%!~To?{G_I1
zEvjvY<xZ?`zgY|k=a=9>I1L-z5Ey68ofWXmW?)&r2DlZ3%?;)s$VU+n&TcGwZ$4MR
z15o>z;Lh>#o+J^ht1z@H!wu1>pIun;mU(w;(XMHKuJTEh6^D%@ht-8Vx0PGpY;3Ii
zRrD{zhnwvGZvym>A%%_V|ACkQ#1oTeUE~;D<X=P?bjbcWfa;9Zf5hk?L$ICzhJet;
zg@l@YW`oS}pMxxp{Wsy`H2v2Q{3HGx7wqaX9mr@wFRj8v2^3Wr^H6Ghb63I@OIx%$
zHVn2rg(>Iul4?bkg1}Ot#FPt}#ZC0L6N8e_ikQmzhXwe~vj=qwm1>)6UfMclrPQAl
z^|nh%-#_*~^gl-)Npm1?L>L(YS^wSkbd^Z1_|=Zr^FSqs{~9pHtPPAkSl2v4)r*UB
z|4r0u-JHH0V(p^hm)2WhT#xZtkHz<=lE(6mmPIgpe$=myEoFNA)&EcdTV6A;tv97(
zRxFuFN?crX2)VpIxV>cJ%MYiQyS{rHu`F*7#Q!ii43it-`>hGEY_v;+ab7Zj@1PhD
z8<<jb_x{4E$$o``SIDi92M-ftN}N`A%#}Xgt^;kV=NU^pp-5vfJUO*oT}&TLx(rk|
zauBpDccR+BAl>BjN1(ke(_RM?`%E0?B3+)uZb<L>*uH1V#9L(Y=F!-mr(DalYnepL
zr4eB3S@Bt7vzHDBUdII#67%rS)+8d%GI5w`*{UEhs+(Xw?Nzt!$J)h%6OyDXOhEUi
zt^!PgjqR*Y{;DB^$VtN}1&_@-w81A887{HifVgGSY^@&s`}=YF0$!p1uGDegb2^ZC
zYo!$77gb`(yULcCsvApF^vAQyt;+(zRSX}bH0BW6fUcPoy|#1ss{MG=nKyJp>F)<_
z9B(1+^jVC@#0*%cAb&`Kj(*r5h?^i`0>(H^gSQCZKON9)@cA23rzN~{A;^2kxpwY~
z5HG&ND_^aoBQ(Zb;_TI{KzAccDWz@i1@-M#!&OJgYHzz$zVe*sdz7;S6o8+@pK}Ed
zgI8xqB`1}=H;yo|O5R!QRfaYnd74o@)@11&G~xdK{pOv-sje))ZbT-ZbW#XL7c)eH
zV#x(q$r|W5SSqhPJUFEB%?WMJtj;yboSAf>SYI$Oji?{K<NS7|dty@h!EJ6lDEA@8
z!7ybmsQ`yYXqCf9VO@`{-4QZViZkUX?>9e?C0R#=(ms=pRWNu3K%okZ>e6e@1NqAy
zp@kvSMf>hAG+ee(y=;H~-A)KkFvJ1=-y4poe+u0V%GCddT?+nt{qP%){>k`3m+uuQ
zrGim80%saKac!TIp{zsK9JnvnW4WhFwm1OYH$s>@j0YX|$}0ufd6FaUyFeK1&%blc
z4W>F*Ae80Y!4lwp1a6ifHRXTlU`v~hIXQUceb7P>Z+F%8qDY01*XmQIZ}=!*VklNv
zavqj;6}AtJ31p$#awj}bqJ?L?X>dwO1-L>;!7o84+?J|Q;_SO@P8_Zu%RGc#Dhpuk
z!?bWAOhbx<3&rCep|;6d_f2&b@>(&=ob;?K!+qk3VPmOFGv<lI(2P6F;G9aXtwS=9
zavQw+Hwz2KK7zTyp1_1LN5B;9f9&5?L$_?#uE2Uv$;G^J%3Wfz-yU!OeLK!`ABO5+
zyn18CTs*b*V{m2*Y=ShSvPAcysKcK$Am=NXIHzX1WT65DnWS$s4E>yA`qOYpsO!L9
zPi^x{>b+QZ(l{KfHpNY6i<pa)!i4)dN`qc~1vzzZNX{M)pI+MA*vm36O)=bW^dz)L
zC-?`Rke8`eQ}(JK^r^YsBwjFW`y;dKT2YY?Lx&A4nubhXp-{hZzss)$voo~){4~;_
zQ(<wd>{pNvA4{+Rkl>md4h7le-3}nuwf9YPQCQkV{GdgScTqaG`)zgE{hW|bL+nnE
zBa<6i7~3O9*EGd_%Ri!DQu@c!829N#OZm_xyYgS&06VI=M6L|~c>ThUHu=&x-9uV}
zS&zF8MmmsQB%SML;#fjlkkQC;#h8ZV{&vIbr&lk~B>G<di1DvmwZwY)+>TWJ&N}x#
z<@qf&R?M`B9z$5%x9h$0?fnVcUGm~FS2q9V1k6JYOV+^TA)|mEbd}aOl~Yvy#XI1c
zf1m9JymI8wZRwh+vZjHyqjzOOyI0pj+hRF;^P<EzP8+F{p-V$FOUBKwdME+(uCKRg
zjLfnHKhKTh_YJF|o-b|obtZnUP#z!VpA~q#PZgJ}x6D-BSmL3;Bw*3&B0JC321zBn
zb^hh!=}_@2NJ?X}uDeL1b);U2y3(3MCcCT+#-3C}ItCf>&T4}rxc=<VkM-EQgct+a
z3*;=t>L`}Z;8hJ9#R~smG1-ZH|I0_qx4oB^p6~EtdolTFRg4%WtQftMSyyCIFcW%B
z^*}@Q*|}ObYmYsSA2X!RN(sy6NV`cJIaJ!f(*wn3u%IP!H}??<YO+@wlv@<!>&BY&
z_W4*!lyy?SKHGZ-CH^W#F<mMmMf$`NvIP&4xt0^)8zK{JU85xNrd?iT2qeQE=ZXv}
zLk4XXO;Eo>bj;;HFYobR*c)_@f~SIaYPzj{6UkEJ_SWUda+*wZx#N-wvS}?1u=qWb
zXA(y#w%&Tk=Hpi-L(jKMmtN`#EQPSYyy}h+=^w{dW0#6)a*S$PdQmvn*j>Gp=rOkK
zch8v|q;-k)qpmEJk2k$fFn!b7V&PnB^Dc=sWAUeZ>;-dWG5v`5`b+&R9cCvs2D)3f
zWHNWJvy*WeQua%Jz&ny=NxI27>mZHD6P*JQ7xHm|_f}y5a!bwzYaYBSH+q90g}$0c
zn$J-hu|Kh{&f&9f9{5g3JX)>UKmH@ydSxc)$TURy79w~6o_f5dS}<305mu48M5TyK
zq@u=I_5+79KT3(dIp!!gnQL$=O|$dbVjs)YPc70-$J=H2#aIv>!3bf(>2>6QKaiz4
zy2^!+Wq!{<vH46%Yad;q(Ta#Ga08XA7Tc1ZoiJIa>e}=)@^7xrN-b5;<YqToK5%*1
z&R-QrjkLF&N_H|zRZ5n-v>h(~t5?@|KkpvJs1029-(J`DH=O9Ey>EY+9DLm7keyRE
z)9iL}tW+=5?n-m>uwv8Tvu51#+%rO&%+F82rg>a0IR%lgduc|hK^`=1-hrl#IOMW#
zhY(#?ioeog;Y!L!wH^!p{WQ~kV)M3u)0<A9SzLGKwqt9hI<=WfiN)$>GwxG(MYK-h
z>2qCqo~gGPof?2C`f{MP+^?oC$8l_}1JMIzOSD}chPlFI*<j5GFA26)S;g=NS59}e
zQzX6(1;g9VYoMMcM%_w5zYpGHP<KUw<2b*;FerN>TbQNe?Bv{8*`aYq$H-f!6!bRt
z0ZG@-v#KnEGR>sZPwVrX{)aBb^)|1G1OIUO_^Zss!v<}I7I0|#4)WET1eONKmgkS+
zhzo4+m+&#*!ZFKj!7?24|3C&^5H%w!`^?$y4;69{)#0&oO@4kpZsY{dl|JI(Tkq@r
z{SB{Pb-B&2_?oX!eG|ao!aca%pl1l9R<QO2OEI$P1gZ<&#cBk70d;>vvi4ziHUC$*
zlHx6&#^kWo>898d!C!X2Dd<Vhm|C9vy7<ykGB3S<LB@t;xYK9;0`8M`5_AVwn0<H<
zEX(jnLUm@Rh#Uzii%Pg1Wi~ul@AkZ1_WAF0%^Kx|i!MaMfu48NEdiF<Y%aBmrH}cd
zp#HoCE4{wF^wL6e4_uVLDMo2kb5B66phCx?9}ix-BueZUPkCfZTZX%pu^WXilb%~c
z?FSG!#4)z`9|+?tjnv}BH=EduEg?<>S27miU;Xpne5<W{2fu5T?zOgN@gi*JQp2ms
z$DI}qEeM(&Zo-Me*U~SGcBg$K{;FYy9BMMzKjiIiI3D06)u48ebk}81vKgNtPwTcW
zh|T)BQGz11u9pi5arE+AI~dE>2k|=(#>T-&pXJ0p0fq_W$Z^D+*Kg6*BQ?Sy#|mTg
z3bPrtV`GRrHuqCB-@8^>J&pgZ#gEF>`&-yaV|;^;J24FCD(wjZ3n5U+1<AB)!QTEA
z$Z^Ht`^N!>OVL~w609>L^S&u9w*Kb3PQnOrqn6wBq@*n^_x08-b2J}|7c&JD26om-
zPqIT9DJI}8*rgno=V8Z;V_90~TF2jT{wP!?l^r8jl%47)qXwCccZef_oAw4)$PI4s
zuo?MAdU`*@fP}r9E8bP)e0JuwUnY-5VXW8q(S-2I)S~S_At^Asvd`pKznh|{ukZZE
zyjw6R%*>w*t#Yi79F*^Q{xcvUNFU+~q(vYl>&!Ti^wJ24vkczB0AU@VGd@e=1ATsy
zx+1C5bbG*de+F&{5f25u;1*ICSA%BR)*mgkqD7#K*l>Hk>MlCQZ|DaJG(LNRXxxYF
z{WN}NUn(Sjp?{*)XED@RLt()01U|zgA`cJa6vMpxCvXgt#sPnkc!T!A;K%LvdJEiT
zjZ3q0DuUmZcUqN}wVl7M;G~^Z=RFfo^J!+C<W2ylJM{`}$c|Y=^K%R-(cF2qY(2!V
zB&d4We=;sWgEn>T)AouJHp%>LyPu3=T8mVw=Qb><2QW^#Ha3W-IbdT69|res(j-jQ
zRG}Q8Yvq?tED-%Xr|pvpoTvNUZdW!bgi`Wuj|iny;$Aoe>eNf?Z5^Qb&5E?aiM%tI
zIC?JmcbR1UOs?o8rwERJBAVJM{dy+M)ivOb>wXE>{fUQ%+pau$P_#2+a=WK#qj^py
zB3cyN)3+tXd5+`$Tvlgr38MP};qLjGHcKDD>7RBeO2Ia3o8`*0k8Nq=vI(A~30}s1
zPMwL^G<rAZt&zGLg|K@X-71%C?|7(ld+q9hs|V7t5woth5a$LE-oZat2qoN41e!gG
zd1$1~iTfEqVn<-xy|A;SHiPxS3FeqZo%A+Cr$M8VVRP2I#Sfg3Gpw%HOg}Kx<(w$J
zWl=!7Nt8`2rXf0cWBj4fvgiiQp5s0dDJ_pWAL>|{;^w(cB^1I(kb5bhjX+K`OQwo+
zUOP{~^g0ibhczb^D!wEZt&|f_Y*mlcS=i=Fq6Nv=IH&+dnb;LV8|#pTMc+!^R#u2G
z4djQdcN3cpS?YG61ml5Se$16vkxIgnzrmOXH`?WGuY0eO({oY+j-NG<EH3^<-xj*R
zvM~~z6EHOO&oBzuahQtZGe#1W9r9sx`wGw`O$!*ULj1-ed@@Ix`oH!smBG28pf-gU
zVcKxgi$TGmny}FV%BI{~1ad8mW+nof080p#P(Qtjfua4bD+rE3!)N-AdMnc<i~RE@
zFAX0E4_KM2zq6Tto2S##CYIw03P_Iu{Vr+sYR56=vG$X9Y_XJgqlQUaTSc6Agfy_S
zhe7REVW2EnxqkEyL<=mSBtEwKCp4DcMlNcNJE_f84em0deYX*+7+x0)adshE+A(<$
z;Dp%@ibOQaw08Wiv8j-f;1Wl0tKAI@?MMR6yPqwVHsg(xY|Lj14Xz{KM(<~aGyXuX
zN2e_A&iC^XnW<73&Os8PW8c89n@H2^*pdW>#Rs}j?rxZ?`t^|6<@m~Cg(R}P!SyqI
z-h9vfPdl||Ep#yDdFCynu;d)@%XZq#!_=jRnQU=uXDEa#WYr31+UE#0uhvlQwepOe
z>@P_u8Y`%Zdpjqjm?#hmpi$6WTuHP%?YqftrsYhTdMnZV{iJY4ikn1wlgofhgbid1
z=-<qiB^KLV$V@3!?No_p^Kf!Z__@6>LEZjvY*l~cJ)=W2UI&gld2FFipBUb_3w!YW
zp6ko*K%-RT#M`_NeHZ_);@tn-Z1SH)&i~&Co8x}kaRnPOn4@TWP6q7ecsCLijS%IF
zUVj$&JMp?=t4&<JP1nW^w@2{0d5*-`>S21^W$54hb@r@=*yjxBcD%5KQB_Y@F*9;A
zF6Gy?;WnoBU~QXOcIoXiQ+X3x)j*snBUXI@e&WB7J=KqITpiTyeeio-?yVQ4=K7FP
z;@gVyV;L$D((7ev+#p@p1ac;mdIc;?;hfiK6?%{4m=W~Ne2z5!d0T3gW&~%}v1_00
zw|6?5Ij4WG4AZ24FYGuohrBARxV6UspQ<}NDm;Afi@C$ujU>(V$jg!Q;RZGZo<#e`
zTMET_&K(bssy)`cF^adAdXtfxd7N^zDJ{;9(^O3uFQFubp|8wh9~1c5(83$@IXh`N
zFUD$zZuGG}6yo8>k10vgAt~F&r!Adj*9qfQ*P1|Y&m*`J<Sib|v$OCh(vEe+cISnM
zi4nN@3x(sR6l0Uq^M|rGQ{Tx&AC*&8ua7aB3GAvF9bJJQX02l>+9DqM$sWXl_2%jg
zm3v36)$%|2yiiNsMGjmm6ASXRJu7);a@T<`7UH-|p+^z@+U&hS3@HwnvEGQVGFA{~
z;+h)zG+1{!gw3wJ__kpCsrSv*3DN~53VCPG`BMju$cPt?T7)*;lla1IY0!8GEL1ER
zbu^aqH-w<JDEmx}zEq{mZ)MSIO}Boe)Cb+n_ft6g`tn0V_kBfc4>OOiIK{pFZXX>V
zwcV?0tnl#H{vDsyeY_Wy?&*{k?N7R`zT(6%iv<ZzxEk}C=qpvGkMHr>_y>CfaeZaT
zSwt*%H*m7Yf>OI+u-XONzNUvpu$j1$gdL{Cl!``;v2CASPvux8cSyk=MCjZ1Fy`;i
z!HP<>#e%W6gxqsOxU%0`WmbF5_Py(uQB?GOoIX2jyJ&LYtC67>qhA!^G~U+L?{l!d
zDVa~>)WRv;oC}4KPD8d_bBn<5=YGtsa%d;<qHSmyH7+5j5|eNSiFh1;SNkSDtLWKz
zEycSUuD1FsR=5fRN;~ZRJhV=iPy?l2pyjY;Q7w*ay`AZ965(2}^&`DUPmf8#CK_iw
z4rclMIM<j3As=lWWHfkjLR)$;ZI}VVK7ZWAPqIjyiP3dpf<07l-ofBm_1P!NKbL+v
z^*S5~n~Hby*m5wZxlmk_0391I&rG;i@p0=%mdD%XN-L!mlOfse_bHrjvnS-$LDBXN
z(~Y$q?F6E#%AD=Ut+IHDEMsc78JJX*G4FgHsvoR5u@Sj4==bCF*Q0kFpWQxkL<-{Y
z%K>x(0I&0f2@jf=Mo2OdWa?j_jaE_mcYD|_N@dtQ%~)rrr|ImuhzV4wQ_g<wTW7ta
z+E|3q#lh*qCqz!uEXv-_=Q}>{#2gt!t0nJZKNj#j-!pcS_%XURDKq}(+m6r>RFK;P
znT(#xSv;NJ4@5!F{l`PqcQg-Ed-f^wz${KW*t_3mRGX{F3~39J@;j_^_RaLUPNdxz
zqhgV1h#z26iQBc_5aJ|b-EdLRishv!H<4DV9A%?j5GsP`&r?6msC9S{QuL@!IQ7#R
zWMi<QKF^ri3!*Q+r^SV_&<5lbMzntnR`;?(;!187I^IsdJ}ac7uX<8O{+N?cPwe$C
z5E+&C+Xy*0rmD)4AV?Ei36<#^R4<m)FP9mhzJ=M_CHdN;cz)HY((|asyF+!uI!@H1
zeJ?)T+Y=JYnWsh8xdLtI`5eU@Qb29ZE#Wp(qu?D{E<!f5C*|dKzz559g63{l3{8HU
zoNCq5RJ!Wo9sfiWD~1pUnWQP!Q$!5Yom;6AhjoWX)s8-_Et(mvI#bu6)Ow*~_{fpa
zC!(jjlZ|~4F9D#nuAL*t*v<@KS#dw#I39RGo2>OZl;XALxcc6+GgG$8O*M{>Zyk}p
z2C?jM<@?PUSRr)4!VS7wp-lbN4z4P_^nQ{A+p4jOUf!%>w$rKE&9){z$OTf_Hdh1@
zQk9l?@h&8BDz~T2ikkzZH?2#Ve(+J1OSm8}q0dsUboSrotJ9;~aYqz%;7rvuY{lCl
zD(pStV^;_C-X?y8krKebYM1vte=z@IY&?he4<tnj<TUR?aopHz>=)0uc8z?T)SZB#
zpHikwMQ*cP{L&bzoc?-x;eJ@Z*Y?!6#=Fdwuxe}^c!w`yTg;8lvao0q`n^ApJ-Q)Y
zf*mH}gxbB=CQ9SBX9vC(3ptzFx5@R}*YgnfNuYu+X;@nt`zar2P>j6*nrb|_xyRgH
zq~dFszG*Q|Ho?o-!o8X#6pU}X#`Kuo^*zw>R)V&-cXe+O<o%925d+`Kzj-;A&)A&Q
zQw1qf<o0Tq9n2laJgiKh0XtknT0aBI#1_sI3v#2conTuB7^WCM$xP|iEcJo3X$`ak
zqcf_78T-l>713iENj0(rYP{I0Tq<HVT0zrowy5*EQ^(P+`viQ|fLArE<bv`rjDDu}
z-JztUM%9h}b(HyG{gB6|=HmA45&zj~25Q6w*kykdF@$l0`Iu#doHQrIR|`^A4TYlb
z<|vEy7*jIu#C2Vf01e0O2bloWF@T_*!#*)lp<5pv^A}cSBCa?{`!{}lS(@d<DXY-(
z`+}TUTy5j>YXWr*e!fH)5;}4BYNe3`{VwZrb=duitHH7SI8oKLRike%u`9<MJ!H^{
zrN^eAUpfCR;V!VIBY1;*DhA|{6mGz*;#;cwUo&c4Ka_~z3Uq**`N>HxNNt-P>uBb+
z9?qL(X^pzPT{v$laC%~*bWF(9C?Y-ukt0O{=Z(o##=E2?80|c4!m`~UAwDjGj_Qzc
zq2Uvs4;|JB#NTRYNcxfRK0z-16VV_{<X0fxkW)nXOMO*Yx>bPn#0<m4jf}~o?oGeS
zv{9m@FIno(G-gSD`umfn&P*AXtP~D$clm&Pz>rg%Y7E>QR9KD+*Y3W&&qr)JKui7V
zZV=21=?pk&N&I2SoBQp|tY%G=9A<=*MIgr`Xcqv=?Xrm~V5Sf*5a@!Rl+&EYHjoC-
zd-mQ1h`}2&j(c`}`R0px24SYa$e^2Cji6o$Bx<ww5EwV<ne8Db4<^jgvGJn8)@@4G
zfLd!E6r;5*D*oxI$|Fke(usXld`n=p-tnGOatR<RFar&!7S7J~C?Z;qmeJ-#{J8s!
z%hjfBGth`{U-M>vT(W9`eKiL4payJ<6`pS%U>2|)B&G_g<+?Uvlx?#+g6?&jJ3^OV
zT`(vZ(iNgAZR|mR>3GK32_ZYI=_6ucv}=DLn?~4iZEmkPcI!Nn(SSFLCM1ZM4fQfo
zKA{0dG8K!%jvrtC12NabYGG-X%S;uEdU`K+R~K!J;+52u8^aY&va4?BOq=0&mFcpu
zT0bsxKAU2$kADm8#x2y@4V49i>dac<<z6+!%p~)M+nN(CEH4TNivX_S5w-y*0RsUo
zDs1ZmtvT98<e=kH-&(i_qgU}<(1|PkpS!@0s1<p5FkjU&aO_(eI28q4%+wtUabhR+
zqL^yG<EQu0`<HIbOlbWic)=*6DW2Zt&(W*T?zzErEVwFYZ<lY`eVhU#UNY!6V_QB#
zA$Fa|#XJS`*v4b)PI_5p8V(sll*6Ztyf3ulUbpOfgeV5gfu(#}8zXy^3uQ)8<fbhH
zXk+d7gY&JBXV17$-&@e%K9x8ytM*g9%d}5Hk@FNU&CF+1(n!?wVm&e)72&T=61l2o
z9S5&Oe0ZSWm55NBJ1M<ZR(yL_+C0D}mBmoL?9|+d`?%S~pmUNiI&5D$sjZ&~V>r{j
zG9&P+XpM}m>#9h}AB)z6T0&e;4sphABf6ZRot>qgv%@#H5=I@F1wy@CU1lar4Ks-0
z0c6n!^q`sF={p!@ULL;M)qST28TyXpWh$3;x7A?I-v6jGd*WuU8D<Dkq5TNvf!*T)
z6TL%+#iI98ns-lI8)~|>5=%4o9%}k5Yc;K%GkxVAw%6(z?n+Bb5vChRJ@B<IPpI3`
zE#uL!cgP#+bW}o7q*qWoMyZkIogc!u*(yB|o~Ps!=~dAg+q70X;^`fwv5xf8^a^2c
ztD#Eu%koJ8vPLHLzAXd`xpT0=v<$DV?9bT2D}0Bf7c=`J<r7Ew6rMPM=5mYYV<_O>
zc9IEeJQs7<*LV2mR&4;gKH$ngpocG0;4LU=W~0^D@^$q-XQpw$e&V)iMErTKY0_uI
zq|f4?PUnQ(bfJ>sV1h=1^tPGAW?B%%D|)!enBpW;O+7ta0oU1;iG=i+9sbR~Kb%K=
zVXSDKfRe{yI~n30z2hu>V%O{#bqvQGY9rB5on`NWcYJr6r8S;<l&vyO`9cvqQO?Lk
zR5pF!#hJ!#xwS$;Ir#1;7&+A*Baqm29v-=hKECIIV06UmSZfg-OJ9QWABb+o-Nhdv
zV}*e1`5xfW?qGL$=I*YB%P>niO=OyDk}g&a8rfj#7}<Ao`2(z-F0Z7CZO3XXYn@Xo
zKCumN9>?YPVjZ3wC$vqX#DjmpBH{N{U?QE(+HA$fAjN7E8-<|cj-9tx18*in%C2}G
z|Nf)W_6IOGoBNvNrdQVB$PQw5%)$XcrCkTWJDU9HMmI>)6Txw9Lq*wlQWq$#V^=em
zf>-f@9-4bJbK5>mu2AsPoCx#<`g5>Xhi8$3$G8IKGmdNU7=WeImiXCLp8_Ci18bkG
zyMta?oYU~R-X{BILnZZlO4FlQPC6$a>mm}4t?v3*rp@r9M^qCL5!KclHE!qRSs$<O
z#}f{AdT@Q>Y;^VzUa`valsvlP*4oOerr0&lDzD2sa(FbNm?Wl)>7KYXim>+{?>
zt1D3%>|Kic^TT-d*(V(-P(7jN?WUa?Sc9jcpS)kWaMn1L4J#ArfNrm&qKd&=bQkL(
z*#D^R#_r5^Vqn5dueBaMxbc|jD{Hy4wQZkT#Ee@X`}6f&zoqiRVH~@S;HpjRn(3m7
z&~c1X&a367141y-CMn66eud17>d$PJ{cF;C8lIi!%Q}#F_Hv_Z<f(mk@y|h_hMbcN
zk{3^$v^zp_UDM$@4biSG?8HfI>%dDkbH9v84!;i!KP{1$vFF>SuSL%;^6sM}m@kNm
zq)38Wu1Ijcb#5n^PMxo3B^=j{;T^JB`1(6m=yb<nH8X*?S|2TC6TZL;j}^y1j%rw1
zN-rZ$g?RY9(_*+Y&A7q~%m-v+;Nt}+ds%c@RwZI3r<svnUhH(0OTp_Qljf@{u;3j{
zXGMiOJ30+UK)QGO!H>UbR51VfC5qwH)xADnMp$+)L`>5L32?yLnaG?OTXLm|oZ`H|
zi`5JPCKoE05xE67K$EWbJy8D1<HzcLzVH4A`4T9@h?9z!!nDJ9bO{qf<jHB1M{lf+
zzRGQ$S^fRNmzpV1|DE1pp(r84LWUMAH%sYn<`vucd|$C=ClxKyw~1_CC4jx93Uraa
zapo|jG0Jx6N);vo%%sIXka|`R>EIv8!y8X#36WUyH9%vipZ%{nP#ElgW{FPVC6MO`
zn-;=doJ8(!kXF_-W0>=VmECm!U+2M&fLzb}M;+w<H-58Y|DN?TadUBuQ!kBzKwgu}
zX~L*l9b?GDTTJBZjMML}6`sBP5KSxYD#k4Y64FJO%JteT40qbLk6X>YTw}+j5I^l*
zEC1Y#;b5-|m7w{SIKYU=;vYstXcbxz!<L@ZhMg>;Y)~EEEd(C>1#n6J9LX;|U!cdR
zmyTx%wEyt>q}V2wK2Gg&c9?o<>vQoV<{lT+>?<}pUm|CJN3q;7oi`MRjP&XH=Z134
zhRcjh_YM7O_>sI+DHswF7Nh7sWv18_e$m8rGL4F@%E@GTEd&v3Ygn#`;FI^fH8V!L
zPSI*=+LMdBSCt#;8}GW^MWo!0=WpGYn~iVGOAWvHD+nfSvKQ0&7kq^p?+Zu@M^s>A
z*}ME`jdsK3HS&twJ(GqLdKVxhlT8;>&tCb#14>;#pjmjvNKcMBm!Csp47HQ0pk297
zxck8uRP{t+6pjpP1?<W=@w<GF-FKtU?}V(?!<{3pKT3pjNwD6Y`0nt8%?CVjB`|&2
z*y!rqC!<vplfn7U&2))1zw<+nLot&-?r8QD8%rq(M4VMFd;c2)|7(thCBHF|;Jnxm
zHUJyCervXVo3TRctaaTO+StuEN42T)eurl7#XGgN_8Mm@OHpct+wO$Jn<ua>x(DgN
z<=3et_x)IhaHs~`l)R{Eyo5H*i*&6TsqS3eUmtd^qW+3cPx6y;ONp&IbG>uwE4y7*
z^=EP;31pmTaMEd7R|iT0t&0+w*AGnfeE22T1-(^>^Vs*eFXCpPb%=*joZ8@yR=j}F
z-NvLi1yDK+PY({EOczl4Cl=fBJLw`%^x%r!Uz@JD1-bbYRG*Vz$CW6fPTo0rchzBk
zviQw?oON@P3z|O8>{6yFIG22Dq7jympsVf0d`&SlwsvB6hm<TdO#OI{JcNIV3@SO3
za<3+=`v?C)F=d|necdPe9_B8|08xab8R#PoyeN0XmkE%J=CtxOj{^6t$gmA8Q7Ur8
z_UNHOIP3MZuFrOE%Y6Le^erB9fo%Kt&P;8tz#*0r&}(RB9w=a<X7D9mc}zm9!hxYC
zyI0?CEVJZp%$J_)fTgcSPdT!?S9c1=3ytld$1$v#t1NF!PaRZ<fV^ONik6wAu^t>i
z*o{;WcFM{hSiT3}(j{L%;^r$9v)`64{eXZOIcZ2%_5OJ+j2d&1aSVhi1~ay09|tfX
z-{^JWWq28np=<r+PE6ZTdJS~%a|?Sh=jPsnEkY5dX+1<=D`Hy&C$FZawxTAnr*HeE
z*4JWEVej%GA~)4K#AmmM|NPEJ{RtvCP9Fn=XTo{<a$CQcV{pi6`tD5EnSO;9AEm_r
zZ2pAjCB&<0F^f+sHhbHRM#a*T!|OLQ7jl`ZaKQCSA}%vd(H)j%X_Qs?fS$b9y^KL5
zdC`r(x<1zH&FRDzP~SX_D>6kOZLBA{5J&1aZ@msias}ae<KtkKPr|klM+Z;5?jJ{+
z{g9sJ!T$Up_ulwu&PXQfG`AaD`*Muh<;HU5_Q?Kxr0E9d$3^_CqCdG7-(yLX4fJ=C
z&s~a8h9fdx<3<e*hv*@9^vB?L0n+xR$tmlrLs;YmvzfBasDMIi;bdAsZHX{(`XtZ!
z!mYE4D}mmhY#IZnCvY!K#IP;eYceqMb4e%-jw9M#mZB-8_4>UHe!#A?K#n!Wlzndq
zTXOv-o+edpD9gE%mwRwQ3TB8m<hExgC7B%L_FwC7T`YrH&^B6(ou``9yPhefc57r0
zs+KVnwdQ&_EBU-QPmWk#tmol-`+=@j*nh2kubOdp)#wqz3FJC#4Jqulw89}8;gQj4
zKCiDGeuBxy>=8cB0HR=ha~I3Nk}?4!<6+gBb#0rH@^?{>8erouKdRQ{y0`Cm^zq}p
z0L9qV)kCb@EIgC~v&Qe_PJ;>y%md0)UG6|ltj+{x6==8QhRfbDz{a32_*OnI$(8&@
zv2#PJWG9y%zrT}Z<39D|nT+}VI(gO&kbd1Gkh2k0HN9+;G0dT>!OuGeZYT-PkOcA<
zvtOjWmWeiZ-^VA@Vt>>06L)VC#ib2asZA|r?LZ%-OM-^=j+ab*dQfLBzuOqf$P{~v
zoGamEIkSn=+X1h+zCfF=)ctwjcuYaYAqdsBS#C>|^8!nLjR4k10W{oM=x*38&?($m
z-T3SE2kvks6TP9}FSNb!=P>VsRd)T!l&MzLS!v#%1H{la0NPRjGoyq%a)F^eevILm
z+iGxrpgySsCXIqsytY+56gYppqkPI*_s(7oH=d{OR=WGKZCP04GO>~xki=9|p<ba7
zq&!x<lbMDyD=JUcw!7KLBw9urZP?u7TjqZu2|F&h^Orw?d2*7=&rK`LrAu|m_CLbO
zycEVo8|fEMeva71r!L>r%s)ByxTOnK!rlcQzYE-xDck%47s|<N67kmG{s$71Q>>8T
z>uYdhbDz&x(Z%*le5IJr02dBps?;s{?K#UbVk>{uwO(ft-_!!C#PFPw==sb#3#Uw_
zV|S{fiq~-jWIJAl3)+ZPNuBB1G(QSoB-@cUI%c58$%!VPc^~o3ik<#l<Q${F0HXO{
z3Qk}8*-{q2$66894ok)W@K>ps6jhB%IBvf|eA2EYx%|MJBBrPye!R2A$OlooR3L{G
z61$41(FcGZZN^uoOgZKYwmL9EDjIzl*IDdyZ)u>x=#H*$$3aoL^<Kz8gXra?i|X{)
zAuhiNXh>h{dc_oEbOT6w{;yikgB2ABiKaNOZTC|S6A5ZUYGHs{uv5L`^Gf|~R~ZZY
zijl~6wX-GAT3rW99s5KSrHi95_E%sj!Vvs{(kRe4=voM(nvW~K%$1n?11YJ&zLbX%
z1vlZuY;~Z~u+NO{!c9Ul!T{hO6jEE0m_8yMNn6G^A}CV|fP6&&HbNwPdCW4!9%k8@
zW7L7(5=H|%K*a#LB|8Gs28eSUU{?HrSk|ykVP`G^?Iz2uMu*L>K$d<E=IM^>K>gb*
z&a}fMVZRszAp3Nf^j90&!SfHqK!m!3T|~Vg!2}23e{;=$`)mtT#7|Qo0``<soCky+
z{>?R`MA?oYh{V4tR7F@?KIeJ~`xmTQY9+Vk|4FdKv42Cw#|Y=zF$!bZ_CT}hpDy@s
zW49iNTYbg-is-=%kIm#hS(k%KV_Ux1S<XDLWpTwsCbk4M-MSA%r1yQPF`}DknJguV
z4}20|t#~wlT@1YKB5eCxH3Tmlk&kbvW48$T6op^`z)c9Ia^2Dp*k*pwavFi4Xq@c`
zI5S91fiT6OZ76b`!vTGs^L{nI%bkT=^pCe+xx6E*qCP7Lb3LtB#DRG3x)73*Zng3z
z6ztLB4tO6R=5y-rAL#$*rwDv72OtQLu!$0eKGR3L14oSOY;XuSf{Cfs8N8qzJN777
zi$C&C&UybtB{#)auS8z|!u{Z4ncA=p6jA2te**bcS;k|O=e2xLlq4}-jO16hOFnoR
z?<%EkE(PFE-D+HJa0j6@A9!4m{+3h(ei!rj^hH}DWAXAxq3N$asfjVg$ylPaXDaiy
z*$W4|t*d#2ECSPsK=YLX5)N!Wux?|(>)d@A+d2i{3CRSo!INQJ!Sy8Wxst7ys5z;c
zD8H>c{|n#BUkD&PH7+&G9Xcez?gMk|-&{AJE4j?x2Tp#lKM*i?X^2wn$`z<x8T&Q$
z);DgJD((mBUw(pChb4d4-UQeUP=5*MxPVIK+rPIcg53Rct=k-X4EwJ>0TzZvpp<(6
z+Ye|jShmz6l$~xlyogl|aSVbLo>Sj^{x3hl)+8}c3UaYrurQDbU}}ATXHw*}X+~_T
z=jhlVK9l~%2VgT`NLoH2R>4C-!J57W7WG3m>jtDAg;Dwb`P5kMU9e#MtIvQYXApNc
zI3MSNawm8|?NQjKT2k>S_Y&j%Z|wP}4y*eA`Xm2pC_^vStwjCbVf&ZE#zx4)$g|A~
zrU5=4G<9dw_{q-f<`~7VR(GC>uzu9#ffuVCjG>tcr13h2h^Vxv2zd4JRIW4Aqqu{j
zdj^UzyM4yN^s+DJ(yO4wm<-9%py&}Re9>BjIhvOx3ch&>d#j(Rib*j3xl}XTXzX^m
z;sP-;!dEz>$|Zj1YH6Y1kt?Xo&sW?x?n{r6MjqYztBUhaw1<Z~9&kwPrzNnh6;S;?
zE&f#jXp!0SvRh!}b=2lLR2X(C-2<@|G^)P24Y#<#6#=XD(?TX6x7$7)kmf^g6Oz>+
zfBsz?Y~bdwjKj4GAn^$&{LMRsysfS((WM^O{K_2B(>y%q2bzX^2=4^Ax0^f)Eon~N
z-G=n^=i6#YvY6fT*$w+{frRsP6Vejln=x?M<coUAHtIFy0EZ9#qt{gP@K09(D)=9^
zTrXu&0jA#`U0%j%u1(6+$W_&<2NNy#y9qp2S(5-wHDmU5a36_(hg|~)^b>)x)<)Ns
zR4X=$*X~0ttuXGb`_V<a1=@su>fPpNP4Mlu4YfvjpPMryl%305Z@s}pbGryNgeU;O
z)D{Tt>d4tQP_Ap$1f@`|TXhb7E>VYTWhb9}8Tc;3VJ~EF>e1n`(RKDZ__;%vqoyn)
zObek}yGuI(Z<f1@ZB4&hP@<KUB3{+var2rXN}q2}?~}OCipyG*ljTud$$qvt2p0n#
z$C!_!WIm?;<n{m(Wd<zbaDkIvm4}z~OE0D2oEfbMQcADrs~8j0q{gcjxQ6F)F|42F
zm@`};et(GoC9#c}nlswty_Vst6m2Tso^S5frRs9m54L!*7fTw$Aba#`=Xj2Yg*UF0
z)ZpVeF9E^z?<ipoHUPweq`!7d&h8IGg{fyV+vvX>7!SkzRV(5iC?#5lKJRur5V`FM
z?{@dX=amvk4R3L6xsMURy5}{PW`h(CVnlGV@cZ3C>yax<^mMp}iqKg0mtIHq-MeZ%
z*F3U#exfd+-rUO9ly9!*H|jPMurziY#5Z!1O;p*pIB%-&45D3V%CnaSjSwEUu>;<y
zweQ~&+|R1Gem(qtpYJ#H2OhPzC*gxhu{R9rnAd6j?a)2t)a(^)k@+!$xR3E-Z=Bpe
zN0^D8Qqtn-l4#-f{(;C>FN5al16mMu*Bg&P4@vxyLGsAK%4}1OWTnOA_=R`UkG?_F
z>G%Z{Q(eCUHU6_5T7zg8*~c`yFw|r|^Ss%ND>3AL!}XoVF(<>f2G5>4AL_N3V3lWe
zIziPKh;3NU=SdTLx%~AshpuUvA@ZmZywv?6UljY~t-wx0rvwK(b>R;m0@Q3JRQ!^A
zKX%T2nK2o=?r#s(iuzq6!8E1^&DK$8Si-+BP|h3t3HB`-u0xv-U*qC3D<b02po!1%
ze=fLr#^NXZLoCVwTUE8Ay{B=dG0Gk>#TrQ+QNEERit6*6RsW^FJC9%tHnWVsJxZ|1
z5$hOt;ju93^8F1XlkE?uv+&c~>)=)6H@Z=x<6%NiOw+!Nbwhc<&=IPSgIenLPs|$D
zZDhk0Tx`>I@^zGR)llip+M}l!44qRcSuSBEI;T^0E7M(FWTDRj1c~a(`--R>^ca~y
z3~9xm01`B}SXz$m2Gqhba<f6dZT)ch61z-`VmS1~=|SC|CF23JTeaU0=+RM2t^q70
z8m=J3qyec^e)f49Dq3^4k*<}1eBHKfp5$Jqqj4>u?@i}vdS5Mc?NwI1Zh<xB*g`IZ
z-~o^QE2v7&L&*JQi|oDLrV?SqTgZI4X!U6Se<1J81EGBX{b8bHoyeMHvJ{aJqR5o8
zhjt;RlI$kQR%6DJeTza-B%~6PeK+<Ym23$yV}@iIX2`fPGySg4bH3+%zvq0;IluFJ
zo<Gj>Jb!3t80NaK`+dEa*K#`TsILA*(ZpWq+<iMZ=Zv3IzZ}HGokK<<IE=Kgt!MBk
zbp+Wcm)1bQ?19|!gEhMD^+i*@=9=$$l^>*Dl&Pe-uTJX}w{WMD=c-nk-K~9Qz12<r
z3=I%G!oiITThyc5QOu|aAD_3t8q{??R)>0bj_*C;jVzzy-n(b>6g@1<Zz^<ci)l*?
zOTXAgSiB_0oJE^LfCIe!3=Oxqi%2^c2-QrDn>3eIj7Ya>2HzQTl$KRxT-^D7q8q(&
zNYO$x{ZaIco|_M59=#)h&_hcwu7jSE-5akLPmHS%WmO*Yoa6VCB`kwp`e}=zEAd?)
z3V5ZJ+{WH&H)rt6HJFHAiv3$J;m@PmDwrh@JtP2Jn^d1$g-TYBy-DkfXcWwY->P?v
ziv@;<Vw<i=fTW-qZ|NKDMPpeYS@h^0t`2@gj$tZhEZx(^N0%tpY}mfYurM3iv+A$s
zX%X^VGGH0ruT*AYXw5w@D<kH5{o%<)vqC@ivHizIj^`dyPAHOHsQf$r18%!)jM-cF
zP18CbhPvlh>2Cjq-lyUwpS7N8Di)$2QEVY)<RT}14|NSL$Ar<uaRGZdOAj%R7WHaz
zOv5x=8>H58<$pZ$Y(ampwFyo$MatQ%I}loqZyU1<d{~{ce#Eehn_0bTT1yE}<ov?7
zVgQ5mxY*$QA$2N}GHU43Q$2#xU545eS<@axEuZl!!>)5l`7(avF9^#`z_|?oKull>
zle;s1U?0G%ZM&$uy0t{_MVIZU5)GCSa6YkkPlL`C7HWL}Y=-;bZ*WB`-q-DG$B#!i
zS;{_%hwTnlwr!mD?Q$M-^zHDkTmg=ETO+_fZd)G-0Ufn#kr?Qm0<j1!2*tU{z!m1l
zFmLkul%6OttKZxXOBTB`WpgKxT%OH%1CsPaL(aF4>44w|$OeS3es7eZ3tfJ5AE&8&
zqVrLHHA>YNZweW>ZNh*nU`{yLTj1(OghPAGA5n@HdFe*2hUF_YQMh-FXHGcb1AIC{
z3~Y?2Ws>p@Q<bnZKiuNP)T;=MWIVT#K3YekBZUmxZdqo^(c?Y-Nn2F%`t$7OQ!DZ8
z2R~oH+7SZaopyuGVT#=qavQi`0F=)mrG4n<DC6N@)I$H^ALUo-HQe;OdAP%oE)W8E
zP=GMfMqfB7@c`kDh{-JfUS4n;7rS`*#>c?dD`RI#k6N~y-naXPIRse%AtV8e9WeSX
zSn%XZM2?=iqooYapesr14Vxn$5P$CSOeD=lBYyb@+8B~1C7q%7{>&);0-QmeyTXic
zUyRhzYV=Fxlff<>^_TN^%jQvW^63^lk{`;Jv>?0tqGe|qrmii_7b)PynRZ}E3RbgK
z7JWgbHicB0{qh6+qlkNr-8~nsR?@h&oK`Z&Pu?}0(GZ?AvHb0G_fg1}f{$l&+?Z2%
z4(r-X-u1#cA#hza<SgbI_8=bzYwjhUBLD{}5UAQy4SF#Q46D{iJ2-?EC*oCe<lZmg
zw=}bi>s87#90!Q0$yT?@QP}GmkVa9M`o7GmAZk?}^oEkSA)d3Jw!N^@cT``*mU~1*
zfb02u<HH@_&+_~_V1hM_US|tML%zx7qnJLTun#GgTg#~8)uheEUAz-BEs4PgYsW{;
zOWYACICN+>T&OP&UnZ1J)=$OU#|c2}o;$%?53RZIQhX;B^RD@Ia$rwX+L4SvF|_>+
zE03v-63vyx-asVX<6Novu&JA#Q6muCg$bw)OXmurEs?H=wA2-1Tfs3>cKkg*{X#2N
z%#eG2M#<X0bx-C}tkd^w^CgKgE|25KkG_CSeo&)hTM<Bemnlpjh%XqzWh$Na#re&6
z34ixU6U=$zFM5+}JQ-YgXHacLWMW$l*~*TEePQe)kn%YTA&K6vR0;jQTMw~!*%vt(
zYd|W~jcnVF+anLSdF<?B)rSS6PA2-?^pw<;!obQugsFe~!3lv^=1~V0LYlsa#UNS`
z`+3R2I>Guk$HM?T%infdjb^|Was-L=weQ$(%cNzNf#b+>GFXni3)sycx!q^Cj{!&6
zze(%;m(H+uGl1Ju%-9^pUc-&%9Cw_$m1Sr`cO+!egI!u2c1b5KRfMz|Xu#`guJ+37
zUsXA8x;S9<LQWYh|DO<2;zg`j?y%No$OB&7QFuq+m|ExUjp{LbZ;u?g^6S4j<Yk@D
zb(~X~QRsGKe29VYz~@FJvn>-ZqQH8CC*@)GK4OGHg#Ai;G8!a(BrJZO-g7I9SR1d>
zy<z{z4{<~+MGlna+$*dHl|Tl4e2P&>){vVo5AG$;rj#P84FkyI6+f=dJ3J$5cO`x=
zd8f1(wPVK)GN(Q~uWETpQ!C<p*2s$f+sn-f*Eu*ZNrb&n;d%jNao-odjQ@3J+utYO
z|DDgn2?{_4b`{3}Qi>5bFdqRat+W9}cd&9y*@iwpklQza4w<q?R0ND$MM>Iuyd0lR
z?y!`kZk*g!*9TH8zUy0->>i}-FBkwf@y+_V`s>9_J8E#E?aEUY$LwNVCHdW6D}S#X
zvgP1<kXkGCp@E$}683+&spSH}p;uK~2<E*3yUT#iAx>}>_X@FNU8NbpGIh*;BoT~{
za^JKQSLWe+yI|^NQ=F2wC2N8O`4r8xZ>*PM7s0$w20Cmb)S7p)USwq3cmN)#XcnPz
zJB@Y)j{4m3@4f2iCq_6g#Ch=2b9^4(#;8f>N=04BKh}C;KmXD8;%xLi(7Q5+t%7#&
zUvS;=IO4x2#=qwJ{Xg343^%LC=11NjZU?<F{OE4RSc=xD`=n+AO}s)ug~L9$oKwcs
zbTQ74k27b->N2uF9peK0rSLaD*)c#{FapH{E(%lnqs%ip?ZptfGgrrR%tGSw%j1nP
zK`FgR<Kc@lz5Dikv39yL?~Y4C0JKKcp{H9Om`<z^-ot9YZB=lBdBT>B(i~7?c%tD#
zxK<<F{KaoY40za=FU*q%E%HBkj)6tL2qrHoe<%`MyF2P-<o45M03dYfz2cC1VYN@Y
zqW9a2)A#&=e>QVBsK-jcWSNwU=LP}hI)3cJ6d{oSCcUqnJjT?1-V^(r?}f9sQ0(>6
z+1()^|J-JzH>}V~HeVi_H?JxaU@Zv$Y_@o6>k!k11w19Pu$B15Rmz0;a;x)R`bY<2
zw}p?_TgCS2k6-2njjcFW+h<MQ_sC6O6Vi@f&5aPp(`@^A(MtUo3Cuul`R6j8&L)vI
zLGAMcK8=cgSk{|R?>PSNj!vepCe4dblTQV*6)ziP&wRq#R$ja>`p(FOMqq$Z1qB&N
z1aIWl@^}M4nuPcVOf5$RN;qJureqI|Dh|D?w00m~7iSc&PuEx!!9<?LbWKi&w$lt4
z698}S7@UTTLBN=@6qA9{gG(1K3|Co;`zUmc%&1L=)EMx8{9gJk{ez5nivnO-zcKDR
zd6kXEy7Y~my71>dM=Lh5Vi6u1+n~11<<_jyQbV=3!Prx|7b@vO&x8Echj#3*ft3e^
z$Ve6(J!^MdzlOHT5Qk>Aj4>V9%I6~;R3*ru3zR6u0VQ+wx(g}VHdg203~#&rie*lX
zu^%6}`G)XFdu?*PEIdEZcl|qaM?V%7iUec%GS}tBIxvkVn31WOQOI_?dTn?AcJItx
zV@(ZLrAI#1*04s6u0NXfR-I0Dz7NR%%VaD82`SjNA_O4+cC(8BgU~C>)u9QwT!`sm
z_xm7rcu8egC+2v&K2GM93Bvtcia(yd9(qt>5o?eugaUvuxM)C$+7v5uk??LDDmq&m
zKSnXBqdB!5Psse%=clF;E}!`3V<Cf1(8vAExa1h)TB{UySTecxxJTusHp7?R>_Egd
zjWHK{R&=8>_=E6do_4I%{N&LALzj}P?ngF#lXR*bN=ozn8L>(dFG7S(1zuRG0ccT#
z*B3nL$vu3h#weD}OQ$s5??S+IKe;&$Hp_dJ_)jhH`!h!OW;ou>Fn4)&`c~5J`=$L`
zUuw@iLN-=j?0Q~O+Fr-(p44kY#35>sZOB+64RBt$gBx0n4n689v!Y8@Cj?Eg#40%>
zgT#wO=?Q*!eoC5(8FncnLeFLX;$d8&Wyb&NgVa)}8ssJ`tH$bqDp`0Nug87$27=G+
z{?0>FqJ_pskz_s_gOLRvQPcG2zd1_4z*&MRT+t5hJ$!p#63|RMvV2AlFWG1o<b8+~
zY`EZ|{%)I<9IYYWr$DXrB8fzI2DKRX7$7h(Icffk3jBznveJlzoB`cuCOW&M&ATMv
zH17fP(Iv49i>tSjl(nL~bJW%|8u=MTkTjgCy`>KgGBBAJGfj{Z9{wIg-!PFc8&6ut
z_?9jt9-9?-@P<TAal2Xa<q%elSdr3^^6gk3!ym!|oaq>rfR&i0MP-f7*I@XY20@?q
z!3n2=W_R=H<ZnHPo;;HM(sfA6Ms(@MQFg77IMWgeqQ^`vrP8&hjuiR43--TKZI!^^
zV7qqM*mOsKc#UM&0OwT~F0qZKsU{#;*q6^dQ~_!1NT9WKcvK5}oP{FPs3%uZz7a|f
zJ;<YCk}p#pQMaG{wZFNC^$>px!G|H_R{RLxLIfKzLT$!PcU!2>bYIFK|5&oi3E6or
zyxCqdyoc+Ja#q$kFPm`1IZu_BHA;V97_Qi+fUMYn#!o^fHXjYHs-DXc>epDy&bfUi
zP*bPSY|FaFGkH2pEgBq4q0K^2%9JuwBzLN`m=fhjhRYSsCIy(L23CCcT5bA<>e4el
zUc6et<~Q*tYa~r_LayK{4MIM*>A@+C6nYeBkA6uSaDeXHq@n^0o(TmKWIP?Wx@GLv
zG^P;z?s7Y5)lW&ed=7;fnfYU<T-;Fb@6nSJ+<q^&R9|y_vx~{P*f?Y$`y_bI|Ha?c
zl%-#5adM!56-F#HB0o1)-jyx+=Sk4tCPIJtXmU{VbI#SDrOk<lRUK5gdJWIfv`IzD
z>PD>;#7_76N^aqq$tx!hlOkY3F|U<E1<~OY#2@)p5h9sir&*nq(+qdAT)OFz+5Yw1
znJd%JdSonTmC{UaTZ2x+33xPSA6PL!yG}CNYv+TQ+U>5by+h<@J)^9MdQ?_1dhG@^
z`7A|QQdxIa5R@c8*nM)6A-AC2^l?gNyW1_;9(Ot2^c&Vd$RtPVN$Dk9;>1egt{rNd
zyZ_XBUf>UbGesXwH7%~9cd+nm1qa6&!&tOq{<q-5VFkECWv2IwXRLMMBuey-!V1<?
z&<XF0Uj1%0l?zlf2_5bcVo`>drZ``fO+O@T9X!qD;z`-(w{#erXQ{$PL$Jl&m@<z@
z4I4Y^XnmxZ<dRg(g?!JkocNEJ>n#m>d$GK;GTXHK#1T_dNZ`=P>nS|OG-kyV4ecF$
zx}VmUba!DAgRIXK9uG8XO2YgCs}X(}+xkZ@)p%vZ7HZ#;LHm8lx@ebSjWAAejMlV#
z^P|h2w^cbQ@4a^TX^?2$QD*M0X}cFyok?$N2bHA2k6Jfyg7Dfs2$1GZG-<EU%OI)>
zM-wKFpTH44&j9E5zHmGV5F==&KqL$$a0~{1NbsZ*!QJ}gtHEx4p4`gLU0g){J}F$e
zAq-^RJ6MrM`yS@PnstLE?C%6_1Y+dhXPjB>>dd~qRU~s)E}e)rW<5f<ViUl?s#yxU
z(80n^tLNKKu1(~w_3N!?vp0>Z=z7U`QZ$xk4v&qOK(d}U3Zr5`SQ7c9rPFMSSecpj
z`*}<#>@3f`^YgbJSwMVPC*etH-scHgZ9q3Qos1h<T?IvcYZ;tou3zZY7v0Q1V#ph4
zFLM>T-;wpa@q^3WQh^$gUy6#!hy9W@zk(BV2W+PY5;T#vA-ptFCSiLCQ6TzsIga0c
z6K3}C=bJRK6UV~{HY(djm@=S^r_x#rjy_^5aWk+6N0ilLp9HV5_6hM2M-F21Ki}eK
z6TwSiuVts~0@1LF&eMSt&`3b*N`7IUd@&bSPxkuh5^=dcLN|B)F!j+jc}H&H9^6PC
z4aF8xr5PnlwBn_jg@ebvcPi{5Mx*Y|-gtYW*Wr741m^1Vf;+c&sBo=+jb}<^lGm9s
zNRocs*JdFmoc`rvAJd}IDzvDZpmdGD;aWaN<cK8CvuBrY;%Vo>xj|^((y51<ws^co
z&M8?d0hhwgG1FI=69tRmG9S!&IL)`mw*3f0V1D9ei`WV~-qU=Z!RWI<!TkkVR$KMy
zGw6s_gZauu8JqIG0cU$_-%jtYks_Bu7uZ7g!2%l1_ifKiG4v^c#5XY+Ir6lxZv^7`
zea?6Bdr&U<n%~$u!J~8|4DljMZKb^hP6JgUq-56d`bKtpZl!H_Nk_rk(l>8A3#w1_
zsolG#MtyiVSY~AWkIs-MY&4(Ta)E{;D@UU(r6M#KmrJ}tGL*u29x3{#E6NcD!j7K;
zy@!XNpR$>`n}T=KKX}?58C*E#ma2g7Q=Zrh_AA;8>`ZyqE6@u4w$5;AWWB_?5o27)
z=*d0JRdpkk?z6#%Mox0Gs>%K5j|x<@HC!<ZJL2mbNWHbmxn6=*Z5H@i0N!IXw$;N(
z88cAdy@hOF9=)eXTWC$K{b)<JDirb3ix)C`3h&)vc5&DHE<x-GwjiqnBZPKkJyK%f
znz<G(Pb$)$cSQx1*gK!Nlbk`<ejGr`&pV%{$kE*r!6^nQf`Gn{7SI(59*1c`DFKhc
zu)eRui|W{Q?^)5Zwck5u3k8FFslM_cZK!Y>4bWAs2^H{aIL!)w8L#%!X4gNb+92I#
z;=x-*0hcgd&^U*U>X9_s_*3G>ShF!aagryC$2>4jH90iR-yBvdF&^joSNm*KUH8d$
zrEZ3sU_8)Z%_CwxbM`~jLQHRN6}*k*Pg+iHG(jEz93!58ygL7tXT)ewk8BgQSjY6*
zQ<O&PgT3!>#1jjj300a}EQ*WAt1$k^$r#B?ABz*(m6~>3h4Zo9XLtoH0q>ErCm4Pn
zB>JAHogS0?BMQT%d1V!Y8qGwRdQa-gtspMGMXPPiVIrc>?D~IY02OvK-veRiOq8WM
z{~28)bng!o;rv}G@T%@FQ&5(GFqJdNNCWDC!{^!E3S+Bm5p%RUEaA-7?#0|2S@Vu3
zvTIPQTpa{ytr{5yF7Loz(13XIw{r^)Bq6Va>UAP2mXc@-Y(5}EKRQEqTU^FD66eph
zuQcaRIhlMNzfkGI8c3G(TM_JID&SuexuCvwgfLSgpLvSrG=Ov;TT5@wzwQtJo;5bF
zo2HDsWy)>$0|D~ms4*D$H)2u)QYWViz6x$I!S|PjOR<K3!T4qFg^dF8#uIF7!*33;
zwcG>{t~4Hq1gB40UuZvKXSAnb^3J7S_n)e#V@bj<HKv^9+-wFfIAKp!z|;?7Mn?V%
z<+qU7pU1ZW*EgPx2yE*-apf_Q15D*q2k^g^-`TBfM~*Br<w|~Y9I&RrSZO6{zd39+
zJF$B&a{m6G?(fLSe8c`UDV4zHodvhGB?MNFn_UOLLDTDex(~{y&B1yF?p%Fd-}&IU
zXvpDk1=G5Yz4!L8p5dMV$RCK?DV?w}I3uVZ@4h_h#Eq3P;__7NWDB`EbwMW(gF0Wi
zq$v$yVk?{G^RI8e>q_}XVH5-FJumhrwgjsHBYC73zH2Nxr~ipgwL~hjM!bB}@bI>n
zS0{L6Mw&84e#BJV$8}S1{06x;z*ucgX37NP$plAl#51%GXpeTmccBHK<0Pyk^U_G8
zOsnB(L**iZAF8m7V==?yN5qi;S(n&`<R{&@7Z@(gH~bFZ!xrw4N$87ybhW4GX9Qi>
zO8Ute35LXF;f+UQef*T0-C8S;l27vN$g?pJA`I(h7kM2!QZ|cqnP?4ScnBQc5ILB0
zHDOrV!{AB~PbxASUNOrQGng`?T9rb!HqdqwULrWYx5Jo7t$!yjoTPU;gmO3}Xhcx+
zq>@*~M|1R7q?YM13cyVM=D=YHffz}siHsYqXdjUi8vI4vGfxjL^)WqF?Q^*WTX;1(
zq>x|eaJ#hVoxo)G3pQW*;f1>CZ@udGKT!W%6SDnD{ZsW9tJ46Xo4eGN9s8XiqcR)M
zj2Xd_%8{^N*cd&y+&hdg(|8CYb#T%ImjDQ!sP_}OnjK0>ax==iwBQGEdvGqv+C%Ii
zr1&2RI>3(N$M6E=8{-Y+PZniQK;cE>!ER@okmW>=F_P6Il;7>Lt8X_P75tM{)YLRS
z+}H4s3;%#AKZo4K9>CMi!orc|xK1a2P=9pp8_IT|+hhk}3rwzi`erd4?abLd0`BOp
z+K08@@&ZXB_9seDF#wP%A_3x|;0f8_cFOLU6rh{6WA}`EMA}ylqV2o{D~R8G`CMx>
z`!o&i9DaAHh1%Lao3tp7IOSQgww96UVpR0{!$U_$(}CWD4^)->5-YR}IbPh%X=~<&
z2Dh*dOl|1VJM>;CgjP%-3RkiC^a~oSxy(CXZWK3eudff5M<3!{87+I}Vt5&Z7eINA
zMpi8^+ENf;M8VG<++NnN+snVe4;)3Q$Jk?EPwJL@v^Ut7qV|w?yX8a6hwihEd&|J!
zJkro>3KrWeQ9-W>3ovkUGaK5~R6>1RtlRjZ;El>p%lF*E1i$#Xg;gH*&ObtM5zc5h
zR|=FdiJc4mbdLxMJ)nHcdW<;$1x-nGG&ng>F40Z1{a<@5ymF9k*Q)W83|Bn$qj9Bf
zo6c#}dE<x^uj(#7am-tn728ms!a)ZmS}N(yWwZ*4ZDgYhjo6uSxeV#yu)ppV3O}Mg
z>o0@67T-U6*g%u3*T8&sE2(A_aZ3YIL51VpQxda?iwk5FPjKcGF8No&zG20F^aUTW
z#`P@!hP}06?FV0f(2ADgmI;4e5PmVCU$X*n(j6DZ)@M&z&~SvYqIN_Kmaj30u6F5*
zVa6~l=5E#2zPBIZG$i!SJ9et=d?Ne4;^FO*nrd7)76ldxFZP?`CO?2pYfK$_UJN}t
zHIV;wS*O@Bz{!E93Ut0IMd~?d0u6_jE@7O$Z}TTyD_cJf><r@ZsNAqIiB=5`^c@;y
zGB~4`95tMj8GTUa^@EzR2kY&(FUCFHwqY*y>fnjQ1AW(#UT^lDRP-R&S{96)yb>Vx
z0b1x-307>~Lr9z&MCoP@9Q4(YupNA85p?%jVr?|{zEf8FkG#y_DC1JV?*bmw0Dz55
z$dxigQBrm?EosQ)DxG1Yy`Gn=kEM4u`g?2&KDm-%U=Xuf!IO6KU~J-B$0n;RYxx4q
zuk&|tpa0MdY3EVVlRKf|Vs;;#uTg?bjL%W0edyYx7Cor^RhMFAZ+9mlZZ>1*sr0_3
z;ZgSaBhxpU#u-;wC2SGQH@R(i(8<}+P*vAoO0@oZH_3*}>-AjFo3H#&Y`^;oX<9zJ
z9h_&Q_x=il7(*0h!<koTCHQ2Fq}OHVz5S&LJ5iMz1LR5L7#^=Y)!V`Px?>|G1XIM3
zvfhpuMX`70O(BzWB@TVch|Dv)xAJ5JUEc7$oFHI+H00(Dv0VX1ZTNE-dG<F@&lm}?
zKRdWl_tD?>U7%G(zpE}r6?`8L8LMz|rf+{cKK|6o2C%2j`w=De&t>^pGx*{5{mAD;
z9-t1&hCxwueq*3!ZU*LviAJLHn1h#(aneH{A9?TTozO@Sc;9!JH}H<a!~BQ(4w2*{
zrsz0HuZ@_DQD8mIwj2R>72N%-y)7#5Cj;SGKa@e||M6`Hm$xeC(;2OozUR`+MK>s3
zQ7QfLYGm6{IAZ}qZ8<`bokZBL377-TEfLMHqq%^Qr7H=KBggr}&fK|iz%>apW_-=|
z3V%MVP0EDRF7<KEr+L=@f!#)7cQuBPK8bIJ9+~ZQh7Y`~9mMo|DW@OqC^>fZBbe=p
z-aecYskBQdcx=jL%^|+T?M_Wbs?%GY8yD^P7OH>?o?@9S!L#D>-oi_6*WwS`a-6rD
z0gmA=j3g+XgpJ-Kv6!@_VP4Mkgy_L&wJG;>M$_Zn0!Yt8c0ZC0%z2Hj!4-FzDZDHg
zQkZ6(O7vX>&T&ubuOezvm||RHJX6#Z`HRTOREV~XmgC5cjv&*7_g93%xOXAFch2AL
zmq}^#1D28>kne#eVif403bGzEcW)`970&!*3k~L1ZdtvyK>6DC`30y*ZL^AdI&)2b
z`}^9n+e00ZYUh3L+;J-ubG`W=I=&6bKmXtARs0wIjQ>XvukVuFPJC%JyEPUR>-_>>
zbNkZE+9X$F$!kgPcDBmEJx72QvN1iExqF$h3yPD^CLneg!rp$cFmmk*zp>(4Zu@C^
zX*%uR8ZOxoOs%-NK`_>brm~z|NIcQkQ=in`SDC3ess!J-t{~tyD9KfbX#;(Mz5#4&
z9g${(=kmI3OS>~mXx^5yWbcTLL0^9}>aFB9v+V{C7r+p?CAY)H+24U)7Ec6%l!4@t
zCR5noh;oa-K6GAMLkPMxa=C`Sqp-x2zoxgUqN<_B?IC5~c<^Z+jU3V69GCCmnbCLq
z)~A1SeDUJ%Ao8*$p}{VU9F#y_hRto}G)*0;4V5@sUt{5`!)Q41NIK}6gSLXoWwpex
z?V5=6K1j)SQE<x((+vmiEMpp7@Je<Q7F28ejuu~;Ci##82;0MXI*gJ1_NC6cCXU6O
z?uqZsWc+~Pn940z+cMA(*1K$n1-b`eY2S98LkqslBdiaMf{;xod95#W<w%s%@sU-i
z?ek{hV@;0%=PfsFO2CKkWn^UwtfewtY$9b*Ph*inw(bv3AZ5k_`jz!%Uc^_k{kg#_
zCw9*BjQ&V6dvjqfe9jfMo)jVjMJ?qsZp5qL?o$3X$mC;V0XIAzzY7Bc1>!HH7+N2~
zzD1igqN04b=I$AOIdvy?^Fhof*2X23+Sb-S1^kO+%$a7O6r*Mf64>+aS#K#7G3;Fp
z*qD^5k-AmNs>znKPOIUwjqJc(K`toI&o6zy?lIzn1S$AL5CU-Hr`iy5=#x|KM$(W3
z!*@j2%WeRLGWm=Tz<t`&(W7G_@=n-frj$fDdiB8Vjv`Ptvr6!8s4+W64U{+4H?Btx
zie<`^Qev|D7_eyc(Wy9HgL478<y?N)0RLB}A7`G0%N!)_=OtLO42JQpz&e2&gvShI
z6=HF)p#A=oMqEbmE2hQZr^UeLOPv*}WAmv>_ysxrUNsYOsW9IelaT5{nl__`l?c)k
zIQj-F(@^>q^DN{?x0wqlv1><R#WUZ`9UJRP?e#+`{k)MO)$&8Dy9K`g(*UJM4gKNE
z28txX6g{*`A}r#!&dg(AbZ|jCkTA6L5xdEqBTUmwg&!+}S01Q3d>%X{n#PKr7X99i
z4tr?xWlwV?3qWm1&yE#!!Xu5upbh#gxu%2H=m3=Qbr9Jw^lJk1<MUY0GM`gu8b2Ij
zTsi0X!PPza9_e}@3#-wNi2Kb^jbALI<VNOJ^+iydrN*F7a&40Yt0BrTO1^?$+DvEZ
zJ%QuNnT8Ip+U;#J!gbB3Av<ykq&C}veK2Ob*m|9o(k`bpjMnnlkx~f#zzQGSbCGc*
zTt1pSPq;B5CBB#I<%bmc?X`y5RVAO^!cMg#P{gr{7H0)k;|o>_!W|atFw&kIEw}57
zdceeK$a|9T#w#mn(l9&ZkoUP#`3U`<z?Ck^MD1-Yc7MXbJnIE>dl7pg6LbX-i#-&<
zAOe186;2TAGoBQRn$9#HE-4yS623aWb6<wkOZmp;SWBwmE=(5dHE?kpGn-ncy1c?n
zV#{|Vyn@>8>iQ8=d+2Ttbk#cFJDIlW*<Z`~N~NeiYHODYnAss_^Mh*+Ihsv&qk7S~
z2oerO1Xz><svW^+=E06Gx{`8XxTw*><u`}DjBvrZ6D}Izb-8!A`ojcGRu?wz3dKwT
zkzUs=6HGsXW(LH7>@Y&~m^fGZNHkl1p*|z#{*F99b%@;^-R0*c{q)q6QNxz^T$L5K
z6EPRoC8(}juE8mkOy=(0EzSoZLBHcgNmzHgex`N-*0pT4M59W4{B_9r{0^5RZ3p^q
zexP0};vWtLswD@&RPYun6|)T@v|yzVz8GAyb|GXn<J~jXwI&V#NyC1HJN+VJ37|$h
zFzr&L(Co!}geN`6?_rB#zR|7eT-NVxBa!^LMzdn3!T7xzJ-9brDfNf|@}A0K-s;OF
zC$oc+;);Gxx-1U_U1m3*%=>|@##3cUw%nKzXMR=$MuT}~1nBhDwB_)moE=%N%xv;O
zBAfOb+`k{<>!XnNWA9o1n7fKW9}B}(Rl!-POW&GGiNT|`;9=NuR5IFxk`iU4POG90
zeRbFvI^=lztIi0lUM<N)1gT2iCVXRix@C>+W7vBHU`Zq}V!z<M5TO5vD5EAnCBsDK
zHV;lp(lw&i9C7bE-_0G?&p@b7MJio5wO2sznn>yzXL@rC>$%Z>Ht1jqH@f%-(2J&o
za%to3{6$?~S1d}}U(eq`ipvl7)Wj-uNJt*){_e(OaQQJ3!4wMrLbrtuf+4LP2@CBv
z0~AyUezCDXz|kO7X?${SwW;LezBNZBpR|rkBdV6q3tTN^1#k0CIoUEyKuZ_kg}19h
zx#Z(6T||+Xu%!TN(=p1~r#gruS6((+c^T|7OmH7CA5coYA#`CMn~uxFlZtXVjrL)N
z;O~PvG{W#aB{!Pr8nJnUX<fWOtlYnkdy3b<o+_y(#FB3u4(U3sHV$x}8WvE^mYOr>
z6H)k-i3cv&ku6L$>$zUX{WW6JI#K__YGsOz>NBJ^#(~!xtGS8o(iko7fLF|nbYZ12
z{lFuAOlY%p?a7k2qeF~E)zC+b=au$r8N?e$8X`JQ8cTCzkoIhCQS7MNbP6azy|xZv
z`ium>Y8Y;Ri}~V!WZ>ch)HJ@&2L|p5eRsjx%<}_h)O2dw!Egz8#U<9*8Y&_SB$1$c
zy$)S3!hEVHv`$Eq(??=l*dP0uNT1TNxyyEesp#qzr*T7Do$ZQ}4+ScGBwM{xF0z`i
zw-94iWYc0=KI085YtRZ5gL$;Hd9S>ePBx~`-#8%K6dNn;<GV})Q`AQ<`aR={?=e|n
zP3%CMY}vCvoo9FCN`TY<98H<5oSfee%-Zg^U%vM_^=)v(!nqodU|O81FkBqGK0K<F
zfKA`K_0ya?G=whEg*jXe@(&VPk0w}M7AhARKr1@ekbTDLsrU!0sHVD0=_J9K$}kSD
z^zR&8`eHOZwS*35^g>1SY=->^!aXH=6WKPwH$st!epxj*V()}H*VN!uyWZ_Ac|Cro
z0-tfQ>-3~_c*_=!@g0)hvI%x$5FppBP<F4JbPauYHtH@&4g`WD+wV-8UDy5m^p3On
zbWO%tyL&xqX&e*jHd6dip-9#KE4)UBAwt&?B+Y&~oV~qlcscLOGLxU|m{4-W;HmiM
zke3r>(_xxP6Nsp%7S6*Pb(Gw(S%x?36~I$0$Jd)t^XLX0&SErddy+4uZBm*x60J0>
zey!n%uS;EXW45*NN`>W<<fEGLS~Y=sb`wa@Gj&6bXOj_3(V(e5Y7ZUsRtN6N^4riR
zYYOl&celEivQJt)=&ZN!ThU!yKeRZ|c>CS$s7q4RH*e6pStZyT#IYbUPZwe|w>1ze
zZpcZ$1$otfj@zHNBlE4tu@=D}z1}xddb6a$KYy0NPx^A+8fx&s4!)P;+B%5&f|q4t
zf#P~Upx)tKvBLAs$D3)~{u=Gi@7qP(#n(m8_kD_=$f6qd#*rQx`ZdPuJ#UClK8>K+
zP$d|ePzt$^2N^64P)qNO4!c{)65<SX$xax?apO(0B!1v}?I6lN&B{-H-=6JlDtuCy
z>&Vw^8Q>iL3CHB1OT_=6=|veT(>V)8g0bx*4$o>B&Bmf1kWP*Sk+{Nh`}<V96CV>F
z?rGiuOrg9Ob*ws!CR%-%y^ksT9x?<((nebpS=O;**1l>8Rh0a7|2fxNJ`|WjAAVLF
zrXHxW5FzB~i0j>a4?ns9FVkyB#pTi*5K?G8x<p)~m*8W(5H)~kMCce{WX2u;#Ekz^
zK({Q-C>~ZHicPQM&tj*&^HCgJ%zOW!LP_|Yr&EH~PR+x%O7x!NUMp&#(6QROz?20W
zI>M3tJ+}fEK1jZmTN<ousQlKw_)Tl!RfkWgJA)?28oQ?`w@G4Ls%O(~pMaO#+t>9b
z<mKqZLUplA=t1G8&hqa4S6p4UQ$zqN%KP;-4*k#DJ!}r>y?FWt2$2;;{d18afc8H6
z3kMqfq_OB^6ttf@v-SdjjO(CCij+^2Va8D5c#0<jKQ>FrN#D|9_aZ?N=k9cUSb>I$
z!L6N!C0ZB?>!re9-ORk{dHf3B9F_!Li*wC(bR3*=G_#gw`PlZiNO$3d%<$&*?04GE
zf<&r!`pLedG9xKj_$b*cu63KU|2a`cY+CX4FOb+ig#Q2yKb~UjhYM&f(Elj51fn3#
zWyo$ZED}$<SHmpSYe6t9D{(A?1u<s$S)Sj6|C7Zu{<yB#t%Pq`20O2_cb07hIic@Y
zL}fCNth}nna(h?8SJ1K)>74Bg9GV$Hzd3-Ssrbq9oj4Y9B%O(Z4gguJWXOyR4@XM<
ziq$3Z2fzAYjW}0Ud$r*G2Q9+7y^WOQ_0%xe)@_xA=LXXrTRgsS*^NN^B-nAl&wJtE
z9hIAR<<lQ0{&DA)-ANHMG&vpb_(D+^asl9iEkME*K~mUHK`PhBs;)Xvg#d<tpCnqr
zxV6S@T`#bG>1S4A%uMnMhw<r-u!YLIYAN6AH$BDj&l~pZEAvR)`ZEr1722P03Q_}&
zi`;}j0%aM-G#ddlx&0QF{I9Y+IFmg%+P%?QNhu%9RLpgMAj<-NB#`lmh#E8Cb4nJc
z3FQZ7WZh+4*%Iq2bSs*`x?2e&(-WlRC+!l$k@<jH!`vMSv}=l(KA<wwAmgH*KuJ@M
z_G`zFtWa!gYr`+TPImGASq{p&)dN2GLxFUpX)r&@u9X;t<=ahiYu6C655~5g8@G0^
zcZ~?q+F8mea>wXKUu{O<zHiUctoK>}te1C!Z^O3ks#KRg0d7YprhJM?b@Gt2N4Znq
zOsQedY;EfE@!==yn?hlBz^~p`7S@`h%u4$ct_7H84fkO}^Ug+e)~n(Gk&NJjx5Cpx
z#O%_}D}3abO;%%!kceP8V~`a<SYBP4wA^-F-$PD@#@sg6*6%bUY|5)YA%Dv-Ekt*f
zrxP3Q1L#B_uo5p=@r~-cp@t6n{Mh1UA<R%s{ijW`>B`d8d|#Q^W{Pl)$7lZON|)(~
z*%^seA}4ziUj<sC8ZE>!YRm&2ABnbR<u6q4AM~cKf{GN#?EG1mn`IY6L>T5`cDGn%
zcUs$3&iw+d`1|b)XAvN$Q8RU1cRFnxj5DT+RdQVFO_PbX%}G{ri^hq$4$MKw5rEtk
zG9qabw*A+J(WsZulA7f8A+$k}?491LkFJ^%o<+3njOUd*Yo}sZC@CF%7;zKpf*Q^2
z!tchrz#fe0F-5t~Gz>FcNuR9PBGQs;R&hoXu7<E<-$LK?x2BP$Eg~#so;3#v#DCyh
z$n3#v_j&UJU+JI-HkSw1?Gs71S%_{gwdlWHnR;k@E>k>+;n`Fz?b<*i#8X{S%ctM#
zgoic<@F?a?=;H=8b^%k8bL8B_dLAkl&chfh5$ybV*>+Ha*(2JD3a$BU9UgIn8kuq4
z%rm7E&e1Fi^-|!`Mu^~qGpit-{CTM4ZURwvWRdQiXOVeLUi!{6bPJ_g%}tPH_tUzs
z&lZLbV!Z`iG1Qm@8QX>ncUB$1h(T&oHt%)7F!+ep{N@^!IQsQGt^GFfW=)OLcf+!t
zS{&ETd5jX+T~{Xhz)|VN=pVpNft%++^CYoj>{r}7%yh!~BinRm$`O*uR{x&d31%d8
z0w}J~z1X~kTjES#2<J~PCQCdJ7-JVoCR|7x*qr}x+(h)6`Vc6|%ENYO`lXKOE+P34
zuFGLqt|__`t}R#4(fwiY*c?vxiBE#kh3@_h!@UMBPS0*=;-6bSsC#t%E;<z4`qzN_
zriKwnBfcG81-ZkLR}bKbh=C4&#8>vG>@a;=ZoD}|fp_5-g5l1}Xy#{kW46^p*J%<q
znHFDs*LbV6-S)MAKNz(8t=i+mA|DDAm~{lHd7FD0!c0zZA&5!N(je5awEO5m4@wTb
zyIZ`hJJKCNx4(rvxGSkvYvyt4xY*T{RI@5PN8QHBty2nmOo=|m-GiKjFy^V0Hd!-x
zdH_-UMK}U*TzC$pidreR+X^e{!?jf{Eh9oXUBbur<1K*y=TW8{-Hd>TVFt5)Jeq~Z
z+o~n@+{%0%Qm&Eebyvy$La%_LPn(^TE3p5(cIlHNNuuq!KP4w+%X~h|Et{gwWnDXP
zP@{1~&iZ-($JdJ7^55^LE4d_|XP}Ud?34bh32ks0yqCr}l4941-HDc@C&jcbZ(}Nt
zDAPV}rZ8?E>#D$FJ+as$<JFE{UI9DN<^6SvKc;URMzVL#kb!8a1}TNgMTYGO&KP&3
zF{0>-9YJ=-#`o2ae;<2WP;vEr^`%JfNdKCQZT>4^`nbC0{n%E`-yApK2}DU>0ODG2
zweT;QcXhmAURqRBF=W4k%W3@doD3nn;dVP!x?M=Es*V+oz1h<&4rP;)av$UmGUX|P
z76DF3^Y=N9R*rFB_VSl2J3VC2X>apM;7@7@F33~+eyGG{W1bbRQw)!USI4y50!gyU
zUX~u1tJE@2p=fs|R|3-1x2<1k4q?7=QH?&*RBe}aq)XKD%`gVJaH~<80$c0>)<J!3
zTegvi-8wr<nR-Q^-2B`;xSDXIuJ5mXrSi9(qiCf|2c{Ba^>N$i*bXG5QvHbh$<`7A
zgzwhPj7Xm#9S5HYVq8%K`TprU@@ZE(A3X><!LM>_EV!p==a%lZd%0+30W~^~z4KF$
z=iR=PF-N`S#bymCk}515eNv&NH}z+4_i=>k797mdQh+>mi9eyb9!Y@3$Vt#Z^i{Gz
zZ&5WmM2J^A<{uDZ<p%b)KG}}YZ#hbhk635(u5PJJu={gq!w;cAFT|dn&>cno0Un`#
z+<HNQwJWZ*H5T0GpLBPBukBGv_N?)=df}5t2gt^pAHAm=S$P=I;4*Hc@ZG++&8i>u
z<<u>w&ZZ9?>bF`9J|Yf=3mG=1*)3-kZgG2NX(VAuMer(AhwXzhrDmQSCkfju_JrA}
zZ@`wmVMF`wl=Sx}VKwr(8bE@I$O2qu(sN|FGS!x%-@Fg)Kx>DK5Ks)$;mY1Ol?y%B
zI5^UJe0zRGofSj3blS~G%GmW|cd!AJ7+~1s?9s{qWxZu97#w8#{d#%D`Q84Zvw=69
z&AbIaFMhs#I9N<o(Ejd8f2fs}0DMoa=D=(kFlNVN2%E0eO*DVfq4M>9=tQI<(ZQg$
z`Rf(I{EuY^%Z*FwPCR(#avL?K0jZfb0GeC}w!Ekd8`G>>`Na^fmzaiv%u(tt&4Y`x
zLp~=5mx|&|`71e96Vv(n5~DjYEoeCCJn}MmAYGp=Bedoh`7&C-y9+1kZ@ydC(h1^z
zD#j$wMBbd<o#=OUHU@hWG1}e^uUN#F7p6q{yn;dw+wOMHwB6T<&-7P3)BVcy&|BHK
zL^YU3SoUo#;TJ<8*b<&kZ|F4%0L2Y3odl3(81j%|fEg^U!}32vL&kX<n)&#6O?Cg;
z*q`&MVkR!X&d@yjF|(v~&iT~V(U*W}qfF(J=eh&wwFetL{S+ENb7AvFZHWWR`IuMC
z3lMhdeq@O|N^R~iao#SdZ0^&s@@}8}tjnhlzPtFmfV74yVCrreF!$0@y83pp`yGTH
zO+$pv%NbvfrT^y0Rt%|TVCNt8s?WSeo_&#pcY9xoabU{ek6>xeeaT1C%J=0nb-olm
z(RV87@HwHB(_-LOWB5y~1$qRwH30|hk7?J~4Jg3xaY?^Y_An)qmJO|52OBuw)H+>p
zWMg_z*Xo+B@=|fMO(ZvSc#8{E^~CMJaLHq{R5iDkIOcxmeFHE1b=5{vCwvdekCyL`
zic8??bxug*(WNgHgM$EMpAmPx_DpA05!ySF2kwI|WI-!7L_A$*ye8tSmu3RrhO5ch
zut#0`1#`_g*YAc>X+`86*>tz5py*~*cUn_ZZd_&Yafh9ST@a-yYX2o~zlV}s$!g87
z)qQ;4`Q(?ECtZ&g<CSsW_V#Te2M38me_OdBIH?7L;q7+p!}N4kb9#Zt<rODqXD4%o
zh{>Iq{ymD-+uj0G#xS)THe8eu0wNk)Vb)6A^+zA`^{ezLynuZZb$Zb>uxR46jBlE0
zn`wT=4G!ExJSjQ1e7O^!gcn^SLI;u5)K@))!345c>|iq-@|?|};UnXSe41?O;=Z*3
z`a|hw5;tu#K_!idWCO*`h*?bko<E%x%MTPQR%)~KyeW34dr<0=0{Yi7&vDa2|J^R^
zeKbuYOU?Eit#vxfg&=dg>pf81;@|^5=f^nIS_wB;`P0eW)O_1?rxbva`mFcgvJL*@
z;P>DCz;J+ssTaGFD4nt;`=_)l^18>O4`QcfQNV?HaD$I21ZK(^zd2?k_23Kx$N=DO
zzOHRiK<A<R7?E99v?9tG_;-XKq5sN(1KF$Bg%hk{?nVKDFMmpQ;uu<)^}eKJDkMaF
z6Pu#*SoQ69AKRK1t8M0oE&Il_kVBu)pjot?mle(a!}L6bcP+s(yr3k*dKcn@`T^a9
zbL`9l&y1qbl4+j3F>x*3+k@oQQlBNM4Bh?ls(!c1#1*nTt_Kb&k5CKgh;#JjDzI(W
zSWP};+ANiyHo+al!OUnrFpwFm8)`UpI!-mr^%9pCmhX9YZprYkU-PDZ=aUJ!Dk}||
z;EEU<|J{29Xr+Hu9QeD>VHsEt+flPNc;zIXHr^qpI#l{=!$>N)w7s)h&2v{t=cmZx
zg7Zr<rk%F~Nt<><L+32J!E3$Z{38tU4<BN*b?W$*KNG=<!AOilF&E0SJJ>pOnT0~&
zrRorT*z?$r;v?cJ@=2Vb*|DoKu*bKJZoGVT{+!)?LA9TMhUBEeEGHtBvSo%=0Xx(P
zlDnKX*mW!kpTJh?<+?T+e9H8(#mrHjX|K^?msm>7rQ@8q`K7L|-ZSMf)`Qor?%%(A
z<z@W7ow2c+Z{KQia9rbT5rdY&`hl@S#C@g(E1juM&K+}M)Hg+5WxX)cy1KuJJYKsq
zSf_gz_T`-TJ6<`Htm_sRs-E4pzjTG&WA~9<eBa^O1kCu^QI~5t1b{-NrMvy+a6XUy
zY237}k-f~eyp+odvG~oA$JZ+Ue?){D`k!9d|M7NPbpsv)N&vB4YsBq^tjMIoUz(K$
zyE&e$gY_J*oMXQwPoDgEp(7;OWcu#32yE3201AkenM@HF`SSBEC2Xq&s)%meX0Y+K
zw*@OD1<O$MRTx}bbnB@-8D&2{<JY5flfXia<S;Ko*0erKZu=*@DZMUrzTS@JJtOL4
zpm}aRByUfhjjJzK6bOZ#4Utr+&U3z482m`(fSNWrL2X>!@`J3Eb(TSb*8_WnjQuc~
z*ml(q?GXic72_+Ai8<P=XoPPrq{t`<0vtHywWei&8yx-CaP=xv3tvYh!v*tQ3w*?v
zoId&Ut%(n~c1wrcaK)8lU6vsg9xx@l?Ovme79JrmmNM*hQG)a4-c?P9uk_eiWgpd2
zSiBz5BHDLvQ^4qu*ySuEg6N$7CfozKln&jrWfVUQ;+5WK9KaU-63SC}^5645|1A&n
zU%$3|dnM{z+C5Grh}u=kOffZZMz$j1MT<n*Zi^;pnErPCjy<H@7F<^w*R^9|Axco}
znWg!v)XLcahq%w0c8|*UB-~S6A|}D5jdrs?69uaf2}J3D1%zNQ$~?9=AoTv;WL>V8
zujiL0#6-niTEsS#X;{)92s(7AFVV+nANx!F<suq|VLWgszo8$k<ySvwNB{NkVgGkx
zcyy#S_YU*=o-RZ)-t3R)8yGny0^J5&C*Xv3g8S{`e1_4Xk-K%6;VD$~9$&|{94Y1P
z&aORsqQ3ofcIqx_VCEoy1qo=5w`DMXDkf`+T{7Jk-ogPo5HSga`kX}_Ggno^`U=H%
zBPrT)q@jV!6ny8m)>j@5H-*Q>92~mJfX;!+#blkcVwWy*a7b>eQF+-&x&0C_;{HR5
z#RDF%e~4HAHh%r@ea%CH{TDFxmSyNDihVB94|pEqBk{keC}i_76f+XwK^L;H@+TzR
z|2Pr+^T&@h?1sQx?lLGzUyNA%qo?tK<}h$V?nUT<(D)MA51_(;0D}gv$DQC0_xt_;
z^V)1ijCRwjV<&COUQPt;(34!1KgjcxOSbfwn71bLYzl+SdXFV#->XLhy@iODU`sE&
zP#alRgCg;uM5*Bmz|zEAsEIpMtxZBxYp$cF#tggI74y+=3O(-qczz_c#L*<g<95To
zh?^p&^Y;(G|Ko{sPcm*GC{83IV|E77+Luh+j(0~2v{N*~F{%TNj<3#ye+^ZgHz?#^
z+LeYk-hQuHs}o&CUtkM@I2_@McBhN7jc9IB0k8)-YR{SNrD%g8R~oAYV}aVQ{?#?^
z=)td@am!zuy+N#vkYJBGi$5MiORyT6RXpnN_=y@RR8`TrUq`-JRdPQjq<{8!@nEzE
zylVIJsRr#|<alN?D1{DzyTk_j6KIEB0NsbAnBN>qpm#3>CSnm_b`3s-iU1`;8pgIL
zK~x<Gys=q<IO~q<U54%~Geu|Fpt$NpK<3}DE7ow@4DbW*V*#bCcq}ayKxwA4{`|=G
zwOg1M{m4w>Up+t7i>F=4-9!L{5a<yqf;7EH2kUu$v3>p{>*Pzk2L3O<p*<56PKaMv
z_5pBFP20gi<IWx`9E}-%x*2JLL1O>n`TyXpvffx#z>2lkcKml&BY*3B<}LVTw|POa
z4bU|Cs|9;K0~Qff?5)<<;$CI5`MAgrPD|Y~5*}>fz>W{Y#~wFmg4I0djw7HRU};vw
z@qA#-H-R7%TCtGIQXLs9FyD;&wx({-Hg@;x;D(6KVef$Y)=`DSaN`2Ig1>r`{{t5x
zpDhD6=k}vO%MjpTY}q15;LJy6jKV`7jo$SQ?NM9V`pa)%9t~pi`2$_X-yGyzrjj>e
zV}A<O(&<U4eeP==yP5UBdj2oA<Ug<R|H+R1$`jALG==5-c^=9o>`5<8dey>j0emnV
z+g=E)EM2;1H_MHrod#FR^c{Jmi&)c_a;w7d@SKBePNvcER~FB3BQkbx&_*D^zKyt+
zfZAn3L5xM()BC8L?@srs_R?V;+Qjj937;{&zUp}aps(ZlIP~6-l}^lP>{qFEBO53R
zESuXSieADYBMxk<3R)~+>je6g`n1LC(MX;Ll{bP{QB_CArMbHm^y6Ub=uUl}UqAt6
z0VHr4O^`4hG!uPb<1w8sWFXGZ|JHFs{auelW??X9f^g)XZW+fZNggljD(90-72XS*
z6STf5vvybnTeyk_Yl8{RHA{S1e5|wwlkO$^ywX#j(<Sbb8A%O~JJPHg5Hw!ac;NLS
zh_(KE{&Twfg%kfrS>FF}zxM<Gl62V3NKgS$i`$2l1e#2JyISe~SIe4c5}i((zJrGM
zr+7c7myUPm@4T5|sNRN?v3%Exxp?yvrnKMq3v8zMDy@$xPGM}bOX-r6dCw|ohL5`W
zzL{cWArfDP-|xLitZlfh$&<Q8IG%JDP`L1<N(9Z8m?Wo!jh<Abdoi`>O%<`ejarn2
zK-yA1H*xhNtJwSCC`si}2>a(8a!?mNvSrF9z{_)~AunKpn894wUAT-x!7Y{K5*N``
z<DxYAC%9}0R)uTnrhU{E_chbQdE)3bk01}Mmn`+c8$*%T!P7~RiU(xG6eD*2G_l8p
z1t1|kWrh{abto%l(Vxym)@aF0bI}_`A0YE4KW~`)yxbR?QtI^(P|_l<1C~EqI;0#e
z!@^bKMU5LzdQoh7S`~eM-OskT@Xb#!uwAE!k)G%*^$TGIjudmigMyCkKH(zy)rccF
zUUyDux#U|nk+zd6WyV^+jGeN_wh~;x^a51npfBxb4cZf_UP$U5P}pt@eULJws<yJu
z7RzPIfG3RC7k`85H}app2|x>@+`t48eB+m<EpXP52Eq9@?SQCsUuSZCGiMotlLov#
z2-O7aFAYo|IQ4W!up8nj&}AT@LuBz2D}HmZ1$BvRll(2y6l}}!|9}5FBv?#2tCd4h
zTL=?&|Iiwlhi#b!Rx{{ttEryJxCT}~8Ex%1<C&umiB0KC{^ocGjD&p7o5%sA-L_eJ
zX~qbfmlp&%H7MOcgDSY?z93#L!5`<{C^*%izN_@UdPm6}W}v)X@=)Q^bB==7r$gx!
zq#U$+swl<{w2hNcwzQ>GPwnH3HvwYL-<=ws0Wy1`=D40*ad08Mf%pM-H`IlC0Dr#}
zPPQFG5QOQh=Ok+*$81Rpl5W?F`D%J+PI)QR!j?xOAuhIHFW9uGx#qpqbOM#rm28{T
zEMLPV!A6W6s`A!6+WnyZG|!fS<<?!?7(5!mixFe)rahM9g!HCLG}^LcYqUHzk3YM7
z`L*qNg{X6!`qSh()mv%zPF@+%4;W(m0Xs=;Hs2a}yPRhc-nlZs!3;0tOJ}6)4Cb3c
zp)CS9yMC5dVQ$1Is<w^ac^L9tjZwo;2Y*p0rW;GM&5g;Wl_Fd<wvR%dylvTAB!SJ0
ze1(*?%%D5IS+_o$3ye!BDZC~(e}36GpYEm?qWx0K>NJSZw?Jd#pHwgYznAC#3*`0h
zbbS8d|9+r7esd`Jf;z$(<hDO|)O-Hh{|Q6Du%vIqjg!Pycu^eU*Kdy3Bg-Zjez1Sa
zh%1r!2)L~XcJ4RFIo#yGmLPvCMgHnht#1hgmO>CoMjq#G?!uk;H*JtCt}o!nPGFxX
z{}tcu5(@)<^S`IZoq+%T`u~E)$G;qlf1%g(-+InYj6&SlrpyXT*ckD;YL|kY-kG~L
zsWEHJRrR;s{D0$-i^lVxDnr?kg8uW0^#y<B-!)_X{~mQ-Q2Gxo(#OOTfAA~+MDP^b
zSlhC~wqC}Qo*~Ov%rOn>Zu+BkIk{o=Ck11uaq;D|aWQUH)vIo{kO&`h%N9Bs2`0TI
zmS*j-|BJo%j%sq-^Tt6yq=`zG5)}{-5D}Cj64XNz5fPClL`0f^h)9!=AV`raD4?M9
zrU;QPMMAGC(wmS1(jy5aJd$$W?Ong${O&#H{@!`#&a9ca@0vectR+c!o}K;dZ~K(S
zfLTSK?Yj?DxDeBNZ6kKK<}6I!!@imd(SOCbc0*vC$Jq0Wghr0S{*pW=O5?_#F(0eM
zqM-{o5c_lu$E^%ES@qE%^ilS&tr~;)DAIwY9|lz>kW7!>rh-Xy4i6E>+_w#?Fhl88
zk$>`+BOaq`fONtcY8WFF?Qw=7P3;+|nx&nj>(|D#wK6pa2~Uj)pB+nEPi5BdoY;S(
zS1jj)rKNqd1R=~9{ih{r#DN@|E)vY-!ceve33X(rS&de7jIK1f_KC7g`Eb<~ZWlsF
zt>sY5Rn0>Q+!ar|n@>K~J2V#(wnzu3PnaTjs()+JS;mpg5EK1OE>N(tj}Lh2m{Kk(
zobs~jIyhe{-V{OL+ikWF(UVarc6B?z9QIAG$bXbC;acL4JJ{sJabR=k0p5lLA9t57
z24{mi!ydzhtL%?>yz#!fkkJybEmHfe8^vf})#xd|mlWQ5h=rSmU>-TkcmiKM57mO=
z{wzpq%)4nE0}K{(1jupdzXj6{puYllMj+oB40cc6|GfOy$~1zdxDT-1YZs+}zmFD0
z<3i1v4;tSG<(e-WSv-QQhOn$?Jb<N=jS-V)u0&mcf+<~XsguvgEh|S0yrk?GLc>WV
z+?4{&mbZ-YS1T{TZ5Q8C)2`Q!mgB6upPf6&nPAlI{7XbrVxQspV;#Rp;Gx_bH0)1>
z>*%fTa?HnyU;hwD3Pb-fAl!dzSE*k>xPNb``0MxoSBH%#x<mzN>XI)R59~IkbcKhq
zF<VEsmJoZf_+lw-MhP6BT?4OhT|G>o@e3nQLqYRE)>yZGL)@&gZvDl%LU(WRV`f3~
z5gT?3zv;D!S?IxPuKgv@nD;1M$w_wEllMl4RqufLHF)&{?9ZREOq$S3UvM&UO<eL?
zJ7r>&f5X|~_Eq>UH`e;UB0}Y3R7V^A@X=eQx<RKhIE*F@Tr^ZeSrFTAe@Wgy{+(r0
zZ3zyE^8?zy*?XxdeNeF7hRz+$*_#B5sq$~AJM0*WI5)556e<)jH%C69xbFKG*-s8R
z%J%9+00BEbL{%ChHCoUN=vS>wcef%k$1V(Y^_?y+#!IG>H@-*08vQhRg;F19`0G8~
zrVE0DtcNT>W&JRyE5Zl^?H##}HQ~5Gk7g+ev#L8|b8}oWlcIKZa}s@rR{cx7JW-}A
zh)N8g@iDno#YW-y9GGBP)4sq1bXTTOQJYCt<)xdb_sg~)zHKBo>iD=wX?_lY(e?nz
zNhCi_a_SEDz%Maf<ON`#s76f`hSqEtKk|w%k55bJSI*fM?&{)};IQP>6lDlty!p}$
zj%{hI`h01alxx9Nf+K*F$wi~d+cl#&>2hRh7MewL<St3%%5aaQ+2oJLiL|pu29{L3
z)eOtcn^y-Mp5MwX6>{3$8YBNpbK*~$mH+ByxB85hf;%B5)TlY{pP4aonBq`Cr_?Rd
z#+I_K!ttk0u!cHrxwzT5t|RF?S7S`&@;Gv!oB|MBRMBp*@l<0ZNORd^puTs=v$aps
z9Rbp%!=k4Q(*B+(E36P}>fRX2TP|=^yPQpETVvH3lp;A56ilZFe7us$EskH(<)C}k
ziw+%Xd~Qqt0ym-lPt;?9mEU2fLE$qS>tSEGF%zdY3`zF1BS>~ewXT3YQj(GVKy%AW
z`%;hlJ#0hc&<~&WZ?>0o-JCC<=xgcR0}P2MV#uQyJ{aDcn{*d7G=TWr^^00K4sFrI
znO`S?%+eZ@`xFKqoo06x%gvOrkIy1Mwd?4jI@31oFUmGwHCy{=A~emA9*fY_)U}a(
zQdk!Dw}ZsZzd1M+4rf04zfD{JpX`|Y2{O*`+)c5+Y{utfI;tE))=hcDe(uTA-sAI+
zKfh^y)Z9B)c7Hcvs2I}>QWCi$`hkWPL`48e8V&Yu3!|<{8EFZ55JONuyetSmD)DOv
zjwcb|!V^z5Vgut#)U{(mb;g}jCPG<#A~xVTDt{Uu)zxP5k)vW92i6+R3T&3umQ*CC
z<I=9klzM5Gu_p<$cjE7f-E(6jSDJ%M4Q%4PL-i`UgRPS5YKRR|ZPK7%dZ&z}3yO+`
z0+@$~7>Q?)7om3ekDw+D?jaD);_hSeH838fnWtG$IG-@(J+5BNF`Ghq6jS0@d<8lu
zSe%j#&RkFFr;CSx+xhnKQK`2KWwZSB$;4Sx`{%Rc-)><a{!&{uL4#fzlj|&B<GmjI
z+Wy||920E3&UvwD(VnT|h#o4-Lsm<0Jc=hmGHhT?-R<exBV`3Y?&{cKKJkmr>1Lm6
zURpxbKe)n)pT_`kvG3p@fT3{Cw^731M9X;IMI-g!J?*Q_0icd0`@31c<j24(M5|Ub
zS8qYE!tV08-RaaFtFu1?J}99DhGH1WBcOIwD@0@EnBn%aC*ZHwfcN(=qiy!{AnWv`
z9~eX5#T`Hj0bxXf_4)MajF)$10<e$9R(%XAKAGpI&UGfeVZZCx5u^cLN_ByvvKMs^
zGaBI@`>AUQS(3L+J?vA>pXIxku~rOvRWi%O1O2_t<FI|*@uqjsEe>0CSWf^2Tv!D)
z2zyht_nFp*q8We*lIw8YPWdpiztGM&5ud#DLR$l-W17H$DnVsk+*<GDvBk{YC(2Q;
zfD!VlL=G5Pa`wWi14ym}vx$>k&D%4IgP2p6l>V4a9M@E=n&=6$;$&f~nZEU-Gv_y!
z5a*ERC?Q#GfST-ZX82Zcw7Q96?9!1~@oD9+b(UXt8{Oc=Tg^q$g}s^jYZDvHKDh`G
zF*Ptl{FA;+Q(PC0s<MpUxxAI0T45h)&PYS)F2bu-R<{YTNm!e($U*z4*v)Z2QRU;H
z!Z0?R4v9#U9w&ZWedDmF1FfkmMJXcj<AHQJj!^Uj?NTseWfgU9z#z{zJ16A&km1S1
zS~EA!PqGPo9X3H`joMb@fG6X{Lv^9(Rs-<6G}VNWr{hdfG>UBQcnkTq+v2F%#`%Y@
z5{-`EOdw9QT&>f4BPS}W_o=gBwJ~yFXNUEyg5po-LAVzf=ciE{C|wCU9yBZFQ05`X
z!ZANEVcUcp6^WF4Ymy(5+S?wT%p`hv7<#<9D|x9rDDJxK8NCkFTXf4&uo<U4s3{N^
z(ao<P=t{#w={CjCEq;NMN?s)s%}>AC-OzqpJahBIZ@!^6&&yXXpR(}Rn|0cfw0uE2
z_iRy-(nh6$9TiFUWIRXTHJhMOZ;^!)bEX&3!l&#M`5Zzn^F%d<oq26A^Etj!-$&4`
zRm&K0QfoI}Q#uC1GI{tZxY@Bz*pwXs*s`xt(G<&tu&B5Le1>)-N=K8qY_(=?CTXNz
zsn8dy{AA%T8N(K5wa`T}p-Fha%RyCGOErY7QHMRG&VF6fam<@pDk**YvTAkZdYE<I
zy-ydl4oCmAx+izn)cJbJh`yI&ryRAvoVwQ4w=$VBo2(QRV<jyuEn&4D4+5w<WsT&5
zCEIvM;W**Wn1eSUJ4*^8DtFK^Ak(be4p3%f+fRP0Vy#hs+Iguwx#&P=9P2R-e?(1T
z0hopD0V6T?oq8azl0_7*o<&5%gc`5U!}l-WzxCsDepTEJ9YvXEzRZMOb!Wmq1Fqyq
z{b*DA;rpv-a~1Z1V)uQ;XHTk{T2DW6zN)wKXb_vAGC4-o0t7DnbHKj^c*(o4aGV6)
z@EufkrjG}Gz!qn8`~8b)MTwxRM#R9B(AWdKq@QW-%?N$)WlS<h(FdwkBGDCqFG@DF
z!II<0V!d`ZmQQ@^&b;=_(Dh!D$@}c*=U1(g!ow5z<Q#f73`X}_>`bU$ES5QTJ`*D5
zVP>&RZaI+K6@$+^cgJBD;{#Hh($_u%AHGXJua4Q@@$L)Leh$N%c9`A3cz<@%;kv6B
z=DQbE8rmMkOF5es-v}!R?7M7Vvs3*_BnfjvX;ETtc(&$}6?fWQWt;3fT_#s$PImJc
zM1+Fd&BG8@h;r?T5V+JU=$I^!z~T#WqK!f1m<`iWnb>Au_jo03)7Oo@mz7<Q70t!=
zuG>q=icfs2%;Ds?_$u|Qw0PDxWg~K*=x+UE{FmaaHo$Ssp~HCBB*$#d0+l*=(7Bj`
z$cy!W-9`6bD&&Wmsb<d7bVb|KM85V36YaK>lsrZ2b7WMVcVi`S)Lh7sCh!XrN(rXQ
zGYXL=6y5dTSu886D47wZ=K>@@GIEZ|ATRT}e*ad!@F3{`cdfa(&iUBt$ZWI$P6Ce4
zg&out11Gc%^B}6_q42Yu&jfxkYo3y+sp#Y47V)1nuHI4*COER4P?>r3L2e3GA)k17
zN>pZto9O_kC+3Ne7C&Bdb07`SC9Ao~-jF<o)KG{7lcTU`4@3pL3m*G0ttBxWw^z@k
z=Yjg`H9s$-W2#qgp8xjbbQ!(lU?Kk9e9_eX4a7$*%lC06*5=^p@mY7tM8QQuI%3Pn
zPAdEq9qiW@S-4l9N+hnzwF5H|)r#GJ3bdK_Ak_#j8_fu%2uqyM=X_hwBSCm`&Ar9H
z#f!78b4J+gDmjg8qv}BWX&2Eg_f(MXG)wwJs09<Kcwy+2PvN4iTvWb~yBp=L3GKtD
z;$mroOHI7VTHoyAERQWsrFbyUb)!wQs>eu{<T0uuBvuSe>O}GyLO3sdsHu5j-6@+D
zG8`r|8Ih)wc`s06Q;6iiLP<@cNH^QcwR`h(?S73JpIR1fy5osHN)XoXrD{qa!KMIT
zHvZKRqW5F$d)K!9gC1ysMq}bjIv75QHH2UL5R^OOQl}wMy*iZ-Z#*9>p%<xmpjgT1
zGpYvNA`N#s3WyK8^=C2A<t?m;T)VQ!xsP^|mX*6x<}N7y^v9iXv?Gb?sO-n!XRkU8
zrA~79j_GbgINAlICL>Fl+(y5UOn?)Pk9js~74ZaKYQT7BcsiDE3ydW0`q+!j!a>Sb
z&4Bm+Bn!-)Vlp~2l?Eb9XQxHlu#SU%W)(KJKNA%hO*5)us>^KtBOku`=N9v{^jD+C
zm<mt^Z6BST%(+@l(*>$M57!b=Do`t~?IAoy#}!pSno}e@+j-pe<i%(Sy~jbu?<2mM
z$*_1#+BB5zvvZ=lvR}2rBS^pleRhm4;6(6YRO#{{T`67hSi-@^s*H|Fx%g8mq_Y~9
zz@qQShg8iP`KrL}U71$j^mdcp`qHpD9;bDszpggw7fJvQH04I?L*@_Xy&p9f^{+XH
zC@CHUom1;D0H@>!r{`V3FzIiG)go3GN|5}NSBXV;Sjut;43}>y^#fZO-sz+3!Z%KU
zgsfN^8FV8q>?{!9>7m`Fp92aKE*SvOfvvg|ivSfj=aw8fD#2?<%)6BH2j-fiujHI{
ziZx$BtyeQX*bAi<nAomN<3zXk=nBz&hA3Y6tjTAlX@rF*PkjzRiI;bI9hrL+>YN65
zz67FbcZlMrPsMWlZncP{fvDG63XIGDnz~(}+v-22Zuj3>*ZrTY?)Xpb>ci7iCg7ic
zbyYb~j7qp$hz)yu&a$o)6jBJ9>pIzkB%PIfseb9!_J^gYrm@*=`T;D7bC#bJx)VhQ
zJ>1sKF*et-0cWRLG7vU-gehn-7`}DyIyC+l93+_Au^?H&XR9)vH9FF6G5bUidz(a;
z_q(H1zgC;T4g~rKCny8>;O5|rt#M0h{h4f_m@RZGpBEO(RH7rmkzWdI5IC!06Jb--
z;HYM)BNW}t2h*+{lbKjopIlr0d%OVx=lHNMl_Js@whBp4lj`oNA?h}--!BSz9r==S
zy@;Ta0INW+*5dmku@xE->c6u{*$18+GaJO24EwPb82c1W>VL;h6e`ZWn|r%@;CUSH
z9GMl!uer9s6VMVM6lc!}Gb&{uJ?yRrIACiPUmHEwSXcebl5%#MxN1FTA|g-G>#g8<
z(z$e^;eNIgbtdL6k6s<*OvVw+NTwh!qEglpblAv!_<FiCWJmP!Uuf6nr5;R)D0@?2
zTI>}ZYnSqI{C46e-hu9hurgLH%t+S6ls!~H^p1SL0z}0*J;<|mA2M(4C|7SDj7r<*
zX)tKwnF~{W&ew8K5B_mHxIoZAqQO%Ho|OeD8m?;4Y=AuOYX~6FICe3jJ-w>?4<dDI
z?RZ`vCEkvh4^zlb7TA}R7az5w-+Hl7|BSWJmpzzd)Db2Cb6UZt)Tmz=S#S^FJ4;B7
zCzOh}ILHnBkQsb1Z7@dCc6jNH_VV-bd2qDu+xHY6C3BG!|CHO;TeFteW4>HFGCrKx
zf}S*?ssK-|UpYu2WC||Eunq131rsvPxK%6Lk!q5D@*n&n8uEJHE~;~jmULnvA!W;u
zE|f=Q!-B8-F|%)$Q2TgGaFA|JSlJq_(sDj?IC)t!Kei{OS?S3UBa}0(OmI^OsH!3=
z;?sXfOu>4(I^f}OvYi<uU(~k7i|T+^Wj#{IMxlg;jw=_9(QRF8UzdG;;T+$7Z(1`c
z5#A8$DHD0CBnC7b@kDHtwn%GY6i73zTH&l5)kDZrd|#4^e*QdsxH933Xk5g#XRTp#
z3NmxErx+n)3x=c(@Qn^Y8sP5uGFk<YHe-ID1)-C6PfB;k%T}S>Kdf(aA!$MDoT;j{
z-srZ2&(C(I&R&H}f=a`CB9_Vh6THz7@O=40(bZ#N>hMTF)~8FBh%=n5hSrYM)eO7p
z-P&-B{gUkJByjOPCN;Xraf6$zxD#0Qk&KCW=S4rQa`Ta=V*-1Ev89c<AV>Es<TF{{
z^WA+uXq@Fn>TJ;|vSDXGbW~+kl4gLsG>lTEdqN%sfl=*~??dfz-=a30g}>(Scje{m
z6Lbqt2)+I`>v+j^#Vn5tM8w`3y*1SOh^l0**0;U+&&FB!(10;g%Nd2nm*T3j!gNe&
z8gzTcOTfNElT2cZbu@?{b$7m%r3kg?UVGo8K42oyo|l$#%j%|v0z2Ez1Zdj<3O!h_
zCCF;F6h;K{z7e*%DP@ZyPjO4zPoJ<qy`k5V=l7G6a>#12<3M<F<<JmQRG^$bi3FI7
zFm<{s1@zVW2x+T?Y>&}em|&Bzy>Y3vTJQB%yV#dfHeMGhHuYK@@3Zvs63nDxX&Qlv
zOzr^UniPZ}WOeaRm^Ej@hLn#VK^zE7SXzB-=E9>CfA{IB&Y#g@mim4&EFJr6`4<ny
zT#UKAqBjqJ)ep&!tya^V-2spj3);!GnAH)W!D99R$7grWHQ1T2-Mmj~G*v-ECUHM&
z?gv>h>rgA@&|@l7(o}ZFJ6&#ci=GN_xPg_!!Z%f7glCRcdm#2f7S?XkF1N~~QsGtV
zZY`gT@FiU4k-OgOpbZ7DEOva;O6*M>zX?hf5jjf8Hp!2y(u(EM(RyPo&UczmmB1W*
z>C$_-;@x?@BY{&T^(UM~<5Oz+dqCB@LUjjg)QoPCYqSG9IRa5J7OIYjb6};MU92Bk
zJK9)V8R*P*OXrPjM`KdcCoaJsQQz4$=d8B8_4lSuE{DIMn=`<C$XR6)D5U_rK$y)G
zz+uh5%y&nNI@Z;U77dn`MU6Xus7ke@>G&eM%cz6MIwx30sQtAx#C7m`yTA=9@gsC!
zm!CO?<pR@I#6+Nq;gC8*`YP>)#pzr!*O3=G?usd``Iic;yk{0tFe5R?QqGv96Pr@u
zs%Si@tyr2gkhdUrJVFZ9ul<{@4AA_9i7sU-hF?g1P|X}n_>?50pcGe}ST%P$ZkFv8
z+XAi%)6x$Pm@P(T<0*=`Dft0zkFEkzo#I#f1gToMl13eS^T6|T%|v14%wd!8los|x
zC1DO&;#j@>gP{*M=!c<gXYc^BgUe4s>JDL@%_NXw8Tg`zEt3^vq$`on#JYUArMBY1
z>rzi;(zot;ccIgLz49C;ytkPP?g#A)qw*)+(gTmo^oo**oXH1dxv|-$+zOq-!^c6W
z`%6B!3ky6YKiUd@MYrhw(A8LC?|Mg<p&%lGUFfB}`4|szDBGX2X8gD;JOb7K3;!vc
z<JnI?kT;v%1fE;{AF5evk)~M?Rq^2R=3HY;RD0rLTdS^g1tl>;9hkf43GbBVizEg7
zl+#<;IoRCr*VL2`unEI{cm=M_!jD<W+;1<ALVu(hRoZX;m1nEadiQ3c9>|DTnK!fk
zEIDDd1^O9(PoRPY{oh`}_+On${97g&|HUJ?4G?BHby`ZkfjU*8+RmH*Q|j1yU|Xw+
zms0FU>F3K&*<Bp)r@5q3tlF^U2)cw=QFK{8qf$pTSmZfNB${<%Rg8^u8)fG+^(q*+
zvYpXK?LoK6qFc}aK19+H0yA_OG#&Mb2H+BZ{eK8LDij{Ph-vnMQE!8ma`c0esMHl|
zI^<QHcl>LbF0UWY51lu8TBPBaMKnPV-O`HKyoct-k`urXd9I#?F`oOEqu{T86aSar
zG!HRBTcf30+a-_s1jHoq;luXLhVV$ti0A|Mp8<Q?##oElw#}@5{};u1qz4f0=N2c!
zz~M#;ps-OzZw~`~8WqA$!PJuNCzz{tVcH?n^vKNSihi#&ZlgAZ<=m!S!$@!vaSFs=
zY&WF{;C(_qR{~rLS=Ygr`FaV_kE08%5X|U88J*bbqGoU2i9jr&217HqS#Dix<tNV?
z534g!Z6;S*i6I8_#Ceav&4Qkp+Wi&SFK<t)zx~8N3ZD@F0-1pm<Q@amv##+rk4;PN
z>~JP-0@}esK~uTw_Zuf^GD+=4)8&&`gd(Mn$$gHn7L6`<C9Tm^zycUCkD9tsnM#8e
z__Yqj&!>2=EBY17*OfO|dcV9$yJK|!cNRbCCu*reG>L0(CWCzt=hCiS%}sNPZ1g$|
z@K$T8-13+&xJhx~j8vP48kj_{2GA5Blgkvdgt@Qa43-%=?$<EuSg@4t(M>*Y-JT3A
z2`)zMe|ztPOlWIC)g$T6i+~@%NI`q@Pgc&tN#cOO0TV&Npy1X4x=ymDUqh_DEs^L#
zyC&uLZZw+uM51Ork$AG?=pMzS7~+GhsUm6<BPZb_g@+1*Eb{~N2n$hb$3088&IVX{
z>8?1WcE6Zseu<PXL?~$<(K>qjz&_QM-u|lSpJrpv4U^h^!ahA@HN`BrGP#q0N^2h|
zC-&2>LLER7>J2Oc=~Q^b7K$;M8sPhpDEqNj@9R5rPMdExWRkSgOdGTo9_`R*$DuxI
z0R<Hs2tq+v6`GUrTDkL;dZ}eKyh_2Mv1#$vI+buw>q|(nrGWoAFMfZDTswjV(zpR=
zOvKN3`b>u#4b`UHuh*(6`8s?Sz1p)&;HB6Fg`3&<{h11xa43bmMlb}~AVCSrDzgp@
zHaXD6Iaw|KxF+iJbHYBgr#z~JYxh0Y!3Z~=h~^KJqYf|<b({g2aFd~EOwo9vqH?@Y
zGePuFWlezp*|1_68utclLar-Q{8y}I4v%505z*esW}qX-QLW}oP~p<NH+C?uQ_L?G
zn%Z-@6BO>ws^M|kjrH~Gr64`!PrI<K(F#VGK>HJCGpeJ7C*AG4Hmq}2=H7DZXr)r}
zgz<@^r(Uvta&9fZ(tLZbW{H$M^ayXpHtl|0N^R7$>g85n)zId6*@BV7k7ct4-ec`z
zpr-LVcj~9Ccb>}X5V($#XBJhDypV9(g+Q%|Pd@Zg3>1>~N$&2&%Bi2O@M8^X&r&%`
z42fZP;8hxQVHE<9dx0hr!a<_4?V#9jNiyo%n1;>NF4T?v2O4<qTGX<0BX>}+v2G<)
zhxrZu4!B@oNegW6DXa(tYugCKSPyv?eyAKMElim4SjxNcj!Wi4yjXuP&%U~4Jxnh=
zI;*ye)In3b5$H{XlibMx=Lb-dfl=4r50u@lGWA>Al9Vz%NB@xDXi%HL_h`bL{zhW=
z;${77g?S&6;V#9Pel$qOa1_qfj}#38rWLn4@W?`!EvEeqhNMn2Q5fWk@5tBnyu<q<
z{>u98r0D&f1@ArD>1;GV<_PeXc;6@vB%*#P9&cuGPaw_0*3EdA9}2&nOzt}ST=|Th
zePoQ1arwPVH?#e39H_hb+NkJe`OcDqh!?;N9II#K{y;lhd&}yoRSrl~cvG89g3h-Z
z>!hGHek=rb&Qw#<05;eU2|#m&V26Q;w37<!RN!{kb?w`>HFd#n5#oH}`n4~kQja?u
z&G|~d;V>7hm1ZH-&=YB=0>_B`32=P%L()PQjI$_=ZthG@of)JK&TRSdXi=9hQTjbj
z{N}`Cu>9eH)dA@+)tJk<^+x`1lpE$1XsRYoo@v8YsH~2G({$3X!~g#BX$Qed<Inl*
ziv3E1UEaJmGX>Qva#Oo{Px^EOYBZx`apWs1f`JQq8M9VxW<2ANV*C5E-NWi<gMR#|
z%6Gf;KK4aHn0f}f_hqdM>l7^<xVyfNa9Z+?%BnJK#l#?l=ze62mVka=B7f8YYyGm;
zfs-<PZ)6f#tfW1@J0eL-F`IBnXr%=#wT{SJq&^g0d?DZMp={+0!l6G&aNlnyzl^iv
z_cjZRbJN`zNjgDzkmKS&n}YP20wZ$JA38*VwzBxtN}X?o=;7*Nn-`&6nrxxF&CVgN
z1#(g=C<`6f@qS<-Zbe}ueA703T)HXgYDH!yzUi5__c(aAApe<6T9TjwZ$R{i{ZG64
zw4~X&qlbqO@MeEq9E6~NB@kmuEy09PqIb+aq+nxS9V78`T3kxMB|Y84elsk8-yUh9
z8LK&X3;@IIXsrIiFS6H=HAn@7e%6VJrTU1bL_TcfgHgL+J8_Q*WR8UR{%cA`y1(ry
zd9|0r5$VDJ)vgz2qN~jt)M@u2arb<rbam#mA57YYX7%JjpK*aTbI8XfDM(a&xXS0#
zXBg$??qk8$+27&LZtu|i9+PG<RXi|(MFE~zNrM^1k|0jQ7-!xn3cJ<!fG5ARcw;8b
z+OgGH?YQw3swZUbPBisyKHzT3sC;_|JF#=#4maNB-}yaN!R3H(THli+@f)>D*DUK~
zWD-Zy6*sb|ssI;pgzgTN5OEU$)HL#K6dRO>FIM(0D^>bo`Xej2$iYW)q-sj!`}BH>
zr^M!>&xUJrBBUTk0y4YH>7C&;Rl4HPaaapV4p2zV(4}{hZEenReC+EuI4Kx06Py+X
z*BatZTGO+ee?v_IcGn}r8Y03}|Fr=B6+v@_0DnzOOEH<uP*%iC^ggtgN+Jplc#lN%
zL_bxOteZB=gNvG%M_QBxMwxqQ7*&3Jd7;ad=Yqug=hI(XFCDUevG1X8=Q5%O)WY#u
z6|jye6+VdjEI$z&$~PKCJi7}Zq1Q{_HBPy_uy6A_SZOS7Tk(1CK9TN4ts>Gs>#f8{
z1o;|z0#0m&vf8CYKaA#)(T_K2pD(X<jM?gcl7c1U%x>F!(s-94>wLVy{C@wJul_Y>
zXNfQpAQB+eAT@AGPI5YoG6x7S!5i+QllF|~ruWkL-&PcqU&%Xj<NZSILwYEP37)_+
zvIOr9Nl#EXJFro<d~0nuUg!WAw_LPp5HZDch}8uV8><76A5y|r553f28>|Ah=b5^L
zy2sEh$|yd%@mF;_Z@%%BOj)ulsVT+fZ4K4@a{!N$YJ}yLXkANIz2-$lS<7|bAfdiD
zG!=>lH4d^Ohj%sYf!t>^>b$nl(N=>?lQKT7gGyuj+MS(U9QLuZ&ruu}5b;>DIsebI
znDK39*I5%l#jdbgP4N)3uO5Gz8*{8(2<igCgEaOXw6Fb_5uEDT2rIB#y!QsGU3!Ar
zti<6u6tg8ayuD9gCVJ^Ul$l2=q+j8t4$`8XSwb&K?Pa`Nbp{^cMGj8P>4sf=2gj41
z%03DGaOIvq7pLQm@t$Jyv~TVgHIm^)S-S&WhFVbkgGkAnzPq2fay)kx{dq_)d{gL2
z!0#-w$NHI#`G64<@$e5Mt_}H3{(r32`EReg_iw3X{<YWs7uP!f4wU*lJQVvL7=zO^
zkseSYI7l@IzI#g`Er)e+L_UuzACjr-dMsRCg2}0FUQUxbAND=I!LnOOZ^!qq3XKN<
z+pc?=QHQ>r$by<iR7AyNoe@u@U9yPNXG@76u4qi!cm9@It4h8{zWn3}WyFIbz~pkL
zSwI^^k#Hc+C4m}_Wn@ss@XpQT?HHx`L#K1R_VBpBol4MX@e>}jH_+H<Z46>&C{jSy
zSO=m60ZO4{wa>tS+N@(xX%&1@j*!orsvGcwq+^=D@St9w$^FA{p~qjk&nd3>Oj;3F
zG*u0ILHqg7q^%GkGHey}%Q^NyHZ(7{hiOsxiaN4uJC;jhqR|E)e=|tCEYUwpF)U87
zXZPBc@3GJ}F>ahZZM`Y^<W;MLF1G6f5hyn1Bub|W0*^GuMKbP*4#zBJFAx|5MPXY{
z+Lckdy)X&Zeu7c!^)Wf|^t9<nVOPqlo+R<khReI*D8=<)V!-MK#ms@xsB?~DMJ#Hi
zA!2cR{os5yOX+)8KEgkLl|64dhj1D``beB~S=b=+q&3TvfqO^adMWOe1%onQf@_<{
z8<afVyc#OcMVdpglw;;c>)t~CM$$Evk)p^9yWqfEsV)hc`$8qZRZ3mfwweQ2yGoEM
zu%e{spuUR?VH7a6Aq)Z5&L1VBanPka+l+X7V0CuCns)!KWHPlZ>tr`pg%hYO;&x*l
zar<o1E$6GSli?tf=2gdq&G3;EqJj$xN>e^(;GmdgFe|H{mpnTkb|z5JneF0@4;LPF
z!-^f?5EuCjvF43e8Pb&OA4}w!I$6rsnd8LQ{-*K=VVbLXRh#?sdFqhkOc#-#Q~+e3
zWXTx%zNxO6eT;;%A9P1T>18qGIq!!@X`X54=U2^qPTb_|)si_D$|5J}K@GrjlJ=9~
z)Nry_DgfelXuC!eX?&%h-Z=URuIoL+^vvgbcB-XF1LS<d2+5<s-K3Cjzn7!%dDT6&
z`!UH!*u+me-TmfME7~R0LO%nklA{PBQ)q%EgfJm5P@iXd;iz8u{xiNi*wj2t(vq#b
z&9wi~g;`Fzurz)<lnG}J1A7Ag7VMMJ`3=3XMw`;M5KnQ}*W4GJ4<|}L6~Aeba4)db
zY|6DY-Rzf8Wg;1fDq{#o6K;|t=B@;)5ih5?(i$EN%2*TMW3RD!-ug_`zrWi~UUT<(
zwAU&0o<K9i$Y%dOk9y})5;bP+XqJzTf=mrYo|`0aLY9YxRWX6XlxW5q=-p{6*%gVj
zFPgfO?nPY99Q`4iE+#UN^ysQ?Zo%O6mSlequ-U80+zDt#>|##payZw0fYgYkavibi
zTA%7Ky;J_tnj#+4DDVI;r8Zaw_{BLOQ*7{{cu;=~*8mwGnCQ&uXNM+z2n+$n?&e)u
zVn$yGoi+hZ^O#{;;%?~TT5D8unyvG@^ux8ng)O-jUUTv+hTmuJHnWKP#92<Wwe9dn
zsM{g<ActN)jms5r`raNM2itg!;1T=vETx()?+4Rcn^JW(Kr~N8hJaH@h1oH48v0I7
zpV8WyK5#~rHC0;Yo}q65N8=c8xR_nw?Vtps8*(zdV;dlLasaYRq**N0VW4P`skMR-
zLCJa8c14{!JnxZ=nA~H!>EPL4G@i9e(`-$(p8QSE+RMOI6tfa;K2*g(m}MD{WG5_-
z&YzBPbJ3&ph{v6*Ncts+D%ufb7KL?93`HFn_(6vgyxA9R{StDHeD>q0$zu2Uve+`*
zEyUvWj!j-?m*$%{rrqx2^iD>tm?2%zm#QwdQUb-oAo=vVen^4s(61X~c3y1vC3;!0
zF~>U)6+1;?jj%G0bnhoyua6I|nGeKPM-0C&>AQ2=zU<bmlNo8>&-f&Dz^%yMkNp^~
zrWE0Zf=P$vgKCYbB5JM4CU@5P-raIx&yfzZ5}ZDR`wrRBAHRfx$=F{Lg7FdntESAf
zQD2OBA{8Ae$$jTXbrehQ4DkfpmEWw?KbvcI<elS4AzR<^B83S4r{{r!*$@zq>7t7W
zbVzAH`ft0+7JZP%!a9{NPAR+l$=2llR_dklEmPg;`%7tAjS_jd0%kHX8o`YcMcNJ>
z9%xbs3^1h#6*Q=|2^V-s_I7?caq)>DNrq*w>MQ<>*gWJ}GJoV6Y0GTyA`l$^T<~HD
zpFBCKnXafeMSnh?=NbHk6wZ_BV{Cyw{B0dR+0}`yOnby+^LWM_Rhw$VOIli791&jD
zDv12;qpr2VK82?W%U@1NU%8QDDT9)R_&b=wj0CI)SY*Im`DM}_w?V&3mTR}avN)Oe
z=(wMCH6d>)`h|8xL2u<}Z@wBHPk6(z2oW|-X2P1%>Xx|H1CoL059J^Gx|FUK2Tt=_
zI_vBU*#o}+P`GRVBN(?|5d9pvuO(eBYVAj)Y~9f6;o6i))%eBGS&4m`MVhBa9>1hd
z-e9&nC@~t)_&m@UYh6ORYDDUYJNpo59}T{YJyF^6@$zLPJIjiOq)whwuavXWwPx8u
zd{qIBlIo*a%C9j`6s5KqCW;SHBzCo#u|bLFC}k<?E(%8)5>4y(-3Zm&eWv@$r2^Cy
z6L~za@O*&g2!yxQ&lC~u*)CR94Eq(vZ}Ai#zLvg}+kHVXlxLN9D!^4t^X*ln1{8Ao
z%{z~0ZvK2hL`5I>w_m)z_5`z>+n3<qdHbB`9-)fUl)>aYuPEQ2v5XRz7#MVVjS$fu
zwdZM|^AoCwGU5Tc1q@-mu@i>Gg;n^hSp;?$(kVX>D5wc~X0>HXeSJ_jI%}S}a837$
zOEv6cwBEbhPrIK~x*?h@&9LYe3n0@B8hBOStEb?r8?;l4sour<Z{5{w-B*-^g(*`u
zeRZubd3}m47xTnOh+td#UI><lJYNrG5!Uv&)ub~%#gX2$Ir0YlRxYP|Q?L2et5@?y
z6Ad5AdiE}MGct8yci>#iU7%n{cs~W6)LiSOjpP8-`izjl91FknSF2gqX3*}#qniP~
zSP4cl$wxX-46+D8r>NxTw!X{BvDU|BV-2d551JkwZ~P<?z3nY6vx~KBtr}&6RA$8a
zO<|^<QDdM2%24Z$OF14Ez0ek)Te>~5cEEqAYFyb>`L2n&zP;Ev=c|U7`ECqt1EoRs
zL>@*W-69V)f+b%^JVDg(w=8G_U@3p#`Joo!`Z<-RHjCkdM8%i!mHesK9vg@R_R_ye
z99+iYqdmBR&DH+=CsxW7la7LDw&dfKemu(#HzSZ-8-Y~Cd-G31W-YdUHAw~I*$Fd)
zUu45XpZj0a6GUC>=Y=k`X7ZFjyF^C4^eC|Vte$pK-<|t>qI#Ea-5wUDuGF)p`-@y&
z++aF01;DoOA$wEDAPU(DSY($FF^|WXaumOXSe;Wo9;NYe#-k5N#@mWY{p-eo4Jpo-
zMV%x_vzxB?uGXvxyEYiBt~UJ|BSR<1mvUsGJ>8j9(@=V2IreQDcs?Ze>kPAs=6k+1
z>Ytv-N@Lq>BWkpIN8m&@Z<Dt>yn&*GOnKZ7BR@s!ALd*?a`n&nK6-VldNYwNP_d0E
z2s-NmdV@;u)XfwKG>$4RzS^`yZ7Qju$3^c2&OdjzK8}(?eYu3C?2XUZr3xsH4Yo`x
zO9Hk!q+L6JzMl;*7p=0lPx<5qfiW^X-g|*PVBcN;E`1?}m~9|uGgXQg#5P~2yon10
z3Y#q;hUf38$-J=e0Ch$Dw8X12)z9aOe?2iJffDqSBkE^(68DO7V5qlK7e_PClLdOE
zT?P_%pEz@6$=zF#qrY1PlJAdcRCA@-pt?|}8-|sM16iFK2`ZWTb|JL~gWnl`f*+Tt
zc)ai`&WaQtRMtTDCTPUy2si?@xJ>aO?@d1Cn|8{&!MB5heeUdP`+iu|Oo>J0?g>tz
zrQhb}MmkS>Ma$<lg)oZHcg!ATmE(ZkEtnf~l(pr!Y2U|-{4A==fNs`CBL{mKLi=17
zk7XF!$8_zd`7p<`_^^&<6Aku^99<djI^knA_19l=YG@a}@mZgoXw6F=Y&xlTW}jM8
zMA-^+Pc==5ev$D4?HYoZH6)$RVV<IU4kyMo-s!mKb@bJ(F|!+~N6yHrie4gEzn7|}
z$gOhFB$1$O3=3rrJT&`d6v5;cA=J^835mS58RSGlSvEYVu;gaEhed4BR6dKSlOruN
zdslJS&K*o<C-LLEQpu!ggREnJL)Z8d&#g?x^B+<S`Df6D`wttm{qq){f1e%=OGX2o
zE=UH^f}MJ1JA~4L#BH&V83o%0s+I#V805PjVIY#kvCNmdKD-@seh+)Z&4LuH@+t2r
z5hk|)s13KSe}&_Vv$*dzS`q9dFmz>=NNOp%u<@ALS~Z^*OL{k{#h?2~Pb&aPmpoK_
zsQF|NviYUSMTlBmqM?}`u<F;Z1epDB0s2{@$&=<OM<L>cVqWTLU(XLxS<93qgkzPO
z!M#PRWz`#IBrWg<?ZAUz%N*M&=!#})BL&Id=PUg`%S;x&=5g0p@5Xg0QI*+~-)}vy
z{oaAJast$gF=j`Th_@oayFDvTil<6KwlHeeBqMO&!($a%EFm5PkskDc6M=U>F*sW`
z^Ed~az=;(dh~hvGG}8ejsFgbVJ4-xZtL!7wxq*_>BM8i!BS@f?bg+au615(?++Vky
z$zLF<n?iP_UuYvMG7~sw*O(%;F#4TDC@KnX#gO3d!qHXk!XcnM^`3cUr}9V0Limya
ze*qGJj{p3*dCbEE2o$%5Y?N(K=}jdvPZ?6*!Jq5Cqh$r*cg(|=5WCUsI`CcCMLbFu
zH0FZ=QKS=~6aIYJj1vCu{3W^zGl9C#Z|+#NWKsC}n=w0@utY56@C{9Q3nxm!9)Xc4
z9EK);{O~)=>^1bAwx1XN<8SJ1Etc#Z&@8F_#s2&TH5}S`+8i%kOP8ya<Ia<N#9vhK
zFH1xF%hI6qm_1o!BkXuAb&`>#bB1Qd>_=B6k~rT$7dp7^jfx0BqwAyRz2uTdo!tr^
z*<3W`DY_)#Y<!h3MC9)-D(yO~7j`|QzmXgw-vaD6{?48Mc};(P!%-tyzH2(Lizh-9
z@%?}LLjSa&)sFO>E!S_fe$K%-*hNU~?|hkm+9o7xw0wxow(iEiaR=u8XW!I_lRx2F
zRyO7U@TfIn_9H4$Pbb&FvmoK-z*}QEv$q`0mj`xHp-%&sjP4`BtUEv7ZZr8h8EJmq
z%s%2;JWRr`!D72~t<l#<=F-_c>rt@yVAp0Wq=8{_ags%B5<9fbT0W|^@ARIgVky@u
zJ<8S?^D&gU@OxYP*({VsLSDo>Huirh0A~OuaD|E)b|J&)2P_D>)f6zH`sCpPoC0{$
zHNxNJb8-}Xd?jsKdFZFPygwV8q{qtIIE}fd;T^hl9&~lR5}{)naRRf5jurly#q@ho
zJpinBe=5wq#_)UcZ?giu4Hax&8=NoD57k4&jmXp#?;JPb_v7P}2Ic3x@5Tz68-#Ng
z$Lx9wgdkZIFE;K#^(iRj_?*kC<6!7Lul~Oewf`uk{GUdg>tE~N{-6B*|F%&3|60xe
z|NivVRR^kC8UU$=Q2<0YHv3b5{2x776qf86K+j&oM-8=S?VyJ}wsWx@VC+7=_De{I
z|Nj%x`9Bh@7FyUwuU~x!#6Ek?7<;U;t^U?Z-jsC_V8jl=KM64T!BsskggBW!*HeI&
zb;vF3(_>658o%ZJ^P+bsdNsuzu@f+=_;Xh_|NrRa{L|~@{Lu_o2%lB8Mmj~d0z&0t
zME*%N>f_P8uR3ys40M%)y{1x^%rV1*DxYO4RQE*ZDYbLc^@p(ddO*fZ@VrF{XoZbC
z5t)^Q_;X0oX(siIm$pZSKP_22VCITu3*zM?(L+h$=GsrA+6NED2EztQh3G4wvA=}9
zi`zMYh{=Kk+=0-}06-lVf_~3p3+fmeZ(8&_i+~N|PT~gUBY(1e;$m(L(+?y<o48tU
zhNw!18J(*N%wFBVq;Ci{jorM76@rtGf@A+l-pV!f>ySk)*>Pr!;!mI$6&8US1zw-q
zbO94;+z1@JB{q$pB0x2y#1}NZM0-0G>7^I7(Pl*-+k1zdN?q?HTGU@qJ7Zvd%xg0D
zMj{lVL7MuV#ejzb`YyY?;81!C1>m)|sa=$=NI!lK2x!i%P}D33PkFd|u6dSyuSlOE
z&NwT`+&Ad9Od3vcmS*=c90j?XHtp+ulk%USt;WvDD1v)pCoQOAkR*+t*>VXfkWJUD
zAI31{kiQK-fPN4`mn~ze`+@yQp2u#5^>sDLr_*oT+VkyZR0sOL5WSB67q}?uE39In
zl>x`su5N)(#Q_XAjC;xMb6G}_iMycuV0nOdL#oRyV;7lps@b3KL$})S=h#g6KfJo~
zm<0qAW=iu1Qnq3AYrnHJXP7aM{+$K;!yihd3l~urql2C?n{h&6D#{hXR-XL&Jjo#}
zD~JL>4#Yw7Yt5h$fNf(qb-=g|k35hJZVkXrppKGjqrLbdt8)=fx|W~VU+rdD^^>(@
zzt`Ja%RRi1FO!x1%61xt8qgQY{zZe@^~ZnzTXBQze~e}HKO!*y?Al>}I16F@MRW-P
zLEEk{v`g0RJ2A)AUwh1A!oAi=uori4*-X?Jy%xLA*&B$NXf=_jJ&FYd9=-`U$?-E%
zu&W;f;ozwFi0T{brif4uFFyRh9|24O5!wL}f^%1Y><b`b51@F_Ev{W)&c+-gVp?Dm
zvqbb0<e7*45+YQ8|Dz&q31ckPlZ>GjVogER8*>eg+)c!2`^9{xJh+!UCQQeYW`_(w
zgfV%bBJvm>@ub?`ti9<_xY2MUJmge$1T+3N>Bz^eX50~V_wswi#)ro=#kz33EVl&b
zxL*$W;5HXe$xsi(C|H`Duv`v&*Lxs_0t-b{!CNttt}OxReY*Sz-JAlOJ83)W&vZ=r
zcN{X*?7n25V2?=J!<o}1L>jpo8~Z)E@_>6uHh^3h2thqGb$8plj{wA#4|(sP4A-4g
z*vGn}!z%4g-jm1knlh)IpTUCK<;%l@CRs2O_4EgfEZ~XZh>b;4^^ssgRb=v(y3W6i
zN&6`gh$FzT56tyL-pqnU)B95%6~y>!)cE4xCO7|}9PZcu(f<1<*Z%v%PJ#7<u^Wzg
z4os438EF_!fG!`yOgyB6xE}@@JKb<9$aE{V@V~P#4sOv@5Yp|4_Nbg?ysj`vg^^1$
zrry7^!1n*n(x~_o$hU_b0j=LE4Itzv2Mb5?lQ*col&E&elY_HY6iVF#>is^x`?OJc
zGWW*iWMtNZ<PZH9nN^1lyBO%O=%b9#E&NP2qyr-z$Pzm6LFXuA=RT0fo;+B2t^Y%v
zR@&t0p|sk4xwS22&etJQDqlLPIRDwGHnL!{OW8={`spFV9nfwA&0uN-G)<l*pn-Wk
zmUVhrmu(p7OtEXTb*R0W-04CSBBg9rj(_Q?6ZMYQSk>&k=kbEk)N~Xz!Utx#($L{w
zUd2sd{Ou0g->5+}+In-E3;Ck-;BJ~}{r)P4-y%kL!;hu*aJmTr4ZZ8=U)bw25E`#v
z)GEn6YyYebCQ`+RzGfGKU99&iEnPP`b60xKAceIHeuDT8HGtdIqzwI<kJ8T4gURzz
zKEF_rJb~zr>$WvTQbR)d+a;V&%hWyuWn3PrmAJksYihu6S~OLYMdnIR0X!eBbp&aJ
zvkN<pA;`7CBEdwgeUGk)hwhQ#0w|uM;Z5y|R?jf@`jk3Q(#7+!v}8K8ftaXzc>Kkc
z{Y{;W^rqe8fq8h&*i~IN=CH0J6iYw|o#S>(-&jt4_@yAnbk9+Xy}88>hUlm2zHwc(
z*|H@#_#H$-R#j2+wB5xKrYK#i4xrtpVuPO9+V9;F93?r+c&@)I+hyp>a}?j=jC;k)
zipYYIFQX^Ahz=QmAbpkbu4*kp1kKuX5bD=^Ka@>=y)xC|6I^oP=#n3sv%7SMM>O-&
z$;ev^5(?dg-=pUJO+Bc7V>L@mVLR@gYZYL&KD4e2NDpCbs4p{TL90;~))j<Ogx-3H
zdy53Vom^=<V9evTom{X=uKJPbu$6p)HagiH^eXWpGo^%V2Q0F@Xr`!+o%w1SAM!E;
zC)#}!HaqB1pP4aRVt#k$@X#r(8KRc#vF`)hHPbI-*gmYX3DJhEUGCp+zi4Yi^6*+$
zZ=C=dmw*ks@cJ-Vi%OtY_)df3&Fsbi>fh#)AFvIDmVQY&_qL&hfl+x31fGe@X_llp
z*c#%f&n)^eiUE==YEc4sWd2l#{_U~+A60Yy51acxH_rm7F!>hH=qtyy;CB3LF*r$0
zC$muW&?)9MU|Thdw5Ny=w%Z`550v?aBDbVZ{+(mn=2MC=z1z<H7lsC#5EKC1h-GpS
z@EK7R6m(Z&RQheo8(+TuC!6C1llkF~)gpD?NbJoMWTmDE{oGH)K`LO_IMj0D=gtC&
zgJ2~pC+utd5O<t&x+dWQE<sCDQwU!VGP8bA==jE7KPd88S7s*gEhSC0CD-j?{BW{{
z=x>P1H%i9byR|8LuqJgY`~w*IUeW3lF;=S(x<K}wwTizTt!g=l<SI5Q69~H?B+43X
z@$`hz8)Y=U8YZb@B7-1ffK>oCe)6#zUym#`&8q;J?VD2nTFgj?rP;Oi4hy4a<gZnn
z&~RIDKHD+52}EVoj0z{S&Gut@`&{$RLhNRV<voje&K|X=hK_L|ig!1t&j5GnaU;Un
zOCucXf{mOJ@>iIT_vlQ<Nt{^tUQ-<_eqXI;(I@>TYReoERs#xTPDQ9-F46{YdcaKf
z!V&rFr|ai))SWAn_tgC+yPH!XENH2DA@`RxE}p0&JjqV6B6;8wxq2GJ$lGDcYbPS=
z=IJMTi^UDT-9NX;cL<(culREfFrUJ%0@9Bq(&vrrb_Z}+;E!{VyC6Xh3~rrth`#uW
zKZ(?R?TjI>boX7uk78JSDC#gJq>73KEq71cq?|*-LcmMD+St@D-{!nvb*I;}#qaN7
zVauFuhUYAAc<=iqZ%B{}Y_uen!Y;YRl*BeH9zJG$o9$!gNo^~_J2csIjV^$(+NaTR
z)|+US@I5y5nFA%B9<Ru8?A^=-J=QDt8qUZ041Tn0LzC}{(D%ExiQIRXqVKE52I=vh
zvUjx_9!k(OX;^v=yT`dG&$b2=0ym@oJIhTt+mvAdF)Q9-PgTN&YU44_sZATR0^}GG
z*bM%1e`;M*vc-_N@Pp*tO79U|{k=wqg3eV0E-KcN)9CwanOtyuMj)w5WulKTv0fvX
zywWxrkQ^?`YLrY=fa&=4%RT!XVo2bh5T{x|A;(Pd@JN)<fCF5`v3!7}-1`&ZD|O?7
zI67*mngw+Yhi>TsQk8~_hIAvQ%=AkBZrKO%JQhud4)bJvn>xMQjI+RH1wp>Iw)(sR
z-O&iSQ_!8}>RvJy?z+2&f4lD~Dflg4l6m8mymT@LYTg!r5d0Nu`b?@GKWNp9rAIv2
z^W@;;I1aY=c%V+^0W@DwN1^oIMWi$%uIbE5K<dxdnZ=rio_ruDKYMW9Kq3XU+%I<a
z{rw>(_e+owI8!8|Q82J<aJs>OPE6=Z)5)RD)Rrs8ic}jitL92#jwojFNDK=T0d`hN
zlLoX#fl*P61f(<V5+n?ZY|?CiC<LE3(EX`vv9>XvI)Ze3gc>8HjoH7xH4Vi!;NH)@
zkpuGFor#bNmL3lHyxuew2Y%2bz#uBN)^+shMh~UR*a(jLf>vK%6ZDUdPJP(RAq(P&
z10MlBX$A6!IJg^JklB|moVy-4)>0$<F!Rmrul4WzYkT8)-n|%~n8^;kfA(0O@S5}c
z3=`iOx7?Vde<>JH+P^FY*5Bhgu(Tsh;=s+kU<B1MHe2);K7=A4-43LGiUce}+uo}7
zDf%PwWmi0NKW!Ua&19zL-|Jidp=s*y=nu~7>YUl&ZK1!joG^pV+A*(Y;9)dbTPx=4
ztr(_W$qoZN_#@cw3Qerw4_g+XmvGjYIqp#Szl*%Ou_lSM0kHXcc^4>+Xs_7T1K-Je
z<Dl;bgH3PP_(E4Wfh&+Ux(&hX!ERbKf_8IwtQk|wkbVlp^NinF^v|?rL7Je2Hv61)
z(_Uz!=<odEPw&=)>Hl+os6#B?EDp}4EC2EtAuC2bYhQ-XOM{GHzJWM&)e)DjSv{%v
zKe%#n%FSYa3T*N35Iro~aO*7Ppm!gL-P~`3(gv-3_?6#T%8pj+fGqA14{T>U_IH+Z
zf;+6`KqZqU_y4g5_>cJcZ?X{+lBnp|6;ydbU$Un;n@b*}GtzV%DgRY^qZ%i$blNy>
zK6u<obcudRHwCZztHKFqwmPu7TXdpD>0*VVLNn)lN+*wwBu2_@zb^mG5&}G*ECPWp
z9Ee}+k>uY@k5>X#-ZA@(^%~pf0cARkx4mjN_6FgdGN%>WtGyx3HRhh5MQe$dpd7za
zqqm=8^8*8Hum`+u5FB`pCNf_AV3Fby)ns9G=3-gxn0rq!5>MI4#1gP=h&U<L^(nDI
z9RdFfEUYdtw-fjBBM#x`$f}=RjKQ(h4|Nd31%Rh*G*j1O|3#|<%MT@M(xj#{dT`X2
zV`oo?8TgLl_wg^Dj$uAbqzlrRJhH^T)N05XTsLiWm)H5DV@SV(r58%ZU8!WJAN)TT
zPnWkystKa2&~%BsSw;E&C;(mp={Slv-3-9mfMjn^G5d^xu^ro>>eCkMImWkHpdfKD
z7n@~==0Uf7X;MRLSW|w{tneS4IsC`%R&3)2cFWCAsb7qE>g2Yu=WdkHeg1K<bd*`Q
zIWsBL_>3UAqP}N65Z!!Y+Dj&MSJn23+UcI29?rP^-7E}dE|yK4$pw&s9ET~=ccBPc
zP~Z>!Bx|GeqY;JfCf$<_fiKE99WI+mOWYd9lF#?O_IE!@*(Nzs7VI|m5{ud?HplV1
z-oS2GKaOJ^($rc2p@;(9Ksa|6uI)Tkd^M`$gU9Th1P4I$saSDQ!O+_uJ|xJnzuIg4
zs0VLFJP0B}1MGb&<^cZ_lnnH8R>8u3<5Z&fa>d+RPfo6rdrulQ&uU8e*h_(R+6_2+
zE#ePi`$;_DWJyi`gj4za7BMrOl?LA!^c{Yc1@x7(QHo%9Jw`Rp?0S47aA7toRNcDY
zpowz)wP)kLkGEe3Kh@3W{^{EZBikb&>9t4mseDe<D6(_gd5Nsm6-sgVcd~!RIr&>Y
z>k&P7PcOtQ=}3dG(i(i`Wk}qWE{&+%4$GByy6HS5{c2vN;o0%^kd3jAA!-EpxGz<S
zEH0DD4e8@Q%FZ6MB&mzbzY^54sB`2D@7~Pj@AVc&xAbR`^)Y-w<x4ohHNOc*zUM<a
zTJMawZrm%Q`n3Cg{a@_8cU+U*x-ANVfK=&7i%J!wOOX~EO%zlVlok~c5CK7ufFTi(
zB2`2|Q9(pRML_9D2|X$zO<E{P2ug>95?)C0Ouu!{y=#4+`<%7*@9cZ-K6n3Nlt_~I
zopWZ+=NZo!<JnG+uztG-L!2xbRyk`u3A#ZhlATgTt~qbpr^V@NBb!io@AKt*3KiD)
zRzabw*AWdY7$x*JN=0&*p`L$kCheBxh_#4t?0(aoa>s-3vomNQ*=@Rp_0ExWv-hF2
z(4?Rx<OOxm;5adNk89Au4<<I;x5S>L+C9jXWQc{b_zsgztOLyDSWVK&_x*cf{jTGT
zHC~?yym#y>^;5g$wHHakNfh`qHQ<?QIRgB|WSz*{zDV0Q3dNWUC)d@N`nqdW*W#t_
z1??qbK0M=y{Klw}+a1?JpahlN@i_Pj8BMf`u_MaN58xYK-N=|so*4B!eNE?Uc#_l7
zQ!#c(NozhCEW~B%r5SiU5NpD?kwN=I?_BI=@ZXbaF_d>1bH%D*gG=uUo#>avQf#r1
z8h4VPGhqU;Kqf2km%fsgjN6y2Cqqrz7Z^Yk-2c&x=LZ(${Y+>A#^QSb=CK@lN-?DI
zrcYo}Fjs)fov@};o4ItEAKYbtt#3zh2`Tb2eXm;Don8F$3U~{F!7QdX$Wx>JK&V%b
ztjwbtw&jSSd};Z~!}^3B<6mnZ9B`mxRqy59vb$y^4%h?S8X$eu_J_}hQbJizi{{pG
z;EQ!P1k1c`KMBSxvjsu9n!^y5X$fq~iE`Y${Z8zH(vOli)@Pr=!Yqc8qLc1AahvJ1
zMI0x=q1|x|k;6?_VV2CNrdu_)2(6Auj6>-8^MOI7f{dSRHyf_0gC2Km8v%{jcT@9c
zC?AHevUV_F6#KXfR_Da`bqNhQtA;2=vx}**^V@L0*IzAu4Ak%%2|ih7b&eT5UYAZh
zOO{ko;eE#5i2!Exz_5OQJ%V-E*o(f5nbz-hGAsnRmDjPKY-oEX*N3A4&j2!N{g>*-
zFXOi<GK+5s97e#5#f8-|2!gLw-@*BM^gc{e1jt-qbal*dI0eH>4hPU!+Jx4`;qJ9-
zFh1~+4Z%zBM>%*sa_B<m3rv$A1GLA6Ltgp5k{_I+Os(W&E^a~<ac6p!K14HhZ|E$<
zdehY6tyO*)8I|<e<9%tH)YL|M$PvN8&>$Y3=t_l&E}YNmYocXdsmyRcrGV(X9&RZy
zFtm|SdfSBN#pd)0Tb`MMQUKS&S3~i|A;dy>f(1k!A<qd7wZ0Se0m=imZ8ZrFl37Cs
znz+IgUag5o(vL8#fYxXMtjpVw1Kk&OqJL*9$q5K<oU%px7{`{=0dDOu+7uG-!BarG
z=!@^|%X@aL=+!E4k4`4rUwUE?Gq70^0Vk#KRj=F}!+hL~BW5zh93~#5F6_@mx5KLp
zTY@2g^-JQr!bpV+dMA(DAMQUjX8LKjO?--V!2#ZdOD#tDdWX<1EEjNsD&fR51;c%Q
zvPnT*C#rgblhsLGv5O^I{*!0VS4Bw`J(^4#VLx;FQhK9!2YQ$w&XQ*hfvn(Ez;Gzh
zaK5t$4;7pRM$EQ3GRWua))^+X5ir}T6#Z?1ik=UoCzIdYURmas>i|A9z(nDFYBDwe
z4rvH2@X_ICk+V}x?%NwET5*qC6AM)7?z!75tLXdK6W&?hrYmOBKi3|ZO3XWyUF<%!
zPk(tmqoeaG5NoAB=ye{RS%=lYONAD2p>zk7F%`3*^%@FbZ719|_U%%M(;st0XilB!
zdwZs#@0AU^R8{3xnadM4C$n`BJBPvcWe+?WBv2~V%u=)*tE<!<C0K6NaIQq(z6^S~
zZXHi5Sy&~BTD?=gEc__6&ilrt2izUNdr)+OumBPd&bQDoog0cBkkcbiwZMh0$n*7=
z+nT*+*1X+4>}B>AX(5463fwrUy+zlieR~nAmac>{V?H!jLARq}0W#>pabt=X%`%ZI
z255A~=LU=eXcl8{)wg!0yV!e|;-t0JrS@;=e42BR)lgljvAX=ABHSgyWc6m4^v!c7
ziko3tv$#3Tf*V*pa0)OHCJIGirA#?;$B-*6upRo3BMjQgzg-W%P+M^`-krzb`(#k0
zX<60QQtK^s1$N-jxsUFL3BrA_ac~~YRd^(VQe3A&y+~ax;&Y64j&sZyaTw~w`5`Nb
z&ue$bnCRA)9v0_@Y5kgVUUL}~7bSC^gL0Ed%hP;KvKL}l^c3y8IXrgh-o&TijW4<q
z78ZUV4}N|n-@j_bkg&NAm`^7-$#Cgl7&GhnQ)&6h{aT%!#Z|quWi^NFV!F#J+#j%z
zsCL~YID0TLEJvANc6pbaS}>czA4-P)?J7&U5f$4EM}u-Y-2wMUYm7fPc=``%mGSS^
zuCl*9^x6ZD1t15cf|<Af1xhv@!;&t5zYf6zI=75Wlq8zkknOime=y6g6Bu}$>-|Av
zKEsigk@5DJn*DvZ$&u=NIw{wgUbd{)4YAnlRA?uH@z9LGOl5)MQ{UJaa(Kb!F#i*|
z3M+TsJ>U9+&8AqgAbaP`1ys;*3h)pw0-v1)Cw<AG9EEKfmTbWgvj~7^P|~NSPG*%k
zZ4k!8N7LrfAarbb1Sj5yuLih->U@(v7BCpB82i&%+YW=8;2IG~)<Eh>)!+1VRY?F2
zQCbwUO&@aus%Cngn~l_?o!I(Vyk*k8+^CPUN55NhOMO(R;H~$1o=$IS=6i0=1DFoF
zKuLy~9<SR5mZLA`SE>EF3PU80l|0v&11W<AFyXtTB>?h52l8=yx@c2_YOnsyu~_@E
zA;Q5x&9-*Ck7FrZmUh}|*ygSzIIp=8@(N-HLz*i8bmHl5nc76|`rbv6AEU1L%1<8S
zw~J3~1||tNiM!)J(Cw)rC+1UQFn<{Z(1G-?gT>T+UNEPRdCDYDrh2Rx{_a<PJD#eW
zoroz6GRW%v21ro_f!j|MNU(b9k)mKe0EdGC+Ti}Mhm?evGJ}zD%CAaI>)^}-v53nh
zE7-dI3PX&}-;5Gg6Ha^se+Q62@GhW<ZUN{4jy)eBID%4yqA6wWJ&G4d6$UqqzdnA2
zX!5-75pngl#onvwi=4&t#iI+#*n%v^Q4o3D!F(4ch+yWxG{ZMh?~30FDwhoGDGvQG
z>*f3;;&ERtlV>{X?m)2baW{t!$rCtj_u9}`Mhx{+H31x$kLh`!YgsaumFPU{yVS)4
zzP$f;*X2n@2<W7S2J6lB_!Z<$)&U9l+L9zIN?i+wSXP;Zt+fsm#9=4dJwU59DrS0j
z&H)4P@&mtv-v5WsIjKx^$s=VN9X(D`zm|DCAGv$hHX=~2^uo1mUt%n|EF*N)zY}D@
z%C&+MvkCme6Hpv=kVLzq81}`fL8<Rq`mV$073`#*zSz$8iSzsk?%{91yFC$3tiw{Q
z0=6!(hM2jiU0N9=p&|d|b)zoWWt$*v;}kudyZV%XElV&Q%%-*lGAlF|t?C9POdM`?
zc8`2>6yiNwIaqNxhT}=_<_YelHF0PwI8wQVSzj9V)UtrH5hZ{ekgQ^Izzx=zS-L!I
z)TB0y&u>^#C^&ovjA>LKs|US1K>-xzc0psP-uaWv{QQ=@S6&J`=%*77G<{OM5N=??
zhNkx*qLRVZFAiSPgnaTgn}Z1q)>M@dV>Ts7oOMN$;w?7?AGrf;{u@Ci99E2dP7GNM
zvP%H;67GeFMQ^_d6DUSD-)uHNFRAW(#pDuE-YQpWC#*i$i5LmKc+n3`Pn@$LGzM(L
z?e&L~31`2(xbwc^+W1v9dn*4+ZqTgJ12gQv5)`1Cd=4Sts~X3polNd~qq!5v{#;w4
z!<SJ7_ky-@^2-JS=sMVGg&7FeY6ZB~LS*le8t5rK0;<zK-zgtM@>4kK`o>R6TP?}!
z)o?7?bXgV7*p6tISABwQ3m7q^#SUEDWtHwV@uTngMi8bSC#oE}{ou%5E;DU*KmdFS
zJ)T%^4%oWy4p~dgN7^Rl+^D;IOF)G~lpW2LJxb`MI^_wbsgyvmV_s?3fKQafn=2z1
zRT=L(N}y)i`MDjOg*?a9<IeN>ad0m#kmVnfgdl@j7SC(qjK~Cm__QHU_rB8FaW6+K
zzeC#n>@{*z*Up^+qta}2Aa0|af<Y=>k&X!HD9aF%=*%6fmqC<zxjp=-TAIOmDKKG!
zZ-T&()MITECe`ns57EMNG~1hsSpG0^dHLb0s!-&l0&Qj^q{N+qU<k*tw&@YsqFaDV
zW=_j6|M?i-1+HFZtjmp1H>*7#&4f-ENy$3ZAXqqTsnxB0s*;U4e5hT|dGe1OypTKW
z;^%~VC-16$%U&@wmwlrn6s$NUu=00(DQzNQ;xii*kuE-Nl83$peAV7&lrYN3+bKG9
zSM7cd<yzc4ws3bR=;-Uzt|i&}g$-!KH1oa_UosXSD9G{$F6y+Q=PdGkwqstE$H2#u
ztyYUTk0kjW5DOOsS;u7<q!HTIC8|>i`OtST0{bt2W~bFFci1Hg<V<9(C`9Jerr^Zw
zw4gmzz!9u9@W?nfGiq(phgXYXZ|eDS=9#3zjuFmDHgnp6CbE1^!+ih%^XHU<?9ZOe
z7ll#>J=hE=w?+6ZLkK&}pLuB74P5Y%wTR;PKOt6iHF$>u2u7N^-Az-yzrnq*9G3J(
z9T(rwD&g1XxwRDE>uucuQ+vH=?6E2O{1!ahw`*vLC8$hofP?jY6~=|!e@UPLE(A&7
zDOxgz3f&n)3O`hK7G5*j@)GpY+w&`|qVBl+iv|`IiJX}K2zJ|vgQR+s!)&+2153}n
zWBYpgyiOAtSUnvAT{Tt~{J;w5&2*1dhHB{UtnLQotlcsJ45c)e1J=i%%}e+bqim$o
zyJc6NCc{|*;4F;_?Yv1N#E0j!B6j4Ys}~^$&zwIrNoLA>3vSzQN(uLP!h(7Zz!Idj
z7&0Z`z5RNUu7ZntyQUy~2#e3k$bZHBnnPU)Q9g+KVr+;S>>dY|^(Z4ulbWgMWECiU
zdX=yrR;qEdkvUwEap3f%++tE|be}TzC`ekf7z%W0^v4Evi_|6rA7f8`45v_;H|;%9
zw0br5+vWNKTZ&55z5|8Nrw0ro+4&}2aPsVP$#2vP|CY`DHqQD}+y!UD5WizLECq1P
z6qX=}Oe*Tp&<u825R87Y4<iN*EVer^%Ske+bUB6`pw0M(GpT);NjY!`y7q1Gveo__
za>@OtjPb`BW`lnwX9*nD&jxLduL5xLx81?m8Fa8h&{T<3x`x}Q<u*xN33tHt=Hkoe
zb;AV#<0)YU0i>H+kQF_G)wxNS?+5l#+CKQ)go%O8Hb!BmF{{67WXuIi>EOF%by3$?
znvv3nxqyJQa-!Z|q~ye40gmAN!9V*spUc9{rq7%Ay9;>yrT1(xeaLY==<IUqlPpTv
z)ykL3Cm$G;n_R1o?>lbItbL9Z!;(@lL=tRs|E~fyWNG-5O)ifmwayX*JdM1pU)AZ`
z9vCWq8U9YkY;bMaxS{?S^E6aLks)T{7DR*qF^Q6qHnCuiw>OrFaQ~`aO-`2@eQt4H
z&`kaCS!h4yaVwl3kV}vGj!XAxp%iE#<*fGX0mJvpqA^JaoX($_*TG$3JO-6}GqBa)
z#!UbCRaYQM5g0d8ArAs%WW`GHC1F{@;Do_l!M>y3uzlq71LW)o*Zd)-9dwXx&fsO{
zl!xsTip&=5soUkL^X2Z0#L07o5vY9E!3$nQAr2pwejSu{A#b>4Lg3JthGvCgjKr)j
z^4%%J`vPAQBX8MpsV{T?=APlCRP3LICU+89xj5Od&N}e^e&u^QdjL_3(;3-2HDOkq
z<|P@grQuPUbISUJo2{Sl-Zax&{+CYKoKfMZKxQVxFA|3NCJa--&0T@$q|}(T91%tf
z-EQS*Ro3)k`*np~9!~@-HlC7lh99%|2Fb@oU?PwStq^i6^r<qbdt9isEsSFGKEYtn
zCgJH@-Y4=rxGUqo8@Bk#ht;z$eqAE<el41Nt}EVL76qp|`<#RxA9;^;<NdC+ugC0z
zQCan!qc848!Ebziamz`HTO6KHx0RSGXYhozuXpd7#|t+c=Au8zrlB)r+h<#O;tk{M
zBy9^cND^A)!9s3^uU<z8fSfxifMsIE&{<;0Dge=yJ_W?@Jnf+NQI380-fa+E;YW-c
zn5m+FeyJgbq_XEWH@F91=TuZcB)VybIF2D}NWFcaiPc3xw4D4@^)xBOB!f3Xu$M>v
z!7Ypj7BaH<hWp7Tu%Gh7lv;z5vO!$qul~4@t-kOpxq&f6FSgKR0;Ex=p)|7F;fb6!
zkYuc%aEEH!jo<okR#^@k91IBH#3<F5^DX0~sY$_Sjv#E+KEEN&-8`D?JSK>;Q`3vv
zYu~pCz?^vEZ}*)4klgOyEf3wYAJcExJ-E^Xc5475r*qc=pabaKezI-etSy;#N30!R
z)n}|?nt(OdWYZ!jTw$nKR`%jt%Y@Ln=PXwF1`&Q|8GZ#MvryR}FUubjpcS1{7K#HU
zUzKJAC#&DsF|!b|XNmjgn=}=Enj8tEj*W#~s$87}{Wh2iPAeUB1d1Orp&!OHg|dVh
z#?Tb02;LSh%24aq6o}2|$>!MW<YXapfvi5#ThCwVTA{cUaKj~BwmN??|3_+14zM8u
z2VYbGN&-p%YD(;-5^l)rC{?4r5N~ty`RI-xDR^S>cX<6Z@q`lp{uQnH3v?I2#R8m~
zsPoV|t(BS6AcBYZTHFN@VspBXlU<F4%cIrPVV<oWyl>H%%nD5Kk;80T*u*#WD3Y{e
z%tx5ZI|4>%NQ!1Mv88b1(CUmqVZc;;Ss8k~HYgDBwd+^+Sv3MG>?hkhl~ycMxw`0e
zzN2oCAHCa|_U+EiWW}Pqkx6tBs}n&viGTs~M`!ys`z;I=Y;VIh^R&Q-H%RiCZ#$zs
z+v?vx>iycnGWS29Vxg_y-s<F0!zEn!70rQZvcg`q$IBd}theXH5z5S%gl4!n!Gi1Y
zTik?+@SCT`Hg&nLDziyKcdyqeuU$*%yqCYDA(<Jc|C5aaGGuK_rT}4!uySQ7<wp+D
zc2C6~|92YXa@(t)FUc8P{kn98fq+H|y<wDNOHn{}4Wrnmrbw9Ygu;~8axpRZr>o8b
zk5QQ`Y8&APSGM+-<Is-cDrO0h<zvRD3-&uC>)#v>@UM3u1ZW*RdMfGklkyFWqges4
z#4N)2<#1t|Tv+0a{XnQ6dYZa5_R$MuY=n$WbX$gm#Q2_=XEB%R)%OVF9~)dgkO?<g
zON~Zgb9hm<<Qyr6Q4ixJRj);e*ood8wTa066ejp=KzA}#b##RF!<9{NlAHw^E|oHD
zo})~_>~^s}v*6V4U&1Z#yn+JSKJ0ZGJhr=vj_Sdw9qN@;lQ?ZOdO(HmTO^{|+5CMT
z6lWXlLSc?d(wx*AWD<QO&R#;C4pusQGdf|a-H5Jc5aqtOxsJZUs`d7@xVc<ASM^Ey
z^ht{_oVC*g$Wv?qRVM`k^*ID<2M8R#$Dr|pzkNRBJZ5~)X33YYB~4m~*H;<|%Zk%K
z+1AM5{Cqy}A?te`*eiG2f!*KT0DK2Ed=b_I!4c@DkmFplG9yl27FlBh)*Y5@Qe3X7
z4<b*Q2&eRK>a6^D@OK1E*_;&Yr2RMe+?^tOA<V}uI95HV6m?r%hDbXY3a`}obCjyZ
zb8|nqKsP$4c8P1`?RB}uc5<)}6iq8E!m8pMbWuAgw{=~SWo^#pnX)Rr6=SR0x9;@f
ztWCP6-c;uR0y-Y%ZIq}=`b0?*aAxpp(owGRdXiQxsJQxqd+4;&ccFMVu?B4U)^Pqp
zgFH~G)cCA@s1l#xYGB=ET@|?0?Q{n~fK50o#oFkL`g4YD?Equq4R#oK@>U4CJcE0c
zp<n@lv#K2Hnh6YX@J`Wj@JqfUEglp)sRV4v5)J)(n3=?zMA`Qyrz2h*j5_e~E0@Vs
z<U<Uxnjl_xE+2;hE)Z(aTG0->CT8AqPzmzaC`W-1I)|^tGvSuDiR>*Q0$zQCih<OG
z7$6^Q%WUo=As7JTa9LJKdRS~~L5?u7zI1g#X2PJCjeSbqTPciO!Iy*gVhP$&pWX*I
z7mxlz;75t>AMMWB4pq-yI6zd_h_<RukLA=&eij`KxEpcaB~!V<FvwV|ol7Q>d7XBH
za{Ks->5Xf1Ro6ASwunEF)3#T<qYFRU-3Pz81R25FLPKGcZ1ix0IHj}JPe(J3vN=HK
zWYb}#?c-;kTOXmwvI$MgC$P|ctZf=#8kDHAj-RMn0QaB&A#ogdXh{p^<Z+)cztHkE
zC@An=BmX5?#|0c?dp2zQXhb1!A#|gzBQ$fBY)sE=E%(>86<YOGeaNL!b}5+a796Rm
zSyUwY<wM#<gySxYpbsUja5nD~pUkU@eP}+sI~0A2J(6~-ep1=1&Bj!x8rp+d3BjDV
zP1g3o3Rjfek=JXYZ+0cGg_3Vf7_<b6oTmtNFcj)ToFy4D09N4)?x|3%NCYpc%o_W4
zUP$jOw*K2WN|52BsfeL=E=*j?efIVBBn<Hx{9S5Gb^c=<LE2Q{xuVebj{$WDJ^Y2=
zKP^DDaB~=~im$-I+YqaPY}uZpix%NUOQ7uQtZ`r|ZN7Dm`ks01$6%)v&Y@dg$kcH#
z21`uDQox4HwLiTjr)()|=jBuRkB=reL>-(Z)o+c^CxJ%dw0)GMlZQ#Ex5owFZQpZR
zd-{g;X~UCg5@FATIywVbg6?25(V01AR{5CfxN<hz;Ec}9_>Msp*`!JM5a%WKK_NzA
z<YSiLDdyb}-<vL}F|cjueaKa}a_{H^6xXikou89NkNXQ+>XbiOl%g@AZiX-s%MY%{
z*(@U)_hiHcCv-5Ccyegs16VWRpllTNgAq=RX-Xui%o|eYP9{G<Jqmd$XeME(Y1ftQ
zBe!LNErb8iRCa5A977Dp;<F`pSOmo7yyGe;D4P<QYuMlGCw}03jrZi)OO2DnOKDNM
zfF^MmbPZftCX}l%WHA$SNl+*)3_3%SQoNIpe7)_u`zM(cHjhjBc4!Kla8=a7iSk7T
z$i=xekWcKz#?_#0B<<FR<L!iV{k+8UnkotGU9L~aZ0DnKf7B_}2&Iw?$*Q(jWl$ci
zf)W}hYoshE+IUBosI1d4F)!;sYe17yc;<qDln4y54)+J?z<+gw>{R9DIGl&XSoM9x
z^E3u1eVmdFrp=^|XMrSL15S+1)*G<EGWKeXaFYj0CUs$}0b+96+U|Kw5u?qOK_kXi
zaGDWEQ-?g<A1}bg02$J@z1R45CjEN*hnU6(LAL5!gFn}JRAbiW*6|5!3`NQmLk4Wq
zoaeedB}1`X=La+mA4&V!p8lp18Z=V+lWk_Fp=r>5Io}yLXxSiwCEhdVJAGV;7DhQJ
zT)30&m?=)+54t{eN3ZKp7)N9q`o+luFm(t{lK^B0`PcSrC|Qbm><itr0w}zB=t8_A
zdT}f9?zT<dO)HD;eK3Jx0mGI`N16mVO}yM!<k}WyKU*+c1Y1$K>Tg*p{}S6o>;(JY
zC7%DxzqyG*3|WxX{)t{{$T|oh-nCBn-IB(ozjPgKFb-fu7Xck>7G~E0N$<adPyaEu
zbsaDH$+qjM9&-b}Dm}BO;TC{I|JDNhPx<yg#K%8OufQjg`{S7V!PfFjdS>nswSaNw
zLlraZkDu8X;7pJ95O-klgisEiIPcN=M2Idlt55GoM1kG-&wAZID!Bjf{=fU||BzVx
z->o12UmT0CbP#Um@ab#a8GuW#2?_d?K`qBhBMkanZk`=E5%u-dIbups!^PINJE}aV
zY$oaj6MQP(y}czXo6=8qhcTZvutWAFyL^gzGmEch-_llJ;jy-eXCr41OFX?U2dM{@
zj=bR+J+@6`3)aYVu(an;y35wI4hVF&5TF#?)<DCMUPpn&xQvWvX;ohl$$H^p<r(~L
z>-(HMlDD3Dh_4`S!_Ny1g_71=<I&nk84~PvFh+D_MIztph09@6zodXu?~_YX3{xM=
zddWHs-an(3?o>8vw`bK9LC&0?qR7x5QJ$D@Wj<r<dI5D<S3UL5t>`a0^?mnKBN3}Q
zxzrRxO+lj>x2hIG9C{06ORP5`VVf5ua4ik;y}OsM29FtVR+Lw!Dpj_LT{tPJ_(YXM
zxmIn@^^VMp%^NVa`gyQqt(&o67nne790>&~aInU@SUG#uX#>A|wvdBGweLq?4nc1V
zm*)US^9b0+t6W(fFd+uOM>y9Zx!1GqFyQd%VN>S&lTG;COy77_T%Do$w7-@gDc0cT
z;|$@?X;#~|YP){e{-BC3gK0VoEGhxj)sYf9|D7&_5{E{}FagknwDbkdecQK}VT;OX
zlhrj@K9>BMk^+jU(G}<7PrS*snnV0i=v{@>!DR?CAJ>_I3{Aw%+kH1>^~Q&S7}S{;
z@U+5d>9;3eC;S+=5T)G|io09PHhrj+2G|lMeCV-kHfVnY@QU0RD2IzfaV_}!S?IKh
z<5H%l=A4gMXIp%q=g+-R8+7()+P&ILg(7qkYGZ@`5(IQt0HoYJH;)hm7)J7k=+AHc
zKw5vital<v=HjBEq4f#T)H+>mqlvOfFglr-CPc_+Dkr+>L+e?=g98lDmqSrvxAKH>
z`#<eaJReNfrvr*48=CV9u=HohT1^y4%5wm*$qvYQheE{bV&rGHgFTJxp%Gb9x`tb>
zaqo;wOX@$j04D;_N6vZdYy_4TK`~NTHxzaT_$tSDoXpfc%>o6T8ysnao9w^P)gAC(
zxDmEp-T{iHH33t?ZTIwcLzu>a%x61YwYgXZDo?V<xOkHie8A;eV46;vI=ni5oodD6
z6(T==<%Jy1>qZ$COkd)h6>Cv{GnM+`*0mr(Br6_pKz^(vz-!pXX0rv;So*9XAnr>{
z&c3`Xlz4BvpAs%LK5&CAiKFrWk}T^{030-BKvd01YOs|Hop8^k3bilq+xhv!R8ok9
z$i{6Ge}fW#%P`knC!^;&(d*D0>j?8Ot2*l&Wu0+rFJ^9Bi^=IQ%}v%!17XfI1~fp9
z7>1O5Uiovv7R(Ti-@?~{Yjk~Pwl=`yWu1B&0B{+#AIz32L)}-PIGMhmVS?DX{hlpp
zZqN4?ul@JVKI(D}3J>hX+IYX<?gD<yX&6uYxYAGXNONfjVHsvA_jWb$ET7QfQXTiE
z3&b3*nWKgIGZKzX*A66%iAv2Rsm{2W{p-Eqf5-a$_1u3sRQ&(Vq2hn-IcP2g)_Vd-
zs)U$vaL)h<#>H<HMUr;ZebSbBDlx0fE2Il<PL7pTJ{KzP@a273Y!uSet4?{)hsI$l
zGaC^jLZtqTDoRy*gB(?}-Y0*^ZH^`_$8hp{*%c~(<7oV14{CNccVokP0Rh77rYpcz
zmFWSRic>*EdG1|hZH^tKIM?h&dh0#2x>}uv-HrFJ2G(5jZHD_qoc#p>9bOUxWC8IP
z!_(%wVFL5E?mh!leCMO{33>{3N8h#`Njj*qGi&wri2~#8HaEXIE>B-S8k9u1l-dvi
zoLDjQN-t2ZfG;cX2IWq9fK9UaJS?Yt+;Pl*Yu}T-oB@sZsqrhd_=aCe1szGpVF0*5
z%yu$Yh5U+V&E>wuCG6vg`SBoB;{oo0Qu_|hcclgcvf5vH=G;P?F_22hyviGJPuk@r
z(N1Ww9)E^X5oDlYx#X11Nk7&*icgID<~%2$dbegJI@aVSDo-4=2A-Q(Zxu>_QLYNL
zVB+Btmot_(`_A>eZR>lw+~xIhR^%X`Vd37uc&6Q#^n1Pq+_Cu!xoV-M_Y$#d-`_vD
za&Iy)(?4!)gKAIaZVrep9kQo7wPtz6k_SNNzg6;UH>)d}CCVxtQ84Y|4H=l4rp;=x
z6FGHZ2M7o*Q(E2in9u4EFCY`!Y(!a0V4<5*X@}Oj!uY`(ILa#(ambLq6Lk_`;k95q
z-i$I_nH8pbuc7O%)s&$@7SFVChZW5vV(Zw%hY!|4Jm0?aD5c369^S4Pd}@Dy1TvbO
z+PVb#*0!6u<A_Dr_B#zC(^*^(09tu!aPR74H|ar#3fo7E<dkKVbC+u#428wBAK4JR
zR^8?~V}H-ydu%?_;u3A+!CGoa1a9>p&_d-jmEa|tbQo5^?ju>R?E`;tPZ8}wP^R`B
z(WPCNRD4S;9ioi4t2Z<Jz)8NH8O@SK+S^hGQ$W0Q_*k!gV`qWe@xl?F#ePK{oj4(S
zWAWR!PP>jiA#*kCWX7U7(BItonKi6EtZsK>$hP1Y7cv!P_R`8A)w38s``V!Pf@0)i
z>y1cZX_p&Eu75Atd>3RZy^+e|v8Ai9zM_r57zMi`JKD1adP9mlTJbNjJ`rKBo%83D
zx7bnNUb!srz5R{R5xiddw;%Z=@3y0-LTxjBL+ouA@)ORyn7_9K!FqsQEa2$LAj7-b
z8U(&+RJf_1xZ_n(SN7m?l=l<Qb8_1pPZ@6ybm@lx<jl!YUk8{Y*Jk7;tIt!A*IOG@
zdi}I@D)A$ytA&eo5AYORiEKQ<eT7+2y!W`4r{n2CE06G?A5nMzHT=yd@+TXzU>2)$
z22z?vvyd{ZBTr$hIxxhm&VvrFRgu_9n;wwkw1fJDMNx=8K-Wlrmt6fP)FS+!ksYEy
z5VqSNWS{OJRGQ}caOKCJp1B1V!u-{t3C#yKu=j-JXlxYBUIa4(8hn(D|3AE+zvlq`
z`QPeYpxiN;e-4m_?&kn4)qxT?*f;Jy08&8rQ<&*PSsdNqB?mAMRT%Q%y7nBW2r=&!
zfvDSVvkkNJ3=<7Ld6uwzpTGu~z-qnN)up#i%Uch?e*U}dO8zGYqxUUKp`$E2mH^n;
z@-RfeZ=(|CE2w3}0I(y&`SrELu8b>F?32u80OCASQ$A2EJ!@-rXzWtbNGE?WQBQl^
zd|HjeDg6m)%1JQqxZ9W<2=Q+(Y2sE3EzqD)-?GsFm;*iUSv>?O0K~WVz}v*Fpg=5{
z=fTXwQpT~M>Bit^%;k&NsSz`+qEm}#EF%OmBogonL~KF>2kNlMQ@(cNGvlckTIlk3
zy4mDql_5fNqapYElH^|+wV_17jM0iQ&S`|<9VVVZwZt94Ms+3y-cBF3K22~rcZB!y
zx%A*nT~nmxiuuWM?H!#c5cCTDfUW~}AC7K?A=P78ws|aACuNG2oJvap7Y#U1j7;Yi
z?&PHXV*6GRaRF(96<mwM+rjo`gz_$hLg4ejlNs;Z_8|bsNcJN!B+)p#Qr5S`<C!Mq
z(Ao_L?aG~GpQM*v>7Uh~Fr(>Cm?lGWDRd9!y?G*?A5u6Kzz!i=`g{Zm>MN>7j-Bw~
z=e0aDnN*VwGfn!QD1uYW+7SBd1z9oRk4arj8Q1~{$79!ivTbNq2A4^yc4Ye9uyto$
zzH;_bR?Q{{{W7a<Q)t2WI<ZS;dXw1|3H1GQXOFwIpq1Vwb(-D#NM)7|KVR7;(iF;n
zanFf&u#mX1rWGxG0c+dFE1(1Y^qkd?C@V=VW^G5g^lI2W4J`2)*p6_yf)B%O>NB7G
zWK*1d$n4P3kTTlQ{9r@tpSsYpSQ$A!)ux%r538%oLj?|kn>qP}GOMOkyeP@@<@-?j
z5tLgURPcKCB}@gK=9zcytd+=JM;?vS?=@R9L+>sU{bVeHKy@r&KJX>F1y<%+F#r7p
z6on9J^0|o#*ZN*5k<`v|ykC-0YR&b9C^t%Xn(F%NaHT>R!<L*)132CU6m;i++|cdV
zJLEtXE!^bly%E3I!V`{z9;<pZQD$m6VJMX7z!2)K*f6ybopY`%qkZy=DfBv1b2DAO
z?cTxo^#$%wqp#dY>1tSB42eMy#7qO4j`tG75FlBff-!$t`Lk;0b<)$HJX`#nvODQi
zg4bdN8gx-?#V*G$-GPJIB;_#8&pH97!k^b>$wTZFp%hGc976)vylFMRhi=4RgAyq#
zjhnQJd<2r*jui=ZAkib>w!MoJb>43;`oI<pb?a1fPu@8ZcB0_RaY~rKvOY<LPq9Z|
zXWyu_eJ<1OPlwp)@;SZ;b^GXJ<x&ZF{X6WeS$?7JGkALRKah?0!{sm^b6#NvMV+(d
zfh=QdD)T0&tvpNUpbj$bY{oGI^<R^gHrURB{jUzO$y3lRkf8ef?q3XH{;u-$FaKYl
zI~bz7yD@`E?9&?+ZAidHxbu_EZGsiN)5Q6AfZf0N-2W#CJ^x^f?4(Dw-w+K6*j&!}
z$#y`$?;n5O#}3^DNjwhoS&+pOVQK>?Modd=xH<*F5&(PIeL2SVO=S8{Hv7$F^y)tc
z`u<5L`=?e&V%_qQKq$$<XoCOMWw2y!{kN8cY(M|Yu;>3>BZ&VIQvZyv{O=4vu7_h8
zvB6#I0AcI){k<FYZ@;DDq@Qf%m<2hY?l=l*l%TsY<y=)ladj%BTHro7evI$@CAH_f
zqrNV-s_l+CdzY;Q7a_{xdql=a&s>8FNL};M)UtEkC036-FwD#gSF5yhYIfuFXRY6m
zw}z@3DJ0zacH3O}?l+fV7*@=QIK?neC3`V<gtm7vw)BF-&m1_`!P>hvb(fNJpso2}
zGmr3E%YuJxSamAP^jJ?xV1URYu7?5Ggn63K@K3f|&L99$0fD+u-mf1m-Kb@tZ*)19
zWuG^<@0a(W*^^pWT?k}P4`cNaDsmX21?1w_;AA?TjgUiknCn!ev^cq#C>iJ63Sgx1
zj^!wYj`>sx8Xj}e;(Nbvs~=8U#mK;?I~oiyM2Spr2&|=-V45zVRgncp>L^`zuotzG
z@Uc%{9ik07RvDuo?A1HG6F)sY*mmxm9ghO%$-iB^{8zVBz>bA>VHZN^u4r(%@2WsI
zJQX-YG#5*f!E8PL`U+MIy&EwcMhlnAZ?B06<$B~L`%I~AVU#~9aQTJ*ewcaJ;ty;V
zVi-6F2;{Rqg0n6Ug46ZUjR*<G4$`Y)m0A}8!{faZiCr~8%ODoB|D^bYKj*=LlQAcL
zk$GCCSq7M%OcWToMhz{%@R{E1giRsBg}gLzOLF26c*S3bkOi}%ma&p*$zCjW9}FYM
zYQXys@5Gz?Z@`h$px5?aW(!8ygeT{$dJ<&d6h2!JH*cDk{o!ml-0_}{vq1BI0y*p}
zsE7Zu5B4Ik`p(^cX##glg`*q9=e-R~=55fv4g;o;AtuupUXsutD0kSlpP`2gCI}+~
z+$s#Dzb1#nzbB8C1>Ght>;#<!<#{A|-{mwZGC3y(rs*#eRExI#w%qd{QsVh9Bb@)k
z_P;+22gUiFy8t?HxrB4}W0kRG6}7KvU^C{)t$l{tcYx(!T&!#415;^c7Mm(>9@{l_
zYQb-Ho~Tq|jEH<(;VtOwN!Oed#&wcVtUPajg94Jp(ojF#U8K~2t$S?lD$%_sOzz#b
z=2t3_IR_Cbju?u|QrvpD9?ChIh-q_4y?sx@H$V>W&E;q8a>);SMSJmFUM2BqTPRT|
zUBwT1|60FVVPEPRGuU{_aXa*ge%<0c<kI@VFo~g8fNW{G_h@y&aX_Zy+zx47%Gb#5
z``6yqoHBQ0OS#QE{#L_J;2)){46sQ(rk{tfjX9K)C9&}LXHwEU%@fHbd~?!kU-Nd|
zb!*am*mTOM(cHbArSaK5AAQR)JzrI|s^2Z#Z&P%K%FSS2h9~pUYCRmto3G(;Jge4@
z{UKeq-KD;Tu){uUYLb7s5MDE`j*-yDd;lhdZ+>S&xb}aU5LW+hFd^*yFB8K5G9mn%
z2_cF~cSd(**)O=!b)hAi-@p>$SQWK6G4#dbkApW|Dq1}q57hY&4f)jz>L3!Z;0Ob>
zr3xX2D3JTfGPA*t1v&iwSdtKhS?6q#1N?I_1=Gv;X3i7(3vZs?yt-4~!wbIJDrmpR
zy`U6IKR5&gx?88_i+o-nkStZ<^2CUcag%L|BARPyd6O`8OlD8$!dOfv`T#1JSq%5@
z!mALu<FS{IFw+?JF9Aa9?ALzBti2%OWhbdnFE~=5#Vss<T#?@m9*7M}VQ;&3K6BrG
z<@6Z(i@_{bR`)<d%AD0qLGBEJ;!pyq?u1O9F=Qy*P1ty8t0*V3`>O($=*<@Xfe!(3
z&!ZRg#t$idxw_o`F=M{Pf8G5NSHoUP>Ou$+KAcIbhBN>R9e5+YtcTV~@H-8P_S=w|
z2dRQ(o^QtSqqBQT+c$fNRWZ+DK8=sibNss_tNmz%6gc3OCV`fVKr9Q8v7&~p_?dAN
z>eM`5BV)?4PF*fF{_5qpPv45}j&hy$VOJilZkBtYikE130I^d;ok3s7#FhqG#L&7k
zMc_@D?JH#B*=VcEINNH_<yrk>md$08cJVxpX9^=P<=giaZT(U|$55}Y8uWE;KjjcK
zXqBHTTRVXD20e)p3wNkiBv_QW@*H+sgD^uBo7yrc0v#k*4pb}zc{`rFovCMK>f`7C
zf~`>A<=Qm*!Z@oBA<+QzDync|4oi^=P~#MR`{e9>)dLs$Ock{5XpkhgL^z&~?ZtZp
ze1Dv(y6B;>-Sw_^;qqWwK@`>(4ygz&^i#daDlPW0&N*#Mw=pt{3n8B-D3ck5h2G9z
zwteq-sO){@?lGCN_w1*W<Vf{-v}{O;eh^@ZfCh5`E{;@b8y9>`TuQt!;1`17I{0SP
z#oe`VCThomXYKO2iJ0*K9XSr;H<3?A6EZ(_+0kg`O98-h47+jG$NXGCXUW>oMi(ub
z`4F_H#=f7L|MVISdM7ep)H#st;~UufJgJiL<z^_CPSWvHU;CwR&Ux?c70cjE(fA>;
zVtXNQS)@pQkRirL>x8Hi*tnchp*EqRmR5ykC%CjJBHv}N6X|7{&TQpOCExUf+U1mO
zu&FaUb?l7Z!tp|6Kv!F#eP%wa$Qf#BCX{`BO%#$vV(lWhTeFq>m*(SQi}n^EG-<Xr
zBImpwx*j^W`|x&8r~M}Pz4|b|u%YOQ%W*GRtpr{c*Yvu%QcnXPwBE8Z=k$K#WJHc#
zUxs4FqQzyqZH^aT?Wj-oJ>GUy5UZt6SjLYwI0w9c(pN?ct&F>eiU2}>-x5g0aA7OO
zkSx%JLx?g?_n9k01AR@|>OH1MG;;gK)_9a3sPruqJv&o=mJx@4Jh^B2)|YL#8rC38
z1mlCj`?7e&^8pM9^h#ZF?hGADdopR)1VILhv$94wWvCZMZ>@Ox_;ZrR<Mo@_CuD_p
zerCLYIRU`&HG><%QhTso1n6`)|K<{7H%4OB4db*dtxs2e<mYo!HdEk-?)Q@~4jlU4
za3Rv&jBdx!WWI!ZVxc{N3nvCA!Xir=RFJmq^5TijXzlJ`pNq$x@wPU{G6wG|wWi?|
zkwe9|q6|bCXCA1fW!)aD+3@tssNXl@*I;=sF?n#fd!xz*5~p&|P~d7NPy!T0B1>@&
zE3!I+;+_awEld^Nt9#{!j;Euw{48g@t5r2uD(WCv*8l3@YCa;8B}l<7qyw0>$<bUK
zJuGhu*+x(R*tHZ_r(WWHSF<3*PqRQ~biwo9vZ`lD{IStP@otq-2p=LEa?-q@PB*Ct
zk@NFks*D{9@l^N2Ec5;@24E)sPETK47tNfT{PLemI1jSq06c-kcZV*C{)Cb7$6PLX
zk1U%AXcf>^$~oPQhE13HRykkwXLkrblb51<5T?95t5aiXiMjA(;ZV!Fk|{$Lj}U1=
z9zLwqPLM#wm@86otyn37Z&u><z?=FFuLA>w(^ypDHt9E-H%g2+l&ke*z4~9@v_<4r
z@S69!zMNtl@uNtxk_j{r*ES1)L>uVU<41^EOd0F~pefy!0h#u3^uPuvRwG%#VSoG;
z0-%7m08r2IT{6Lx4PK|(vUVFl;E=Oiq8|XW%>}ZwbgRF<Gu9V1jH$yIoBr_#9{`zU
zSH*f9EqoEZ2h(avV8=|40Z<jj*hv7j1CUV0CpT;X^@LhhHyk<$Y6psg3jvR{)>Egf
z{`R<ugKy9pD6B5Gt~RZ|UN(#>T-Au!G{oS!fs_b*87H%tMJt{xDr5R@k3ZSYPW@yv
z9~_4b&(%NxR1lUyZ3O^ADDxSH5{+Tm<o)Kx5ETtqvE%Tk@_#(S<^b+6g{QaB%|hW~
z*a?MzTi8{90F?-_wg&)IVBcSF%$HWxnMud3<o)fD>5;>rBaj%c;;D(uzYbdc63qpC
z73W!PnDRlKnHJ5rzr1S)N{xz*XJ@tcvUYg~dE37!U9)xax@U-4%=Nr%s;ZK})cxZQ
z=O`sCK`qLUJ7J)F+CGR@a&@u9|743qS&))F4tz66E%#Z+;%V0&okzcN-XkX?^*QII
zLCaL{YsY<~8WB9%+!-1L@0mMKo;4S*ul`o2^>WrQF1SkK-Nz}Z4&5!j;SKj!n*Ywf
zU(o*dV))X3Lp;A{mE&T9>WhDdkNQvlj;m8DW|Al8SMv|IqQ8fac?1Xk?o$6TyZKEU
zf`k&YvHTJS>T?=3=V4^qbtiLkic-S10R2Lq(@(an>%8hJ&^RFHCr-w9EeKHfTHfh@
z#gZeoeBS4m)OW>^dTabi^%TA?+7--QG`iVb>SjFSFg3Hu(Xz+(i}K*rzK#=ymwvc&
z?FV}LAu*>Zi*<(h3TxZBmq*`Sft7o;#Jj~3Wm+>hQgT*C@O8d!;B7X*5ig4}Q9Bqx
zKZrc#n0^g*>FHwJr=-ainaG9g`VS|#=e~<W(S?{M9Gqh5r+iY9p=r_3@%Bpa0{1zr
z<6a~Dz<>ys+#PN<>R<vnqy*ZJU~RKMuM!~S%i23QAT}!9(uD689QxWcHQW-ru;Pb6
z0j+<GIAasA>%!=}W>2+oN1KZ?JgIy(J08U55Fe+1e}?CP>63|zWd8_i`xQK<snM!+
zAkCFw0$S60Gd;fS1#!u3W1jBk_g=)h%;?3ErC`0c;JHWDHwdXLJ`egOU&_Z{@@aFu
zV_op{tNLd(R^PQ;WBc^Of4RNy3F*GPp~u}-aPQ!z5c-dY^C$qP>u9~_()=oPR^F(0
zbzP};>&=UTI*h$HFYza^fqf%70~ps!8Nxu16cbQs9ay820rT*qF@aT6uq^(IBis~-
z0SEkEI0#I<q3g65W*RVpLfW!K0tCA>-;EpgR)o@Boo@^H@HSnYB_3B5E;}D1kbX-5
z-Q<G00X`soZAc@xyMcotPuaAucu-+4>NXayFmm@^MCOt<Db(sgm)(9gSoc$L2Fn!y
z^xWpibo&fn6U1;=OR4n%>&ZGQXRoECEY(+S`_Lk?%YV-jC6y)UIPfE_#9V_R(w%l<
zudnCxD^8D7_j^q7(9+u3Mun=b2~BTFV2JdAkQg|2va00j%+;Ai4AH#g+I&C5M&Z$S
zsED2)Zij2G6TX{|y#iQhFt}3>{fiwFh>_wS`<_0^XLV4NbFOrx-`%Zs-JsD(HkPcA
z(g>?BcOuroDM#%gRma!-JxCmF5lS?Cx{PVX+Voz%l`6IKBKNKjjK+iz0z=qpVxR=A
zRk1W;Le6hG%cwZJ<|}heJIE;E)4`<?Aa7<i_v09L{~cY<^--{IXmnbcF=T9aYI<Gh
zBMha|?nBf!Z!$%;nfiqYu9T(I?>-I++qh;TsQ$7JjO`8VMTD>bVSAkyp!F>8$nf&B
z#~Ht<{hTB)bU)Z#Z&3E~Ls;!Xn0HS2X{mMb&N;L>MJ3i*k*ZvAq2STVcfQ9DCkRzL
zMrvXM)1Cx>ENKB#AMK4H-gA1bP1TAHDGIV-F|sn_E7UX9*w}+sm^~@%#bhu^(Sbk*
zB7JZayCpa=^}g29V#%c$!sC?kTTGb&j9=GKOG`!$UVay-Hy834a;|9V)1}e_r*-?-
zvyGinkZ~+tP#EK^p}hH`Prw_J?(4eTRlJQXITm*R2|K;KT5B}eo!d;k9i<0;OA#z*
zIPyTDH@0l*c2L3znZ(0E*Knqf9<3~7M@P1SYIGcim=01P1n6`4QIaThD^pOBxftzZ
z_ys=q!Vf2Be1iM#Tn@#p3sM-2D^{w+GIJR+3WsV(<_8Hrt{!mpL&Z|$(tFq2_1cgP
z6<`6Xft4U!E^$Z8+<5r2CuhlUu;0ZwnZUtM?<4AwU-d@v(J9k}clPsIq#jietu<k`
zoDHL+y2_yI_KE0GvdQ@!?RI8*?j0{sVvs(Go_6yYWNT6S$m#sen>-5V3o6fW%YS2s
zLPNku9RV+wasPhRRD-<W(JFb$+9O54q{X@L)6Nq%p64_?KHsWY!}(XMfDylsz2N&w
zdZ`J`4ZWZ6)`@@7%aBCQ=%o@5t7I5@eV^hmo7?0mFDrjn<zm8fk0*Wc#T9X@Zo#eR
z{xDG6&DOn!F+$4Fjx+N@a(1x?s5csN_&!;198p-{mc!WXxeKnu5ANd^cpkwGYRIzw
zijPd2wN_u1_n&UW>Ahpe4jBdbt8elGo-f-xoLC5_I0T4-Srm(#-$s7c+d0`70vGl%
z;8<eM6xE%VT120mi_EHo`!7M7mU;WI!@xk-Kqq5Fa-=RI&!3~rFkK-@5SmKaivHzq
z&;cSK@#}2_m}w{-j>fsGF$)+9UPOC2H&hs<x__UUIWxq>*FkP_Mlv(#O=Rocw?WLF
z@&*E_Wcm<R8vf6<DyenxCmM;}2h~F)HV(v10nz9^(Dw=0aLIuIay1njv)3;~!^u|o
zqdb0qvL~kFzVO|q%$SRoVOw5X)MqsNDc}J+<v*h(@Bel!^W4e`EeCv!Gtiyb(!o|$
za{=E^MNnD$3O<nxVq;#P>vNLb&N_`*A(2NL<6v43J8CPDCEdoXFtpJSdhl1H0!US|
zl`vA)1QXC~&_ZcJuRUKBS7z}iE7#{fRUXs$$rf@InXzU2$3uJeU$kMn*nr@+6j%IK
z>s{hozYn`MwCZcqrjBWWy~UQsE!5IY(0F5ml2;7Aet(r(%6!krmCeg}p5{+q%`AW3
zW>H;PV?CLA&7n#is17#=8pN3Q;Q_EAGOY$efHq=6M6UM%fr4ogOTrd=w+uRzYFmDN
z9V&C7`UK16Qf|xEtFau*1QIvSek6}ZrpRI|a3ELkCh#%DdMoPJfto{>X6&9Pn~|oc
zC~{>`(!{^NM2!1VeCi&zqcNU^n~(437wmR)W;tv1U5qHT30Z~b8LLx#Pl55PMjfPT
zF+}4?h6{X@^p>oUrMA?h@JV@rp4HlD71}ZBC46>on37*lP32+XkQ1aw=PVZmB0T73
ztgk?AacU`!AQmjN0~Nx|1&3X{v_P{frrCl-qqODU7OqtY<2aT*WTksM-CLPh=^^{t
z|A;-es`9M9jGIe+?K#Zd{@{hr#ptY)h!|t1Hm#9p7VkkSb^@J9--|jOM=WVxirEBT
z<K2|mj1RKXl_hUyX@PGuk0cVtEPQ3G6AV-@x9eJx+`oe@oy05zmZ36?>rerm4?TSG
zGjLUbHJd!%BkJCl>vC3q0d`VmVX^ZA>b=doc2}$09SiuR7Y-S`C!<51Y}`U@7pwd#
zx4IjS9A3MZ!14wytmDD$IRY1~xcLkDDDNJY9*~?Z?){D}i`wisSoUqPqNCO~)AQbp
za{OUWe&14dS>ami?X{wHMG*p&!z|t^vY!cyhx7(bAjp^dMp(6*3(y@CU!NV~;FrYr
zSIo|I7ju!eXH8h&j?pd5@MORCeAcsn_lCpqmwBFJ3q6`qDasn|1CBMDQ}n|OL<JDQ
zb0S;F$qR{B0+F`XANrlY8Vc{57|^A2+|t{0tpw3ebUkpz=P^B_&g$BWW^-b1xV
z-VL1U8MW@#Sly<bRLv{(l3By%CASp=$CPU|VIg%vz?|WvQ>PZ#sp;hQN9w)}+iS7?
zrdthTH6)LfmR3i;@b%U@uQRqm$y>0Is+{0{RL!1>w;vLMQJS$_I>4dEFb=yKC7m_o
z?T{CuIJS^a58qjE=CfnPgVdrqsya(h2@oQzP~db2p9HYPndN4zpYQMgqpT71l5to=
zJjzEzxyLO{{h8$1@h1iiA`fN`6hOcZ{cnGQ{!OQ8@yy?QQ`<k20WbcatKR-Qp+&9e
z`$G29E9>~%uQ`PUx)hVd#RMm_*-A79%;5}f%EC||;~?2@OZ$xz6DG+=$5qsV*jtQ@
zh^fuD%z_di+^-xG5JZ|x>vYp>ChSw;g5_(Eck5lBNY>ps68N;Xm@5!>bh&FgC}9J1
z`hon8gmV>eyb6iT!y`Zo*nVy_`x+Og?tXu4%M-L1`{rTb-+_!~@vVS4^rLh<T!Z8J
zn!h?~88W49iW^R5XCJd)30$}8>`So-6U+yJ0hq5Q;nj(R99a#`P%oxk`Ww0Gwa#Y@
zAmK(QuuuzF;uH*0GA|1N?hyK^;>bIT592TP*6F-*t&QP7`%0MY(8B&EBbKuSh7#yC
zpIL&_gJ2{?d%$*{$z=~;p)KPdOKsc5<hAXrI5|E}+=63NOPWB|A2o*<9q$y3qbs7j
zLr)y8S}#TJ{y{K_^St|-oQ8uOiong5!YUj~P<v5ux*+P(-FeOSSq<|ZOCBjM+vI|o
zC-lGrf5gab0M#5}@ut6VZV4n3hTv_j_9~y4c@q&5gTg_&9aCj{W}4h{V!UkWC`?m8
zh0#<_3*o(Z{+m5~BC}|d_bG=PKkb(Cl$z!49C`Lkh>)O82`AG4$Q>PBF_~pA#QF*o
z*k|^BAvpvleLM)&EYwMVyl<=34#a)><|Q0C03>VYkoRDm4BqZ<yL_a{C;Cf*2%6h@
z7;inYgMbL?mvx#pFk}cCj1OL``eu5Z!qKvP^e5ZCj7__$K*gg%e(P70({N*qH+kr<
z2uk3M8q@SEfMwRq6u7+0wBE33d6XyJTc&%|+K%;>W6_m<jSY_WE@=cBGYq69@%kFV
ziCHsW{;}u4!2*U!&!tfeeuaGX=6RNQvKB++I*T`_*!Btvtg@|Jsd+7IexC}EF2~+o
z-V4XdEVFaSJgEMv^Q4=5RmIgSC5>QVaWH+V4SeOov7Cz-VjQ+YGQQ{wTAx|V+HbGe
zyOCb`=#6p)&)5^ooj=(eb=t)iXgKjZ6QBfXxZbk<;q*OV%Td96p-<AWjVMDdrE1*S
zYJJm2cfV<HE<021jBN*9&R9pE(bIT)fy<ItZ>Fmkn*%Z@u3=Wo=G{GHxIDv!D(lO7
zt*<3LZrqk#Yx*8GPL!Q`R^W}H_%1PoamP}4DlqJGSs!Dr5w21lOHEsnJnIrazO$m6
zSOLRs!WDuvOTc@}jX{*l^I^V=N5&%<!W)+YV@JvULb8L8W-*lo^8wFCnW<-?3Sx)N
zjo4v>dJh>tGeQ2A6!jH*OEB&JaAJb9bW9Zl^9wTV(c1K{c<b&NYh)g=`pDg!v=LFG
zKbNJ#*dE8?Ss`sGX0!l5k*l!>#F#H}3p?4TmA&|&i?_@=&VeNktP%}SW@-lu86lK?
z$9FHf`er2K>`8b3TYG072xY%FeBDT7N!holtcC2m3@s$pv?yg4LSj<M*k**to+XMb
zSu4vVTb7KSln{}TWl+{(#xgEuxu0L>yw5q^_vt>*d!P3`?|I(y+<#c+=W@+8zwNvH
zzTeL`lKk2VMBsoG8rsV)2wYNHAzQc|_Z~EI-W}(}7OT68we7`>Ct*ltOWe=71SvNm
z7tc3BQGU?56QidcgU4Jxk8CxSN*s9YbBGQEk|T^WFpa^B4b9-}W3<OfF5tP6Y%h$a
zX*Ow2R?0E2Ot!#CUZ5CAN^>>_K3RYF<`k=xrF09mEUxyYWGz;E7P64kqKSoo=SiH;
zMXi{BObYzkSEu)UWIxPU1%IC>OM|MEL)!=05Qfh=WTB8|rFR}y$Bw=ix?jrV!ja{|
z#M4xRs)J`WsBp#9xKK;w7)e_oZ#IzmM`f9pXNF*hpa&Bws_L2K10>B-nSN`ocJG?0
zon8;R39Symg~3Y-KNEu?@X<5Ty6!uVQ}-Uz-=%Lwx_-<pH-<@Q@hBQ<_CR|6)F_h0
z%<Re>*Hn<j(|8OsUM->Ukx+wVXS8ErSUl^-cF3=DM7Z;dkrL(=C+XPXxWi{YV;XHR
zo@v8zN==zBd|GuTk8U-tuj^AdCi^l(K1@jbUV9#T9he1oo`LlNtg+yRt7X!FQEP8w
zqPo+3&H&z5)Gxy>dao?S{g!ATs>^8%Jp?L`plbzyGKp?u@b<R!IdB({f5K08M%^~>
zj+BdNSbRmX{YYm4Ip)Co*@l61(rnBk)RjWutorcK;hl4qU+A+d%4p20Pc*++0B+y<
zYSbGr?eb_QVy^43GGusn(#53k7FF@CxTLS9VHU$*HpzB9U3^<;D;EP;z?Ba*3bl*%
zO~^~*l(6?%Z#5qg3Jcp~!bw@_j>hNuq|GO@o_gRMKrI0i+nvC7jG*3d6*<(cr80}-
zE}v~Voxs!aK^+m_!GSPT{Y<9wfREqpWJfYXLJ9%}{?gM?VCQ`z^7+Yw^MWTyGHC^O
znM(|7<4p~+c-kux!5p!tmui^Jz*>H0s)}nf^>7%Y)(R^sIO&DMboa2a@YIDz--#Yv
z<p*0IGx%IAye|hlh>2}LIVJ8Y)IXvh8DQhihmcV|cD}Fb>A^5soMo{VO|*cjc(vzU
zhBDMh#@Zwh?j)FviGFqt4&p!F%I9{c-0@5ItX&TmAgiBq2iwIorcuq(RGKWE5>wIC
z$!{<+>2ZC--epffZ3m<?tuxyuf3}^$)ego}|58v2oC9|H=;<dKTsP0xor#;+t}z@l
zM)YnAbYJHHZ~-*P#oQsu*fvYRN#O&SNEMAirCXQTY7}eXnD;&-gE2$x62Y&vPJx>#
zV;W^vK_EGxfpB>4&V&-Iv`(d;9-PhN+d@XifX_Nb52`zOV=z^=5y@(GeE!8t*V~Vu
zHMVbJ9c)W;(Qoa_WP(Zo8f3m%0DauoYeV<wa}R4rX5Ls8Tk=t+>ou4r5Kx|a=%_>@
zVMO(#y<QHl<&itwHy<=b-|Xfevclu>dtyImM3}zc!75DK-};wC%`Ff(5cqq<c6pK>
zgByr|B>T^pPfO!VbUmxVrPwq;I-kdEe$DJS%?d2ezvSKjYv}2zaJ)Pvn_ltF5y<+S
zgbk-{Ck0PihntWr?%97(Hzz9yL=+Ec^zWWSR&$>A_2+kt=zp^!VTk<z$0ss4G%!>x
zs{jqiDnCHGU*C~jH4$?^?IcOykw26BrJTFY)l{2VR>{hsrNh_bwUG)hPG(Jh;lV4l
zGdQ7h;0+fbwFyLoUwUOo{#BrpdeaZQUq2o1QJhz&bw)eF`Tlcc1GTz)ze-1mKP3*z
z8IKzR_zqK23V^pYJ|DKCsX%v0i(s+&nMBCO!(S?R?{kTy$Y@<$&arx!gHJ%>FPxZ?
znY5iQyNhsoo%yLgWni$6A$l(4QG43x{FkY~aDYnY;3rG0Q$jw#oj{I<Bf)8mEx>!C
z);+D8)7P)a@@ryIyURPWkuRqgE5N#y9eL9eEGnfK&_zPDi137hdeT6UOh@XU^WU15
zKRd=<r{U%8ePcq-AxyhCO0Ud*|5IV9L4Hty&kDdZ?Eu6V<7_vyPH`j4q))sC$^4O?
zb?y_FZbgOGL>>RIkR3(8RvLa!y&AtuD(urX(Sdq>dM?fp1H?^L3QfQrIA_yV5yDy_
z`r?Jqu4dwdn@0|hkB`{x7lE-XQsSk7g>MrA?9089SUfvw)c4JOS-g4pd`ZUgaLF?_
z#u4*piaQ6o5wGZ<>4`W`q9`Q?asx^PUKPPp6^h<9lcFVggltfJ^9z>?;w9Vreu0q*
zL>WFfS5iN;QkMe)2f>XN7`G80g7h@;4v9*+`q7)`&IN&&bC}^2KVn4&&O6U2QJPpE
z?ck#FJV({IudpMOw~W>OJ&kR0EE-ekL)%#}DY&3AAuglDLkoaz5~}OOC%CvZtWXkO
zTjzB=Xm!{`jO){8d-m!MrahJ$RTNpErnoG{VoG6mmw=n~aGL>l6_An0TvSN2ZbntM
zTeFG3_Jx6ofnx`XS>hW)uf_So_nbbp&~LK+xc+j|Z8s~6ymCLE8OlPGw{te4AZ6{z
zW-IkPrtx&W0Qy}fH%R?)sO(vy*$^<%BE^-NZUzY=?yvR$ochD5*a7V+|8TuSzQ@fM
z4_=pla}7QY+^6MALBQI_Fa;>ni|V)7256GvTII<(sGD|-&s85~L=p?tPmki8Ip~i&
zw=%6<d<JXbdApoKj35;dunpL4j0QV42Im|R3n7{_dCUF9CC~ciq?Z&;zIe5qx3YJ;
z=%Te)783=V{U=3y`5?LzPO((~22|&h^S?iZ)6%S2eO*5M<m}@$AACEp0@R;U!SnyH
z@F7+3M`aRiM~xV3%PtaHkjc^rDDrz1^?kP9UyhajDN*qMa?r>8a@0gSsGcxsvd~CQ
zY{}R`&t|Zo>cPZzS4n9`%ag>@pvVD#<wtuNDqjxfj)!%QmnaL~2wvN)Jzms#qX2e%
zgX3JN9ZnScYE4#Pux0^cfGfo|z#_oBz!7oeGRidBuDKX74&trrr63i3m_v?x<-Bvz
z)f?3@x&0+YJ&c*VCkv~qDp&d)z6Cm$X*I)#awswoQgERRY=~_1I4sR#Qfa%jY5uIl
ze$rC#WwJCjvM9;3@{#A|GmPXncTs6y&&7PPx(*vWsbiDma7t~~=B;Y-v-Xj`BlwC3
z$@x4SMMZ0HaQf_y8!~CDSEVNza0Ix}qYVU)j(770h?2!q;uH+T&A48NevnKU9AL7l
zu8N&7_gOrJYDoNo%|^3=?l^S-=L(Cm6@(DuOLt&=>b`4Rlb|qai=S}sLY;nm%R;`J
zi+!0|=kq8L{u{Y3Sd-fAo|aF%GdznDmuHJS8?Hd`x5X1N?<$PnlW)T_ctUJ7$Z@ot
zf|H2WrMmz+ST6{@ZMEa9-vd>yw=smtS|R0C1KCv`$K@Ekj=p@?B-l(ZOAE(!zZ$<L
z8jiy>YM{9SJjg#Y5$}-w85|zu$!0CvpFkS-6N$`h$fen`+~5?}Ios{B_JGw)x3a0u
zU0A2=?y1LI!8lj`N2kmuXH6~ik?J-%Zt98X<0HD)N{%<L`zgjlwLpDfgOFq%0s!6X
zHQVw)PqO9=L_g+yTzh2s*1RI&J;5<yoaiX_yof*iaJ0lnl3fGhll!DBkq7*zDu^Og
z=MtGeM9zOQ3E$zK%Aa4ze5rg;z_Mn{lJn%33Hc|2Nz!n+OIp$67HbDg^;6kpi?tpi
zGS2TTyK#0EoJG7dD5*BLJL<Sm4t*Q`EfAG@7~NzmLw}^NK~<-jAtF;Uc4i(QARbr^
zam<;0|DyctaV2Nun7ps&0J&Y*w*y}{NK{(T%DFl!PLDF+MPckJQrkA$8&mmTw^p=V
z+2@d~CwD%+H)wySF~|LFlKg81Uw8lHcPoG3d@1i^oju&={ZmdQM%DUo{0JA19k%>Y
za~|+_e#;0}eNNlC3iq$mcs5%j+Z+hLG~l_nysWJ6a;ooFFR?R6#f%<3O51YY@Qp0f
z7F#Q-0^?)mPZ$rSkjhLL<_gid&WIp>Xm0UC-)YS<d@j*Z%}CO?inpyxVs(!-SCOs=
z<1~N^kKyAer6heAGsf8i-e{p^#}p9AdWU?f5LseEfYxe>3n7PU`g1SDovK(K(8)_U
ziF3WAnX^0W1br5iKElxi*PHD)Zz(a+B<c6jLXhO4(z4j$<CRF^!_p5P6dmS$8r$(g
z?lf1u6|9bZ>6;e|a~zP&TLoK`AYE9?)-S~dFmyMb<;cQaq_^S~U-KOX9zAM%8rROO
z7UD?Yl%cUFwJqnX9fYOa?)6Ak*y1cu>s@2$A5okr=EL<VpzbR;-Q|$qSjPbNEVtIc
zf!$&2lr+dCr?FP`jpO)0hM`K20<k=LvL+M#5l9yjq8m~iyH_+}AWg4TRNoM>Q*|xx
z4x{YnbD}-lhdUQjUpg+)&bNg+e-yptB$$HFfr6anZl;II1W>#C31`YNS5S^8OOKv#
zTwJ;jR3lgA%?oQdV8T<9`txFF9#mw&)w+xKLj|-fJ>o~b5->ee9g{&O67^ZIO0<qJ
zDJ+YPM%F0E;z#g&N*Ujd0_shTJV~|{Mmpr7qOW+1Awz#!ukWsMlzWnsA%F4AV+`-I
zVm7itynaL;2_l)1-3zau&1m9CskF0D^8{=9<z%Ni@<8rRL(Lb5N_Dw%#O#04lVCLn
zLa)4e=GK$`vRfp6uHnM1jD4gQ?G<m)^W7)jg=`KIK7+pSWTWj8J(KF5j&5$VEJ}Q8
z?A|RjzcW3{QCQ1HrH$um-wR-tc>qBfqnV;IVndUt!l~>#Z`EZ?@tz(k$Yq<Cy@zu{
z4D~f_mqeAdkBWV0QGEy_`sJ02Ithz%Z)_C}0hs?bnkiU#AfGST7o{-@7o?#epFDM=
zORZy|=Bs8H!F24pGSAfwwWHt)u;Vm}uJYdlv99$3NzLi`6T-6ZZw|hTzcb(F=M@|L
z+VL>B3Pd}~>+4)gPK7t{upAXpab`PqXTxA2w}%D22mJXRxQUGZfv*S6fJ*DVhu0T3
zld<Pzt*9~T-OdgzuXXhLDINZaR2)rA9*pnhUdLFo%s4#Dr~{rg*TUz7vR;wqqT%{(
z4yxG+a5o=xgP#F_%#H;cFh{*>TY?I%aeMU@8LZ820*OLZfvnpCG)U7;n?4B{<{0N*
z;vRgEqwCO)h6XXfc3&0+$%-M5K^I2)U71;^SG1WsE0uA^49@9ZqN?5DSnynx+;~A}
zQp`dEK0ZAL$#CbVsG{}+_)eHtVHCMj2#SY{)W&x_do9_hBg9jQ3oOU{gn~8q7U^z{
zalCwQ^m6g*IxnWtto_jvx`P(L;)T(Uc#}Xkp4D{;#kgSc%q^Vp?AEiRtUlY+s#p$o
zYKh&fLP}-;S6KKO?eKue2PGCp3$EOTT(j|Du}*?=ePQ^O@`Lom_&zD_hYRwXV&J$k
z(*^WH6MD?Dvk)anVS}FBI3O%AOg**PZ1(=xxvvpgLSEgYLgOw>Q?^J*u8Jm*(<0y_
z2v83}YugUd;(umxLxBp2cdCMT<d%(Fedn}^8C;-Lv>`0Woob6|Z1j6KlWMULLg3oN
zucgyZ^L3M%Y~x42gR$Iv9PKAkogZKc`Y>MK42gu%_VWbSizQq(d%b>dO*quZh;X-V
zXW#<owUOxP$;W62cqfA6fGN!;e|5P-1R12e_r$Pyt4xKTz1<Q3j@HH2n%bS{%Z9*|
zWD~tQuiRg+L=))+NNjpyv&FiOtPXP2`oIV6b{2m>#wvKi;B{Q-AWbmyVs%6b#(j+4
zt+y?g*LhrMxn;Hc5o+XMsX5`>9Kkd?uPMnv;gq@hs*wX5VQJ&(dv^=Ueu;Z`X`3iY
zwCn>e5FnTuPns~;HK=Il<vtT&R(mc*9R1?lO}X$WcQ)s;Ba^Q(S3&HtHvf<=ixkGT
zL~_){hzT;1m`5I6@Q{ZMd9U_S!i10zC4&8+GD1F9#jYL@^FggDZCS!7maER-Ah2Xy
zxdJ`<ws_WI{L7CSvNkjhKfVR~P+AP;AASjv?dq&UtCQJUaC|0e<%#Wus7{9YqleR~
z`_=6U_4a;K%yvB*z#l^gnZ}JTU?gLye&>&Mhe>TqHW}xHUHZL8j$ddA*i!Auci)&z
zHjC-<Yo%|F#p~ei5jA{Rjktz`d)`HrM@12h#~nYJ-5w7#<OfM0Oz877fGUH7q~PJs
zRVk<F8TvchhHlH^)my4rScg~GeZsXDhjgZk{U~6+XDI?5>WFg7N;v|EZz4vDWyr`g
zgp<mhF^{KosubggGxRJ*`%kmI7g1WcN!^R`>@5dHVQBKCz)<d1yq*Vlr<8)Ba8uLP
z>rCBQ{KQSIP?{p4-B<r$KK_$>kbD~I?B0f3Y_8(+S%38m{7stHM4dlyA{+sXeKJOu
z=pIR4qTSiWsBNXq7~cLqxA8wuV}9>XG!y9IzsK#Taf6Qidx1rKpxpLfgUGm%AeQ(3
z`$r5~<|fyz2VHX@h*G!@K9&g9T}eXP46r(WSbCIsV;%H1omV1}s_@x#xg7?`{+^D1
zBK!M0k>n|t>u{8LdSv}B@VJKXD_!mscL*mv@*p|4XXMgV%Uc;IDUY67DktC<hd|}Y
z?Lo%I+dA|`Xo+R?dzkTYBs0F9pu~p$SkE@NTFbfvKQ=$gX-C{QY0LN7*K^LP{jAW}
zI)!1yXAw!xCyTk?Y3^0;J3+6@#_fU=65-<$xJ@0qFc@Sk@~fBMfd1lt2KsZi&uO*u
z3}Gn&^nCU0nC*-nG)Hfpv!k1AnukQOoA{eM-K^PHM02FBALos4oTymixdUVI#)Qt2
zM9-7ZQ8&l$6(@A5vtM%v)Uc0R%pQKQ++bUHcYIj~+o=RObwk=%Myv%PoWUCw_sf@s
z)Lnv3)elm>m4WT_U(S3s#)@-61lg5^h2JFLf+V*01_(O(?@s-yDAFmfq93B|cH%1M
zU8AOCS=VuUFB_Q@QW_yFf($c<nsXp@(_pLLm+}jKVOTzns+0>PR^E1nEKeuH!U_bl
zn8g&5rs@JpIRZ$%JPFt|+nP;&-EjDgibv%X4z|o1onL;8?p6+Ikx~#)jyJg$K^jP7
zi$*T=A!-J4Dv#7>45&SbG!5dOc_n&ySG1R{;$-(HC5jZ3L*C91Xm5!Nt=~BqaLj4N
z?)2*q9$!;0F*UgP$Vl(r5~r*wzXFPi39B)<f#A$}0D*(MP=}mfHx!7a2#cH7Rbi%h
zgiU8A&^4$sf=ScQOcK_vT4+tuVp^dY!M&wl{Yq^`>^TQ-5k6Co^?l3frysC--D#E3
zfzFY-C%oHYe8_@^F9xMaM4DfI^7;#fw)YfRwZ2~Xo4pi=4r~YRm&5U^(J7$v@YCJ`
zb7l|ycBUiceIyNt@-Xlf5(!W(U(iArf{^Rqw76mHFY96Fr{LJbrBoNYSXkNT_%0wQ
zxM+4eULa-1abfY4mD1RP8-@nqEMk5Z>KG6jr=5T13!JDz>fHTV5W)H-$u9`?jo!bA
zgJc1{`9?EkZ)TevKSK%{piUID0(gIl@(2psrklimb#U*{Z6go&Yj;~KisYM)h}Xa3
zD4cb^bl&<P(oM72T?~pgci}dV>r4HH?kD`sCiTs=`A^9u9r#D{&i)G;%<o0_%P$Wy
zw)0bsX{YFEaQh_*D2JdKQ8F@Dg%@~sE%j@-s+Csw#bM>R)5(Ws=1PxQNL+C0-tA?D
zdyz>*PLAEnEK9Ogr9aVUo8SqrvA7$cRxs8*0pDd~)h+RCN0p*Rr;u~9rvO~#U69JC
zZialGgUdqTOFTuD^33c$T@kAaRg=>Qhz0~Jt_2Z|U&{<D*Uk?c!XoXQ4!MW1i@2~Q
zdos16k6L3I-5#Ti?eKzT5ZmP2u{-xWTA)qo${nv$&AtRys8ixSyGph$so4vyVu24h
zc$s#H<kE_Tq`Kl-dBQD2ElWZscy^#{8BL4KNy+TfoY4ohwI!ctaBnM~bzY&oC^&OT
z%QMp_aak>6uYOE>Hasirg0WmP%XO0vX?7xvUU=o$9R??($6yCNB|t4_4AnQG6qDyh
z=)Lti>yw*ni|JF_&M_&04)Y*{f-#?$wF%LT>UW8@4NA#nbi=mka~0AqL3u?N>z~#O
z_c#S_E$i#se*o+0X{qP$YHRcou5#yQQ|yB-F%1(ZAdnL1TnCG;p@VwCoPLv8nH)tu
zH3sE0*zO_4R*+WON4F*j0oVxV{Jj{Br`2bB%=lYaiHm#UjfSO88MPN^01zu~yaJpu
z3D*mnlJzKd5bq@R-oWO?S01jPqO7buw5{|cW$`ZFW)V$yZ?bF|IrGv_%=)@_n%5cp
zNO8%i?HyuPf(wxjN|P@<q9SN2^fI_h+;+wX9NQOPbr_%J1}XQ|ihaCLyMijK6#L%z
zOJdB?#ess>D<M%{31!9+`6a<WqNV<rp_=n&hN{z_z+8h+*?4;s4lhXYM+X1QB&sft
z*a^~Gb&r5;JKZmhkSe^FDYo_4TuH9#4vzbV8dGq9lW|8<w*~NEhUN{)CDoHkT(g@G
z`td)xzfVNuy4{GMa_jBn)4x9)bVxn4l3V*U(<b)U^nJR}1}XcmHN%+`e~dyY!u<w1
zC6MXUZ&da91q9@`P?tYy{~w3H{5?DwU-}p=sSv)o3{<Z}{4@O1>zL)UK&74UM9{aL
zGxP(fi(@=1EI<-{#C_RUzvXA9m*?{^vtt*rQye(#7<ABzaW-unMwPa-V5F}_G7O(G
z|I_GPf7*|+)0vEId?qyIqZ8|}9r~WzK=gYAsDBw?#8`N3G4=j48`;2w=nNd$0)Eq9
zu`~hA&Sjtlu)C2b@NF`w2<UKy3_j`=@z}ab!vd4k<`z2*#(>C`*W@V1HWlm)Z0oev
zX8K^WsLqqfo)Cq2BjonZtex97_{&yl@)Vr;hw`qQCtme>7orvy7AIU?f}MH9;$PuA
zJ5l>hSo|zH$I1Xvk}8Mpz=xGRL%(NmSCIPLdy53!m^;Y6`&((RCgUGT9_mVX$|uIT
z%0F^=m(aOL|2FHlRVVc;mD4qcE4wak4338rEvS;9C}x=nG?edNb2dPyhqkq{ik&5f
z@4AVj3&l+#S#wymqbO`-Z~McpyVeQv8>>+3IIuT$!AMyfh`2K3A}_A#ti6_D4B%%x
zmI<}}`N^LhnYYa~87*vJXTE;x5=CIWO!hN?52C5N4l>C~KQ>yDO}=-0=r!K)TJcpO
z!}qVoA3`+uIe-)?8w7AFHNtYhKB$jCwZSw;cp-1q1dQK1QW>tC{@h{_R-?<wF`E~Y
zbKI>_vpOlA_+b;RH#_63yDN2I_#$mLJw2|y8<IuR_#_!SK`gy-K`s<KP00xsMd>?i
zTh1YY^n*ROd#XCPhxb*rC1<IIqZ;wIOKl}4n_&FvW6*)jb_6rF^=({eDz1f#xfx#&
z=B(?HH%J*xcXkomWx_ln`Fy)uc07mtzT1a}Pi=wqjI|?&fT=e|R2!e6Z6OagQ%0pe
z?5Phq8#y{7X5xF*=cp?G?!qUsED~$&jGnQVJPRMG7ASuNX^^;moXW#zlxzC+lj-en
zqQFZ>&zXX=FQ2sA6sZOIbDU{!tanqi_O(f<Os+^C+4&;yQl1l~f5V4znI5O16qZ>b
z3)D6@7%afP26`|FJnvjXsN2heRb}N?{&F(O6QAFRKNVZ_4S3mb-3p8R=4}buRt8KK
zrPYVdqVW|t-hC2ni^u6Q^v4*-xS;~d@xBT!I0a2gF9?q<AuKIQDk*mLp8jd0>{yp1
zk%^UaOm5uX!k>f~hd058-V#>rIO;iEZ-YwitGiBKPrfj;+>*joZ#<Q_qVlH1?^T}M
zDbcZHymU4qdrc!oUK?lELn{S>p84%=@I4?um1j{Qz-3})_o+89O2u0bSk$gA^vA&@
znkWP#Wl{o2Zje98@o*)H66v{!?aF()7hZ0A?8>7J*_7J~?jH*|Po6(pG^dz$Aj#K0
z=ac-8;|c#W&~TE$E>4zZ;AqP5gEtyD>UgNF@bepYe=EpP%(2l@8!(+jF?4~7{=u2{
z?z`XZcE44(#rZdg&s$r6qXPMVo&5AyTlN>%zQO{>VFxdKzRvD_t>rJw@*U#E-#seW
zR}KG>FIW(qbYqN`vvm@L+pu9~@uRqaTVoq+R)8_4&Nzf)#{J#;drUUp3+J?tAmvXj
zAA^%+gxE;`5`2`u`Ay&k{Tk``fAadDhJ*aaVOrQ}o^KIiC`{u?oI`hvtsWUk+lgzH
zU>`IPp&c8LA@dIg8|*%MyD<Ce;Nua~0_&j~tp{pMU1H%^SGU(5z~pqw?=N?_p#Ols
z_e^3`9b($z-8V-FX678k4$=s9yMY~<^S4ddndgyHjLs%Zce(poYI^tY_&4~=7nv(W
zwt=fY@N_$1+JeV_f*6-&u*}10esRBUz-2#|`}6z2E<O9q@DAvPbX*vB;X#?uE+FN}
zbNat-U_?I6p~{1YH|PnxQgCzQ;|hHIOXemY15}JY1vaVnr@w9}+}xiXw@@TYqZ<9+
z{4CWr2nPE!aK)bo0~8GPHD!o{UoxT0h71AdFyZ9oKHEA{N;8C&%51T?%`IISOZiIt
zocHn=Ghf`2y|WL?Eye@6!h#fsC(?3_HXP^YG_JH|OSwu$vz`}Xy3WKmCs)$9<NMX{
z@2#pn%}*6|wVWV?dbMp=Q)xJ@o8Z6YY`<C~Yd??sjVP%b`z{y@UpUvua;r@b)VZkz
zZiHQ^=|(uH-;lU(GA>-VH5QjWC!s+Kv78g}Xz{sFwttSaPvtGIlf4kX9)iWO0!?e3
z_?d})wkAJuyvxKK9N%h~xyinKk>F&OcCR?tx0~-sxObEva{o4)w~w7H$Y;RN#)1(&
ziyp?j3~#dIMfot;<8tf*j399`TdU<MTw#EEWti35ZtnTpRi)NZ#rw`3P$;p{v~J6~
zcS9^D(h@!bhh)ZRy9y_HLJj1u!boS}kX$~NCkpNwCwMVRd%Q~LLjWvSYhUhc{M2)B
z%AEWm`Vq|Qb8Gq(6TR&Fq5pT;2>%sSEuZFf&9_;f3Xf$)p=#q6Bm3)1EBcEin#ew_
zyPK6x#5GmJAqxXekPR>WGgEyU&F$ViV-Os99Ne|S7g#dL@RR*24|37&0zm7=vK@66
z*B1B7>p6el^{&iVstuO5ig<U1Qfs#{llidZFoVB41U~%+vjk35fp^v#Qey<9FgB6o
zEN&d}Hg2UCu3`eB6Q;A#m#cYx@$SIu_T7^O0rk2953=%)&{|>c;J7{`Vk2_u`y1x`
z;;oqec=SJd{oi21cxC9lstV-)NjGz>wI|Lp@fVXE0a-|Vzq}U={#f2W+{ymfnf{ju
z_+wN3-xGcRttL!$q4lu1<H@~88+;bwwKlNt<mGqE|CdAiF!pze*Zc2(*X4gm<KNPm
z{##9)KXjn~pL8G#b}y+L0jA}y9S)Rzx|q8C(=>Tda=JU=XQr`}42`@z)ZykD>A~S!
zZyor8_&a2u0EK?mH9PV!{N1h}vd$Se%-@31vv_{~e?|-aVRg{&tiS&$fB(IR`l0^-
DLO^x_

literal 0
HcmV?d00001

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 8ac09f6988893..fd741ea5e9766 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -114,6 +114,7 @@ Documentation
    usage/engine_args
    usage/env_vars
    usage/usage_stats
+   usage/disagg_prefill
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/source/usage/disagg_prefill.rst b/docs/source/usage/disagg_prefill.rst
new file mode 100644
index 0000000000000..9fe714b4fd856
--- /dev/null
+++ b/docs/source/usage/disagg_prefill.rst
@@ -0,0 +1,69 @@
+.. _disagg_prefill:
+
+Disaggregated prefilling (experimental)
+=======================================
+
+This page introduces you the disaggregated prefilling feature in vLLM. This feature is experimental and subject to change. 
+
+Why disaggregated prefilling?
+-----------------------------
+
+Two main reasons:
+
+* **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. ``tp`` and ``pp``) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT.
+* **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL.
+
+.. note::
+    Disaggregated prefill DOES NOT improve throughput.
+
+Usage example
+-------------
+
+Please refer to ``examples/disaggregated_prefill.sh`` for the example usage of disaggregated prefilling.
+
+
+Benchmarks
+----------
+
+Please refer to ``benchmarks/disagg_benchmarks/`` for disaggregated prefilling benchmarks.
+
+
+Development
+-----------
+
+We implement disaggregated prefilling by running 2 vLLM instances. One for prefill (we call it prefill instance) and one for decode (we call it decode instance), and then use a connector to transfer the prefill KV caches and results from prefill instance to decode instance.
+
+All disaggregated prefilling implementation is under ``vllm/distributed/kv_transfer``.
+
+Key abstractions for disaggregated prefilling:
+
+* **Connector**: Connector allows **kv consumer** to retrieve the KV caches of a batch of request from **kv producer**.
+* **LookupBuffer**: LookupBuffer provides two API: ``insert`` KV cache and ``drop_select`` KV cache. The semantics of ``insert`` and ``drop_select`` are similar to SQL, where ``insert`` inserts a KV cache into the buffer, and ``drop_select`` returns the KV cache that matches the given condition and drop it from the buffer.
+* **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports ``send_tensor`` and ``recv_tensor``.
+
+.. note::
+    ``insert`` is non-blocking operation but ``drop_select`` is blocking operation.
+
+Here is a figure illustrating how the above 3 abstractions are organized:
+
+.. image:: /assets/usage/disagg_prefill/abstraction.jpg
+    :alt: Disaggregated prefilling abstractions
+
+The workflow of disaggregated prefilling is as follows:
+
+.. image:: /assets/usage/disagg_prefill/overview.jpg
+    :alt: Disaggregated prefilling workflow
+
+The ``buffer`` corresponds to ``insert`` API in LookupBuffer, and the ``drop_select`` corresponds to ``drop_select`` API in LookupBuffer.
+
+
+Third-party contributions
+-------------------------
+
+Disaggregated prefilling is highly related to infrastructure, so vLLM relies on third-party connectors for production-level disaggregated prefilling (and vLLM team will actively review and merge new PRs for third-party connectors).
+
+We recommend three ways of implementations:
+
+* **Fully-customized connector**: Implement your own ``Connector``, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
+* **Database-like connector**: Implement your own ``LookupBuffer`` and support the ``insert`` and ``drop_select`` APIs just like SQL.
+* **Distributed P2P connector**: Implement your own ``Pipe`` and support the ``send_tensor`` and ``recv_tensor`` APIs, just like `torch.distributed`.

From d263bd9df7b2f5586910e5d006a11ff11ba7c310 Mon Sep 17 00:00:00 2001
From: shangmingc <caishangming@linux.alibaba.com>
Date: Mon, 16 Dec 2024 05:28:18 +0800
Subject: [PATCH 078/357] [Core] Support disaggregated prefill with Mooncake
 Transfer Engine (#10884)

Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
---
 vllm/config.py                                |   7 +-
 .../kv_transfer/kv_connector/factory.py       |   3 +-
 .../kv_connector/simple_connector.py          | 101 +++++--
 .../kv_transfer/kv_pipe/mooncake_pipe.py      | 272 ++++++++++++++++++
 4 files changed, 352 insertions(+), 31 deletions(-)
 create mode 100644 vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py

diff --git a/vllm/config.py b/vllm/config.py
index 37d062f7eb079..fce8011be4015 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2171,13 +2171,14 @@ def from_cli(cls, cli_value: str) -> "KVTransferConfig":
         return KVTransferConfig.model_validate_json(cli_value)
 
     def model_post_init(self, __context: Any) -> None:
+        supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"]
         if all([
-                self.kv_connector is not None,
-                self.kv_connector != "PyNcclConnector"
+                self.kv_connector is not None, self.kv_connector
+                not in supported_kv_connector
         ]):
             raise ValueError(f"Unsupported kv_connector: {self.kv_connector}. "
                              f"Supported connectors are "
-                             f"`PyNcclConnector`.")
+                             f"{supported_kv_connector}.")
 
         if self.kv_role is not None and self.kv_role not in [
                 "kv_producer", "kv_consumer", "kv_both"
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index 015f892cec933..3e2bb436d24b5 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -11,7 +11,8 @@ class KVConnectorFactory:
     @staticmethod
     def create_connector(rank: int, local_rank: int,
                          config: "VllmConfig") -> KVConnectorBase:
-        if config.kv_transfer_config.kv_connector == 'PyNcclConnector':
+        supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"]
+        if config.kv_transfer_config.kv_connector in supported_kv_connector:
             from .simple_connector import SimpleConnector
             return SimpleConnector(rank, local_rank, config)
         else:
diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
index bf4f40ca94e29..4ace03ff1184e 100644
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -2,7 +2,8 @@
 Simple KV Cache Connector for Distributed Machine Learning Inference
 
 The SimpleConnector transfers KV caches between prefill vLLM worker (KV cache 
-producer) and decode vLLM worker (KV cache consumer) using PyNcclPipe.
+producer) and decode vLLM worker (KV cache consumer) using PyNcclPipe or
+MooncakePipe.
 
 But the logic can be extended to support other pipe and lookup buffer.
 """
@@ -15,7 +16,6 @@
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
 from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import (
     SimpleBuffer)
-from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
 
@@ -36,7 +36,27 @@ def __init__(
 
         self.config = config.kv_transfer_config
 
-        logger.info("Initializing PyNcclConfig under kv_transfer_config %s",
+        if self.config.kv_connector == "PyNcclConnector":
+            from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import (
+                PyNcclPipe)
+            logger.info(
+                "Initializing PyNcclConfig under kv_transfer_config %s",
+                self.config)
+        elif self.config.kv_connector == "MooncakeConnector":
+            # Check if MOONCAKE_CONFIG_PATH is set
+            import os
+            use_mooncake_distributed_pipe = os.getenv(
+                'MOONCAKE_CONFIG_PATH') is not None
+
+            if not use_mooncake_distributed_pipe:
+                raise ValueError(
+                    "To use MooncakeConnector, you need to pass the ENV: "
+                    "'MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json'.")
+            else:
+                from vllm.distributed.kv_transfer.kv_pipe.mooncake_pipe import (  # noqa: E501
+                    MooncakePipe)
+                logger.info(
+                    "Initializing MooncakeConfig under kv_transfer_config %s",
                     self.config)
 
         self.lookup_buffer_size = self.config.kv_buffer_size
@@ -44,6 +64,11 @@ def __init__(
         self.producer_buffer: Optional[SimpleBuffer] = None
         self.consumer_buffer: Optional[SimpleBuffer] = None
 
+        self.producer_data_pipe: Union[PyNcclPipe, MooncakePipe]
+        self.consumer_data_pipe: Union[PyNcclPipe, MooncakePipe]
+        self.producer_signal_pipe: Union[PyNcclPipe, MooncakePipe]
+        self.consumer_signal_pipe: Union[PyNcclPipe, MooncakePipe]
+
         # 2 pipes for every rank in the world
         port_offset_base = 2 * rank
 
@@ -51,17 +76,26 @@ def __init__(
         # and the decode vLLM only uses recv pipe
         if self.config.is_kv_producer:
 
-            self.producer_data_pipe = PyNcclPipe(
-                local_rank=local_rank,
-                config=self.config,
-                port_offset=port_offset_base,
-            )
-            self.producer_signal_pipe = PyNcclPipe(
-                local_rank=local_rank,
-                config=self.config,
-                port_offset=port_offset_base + 1,
-                device="cpu",
-            )
+            if self.config.kv_connector == "PyNcclConnector":
+                self.producer_data_pipe = PyNcclPipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                    port_offset=port_offset_base,
+                )
+                self.producer_signal_pipe = PyNcclPipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                    port_offset=port_offset_base + 1,
+                    device="cpu",
+                )
+            elif self.config.kv_connector == "MooncakeConnector":
+                self.producer_data_pipe = MooncakePipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                )
+                # We only need to initialize MooncakePipe once
+                self.producer_signal_pipe = self.producer_data_pipe
+
             self.producer_buffer = SimpleBuffer(self.producer_signal_pipe,
                                                 self.producer_data_pipe,
                                                 self.config.kv_buffer_size)
@@ -70,17 +104,25 @@ def __init__(
 
             # the current vLLM instance is KV consumer, so it needs to connect
             # its recv pipe to the send pipe of KV producder
-            self.consumer_data_pipe = PyNcclPipe(
-                local_rank=local_rank,
-                config=self.config,
-                port_offset=port_offset_base,
-            )
-            self.consumer_signal_pipe = PyNcclPipe(
-                local_rank=local_rank,
-                config=self.config,
-                port_offset=port_offset_base + 1,
-                device="cpu",
-            )
+            if self.config.kv_connector == "PyNcclConnector":
+                self.consumer_data_pipe = PyNcclPipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                    port_offset=port_offset_base,
+                )
+                self.consumer_signal_pipe = PyNcclPipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                    port_offset=port_offset_base + 1,
+                    device="cpu",
+                )
+            elif self.config.kv_connector == "MooncakeConnector":
+                self.consumer_data_pipe = MooncakePipe(
+                    local_rank=local_rank,
+                    config=self.config,
+                )
+                self.consumer_signal_pipe = self.consumer_data_pipe
+
             self.consumer_buffer = SimpleBuffer(
                 self.consumer_signal_pipe,
                 self.consumer_data_pipe,
@@ -260,6 +302,11 @@ def recv_kv_caches_and_hidden_states(
 
     def close(self):
         self.producer_data_pipe.close()
-        self.producer_signal_pipe.close()
         self.consumer_data_pipe.close()
-        self.consumer_signal_pipe.close()
+        if self.config.kv_connector == "PyNcclConnector":
+            self.producer_signal_pipe.close()
+            self.consumer_signal_pipe.close()
+        elif self.config.kv_connector == "MooncakeConnector":
+            # MooncakePipe reuses data_pipe for signal_pipe, so we only have to
+            # close the data_pipe.
+            pass
diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
new file mode 100644
index 0000000000000..8e4358672b74d
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -0,0 +1,272 @@
+import json
+import os
+import pickle
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import zmq
+
+from vllm.config import KVTransferConfig
+from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+NONE_INT = -150886311
+
+
+@dataclass
+class MooncakeTransferEngineConfig:
+    prefill_url: str
+    decode_url: str
+    metadata_backend: Union[str, None]
+    metadata_server: str
+    protocol: str
+    device_name: str
+
+    @staticmethod
+    def from_file(file_path: str) -> 'MooncakeTransferEngineConfig':
+        """Load the config from a JSON file."""
+        with open(file_path) as fin:
+            config = json.load(fin)
+        return MooncakeTransferEngineConfig(
+            prefill_url=config.get("prefill_url"),
+            decode_url=config.get("decode_url"),
+            metadata_backend=config.get("metadata_backend", None),
+            metadata_server=config.get("metadata_server"),
+            protocol=config.get("protocol", "tcp"),
+            device_name=config.get("device_name", ""),
+        )
+
+    @staticmethod
+    def load_from_env() -> 'MooncakeTransferEngineConfig':
+        """Load config from a file specified in the environment variable."""
+        config_file_path = os.getenv('MOONCAKE_CONFIG_PATH')
+        if config_file_path is None:
+            raise ValueError(
+                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set.")
+        return MooncakeTransferEngineConfig.from_file(config_file_path)
+
+
+class MooncakeTransferEngine:
+    """Handles the transfer of data using mooncake_vllm_adaptor and ZeroMQ."""
+
+    def __init__(self, kv_rank: int, local_rank: int):
+        try:
+            import mooncake_vllm_adaptor as mva
+        except ImportError as e:
+            raise ImportError(
+                "Please install mooncake by following the instructions at "
+                "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
+                "to run vLLM with MooncakeConnector.") from e
+
+        self.engine = mva.mooncake_vllm_adaptor()
+        self.local_rank = local_rank
+
+        try:
+            self.config = MooncakeTransferEngineConfig.load_from_env()
+            logger.info("Mooncake Configuration loaded successfully.")
+        except ValueError as e:
+            logger.error(e)
+            raise
+        except Exception as exc:
+            logger.error(
+                "An error occurred while loading the configuration: %s", exc)
+            raise
+        prefill_host, base_prefill_port = self.config.prefill_url.split(':')
+        decode_host, base_decode_port = self.config.decode_url.split(':')
+
+        # Avoid ports conflict when running prefill and decode on the same node
+        if prefill_host == decode_host and \
+                base_prefill_port == base_decode_port:
+            base_decode_port = str(int(base_decode_port) + 100)
+
+        prefill_port = int(base_prefill_port) + self.local_rank
+        decode_port = int(base_decode_port) + self.local_rank
+        self.prefill_url = ':'.join([prefill_host, str(prefill_port)])
+        self.decode_url = ':'.join([decode_host, str(decode_port)])
+
+        self.initialize(self.prefill_url if kv_rank == 0 else self.decode_url,
+                        self.config.metadata_server, self.config.protocol,
+                        self.config.device_name, self.config.metadata_backend)
+
+        self.remote_url = (self.decode_url
+                           if kv_rank == 0 else self.prefill_url)
+
+        # Initialize ZeroMQ context and sockets
+        self.context = zmq.Context()  # type: ignore[attr-defined]
+        self.sender_socket = self.context.socket(zmq.constants.PUSH)
+        self.receiver_socket = self.context.socket(zmq.constants.PULL)
+        self.sender_ack = self.context.socket(zmq.constants.PULL)
+        self.receiver_ack = self.context.socket(zmq.constants.PUSH)
+
+        self.buffer_cleaner = ThreadPoolExecutor(max_workers=1)
+        self._setup_metadata_sockets(kv_rank, prefill_host, base_prefill_port,
+                                     decode_host, base_decode_port)
+
+    def _setup_metadata_sockets(self, kv_rank: int, p_host: str, p_port: str,
+                                d_host: str, d_port: str) -> None:
+        """Set up ZeroMQ sockets for sending and receiving data."""
+        # Offsets < 8 are left for initialization in case tp and pp are enabled
+        p_rank_offset = int(p_port) + 8 + self.local_rank * 2
+        d_rank_offset = int(d_port) + 8 + self.local_rank * 2
+        if kv_rank == 0:
+            self.sender_socket.bind(f"tcp://*:{p_rank_offset + 1}")
+            self.receiver_socket.connect(f"tcp://{d_host}:{d_rank_offset + 1}")
+            self.sender_ack.connect(f"tcp://{d_host}:{d_rank_offset + 2}")
+            self.receiver_ack.bind(f"tcp://*:{p_rank_offset + 2}")
+        else:
+            self.receiver_socket.connect(f"tcp://{p_host}:{p_rank_offset + 1}")
+            self.sender_socket.bind(f"tcp://*:{d_rank_offset + 1}")
+            self.receiver_ack.bind(f"tcp://*:{d_rank_offset + 2}")
+            self.sender_ack.connect(f"tcp://{p_host}:{p_rank_offset + 2}")
+
+    def initialize(self, local_hostname: str, metadata_server: str,
+                   protocol: str, device_name: str,
+                   metadata_backend: Union[str, None]) -> None:
+        """Initialize the mooncake instance."""
+        if metadata_backend is None:
+            self.engine.initialize(local_hostname, metadata_server, protocol,
+                                   device_name)
+        else:
+            supported_backend = ["etcd", "redis"]
+            metadata_backend = metadata_backend.lower()
+            if metadata_backend not in supported_backend:
+                raise ValueError(
+                    "Mooncake Configuration error. `metadata_backend`"
+                    f"should be one of {supported_backend}.")
+
+            self.engine.initializeExt(local_hostname, metadata_server,
+                                      protocol, device_name, metadata_backend)
+
+    def allocate_managed_buffer(self, length: int) -> int:
+        """Allocate a managed buffer of the specified length."""
+        ret = self.engine.allocateManagedBuffer(length)
+        if ret <= 0:
+            logger.error("Allocation Return Error")
+            raise Exception("Allocation Return Error")
+        return ret
+
+    def free_managed_buffer(self, buffer: int, length: int) -> int:
+        """Free a previously allocated managed buffer."""
+        return self.engine.freeManagedBuffer(buffer, length)
+
+    def transfer_sync(self, buffer: int, peer_buffer_address: int,
+                      length: int) -> int:
+        """Synchronously transfer data to the specified address."""
+        ret = self.engine.transferSync(self.remote_url, buffer,
+                                       peer_buffer_address, length)
+        if ret < 0:
+            logger.error("Transfer Return Error")
+            raise Exception("Transfer Return Error")
+        return ret
+
+    def write_bytes_to_buffer(self, buffer: int, user_data: bytes,
+                              length: int) -> int:
+        """Write bytes to the allocated buffer."""
+        return self.engine.writeBytesToBuffer(buffer, user_data, length)
+
+    def read_bytes_from_buffer(self, buffer: int, length: int) -> bytes:
+        """Read bytes from the allocated buffer."""
+        return self.engine.readBytesFromBuffer(buffer, length)
+
+    def wait_for_ack(self, src_ptr: int, length: int) -> None:
+        """Asynchronously wait for ACK from the receiver."""
+        ack = self.sender_ack.recv_pyobj()
+        if ack != b'ACK':
+            logger.error("Failed to receive ACK from the receiver")
+
+        self.free_managed_buffer(src_ptr, length)
+
+    def send_bytes(self, user_data: bytes) -> None:
+        """Send bytes to the remote process."""
+        length = len(user_data)
+        src_ptr = self.allocate_managed_buffer(length)
+        self.write_bytes_to_buffer(src_ptr, user_data, length)
+        self.sender_socket.send_pyobj((src_ptr, length))
+        self.buffer_cleaner.submit(self.wait_for_ack, src_ptr, length)
+
+    def recv_bytes(self) -> bytes:
+        """Receive bytes from the remote process."""
+        src_ptr, length = self.receiver_socket.recv_pyobj()
+        dst_ptr = self.allocate_managed_buffer(length)
+        self.transfer_sync(dst_ptr, src_ptr, length)
+        ret = self.read_bytes_from_buffer(dst_ptr, length)
+
+        # Buffer cleanup
+        self.receiver_ack.send_pyobj(b'ACK')
+        self.free_managed_buffer(dst_ptr, length)
+
+        return ret
+
+
+class MooncakePipe(KVPipeBase):
+    """MooncakeTransferEngine based Pipe implementation."""
+
+    def __init__(self,
+                 local_rank: int,
+                 config: KVTransferConfig,
+                 device: Optional[str] = None):
+        """Initialize the mooncake pipe and set related parameters."""
+        self.config = config
+        self.local_rank = local_rank
+        self.kv_rank = self.config.kv_rank
+        if device is None:
+            self.device = self._select_device(self.config.kv_buffer_device)
+        else:
+            self.device = self._select_device(device)
+
+        self.transfer_engine = MooncakeTransferEngine(self.kv_rank,
+                                                      self.local_rank)
+        self.transport_thread: Optional[ThreadPoolExecutor] = None
+        self.none_tensor = torch.tensor([NONE_INT], device=self.device)
+
+    def _select_device(self, device: str) -> torch.device:
+        """Select available device (CUDA or CPU)."""
+        logger.info("Selecting device: %s", device)
+        if device == "cuda":
+            return torch.device(f"cuda:{self.local_rank}")
+        else:
+            return torch.device("cpu")
+
+    def tensor_hash(self, tensor: torch.Tensor) -> int:
+        """Calculate the hash value of the tensor."""
+        return hash(tensor.data_ptr())
+
+    def _send_impl(self, tensor: torch.Tensor) -> None:
+        """Implement the tensor sending logic."""
+        value_bytes = pickle.dumps(tensor)
+        self.transfer_engine.send_bytes(value_bytes)
+
+    def _recv_impl(self) -> torch.Tensor:
+        """Implement the tensor receiving logic."""
+        data = self.transfer_engine.recv_bytes()
+        return pickle.loads(data)
+
+    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
+        """Send tensor to the target process."""
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
+        tensor = tensor if tensor is not None else self.none_tensor
+        assert (len(tensor.shape) > 0)
+        self.transport_thread.submit(self._send_impl, tensor)
+
+    def recv_tensor(self) -> Optional[torch.Tensor]:
+        """Receive tensor from other processes."""
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
+        tensor = self.transport_thread.submit(self._recv_impl).result()
+        if tensor.numel() == 1 and tensor.item() == NONE_INT:
+            return None
+        else:
+            return tensor
+
+    def close(self) -> None:
+        """Cleanup logic when closing the pipe."""
+        self.transfer_engine.sender_socket.close()
+        self.transfer_engine.receiver_socket.close()
+        self.transfer_engine.sender_ack.close()
+        self.transfer_engine.receiver_ack.close()
+        self.transfer_engine.context.term()  # Terminate the ZMQ context
+        logger.info("Closed the transfer engine and cleaned up resources.")

From 25ebed2f8ca6d747d63f2be9ede023c561851ac8 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 15 Dec 2024 13:33:00 -0800
Subject: [PATCH 079/357] [V1][Minor] Cache np arange to reduce input
 preparation overhead (#11214)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index abcd4b007a326..67166fb05085c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -118,6 +118,12 @@ def __init__(
             dtype=self.dtype,
             device=self.device)
 
+        # OPTIMIZATION: Cache the tensors rather than creating them every step.
+        self.arange_np = np.arange(max(self.max_num_reqs, self.max_model_len),
+                                   dtype=np.int32)
+        # NOTE(woosuk): These tensors are "stateless", i.e., they are literally
+        # a faster version of creating a new tensor every time. Thus, we should
+        # not make any assumptions about the values in these tensors.
         self.input_ids_cpu = torch.zeros(self.max_num_tokens,
                                          dtype=torch.int32,
                                          device="cpu",
@@ -269,11 +275,13 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
 
         # Get request indices.
         # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
-        req_indices = np.repeat(np.arange(num_reqs), num_scheduled_tokens)
+        req_indices = np.repeat(self.arange_np[:num_reqs],
+                                num_scheduled_tokens)
 
         # Get batched arange.
         # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
-        arange = np.concatenate([np.arange(n) for n in num_scheduled_tokens])
+        arange = np.concatenate(
+            [self.arange_np[:n] for n in num_scheduled_tokens])
 
         # Get positions.
         positions_np = self.positions_np[:total_num_scheduled_tokens]

From da6f40924609e084ced486cae5b4ddf97133acd9 Mon Sep 17 00:00:00 2001
From: AlexHe99 <alehe@amd.com>
Date: Mon, 16 Dec 2024 08:33:58 +0800
Subject: [PATCH 080/357] Update deploying_with_k8s.rst (#10922)

---
 docs/source/serving/deploying_with_k8s.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/serving/deploying_with_k8s.rst b/docs/source/serving/deploying_with_k8s.rst
index 7dc076dc709df..cc3606f0df851 100644
--- a/docs/source/serving/deploying_with_k8s.rst
+++ b/docs/source/serving/deploying_with_k8s.rst
@@ -162,7 +162,7 @@ To test the deployment, run the following ``curl`` command:
     curl http://mistral-7b.default.svc.cluster.local/v1/completions \
       -H "Content-Type: application/json" \
       -d '{
-            "model": "facebook/opt-125m",
+            "model": "mistralai/Mistral-7B-Instruct-v0.3",
             "prompt": "San Francisco is a",
             "max_tokens": 7,
             "temperature": 0
@@ -172,4 +172,4 @@ If the service is correctly deployed, you should receive a response from the vLL
 
 Conclusion
 ----------
-Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation.
\ No newline at end of file
+Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation.

From 69ba344de8683ec4d3d42d11ae4e147a2a302da8 Mon Sep 17 00:00:00 2001
From: chenqianfzh <51831990+chenqianfzh@users.noreply.github.com>
Date: Sun, 15 Dec 2024 16:38:40 -0800
Subject: [PATCH 081/357] [Bugfix] Fix block size validation (#10938)

---
 vllm/engine/arg_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 5a73c6ee02e0c..0aa367a173b6c 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -425,7 +425,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument('--block-size',
                             type=int,
                             default=EngineArgs.block_size,
-                            choices=[8, 16, 32, 64, 128],
+                            choices=[8, 16, 32],
                             help='Token block size for contiguous chunks of '
                             'tokens. This is ignored on neuron devices and '
                             'set to max-model-len')

From 17138af7c45eba3aba3e9b84a3852b4ba81e460f Mon Sep 17 00:00:00 2001
From: yansh97 <yansh97@foxmail.com>
Date: Mon, 16 Dec 2024 16:15:40 +0800
Subject: [PATCH 082/357] [Bugfix] Fix the default value for temperature in
 ChatCompletionRequest (#11219)

---
 vllm/entrypoints/openai/protocol.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index dfb7c977dbd43..6ed7c2e9dcd6b 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -211,7 +211,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
     stream: Optional[bool] = False
     stream_options: Optional[StreamOptions] = None
-    temperature: Optional[float] = 0.7
+    temperature: Optional[float] = 1.0
     top_p: Optional[float] = 1.0
     tools: Optional[List[ChatCompletionToolsParam]] = None
     tool_choice: Optional[Union[Literal["none"], Literal["auto"],

From b3b1526f03906c935e6ef80a2cdc971a65fdf7e2 Mon Sep 17 00:00:00 2001
From: cennn <61925104+cennn@users.noreply.github.com>
Date: Mon, 16 Dec 2024 17:20:49 +0800
Subject: [PATCH 083/357] WIP: [CI/Build] simplify Dockerfile build for ARM64 /
 GH200 (#11212)

Signed-off-by: drikster80 <ed.sealing@gmail.com>
Co-authored-by: drikster80 <ed.sealing@gmail.com>
---
 Dockerfile                                    | 40 +++++++++++++++----
 docs/source/serving/deploying_with_docker.rst | 26 ++++++++++++
 requirements-build.txt                        |  2 +-
 requirements-cuda-arm64.txt                   |  3 ++
 requirements-cuda.txt                         |  4 +-
 5 files changed, 64 insertions(+), 11 deletions(-)
 create mode 100644 requirements-cuda-arm64.txt

diff --git a/Dockerfile b/Dockerfile
index c1b6e1bbfe354..123703848749c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,6 +11,7 @@ ARG CUDA_VERSION=12.4.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
 ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
+ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive
 
 # Install Python and other dependencies
@@ -46,9 +47,14 @@ WORKDIR /workspace
 # install build and runtime dependencies
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
+COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-cuda.txt
 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        python3 -m pip install -r requirements-cuda-arm64.txt; \
+    fi
 
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@@ -63,6 +69,7 @@ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
 
 #################### WHEEL BUILD IMAGE ####################
 FROM base AS build
+ARG TARGETPLATFORM
 
 # install build dependencies
 COPY requirements-build.txt requirements-build.txt
@@ -70,6 +77,11 @@ COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-build.txt
 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        python3 -m pip install -r requirements-cuda-arm64.txt; \
+    fi
+
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
@@ -134,8 +146,8 @@ COPY requirements-test.txt requirements-test.txt
 COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-dev.txt
-
 #################### DEV IMAGE ####################
+
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
 FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
@@ -143,6 +155,9 @@ ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETPLATFORM
+
+COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
 
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
     echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
@@ -168,18 +183,25 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
-# install vllm wheel first, so that torch etc will be installed
+# Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install dist/*.whl --verbose
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    . /etc/environment && \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        pip uninstall -y torch && \
+        python3 -m pip install -r requirements-cuda-arm64.txt; \
+    fi
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+. /etc/environment && \
+if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
+    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
+fi
 COPY examples examples
 #################### vLLM installation IMAGE ####################
 
-
 #################### TEST IMAGE ####################
 # image to run unit testing suite
 # note that this uses vllm installed by `pip`
@@ -209,7 +231,6 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
 RUN mkdir test_docs
 RUN mv docs test_docs/
 RUN mv vllm test_docs/
-
 #################### TEST IMAGE ####################
 
 #################### OPENAI API SERVER ####################
@@ -218,8 +239,11 @@ FROM vllm-base AS vllm-openai
 
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' timm==0.9.10
-
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10'; \
+    else \
+        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \
+    fi
 ENV VLLM_USAGE_SOURCE production-docker-image
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index 14d94b09e9b9c..11a9f12fd17cd 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -37,6 +37,32 @@ You can build and run vLLM from source via the provided `Dockerfile <https://git
         current GPU type the machine is running on, you can add the argument ``--build-arg torch_cuda_arch_list=""``
         for vLLM to find the current GPU type and build for that.
 
+Building for Arm64/aarch64
+--------------------------
+
+A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
+of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
+
+.. note::
+
+        Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
+        flags to speed up build process. However, ensure your 'max_jobs' is substantially larger than 'nvcc_threads' to get the most benefits.
+        Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
+
+.. code-block:: console
+
+    # Example of building on Nvidia GH200 server. (Memory usage: ~12GB, Build time: ~1475s / ~25 min, Image size: 7.26GB)
+    $ DOCKER_BUILDKIT=1 sudo docker build . \
+      --target vllm-openai \
+      -platform "linux/arm64" \
+      -t vllm/vllm-gh200-openai:latest \
+      --build-arg max_jobs=66 \
+      --build-arg nvcc_threads=2 \
+      --build-arg torch_cuda_arch_list="9.0+PTX" \
+      --build-arg vllm_fa_cmake_gpu_arches="90-real"
+
+
+
 
 To run vLLM:
 
diff --git a/requirements-build.txt b/requirements-build.txt
index fec01caaf25ef..388b193403e88 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -4,6 +4,6 @@ ninja
 packaging
 setuptools>=61
 setuptools-scm>=8
-torch==2.5.1
+torch==2.5.1; platform_machine != 'aarch64'
 wheel
 jinja2
diff --git a/requirements-cuda-arm64.txt b/requirements-cuda-arm64.txt
new file mode 100644
index 0000000000000..bbcb5cb7012ce
--- /dev/null
+++ b/requirements-cuda-arm64.txt
@@ -0,0 +1,3 @@
+--index-url https://download.pytorch.org/whl/nightly/cu124
+torchvision==0.22.0.dev20241215; platform_machine == 'aarch64'
+torch==2.6.0.dev20241210+cu124; platform_machine == 'aarch64'
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 058ab7c1ee9df..5d4dee8c7129a 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -4,7 +4,7 @@
 # Dependencies for NVIDIA GPUs
 ray >= 2.9
 nvidia-ml-py >= 12.560.30 # for pynvml package
-torch == 2.5.1
+torch == 2.5.1; platform_machine != 'aarch64'
 # These must be updated alongside torch
-torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision == 0.20.1; platform_machine != 'aarch64' # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.1

From bddbbcb132429084ede62855bcd6a1023a3645c1 Mon Sep 17 00:00:00 2001
From: Jani Monoses <jani.monoses@gmail.com>
Date: Mon, 16 Dec 2024 11:56:19 +0200
Subject: [PATCH 084/357] [Model] Support Cohere2ForCausalLM (Cohere R7B)
 (#11203)

---
 docs/source/models/supported_models.rst |  4 ++--
 tests/models/registry.py                |  2 ++
 tests/models/test_initialization.py     |  4 ++++
 vllm/model_executor/models/commandr.py  | 19 +++++++++++++++++--
 vllm/model_executor/models/registry.py  |  1 +
 5 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index cae4a88de1638..3bef3f3226062 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -118,9 +118,9 @@ Text Generation (``--task generate``)
     - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc.
     - ✅︎
     - ✅︎
-  * - :code:`CohereForCausalLM`
+  * - :code:`CohereForCausalLM`,:code:`Cohere2ForCausalLM`
     - Command-R
-    - :code:`CohereForAI/c4ai-command-r-v01`, etc.
+    - :code:`CohereForAI/c4ai-command-r-v01`, :code:`CohereForAI/c4ai-command-r7b-12-2024`, etc.
     - ✅︎
     - ✅︎
   * - :code:`DbrxForCausalLM`
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 6a8b1742ceae3..fac8c4b2e9b19 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -53,6 +53,8 @@ class _HfExamplesInfo:
     # ChatGLMModel supports multimodal
     "CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01",
                                          trust_remote_code=True),
+    "Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501
+                                         trust_remote_code=True),
     "DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"),
     "DeciLMForCausalLM": _HfExamplesInfo("Deci/DeciLM-7B-instruct",
                                          trust_remote_code=True),
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 3b728f2744fca..a4eea7f035c91 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -1,6 +1,7 @@
 from unittest.mock import patch
 
 import pytest
+import transformers
 from transformers import PretrainedConfig
 
 from vllm import LLM
@@ -11,6 +12,9 @@
 @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
 def test_can_initialize(model_arch):
     model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+    if (model_arch == "Cohere2ForCausalLM"
+            and transformers.__version__ < "4.48.0"):
+        pytest.skip(reason="Model introduced in HF >= 4.48.0")
     if not model_info.is_available_online:
         pytest.skip("Model is not available online")
 
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 85e24ca660686..c846e42f1b0c3 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -48,7 +48,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (extract_layer_index, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -171,12 +171,26 @@ def __init__(
             rope_scaling=self.rope_scaling,
             is_neox_style=False,
         )
+
+        sliding_window = getattr(config, "sliding_window", None)
+        # Model v2 has sliding windows, v1 does not
+        self.v1 = sliding_window is None
+
+        layer_idx = extract_layer_index(prefix)
+        layer_has_sliding_window = (
+            getattr(config, "sliding_window_pattern", False)
+            and (layer_idx + 1) % self.config.sliding_window_pattern != 0)
+
+        self.sliding_window = (sliding_window
+                               if layer_has_sliding_window else None)
+
         self.attn = Attention(self.num_heads,
                               self.head_dim,
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
                               quant_config=quant_config,
+                              per_layer_sliding_window=self.sliding_window,
                               prefix=f"{prefix}.attn")
         if self.use_qk_norm:
             self.q_norm = LayerNorm(param_shape=(self.num_heads,
@@ -206,7 +220,8 @@ def forward(
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         if self.use_qk_norm:
             q, k = self._apply_qk_norm(q, k)
-        q, k = self.rotary_emb(positions, q, k)
+        if self.v1 or self.sliding_window:
+            q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
         output, _ = self.o_proj(attn_output)
         return output
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 4e77746f312e3..68a2467a813a1 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -41,6 +41,7 @@
     "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
     # ChatGLMModel supports multimodal
     "CohereForCausalLM": ("commandr", "CohereForCausalLM"),
+    "Cohere2ForCausalLM": ("commandr", "CohereForCausalLM"),
     "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),

From d927dbcd889fb2476cb61ea477ff51e5dd9e1ae3 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 16 Dec 2024 18:09:53 +0800
Subject: [PATCH 085/357] [Model] Refactor Ultravox to use merged input
 processor (#11198)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 examples/offline_inference_audio_language.py  |  10 +-
 tests/distributed/test_pipeline_parallel.py   |   2 +-
 tests/entrypoints/openai/test_audio.py        |   1 +
 .../audio_language/test_ultravox.py           |   5 +-
 vllm/entrypoints/chat_utils.py                |   2 +-
 vllm/model_executor/models/ultravox.py        | 244 ++++++++----------
 vllm/multimodal/processing.py                 |  19 +-
 7 files changed, 129 insertions(+), 154 deletions(-)

diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py
index 050b791b62adb..68b786961b14a 100644
--- a/examples/offline_inference_audio_language.py
+++ b/examples/offline_inference_audio_language.py
@@ -25,16 +25,16 @@ def run_ultravox(question: str, audio_count: int):
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     messages = [{
-        'role':
-        'user',
-        'content':
-        "<|reserved_special_token_0|>\n" * audio_count + question
+        'role': 'user',
+        'content': "<|audio|>\n" * audio_count + question
     }]
     prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
 
-    llm = LLM(model=model_name, limit_mm_per_prompt={"audio": audio_count})
+    llm = LLM(model=model_name,
+              trust_remote_code=True,
+              limit_mm_per_prompt={"audio": audio_count})
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 85d408efafe96..ddbf40f089407 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -214,7 +214,7 @@ def iter_params(self, model_name: str):
     "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
     "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
     "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
-    "fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
+    "fixie-ai/ultravox-v0_3": PPTestSettings.fast(trust_remote_code=True),
     # [Encoder-decoder]
     # TODO: Implement PP
     # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index a74109e2f5120..b579dcbb5c402 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -25,6 +25,7 @@ def server():
         "--max-num-seqs",
         "5",
         "--enforce-eager",
+        "--trust-remote-code",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index e100c6b9bb906..c548cfdf53414 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -16,7 +16,7 @@
 
 AudioTuple = Tuple[np.ndarray, int]
 
-VLLM_PLACEHOLDER = "<|reserved_special_token_0|>"
+VLLM_PLACEHOLDER = "<|audio|>"
 HF_PLACEHOLDER = "<|audio|>"
 
 CHUNKED_PREFILL_KWARGS = {
@@ -46,7 +46,8 @@ def audio(request):
 def server(request, audio_assets):
     args = [
         "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
-        f"--limit-mm-per-prompt=audio={len(audio_assets)}"
+        f"--limit-mm-per-prompt=audio={len(audio_assets)}",
+        "--trust-remote-code"
     ] + [
         f"--{key.replace('_','-')}={value}"
         for key, value in request.param.items()
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index c2054dcbfce0e..aaa5cd759366a 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -418,7 +418,7 @@ def _placeholder_str(self, modality: ModalityStr,
             raise TypeError(f"Unknown {modality} model type: {model_type}")
         elif modality == "audio":
             if model_type == "ultravox":
-                return "<|reserved_special_token_0|>"
+                return "<|audio|>"
             if model_type == "qwen2_audio":
                 return (f"Audio {current_count}: "
                         f"<|audio_bos|><|AUDIO|><|audio_eos|>")
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index ea1e5401d42c0..ebaa8a4c4f38a 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -3,41 +3,39 @@
 
 import math
 from functools import cached_property, lru_cache
-from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union, cast)
+from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set,
+                    Tuple, TypedDict, Union)
 
 import numpy as np
 import torch
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import functional as F
+from transformers import BatchFeature
 from transformers.models.whisper import WhisperFeatureExtractor
 from transformers.models.whisper.modeling_whisper import WhisperEncoder
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.loader import DefaultModelLoader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
-                             NestedTensors)
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   consecutive_placeholder_ranges,
-                                   repeat_and_pad_placeholder_tokens)
-from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataDict,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
+from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
-from vllm.utils import is_list_of
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings_from_map)
 
-_AUDIO_PLACEHOLDER_TOKEN = 128002
 _AUDIO_TOKENS_PER_SECOND = 6.25
 
 
@@ -72,64 +70,18 @@ def get_ultravox_max_audio_tokens(ctx: InputContext):
     return math.ceil(feature_extractor.chunk_length * _AUDIO_TOKENS_PER_SECOND)
 
 
-def dummy_seq_data_for_ultravox(
-    ctx: InputContext,
-    seq_len: int,
-    audio_count: int,
-):
-    audio_length = min(get_ultravox_max_audio_tokens(ctx),
-                       seq_len // audio_count)
+class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
 
-    return SequenceData.from_prompt_token_counts(
-        (_AUDIO_PLACEHOLDER_TOKEN, audio_length * audio_count),
-        (0, seq_len - audio_length * audio_count)), {
-            "audio":
-            consecutive_placeholder_ranges(num_items=audio_count,
-                                           item_size=audio_length)
-        }
-
-
-def dummy_audio_for_ultravox(
-    ctx: InputContext,
-    audio_count: int,
-):
-    feature_extractor = whisper_feature_extractor(ctx)
-    audio_and_sr = (np.array([0.0] * feature_extractor.chunk_length), 1)
-    return {"audio": [audio_and_sr] * audio_count}
-
-
-def dummy_data_for_ultravox(
-    ctx: InputContext,
-    seq_len: int,
-    mm_counts: Mapping[str, int],
-):
-    audio_count = mm_counts["audio"]
-    seq_data, ranges = dummy_seq_data_for_ultravox(ctx, seq_len, audio_count)
-    mm_dict = dummy_audio_for_ultravox(ctx, audio_count)
-
-    return DummyData(seq_data, mm_dict, ranges)
-
-
-def input_mapper_for_ultravox(ctx: InputContext, data: object):
-    if not isinstance(data, list):
-        data = [data]
-
-    if len(data) == 0:
-        return MultiModalKwargs()
-
-    # If the audio inputs are embeddings, no need for preprocessing
-    if is_list_of(data, torch.Tensor, check="all"):
-        return MultiModalKwargs({"audio_embeds": data})
-
-    audio_features = []
-    for audio_input in data:
-        if not isinstance(audio_input, tuple):
-            raise NotImplementedError(
-                f"Unsupported data type: {type(audio_input)}")
-
-        (audio, sr) = cast(Tuple[np.ndarray, Union[float, int]], audio_input)
-        feature_extractor = whisper_feature_extractor(ctx)
+    def _get_feature_extractor(self) -> WhisperFeatureExtractor:
+        return self._get_hf_processor().audio_processor.feature_extractor
 
+    def _resample_audio(
+        self,
+        audio: np.ndarray,
+        sr: int,
+    ) -> Dict[str, Union[np.ndarray, int]]:
+        # resample audio to the model's sampling rate
+        feature_extractor = self._get_feature_extractor()
         if sr != feature_extractor.sampling_rate:
             try:
                 import librosa
@@ -140,78 +92,92 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
                                      orig_sr=sr,
                                      target_sr=feature_extractor.sampling_rate)
             sr = feature_extractor.sampling_rate
+        return {"audio": audio, "sampling_rate": sr}
 
-        minimum_audio_length = feature_extractor.n_fft // 2 + 1
-        if len(audio) < minimum_audio_length:
-            # Not enough audio; pad it.
-            audio = np.pad(audio, (0, minimum_audio_length - len(audio)))
-
-        single_audio_features = feature_extractor(
-            audio, sampling_rate=sr, padding="longest",
-            return_tensors="pt")["input_features"]
-
-        # Remove the batch dimension because we're wrapping it in a list.
-        audio_features.append(single_audio_features.squeeze(0))
-
-    return MultiModalKwargs({"audio_features": audio_features})
-
-
-def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "audio" not in multi_modal_data:
-        return inputs
+    def _apply_hf_processor(
+        self,
+        prompt: str,
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data or not mm_data.get("audio", None):
+            return super()._apply_hf_processor(prompt, mm_data,
+                                               mm_processor_kwargs)
+
+        audio_data = mm_data["audio"]
+        if not isinstance(audio_data, list):
+            audio_data = [audio_data]
+
+        # Ultravox processor doesn't support multiple inputs,
+        # therefore we need to input text and audio one by one
+        tokenizer = self._get_tokenizer()
+        audio_features, audio_token_len = [], []
+        processed_inputs = {}
+        for audio, sr in audio_data:
+            data = self._resample_audio(audio, sr)
+            processed_inputs = super()._apply_hf_processor(
+                prompt, data, mm_processor_kwargs)
+            prompt = tokenizer.decode(processed_inputs["input_ids"][0],
+                                      skip_special_tokens=False)
+            audio_features.append(
+                processed_inputs.pop("audio_values").squeeze(0))
+            audio_token_len.append(
+                processed_inputs.pop("audio_token_len").item())
+
+        return dict(
+            **processed_inputs,
+            audio_features=audio_features,
+            audio_token_len=audio_token_len,
+        )
 
-    if "multi_modal_placeholders" in inputs and "audio" in inputs[
-            "multi_modal_placeholders"]:
-        # The inputs already have placeholders.
-        return inputs
+    def _get_processor_data(
+        self,
+        mm_data: MultiModalDataDict,
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        # Ultravox uses "audio" instead of "audios" as calling keyword
+        processor_data, passthrough_data = super()._get_processor_data(mm_data)
+        if "audios" in processor_data:
+            processor_data["audio"] = processor_data.pop("audios")
+        return processor_data, passthrough_data
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_inputs: BatchFeature,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> list[PromptReplacement]:
+        hf_processor = self._get_hf_processor()
+        placeholder = hf_processor.audio_token_replacement
+
+        def get_replacement_ultravox(item_idx: int):
+            audio_token_len = hf_inputs["audio_token_len"][item_idx]
+            return placeholder * audio_token_len
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target="<|audio|>",
+                replacement=get_replacement_ultravox,
+            )
+        ]
 
-    feature_extractor = whisper_feature_extractor(ctx)
-    audios = multi_modal_data["audio"]
-    if not isinstance(audios, list):
-        audios = [audios]
-
-    audio_token_counts = []
-    for audio in audios:
-        if isinstance(audio, torch.Tensor):
-            audio_num_tokens = audio.shape[1]
-            audio_token_counts.append(audio_num_tokens)
-        else:
-            audio_data, sample_rate = audio
-            audio_length = audio_data.shape[0]
-            if sample_rate != feature_extractor.sampling_rate:
-                # Account for resampling.
-                adjustment = feature_extractor.sampling_rate / sample_rate
-                audio_length = math.ceil(adjustment * audio_length)
-
-            feature_extractor_output_length = math.ceil(
-                (audio_length - (feature_extractor.hop_length - 1)) /
-                feature_extractor.hop_length)
-
-            uv_config = ctx.get_hf_config(UltravoxConfig)
-            audio_num_tokens = min(
-                max(
-                    1,
-                    math.ceil(feature_extractor_output_length /
-                              (uv_config.stack_factor * 2))),
-                get_ultravox_max_audio_tokens(ctx))
-            audio_token_counts.append(audio_num_tokens)
-
-    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
-
-    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-        tokenizer,
-        inputs.get("prompt"),
-        inputs["prompt_token_ids"],
-        placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN,
-        repeat_count=audio_token_counts,
-    )
-
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data,
-                        multi_modal_placeholders={"audio": ranges})
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        feature_extractor = self._get_feature_extractor()
+        sampling_rate = feature_extractor.sampling_rate
+        audio_len = feature_extractor.chunk_length * sampling_rate
+
+        audio_count = mm_counts["audio"]
+        audio = np.zeros(audio_len)
+        data = {"audio": [(audio, sampling_rate)] * audio_count}
+
+        return ProcessorInputs(
+            prompt_text="<|audio|>" * audio_count,
+            mm_data=data,
+            mm_processor_kwargs={},
+        )
 
 
 class StackAudioFrames(nn.Module):
@@ -332,11 +298,9 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_input_mapper("audio", input_mapper_for_ultravox)
 @MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
     "audio", get_ultravox_max_audio_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_ultravox)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_ultravox)
+@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor)
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index ce6bec1d49aac..339e193eefe20 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -594,14 +594,10 @@ def _find_placeholders(
         return list(
             iter_placeholders(all_prompt_repls, new_token_ids, mm_item_counts))
 
-    def _apply_hf_processor(
+    def _get_processor_data(
         self,
-        prompt: str,
         mm_data: MultiModalDataDict,
-        mm_processor_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        hf_processor = self._get_hf_processor(**mm_processor_kwargs)
-
         processor_data = dict[str, Any]()
         passthrough_data = dict[str, Any]()
         for k, v in mm_data.items():
@@ -619,6 +615,19 @@ def _apply_hf_processor(
                     processor_data[f"{k}s"] = v
             else:
                 processor_data[k] = v
+        return processor_data, passthrough_data
+
+    def _apply_hf_processor(
+        self,
+        prompt: str,
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        # some mm_processor_kwargs may be used in processor initialization
+        # instead of processor call
+        hf_processor = self._get_hf_processor(**mm_processor_kwargs)
+
+        processor_data, passthrough_data = self._get_processor_data(mm_data)
 
         assert callable(hf_processor)
         mm_processor_kwargs = self.ctx.resolve_hf_processor_call_kwargs(

From 2ca830dbaa1a7c30b8ff4d7c860c63f87dc18be3 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 16 Dec 2024 19:23:33 +0800
Subject: [PATCH 086/357] [Doc] Reorder vision language examples in alphabet
 order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 examples/offline_inference_vision_language.py | 486 +++++++++---------
 ...e_inference_vision_language_multi_image.py | 288 +++++------
 2 files changed, 387 insertions(+), 387 deletions(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 7bc43242b717e..6d0495fdd4054 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -19,6 +19,159 @@
 # Unless specified, these settings have been tested to work on a single L4.
 
 
+# Aria
+def run_aria(question: str, modality: str):
+    assert modality == "image"
+    model_name = "rhymes-ai/Aria"
+
+    llm = LLM(model=model_name,
+              tokenizer_mode="slow",
+              trust_remote_code=True,
+              dtype="bfloat16",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+
+    prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
+              "<|im_end|>\n<|im_start|>assistant\n")
+
+    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
+    return llm, prompt, stop_token_ids
+
+
+# BLIP-2
+def run_blip2(question: str, modality: str):
+    assert modality == "image"
+
+    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
+    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
+    prompt = f"Question: {question} Answer:"
+    llm = LLM(model="Salesforce/blip2-opt-2.7b",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# Chameleon
+def run_chameleon(question: str, modality: str):
+    assert modality == "image"
+
+    prompt = f"{question}<image>"
+    llm = LLM(model="facebook/chameleon-7b",
+              max_model_len=4096,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# Fuyu
+def run_fuyu(question: str, modality: str):
+    assert modality == "image"
+
+    prompt = f"{question}\n"
+    llm = LLM(model="adept/fuyu-8b",
+              max_model_len=2048,
+              max_num_seqs=2,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# GLM-4v
+def run_glm4v(question: str, modality: str):
+    assert modality == "image"
+    model_name = "THUDM/glm-4v-9b"
+
+    llm = LLM(model=model_name,
+              max_model_len=2048,
+              max_num_seqs=2,
+              trust_remote_code=True,
+              enforce_eager=True,
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
+    prompt = question
+    stop_token_ids = [151329, 151336, 151338]
+    return llm, prompt, stop_token_ids
+
+
+# H2OVL-Mississippi
+def run_h2ovl(question: str, modality: str):
+    assert modality == "image"
+
+    model_name = "h2oai/h2ovl-mississippi-2b"
+
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    # Stop tokens for H2OVL-Mississippi
+    # https://huggingface.co/h2oai/h2ovl-mississippi-2b
+    stop_token_ids = [tokenizer.eos_token_id]
+    return llm, prompt, stop_token_ids
+
+
+# Idefics3-8B-Llama3
+def run_idefics3(question: str, modality: str):
+    assert modality == "image"
+    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
+
+    llm = LLM(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        enforce_eager=True,
+        # if you are running out of memory, you can reduce the "longest_edge".
+        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
+        mm_processor_kwargs={
+            "size": {
+                "longest_edge": 3 * 364
+            },
+        },
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
+    )
+    prompt = (
+        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
+    )
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# InternVL
+def run_internvl(question: str, modality: str):
+    assert modality == "image"
+
+    model_name = "OpenGVLab/InternVL2-2B"
+
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        mm_cache_preprocessor=args.mm_cache_preprocessor,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    # Stop tokens for InternVL
+    # models variants may have different stop tokens
+    # please refer to the model card for the correct "stop words":
+    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    return llm, prompt, stop_token_ids
+
+
 # LLaVA-1.5
 def run_llava(question: str, modality: str):
     assert modality == "image"
@@ -75,83 +228,20 @@ def run_llava_onevision(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
-# Fuyu
-def run_fuyu(question: str, modality: str):
-    assert modality == "image"
-
-    prompt = f"{question}\n"
-    llm = LLM(model="adept/fuyu-8b",
-              max_model_len=2048,
-              max_num_seqs=2,
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
-
-# Phi-3-Vision
-def run_phi3v(question: str, modality: str):
+# Mantis
+def run_mantis(question: str, modality: str):
     assert modality == "image"
 
-    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
+    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
+    prompt = llama3_template.format(f"{question}\n<image>")
 
-    # num_crops is an override kwarg to the multimodal image processor;
-    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
-    # to use 16 for single frame scenarios, and 4 for multi-frame.
-    #
-    # Generally speaking, a larger value for num_crops results in more
-    # tokens per image instance, because it may scale the image more in
-    # the image preprocessing. Some references in the model docs and the
-    # formula for image tokens after the preprocessing
-    # transform can be found below.
-    #
-    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
-    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
     llm = LLM(
-        model="microsoft/Phi-3.5-vision-instruct",
-        trust_remote_code=True,
+        model="TIGER-Lab/Mantis-8B-siglip-llama3",
         max_model_len=4096,
-        max_num_seqs=2,
-        # Note - mm_processor_kwargs can also be passed to generate/chat calls
-        mm_processor_kwargs={"num_crops": 16},
+        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
         mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
-
-# PaliGemma
-def run_paligemma(question: str, modality: str):
-    assert modality == "image"
-
-    # PaliGemma has special prompt format for VQA
-    prompt = "caption en"
-    llm = LLM(model="google/paligemma-3b-mix-224",
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
-
-# PaliGemma 2
-def run_paligemma2(question: str, modality: str):
-    assert modality == "image"
-
-    # PaliGemma 2 has special prompt format for VQA
-    prompt = "caption en"
-    llm = LLM(model="google/paligemma2-3b-ft-docci-448",
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
-
-# Chameleon
-def run_chameleon(question: str, modality: str):
-    assert modality == "image"
-
-    prompt = f"{question}<image>"
-    llm = LLM(model="facebook/chameleon-7b",
-              max_model_len=4096,
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
-    stop_token_ids = None
+    stop_token_ids = [128009]
     return llm, prompt, stop_token_ids
 
 
@@ -199,58 +289,45 @@ def run_minicpmv(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
-# H2OVL-Mississippi
-def run_h2ovl(question: str, modality: str):
+# LLama 3.2
+def run_mllama(question: str, modality: str):
     assert modality == "image"
 
-    model_name = "h2oai/h2ovl-mississippi-2b"
+    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 
+    # Note: The default setting of max_num_seqs (256) and
+    # max_model_len (131072) for this model may cause OOM.
+    # You may lower either to run this example on lower-end GPUs.
+
+    # The configuration below has been confirmed to launch on a single L40 GPU.
     llm = LLM(
         model=model_name,
-        trust_remote_code=True,
-        max_model_len=8192,
+        max_model_len=4096,
+        max_num_seqs=16,
+        enforce_eager=True,
         mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
-
-    # Stop tokens for H2OVL-Mississippi
-    # https://huggingface.co/h2oai/h2ovl-mississippi-2b
-    stop_token_ids = [tokenizer.eos_token_id]
+    prompt = f"<|image|><|begin_of_text|>{question}"
+    stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
-# InternVL
-def run_internvl(question: str, modality: str):
+# Molmo
+def run_molmo(question, modality):
     assert modality == "image"
 
-    model_name = "OpenGVLab/InternVL2-2B"
+    model_name = "allenai/Molmo-7B-D-0924"
 
     llm = LLM(
         model=model_name,
         trust_remote_code=True,
-        max_model_len=4096,
+        dtype="bfloat16",
         mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
-
-    # Stop tokens for InternVL
-    # models variants may have different stop tokens
-    # please refer to the model card for the correct "stop words":
-    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
-    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
-    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    prompt = question
+    stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
@@ -279,58 +356,57 @@ def run_nvlm_d(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
-# BLIP-2
-def run_blip2(question: str, modality: str):
+# PaliGemma
+def run_paligemma(question: str, modality: str):
     assert modality == "image"
 
-    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
-    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
-    prompt = f"Question: {question} Answer:"
-    llm = LLM(model="Salesforce/blip2-opt-2.7b",
+    # PaliGemma has special prompt format for VQA
+    prompt = "caption en"
+    llm = LLM(model="google/paligemma-3b-mix-224",
               mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
-# Qwen
-def run_qwen_vl(question: str, modality: str):
+# PaliGemma 2
+def run_paligemma2(question: str, modality: str):
     assert modality == "image"
 
-    llm = LLM(
-        model="Qwen/Qwen-VL",
-        trust_remote_code=True,
-        max_model_len=1024,
-        max_num_seqs=2,
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
-    )
-
-    prompt = f"{question}Picture 1: <img></img>\n"
+    # PaliGemma 2 has special prompt format for VQA
+    prompt = "caption en"
+    llm = LLM(model="google/paligemma2-3b-ft-docci-448",
+              mm_cache_preprocessor=args.mm_cache_preprocessor)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
-# Qwen2-VL
-def run_qwen2_vl(question: str, modality: str):
+# Phi-3-Vision
+def run_phi3v(question: str, modality: str):
     assert modality == "image"
 
-    model_name = "Qwen/Qwen2-VL-7B-Instruct"
+    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
 
+    # num_crops is an override kwarg to the multimodal image processor;
+    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
+    # to use 16 for single frame scenarios, and 4 for multi-frame.
+    #
+    # Generally speaking, a larger value for num_crops results in more
+    # tokens per image instance, because it may scale the image more in
+    # the image preprocessing. Some references in the model docs and the
+    # formula for image tokens after the preprocessing
+    # transform can be found below.
+    #
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
     llm = LLM(
-        model=model_name,
+        model="microsoft/Phi-3.5-vision-instruct",
+        trust_remote_code=True,
         max_model_len=4096,
-        max_num_seqs=5,
+        max_num_seqs=2,
         # Note - mm_processor_kwargs can also be passed to generate/chat calls
-        mm_processor_kwargs={
-            "min_pixels": 28 * 28,
-            "max_pixels": 1280 * 28 * 28,
-        },
+        mm_processor_kwargs={"num_crops": 16},
         mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
-
-    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-              "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
-              f"{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -352,149 +428,73 @@ def run_pixtral_hf(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
-# LLama 3.2
-def run_mllama(question: str, modality: str):
-    assert modality == "image"
-
-    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-
-    # Note: The default setting of max_num_seqs (256) and
-    # max_model_len (131072) for this model may cause OOM.
-    # You may lower either to run this example on lower-end GPUs.
-
-    # The configuration below has been confirmed to launch on a single L40 GPU.
-    llm = LLM(
-        model=model_name,
-        max_model_len=4096,
-        max_num_seqs=16,
-        enforce_eager=True,
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
-    )
-
-    prompt = f"<|image|><|begin_of_text|>{question}"
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
-
-# Molmo
-def run_molmo(question, modality):
+# Qwen
+def run_qwen_vl(question: str, modality: str):
     assert modality == "image"
 
-    model_name = "allenai/Molmo-7B-D-0924"
-
     llm = LLM(
-        model=model_name,
+        model="Qwen/Qwen-VL",
         trust_remote_code=True,
-        dtype="bfloat16",
+        max_model_len=1024,
+        max_num_seqs=2,
         mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
 
-    prompt = question
+    prompt = f"{question}Picture 1: <img></img>\n"
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
-# GLM-4v
-def run_glm4v(question: str, modality: str):
+# Qwen2-VL
+def run_qwen2_vl(question: str, modality: str):
     assert modality == "image"
-    model_name = "THUDM/glm-4v-9b"
 
-    llm = LLM(model=model_name,
-              max_model_len=2048,
-              max_num_seqs=2,
-              trust_remote_code=True,
-              enforce_eager=True,
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
-    prompt = question
-    stop_token_ids = [151329, 151336, 151338]
-    return llm, prompt, stop_token_ids
-
-
-# Idefics3-8B-Llama3
-def run_idefics3(question: str, modality: str):
-    assert modality == "image"
-    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
+    model_name = "Qwen/Qwen2-VL-7B-Instruct"
 
     llm = LLM(
         model=model_name,
-        max_model_len=8192,
-        max_num_seqs=2,
-        enforce_eager=True,
-        # if you are running out of memory, you can reduce the "longest_edge".
-        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
+        max_model_len=4096,
+        max_num_seqs=5,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
         mm_processor_kwargs={
-            "size": {
-                "longest_edge": 3 * 364
-            },
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
         },
         mm_cache_preprocessor=args.mm_cache_preprocessor,
     )
-    prompt = (
-        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
-    )
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
 
-# Aria
-def run_aria(question: str, modality: str):
-    assert modality == "image"
-    model_name = "rhymes-ai/Aria"
-
-    llm = LLM(model=model_name,
-              tokenizer_mode="slow",
-              trust_remote_code=True,
-              dtype="bfloat16",
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
-
-    prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
-              "<|im_end|>\n<|im_start|>assistant\n")
-
-    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
-    return llm, prompt, stop_token_ids
-
-
-# Mantis
-def run_mantis(question: str, modality: str):
-    assert modality == "image"
-
-    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
-    prompt = llama3_template.format(f"{question}\n<image>")
-
-    llm = LLM(
-        model="TIGER-Lab/Mantis-8B-siglip-llama3",
-        max_model_len=4096,
-        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
-    )
-    stop_token_ids = [128009]
+    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+              "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+              f"{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+    stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
 model_example_map = {
+    "aria": run_aria,
+    "blip-2": run_blip2,
+    "chameleon": run_chameleon,
+    "fuyu": run_fuyu,
+    "glm4v": run_glm4v,
+    "h2ovl_chat": run_h2ovl,
+    "idefics3": run_idefics3,
+    "internvl_chat": run_internvl,
     "llava": run_llava,
     "llava-next": run_llava_next,
     "llava-next-video": run_llava_next_video,
     "llava-onevision": run_llava_onevision,
-    "fuyu": run_fuyu,
-    "phi3_v": run_phi3v,
-    "paligemma": run_paligemma,
-    "paligemma2": run_paligemma2,
-    "chameleon": run_chameleon,
+    "mantis": run_mantis,
     "minicpmv": run_minicpmv,
-    "blip-2": run_blip2,
-    "h2ovl_chat": run_h2ovl,
-    "internvl_chat": run_internvl,
+    "mllama": run_mllama,
+    "molmo": run_molmo,
     "NVLM_D": run_nvlm_d,
+    "paligemma": run_paligemma,
+    "paligemma2": run_paligemma2,
+    "phi3_v": run_phi3v,
+    "pixtral_hf": run_pixtral_hf,
     "qwen_vl": run_qwen_vl,
     "qwen2_vl": run_qwen2_vl,
-    "pixtral_hf": run_pixtral_hf,
-    "mllama": run_mllama,
-    "molmo": run_molmo,
-    "glm4v": run_glm4v,
-    "idefics3": run_idefics3,
-    "aria": run_aria,
-    "mantis": run_mantis,
 }
 
 
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
index 928bbef54eab7..6af8d7768e75d 100644
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -33,78 +33,23 @@ class ModelRequestData(NamedTuple):
 # Unless specified, these settings have been tested to work on a single L4.
 
 
-def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
-    model_name = "Qwen/Qwen-VL-Chat"
-    llm = LLM(
-        model=model_name,
-        trust_remote_code=True,
-        max_model_len=1024,
-        max_num_seqs=2,
-        limit_mm_per_prompt={"image": len(image_urls)},
-    )
-    placeholders = "".join(f"Picture {i}: <img></img>\n"
-                           for i, _ in enumerate(image_urls, start=1))
-
-    # This model does not have a chat_template attribute on its tokenizer,
-    # so we need to explicitly pass it. We use ChatML since it's used in the
-    # generation utils of the model:
-    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-
-    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
-    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501
-
-    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True,
-                                           chat_template=chat_template)
-
-    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
-    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-    return ModelRequestData(
-        llm=llm,
-        prompt=prompt,
-        stop_token_ids=stop_token_ids,
-        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=chat_template,
-    )
-
-
-def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
-    # num_crops is an override kwarg to the multimodal image processor;
-    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
-    # to use 16 for single frame scenarios, and 4 for multi-frame.
-    #
-    # Generally speaking, a larger value for num_crops results in more
-    # tokens per image instance, because it may scale the image more in
-    # the image preprocessing. Some references in the model docs and the
-    # formula for image tokens after the preprocessing
-    # transform can be found below.
-    #
-    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
-    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
-    llm = LLM(
-        model="microsoft/Phi-3.5-vision-instruct",
-        trust_remote_code=True,
-        max_model_len=4096,
-        max_num_seqs=2,
-        limit_mm_per_prompt={"image": len(image_urls)},
-        mm_processor_kwargs={"num_crops": 4},
-    )
-    placeholders = "\n".join(f"<|image_{i}|>"
-                             for i, _ in enumerate(image_urls, start=1))
-    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
-    stop_token_ids = None
-
+def load_aria(question, image_urls: List[str]) -> ModelRequestData:
+    model_name = "rhymes-ai/Aria"
+    llm = LLM(model=model_name,
+              tokenizer_mode="slow",
+              trust_remote_code=True,
+              dtype="bfloat16",
+              limit_mm_per_prompt={"image": len(image_urls)})
+    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
+    prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
     return ModelRequestData(
         llm=llm,
         prompt=prompt,
         stop_token_ids=stop_token_ids,
         image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
-    )
+        chat_template=None)
 
 
 def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
@@ -141,6 +86,37 @@ def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
     )
 
 
+def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
+    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
+
+    # The configuration below has been confirmed to launch on a single L40 GPU.
+    llm = LLM(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=16,
+        enforce_eager=True,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        # if you are running out of memory, you can reduce the "longest_edge".
+        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
+        mm_processor_kwargs={
+            "size": {
+                "longest_edge": 2 * 364
+            },
+        },
+    )
+
+    placeholders = "\n".join(f"Image-{i}: <image>\n"
+                             for i, _ in enumerate(image_urls, start=1))
+    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=None,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
 def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
     model_name = "OpenGVLab/InternVL2-2B"
 
@@ -178,6 +154,28 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
     )
 
 
+def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
+    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+
+    # The configuration below has been confirmed to launch on a single L40 GPU.
+    llm = LLM(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=16,
+        enforce_eager=True,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    prompt = f"<|image|><|image|><|begin_of_text|>{question}"
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=None,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
 def load_nvlm_d(question: str, image_urls: List[str]):
     model_name = "nvidia/NVLM-D-72B"
 
@@ -211,6 +209,80 @@ def load_nvlm_d(question: str, image_urls: List[str]):
     )
 
 
+def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
+    # num_crops is an override kwarg to the multimodal image processor;
+    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
+    # to use 16 for single frame scenarios, and 4 for multi-frame.
+    #
+    # Generally speaking, a larger value for num_crops results in more
+    # tokens per image instance, because it may scale the image more in
+    # the image preprocessing. Some references in the model docs and the
+    # formula for image tokens after the preprocessing
+    # transform can be found below.
+    #
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
+    llm = LLM(
+        model="microsoft/Phi-3.5-vision-instruct",
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={"num_crops": 4},
+    )
+    placeholders = "\n".join(f"<|image_{i}|>"
+                             for i, _ in enumerate(image_urls, start=1))
+    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
+    stop_token_ids = None
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
+def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
+    model_name = "Qwen/Qwen-VL-Chat"
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=1024,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+    placeholders = "".join(f"Picture {i}: <img></img>\n"
+                           for i, _ in enumerate(image_urls, start=1))
+
+    # This model does not have a chat_template attribute on its tokenizer,
+    # so we need to explicitly pass it. We use ChatML since it's used in the
+    # generation utils of the model:
+    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+
+    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
+    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501
+
+    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True,
+                                           chat_template=chat_template)
+
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=chat_template,
+    )
+
+
 def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
     try:
         from qwen_vl_utils import process_vision_info
@@ -268,88 +340,16 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
     )
 
 
-def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
-    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-
-    # The configuration below has been confirmed to launch on a single L40 GPU.
-    llm = LLM(
-        model=model_name,
-        max_model_len=4096,
-        max_num_seqs=16,
-        enforce_eager=True,
-        limit_mm_per_prompt={"image": len(image_urls)},
-    )
-
-    prompt = f"<|image|><|image|><|begin_of_text|>{question}"
-    return ModelRequestData(
-        llm=llm,
-        prompt=prompt,
-        stop_token_ids=None,
-        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
-    )
-
-
-def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
-    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
-
-    # The configuration below has been confirmed to launch on a single L40 GPU.
-    llm = LLM(
-        model=model_name,
-        max_model_len=8192,
-        max_num_seqs=16,
-        enforce_eager=True,
-        limit_mm_per_prompt={"image": len(image_urls)},
-        # if you are running out of memory, you can reduce the "longest_edge".
-        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
-        mm_processor_kwargs={
-            "size": {
-                "longest_edge": 2 * 364
-            },
-        },
-    )
-
-    placeholders = "\n".join(f"Image-{i}: <image>\n"
-                             for i, _ in enumerate(image_urls, start=1))
-    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
-    return ModelRequestData(
-        llm=llm,
-        prompt=prompt,
-        stop_token_ids=None,
-        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
-    )
-
-
-def load_aria(question, image_urls: List[str]) -> ModelRequestData:
-    model_name = "rhymes-ai/Aria"
-    llm = LLM(model=model_name,
-              tokenizer_mode="slow",
-              trust_remote_code=True,
-              dtype="bfloat16",
-              limit_mm_per_prompt={"image": len(image_urls)})
-    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
-    prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
-    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
-    return ModelRequestData(
-        llm=llm,
-        prompt=prompt,
-        stop_token_ids=stop_token_ids,
-        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None)
-
-
 model_example_map = {
-    "phi3_v": load_phi3v,
+    "aria": load_aria,
     "h2ovl_chat": load_h2onvl,
+    "idefics3": load_idefics3,
     "internvl_chat": load_internvl,
+    "mllama": load_mllama,
     "NVLM_D": load_nvlm_d,
-    "qwen2_vl": load_qwen2_vl,
+    "phi3_v": load_phi3v,
     "qwen_vl_chat": load_qwenvl_chat,
-    "mllama": load_mllama,
-    "idefics3": load_idefics3,
-    "aria": load_aria,
+    "qwen2_vl": load_qwen2_vl,
 }
 
 

From efbce85f4d375d7851a491a0126a224e25d9f91d Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Mon, 16 Dec 2024 13:14:57 -0500
Subject: [PATCH 087/357] [misc] Layerwise profile updates (#10242)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 .buildkite/test-pipeline.yaml                 |   2 +-
 examples/offline_profile.py                   | 236 +++++++++++++++---
 tools/profiler/print_layerwise_table.py       |   9 +-
 tools/profiler/visualize_layerwise_profile.py |  92 ++++++-
 vllm/profiler/layerwise_profile.py            |  22 +-
 5 files changed, 314 insertions(+), 47 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 97aae233db105..44f47fac1c1b3 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -201,7 +201,7 @@ steps:
     - python3 offline_inference_classification.py
     - python3 offline_inference_embedding.py
     - python3 offline_inference_scoring.py
-    - python3 offline_profile.py --model facebook/opt-125m
+    - python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
 
 - label: Prefix Caching Test # 9min
   mirror_hardwares: [amd]
diff --git a/examples/offline_profile.py b/examples/offline_profile.py
index 1d415b82cddb6..46afe8aa2604b 100644
--- a/examples/offline_profile.py
+++ b/examples/offline_profile.py
@@ -4,9 +4,10 @@
 import sys
 from argparse import RawTextHelpFormatter
 from dataclasses import asdict, dataclass
-from typing import Optional
+from typing import Any, Dict, Generator, List, Optional, TypeAlias
 
 import torch
+import tqdm
 
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
@@ -15,16 +16,21 @@
 
 BATCH_SIZE_DEFAULT = 1
 PROMPT_LEN_DEFAULT = 256
-OUTPUT_LEN_DEFAULT = 2
 
 
 @dataclass
 class ProfileContext:
     engine_args: EngineArgs
     prompt_len: int
-    output_len: int
     batch_size: int
-    save_chrome_traces_folder: Optional[str]
+
+    # The profiler can run in 2 modes,
+    # 1. Run profiler for user specified num_steps
+    num_steps: Optional[int] = None
+    # 2. Run profiler until all requests complete
+    complete_num_requests_per_step: Optional[int] = None
+
+    save_chrome_traces_folder: Optional[str] = None
 
 
 def get_dtype(dtype: str):
@@ -34,23 +40,155 @@ def get_dtype(dtype: str):
         return dtype
 
 
+OutputLen_NumReqs_Map: TypeAlias = Dict[int, int]
+def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
+      -> OutputLen_NumReqs_Map:
+    """
+    Given the number of requests, batch_size, and the number of requests
+    that each engine-step should process, step_requests, determine the
+    output lengths of the requests such that step_request is honoured.
+
+    Example: 
+    if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1]
+    then return,
+    {2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning,
+    32 requests should have output length 2,
+    32 requests should have output length 3,
+    32 requests should have output length 4,
+    31 requests should have output length 5,
+    1 request should have output length 6.
+
+    Args:
+        batch_size (int): Number of requests submitted for profile. This is
+            args.batch_size.
+        step_requests (List[int]): step_requests[i] is the number of requests
+            that the ith engine step should process.
+
+    Returns:
+        OutputLen_NumReqs_Map : A dictionary with output-length as keys and the
+            number of requests required to have that output-length as values.
+    """
+    ol_nr: OutputLen_NumReqs_Map = {}
+
+    # Number of request that are assigned an output-length
+    num_reqs_assigned: int = 0
+    num_steps: int = len(step_requests)
+
+    # sanity check. The first step (prefill-step), must process all requests.
+    assert step_requests[0] == batch_size
+
+    # Begin assignments from the last step.
+    output_length: int = num_steps
+    for num_requests_at_step in reversed(step_requests):
+        if num_reqs_assigned == batch_size:
+            break
+
+        assert num_reqs_assigned < batch_size
+
+        # Remove the number of requests that have been determined
+        # to participate in this step and beyond.
+        num_reqs_unassigned_at_step = num_requests_at_step - num_reqs_assigned
+        assert num_reqs_unassigned_at_step >= 0
+
+        if num_reqs_unassigned_at_step > 0:
+            ol_nr[output_length] = num_reqs_unassigned_at_step
+            num_reqs_assigned += num_reqs_unassigned_at_step
+
+        output_length -= 1
+
+    # sanity checks.
+    assert sum(ol_nr.values()) == batch_size, \
+            ("Number of requests in output-length assignment does not match "
+             f"batch-size.\n batch size {batch_size} - "
+             f"step requests {step_requests} - assignments {ol_nr}")
+
+    # Check that the output-length is in [1, num-steps]. Output length must be
+    # at least 1 as all requests must participate in the prefill-step.
+    assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), \
+            ("Output lengths of requests should be in range "
+             f"[1, num-engine-steps].\n batch size {batch_size} - "
+             f"step requests {step_requests} - assignments {ol_nr}")
+
+    return ol_nr
+
+
+def determine_requests_per_step(context: ProfileContext) -> List[int]:
+    """
+    Determine number of requests each engine step should process.
+    If context.num_steps is set, then all engine steps process the
+    same number of requests and the output list is of length
+    context.num_steps.
+
+    If context.complete_num_requests_per_step is set, then each decode step
+    processes fewer and fewer requests until there are no requests to process.
+    In this case, the output list is as big as the number of steps
+    required to process all requests.
+
+    Args:
+        context: ProfileContext object.
+
+    Returns:
+        List[int]: Number of requests to process for all engine-steps. 
+         output[i], contains the number of requests that the ith step
+         should process.
+    """
+    if context.num_steps:
+        # All requests must run until num_engine_steps. This implies
+        # that their output lengths must be equal to num_engine_steps.
+        return [context.batch_size] * context.num_steps
+
+    assert context.complete_num_requests_per_step and \
+                context.complete_num_requests_per_step > 0, \
+        (f"Expected a positive complete_num_requests_per_step argument."
+         f"Instead got {context.complete_num_requests_per_step}")
+
+    # We start dropping after the first decode step.
+    step_requests = [
+        context.batch_size,  # prefill
+        context.batch_size,  # decode
+    ]
+
+    num_running_requests = context.batch_size
+    num_running_requests -= context.complete_num_requests_per_step
+    while num_running_requests > 0:
+        step_requests.append(num_running_requests)
+        num_running_requests -= context.complete_num_requests_per_step
+
+    if step_requests[-1] != 1:
+        # have 1 request running at the last step. This is often
+        # useful
+        step_requests.append(1)
+
+    return step_requests
+
+
 def run_profile(context: ProfileContext, csv_output: Optional[str],
                 json_output: Optional[str]):
     print("Run profile with:")
     for key, value in asdict(context).items():
         print(f"  {key} = {value}")
 
+    requests_per_step: List[int] = determine_requests_per_step(context)
+
+    ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
+        context.batch_size, requests_per_step)
+
+    num_steps_to_profile: int = len(requests_per_step)
+    max_output_len: int = max(ol_nr.keys())
+    assert max_output_len >= 1
+
     # Create sampling params
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     max_tokens=args.output_len,
-                                     ignore_eos=True)
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        # max_tokens is set on a per-request basis.
+        max_tokens=None,
+        ignore_eos=True)
 
     # Create LLM
     llm = LLM(**asdict(context.engine_args))
     batch_size = context.batch_size
     prompt_len = context.prompt_len
-    output_len = context.output_len
 
     scheduler_config = llm.llm_engine.scheduler_config
     max_model_len = llm.llm_engine.model_config.max_model_len
@@ -65,7 +203,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
               f"choose a smaller batch size or prompt length, or increase "
               f"--max-num-batched-tokens")
         sys.exit(-1)
-    if batch_size >= max_num_seqs:
+    if batch_size > max_num_seqs:
         print(
             f"ERROR: chosen batch_size ({batch_size}) is larger than "
             f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
@@ -73,16 +211,26 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
         sys.exit(-1)
     print("llm.llm_engine.model_config.max_model_len: ",
           llm.llm_engine.model_config.max_model_len)
-    if prompt_len + output_len > llm.llm_engine.model_config.max_model_len:
-        print(
-            f"ERROR: chosen prompt_len + output_len ({prompt_len} + "
-            f"{output_len} = {prompt_len + output_len}) is larger than the "
-            f"model's max_model_len ({max_model_len}), please choose a smaller "
-            f"prompt_len or output_len, or increase --max-model-len")
+    if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len:
+        print(f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + "
+              f"{max_output_len} = {prompt_len + max_output_len}) is larger "
+              f"than the model's max_model_len ({max_model_len}), please "
+              f"choose a smaller prompt_len or max_output_len, or increase "
+              f"--max-model-len")
         sys.exit(-1)
 
     def add_requests():
+
+        def get_output_len_generator() -> Generator[int, Any, Any]:
+            for output_len, num_reqs in ol_nr.items():
+                for _ in range(num_reqs):
+                    yield output_len
+
+        output_len_generator = get_output_len_generator()
         for i in range(batch_size):
+            sampling_params.max_tokens = next(output_len_generator)
+            assert isinstance(sampling_params.max_tokens, int)
+
             prompt_token_ids = torch.randint(
                 llm.llm_engine.model_config.get_vocab_size(),
                 size=(prompt_len, )).tolist()
@@ -110,8 +258,11 @@ def abort_requests():
         llm.llm_engine.step()  # First step is prefill
 
     decode_profs = []
-    for x in range(args.output_len - 1):
-        with layerwise_profile() as decode_prof:
+    for _ in tqdm.tqdm(range(num_steps_to_profile - 1)):
+        num_running_seqs = llm.llm_engine.scheduler[
+            0].get_num_unfinished_seq_groups()
+        with layerwise_profile(
+                num_running_seqs=num_running_seqs) as decode_prof:
             llm.llm_engine.step()
         decode_profs.append(decode_prof)
 
@@ -154,7 +305,8 @@ def abort_requests():
         decode_results_list[0].print_summary_table()
 
     if csv_output:
-        csv_filename_base = csv_output.rstrip(".csv")
+        csv_filename_base = csv_output[:-4] \
+                if csv_output.endswith('.csv') else csv_output
         prefill_results.export_model_stats_table_csv(
             csv_filename_base + "_prefill_model_table.csv")
         prefill_results.export_summary_stats_table_csv(
@@ -187,10 +339,10 @@ def abort_requests():
             for idx, dr in enumerate(decode_results_list):
                 json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
 
-        for idx, dr in enumerate(decode_results_list[1:]):
-            json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
-
-        with open(json_output.rstrip(".json") + ".json", "w+") as f:
+        # Add .json to json_output filename if it doesn't exist already.
+        json_output_file = json_output if json_output.endswith(
+            '.json') else json_output + '.json'
+        with open(json_output_file, "w+") as f:
             json.dump(json_dict, f, indent=2)
         pass
 
@@ -214,7 +366,7 @@ def abort_requests():
     python examples/offline_profile.py \\
         --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
         --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
-        --enforce-eager
+        --enforce-eager run_num_steps -n 2
     ```
 
     then you can use various tools to analyze the json output
@@ -261,17 +413,41 @@ def abort_requests():
                         default=BATCH_SIZE_DEFAULT,
                         help=f"Number of requests to run as a single batch, "
                         f"default={BATCH_SIZE_DEFAULT}")
-    parser.add_argument(
-        "--output-len",
+
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    run_num_steps_parser = subparsers.add_parser(
+        "run_num_steps",
+        help="This variation profiles n engine.step() invocations.")
+    run_num_steps_parser.add_argument(
+        '-n',
+        '--num-steps',
         type=int,
-        default=OUTPUT_LEN_DEFAULT,
-        help="Number of llm steps to run (includes prefill and decode) "
-        "- default={OUTPUT_LEN_DEFAULT}")
+        help="Number of engine steps to profile.\n"
+        "Setting it to 1, profiles only the prefill step.\n"
+        "Setting it to 2, profiles the prefill and first decode step\n"
+        "Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n"
+        "and so on ...")
+
+    run_to_completion_parser = subparsers.add_parser(
+        "run_to_completion",
+        help="This variation profiles all the engine.step() invocations"
+        "until the engine exhausts all submitted requests.")
+    run_to_completion_parser.add_argument(
+        '-n',
+        '--complete-num-requests-per-step',
+        type=int,
+        help=
+        "Complete complete_num_requests_per_step requests every decode step."
+        "For e.g., with batch_size 128 and complete_num_requests_per_step 32,"
+        "the profiler is run for 6 engine steps, with the steps processing, "
+        "128, 128, 96, 64, 32, 1 requests respectively.\n"
+        "Note that we tack-on a one-request step at the end as it is often "
+        "useful.")
 
     EngineArgs.add_cli_args(parser)
 
     args = parser.parse_args()
-
     context = ProfileContext(
         engine_args=EngineArgs.from_cli_args(args),
         **{
diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py
index 081076ad7dbdc..394ca8663e189 100644
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@@ -34,9 +34,10 @@ def get_entries(node, curr_depth=0):
                         "examples/offline_profile.py")
     parser.add_argument("--phase",
                         type=str,
-                        choices=["prefill", "decode_1"],
                         required=True,
-                        help="The phase to print the table for.")
+                        help="The phase to print the table for. This is either"
+                        "prefill or decode_n, where n is the decode step "
+                        "number")
     parser.add_argument("--table",
                         type=str,
                         choices=["summary", "model"],
@@ -49,6 +50,10 @@ def get_entries(node, curr_depth=0):
     with open(args.json_trace) as f:
         profile_data = json.load(f)
 
+    assert args.phase in profile_data, \
+       (f"Cannot find phase {args.phase} in profile data. Choose one among"
+        f'{[x for x in profile_data.keys() if "prefill" in x or "decode" in x]}') #noqa
+
     if args.table == "summary":
         entries_and_depths = flatten_entries(
             SummaryStatsEntry, profile_data[args.phase]["summary_stats"])
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index adc44474aa4c1..da7a28da15c19 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -151,16 +151,31 @@ def is_quant(op_name: str):
            "scaled_int8_quant" in op_name:
             return True
 
+    # LoRA ops
+    def is_sgmv_shrink(op_name: str):
+        return "sgmv_shrink" in op_name
+
+    def is_sgmv_expand(op_name: str):
+        return "sgmv_expand" in op_name
+
+    def is_bgmv_shrink(op_name: str):
+        return "bgmv_shrink" in op_name
+
+    def is_bgmv_expand(op_name: str):
+        return "bgmv_expand" in op_name
+
+    def is_cutlass_gemm_op(op_name: str):
+        return "void cutlass::Kernel" in op_name or \
+           "void cutlass::device_kernel" in op_name
+
     def is_gemm_op(op_name: str):
         if is_quant(op_name):
             return False
-        if "xmma_gemm" in op_name  or \
+        return is_cutlass_gemm_op(op_name) or \
+           "xmma_gemm" in op_name  or \
            "gemv2T_kernel" in op_name or \
            "splitKreduce" in op_name or \
-           "void cutlass::Kernel" in op_name or \
-           "void cutlass::device_kernel" in op_name or \
-           "s16816gemm" in op_name:
-            return True
+           "s16816gemm" in op_name
 
     def is_elementwise_op(op_name: str):
         return "elementwise_kernel" in op_name
@@ -211,6 +226,18 @@ def is_reduce_kernel(op_name: str):
     quant_ops = list(filter(lambda x: is_quant(x), ops))
     ops = list(filter(lambda x: x not in quant_ops, ops))
 
+    sgmv_shrink_ops = list(filter(lambda x: is_sgmv_shrink(x), ops))
+    ops = list(filter(lambda x: x not in sgmv_shrink_ops, ops))
+    sgmv_expand_ops = list(filter(lambda x: is_sgmv_expand(x), ops))
+    ops = list(filter(lambda x: x not in sgmv_expand_ops, ops))
+    bgmv_shrink_ops = list(filter(lambda x: is_bgmv_shrink(x), ops))
+    ops = list(filter(lambda x: x not in bgmv_shrink_ops, ops))
+    bgmv_expand_ops = list(filter(lambda x: is_bgmv_expand(x), ops))
+    ops = list(filter(lambda x: x not in bgmv_expand_ops, ops))
+
+    cutlass_gemm_ops = list(filter(lambda x: is_cutlass_gemm_op(x), ops))
+    ops = list(filter(lambda x: x not in cutlass_gemm_ops, ops))
+
     gemm_ops = list(filter(lambda x: is_gemm_op(x), ops))
     ops = list(filter(lambda x: x not in gemm_ops, ops))
 
@@ -257,6 +284,24 @@ def is_reduce_kernel(op_name: str):
         trace_df['attention'] = trace_df[attention_ops].agg("sum", axis=1)
     if len(quant_ops):
         trace_df['quant_ops'] = trace_df[quant_ops].agg("sum", axis=1)
+
+    if len(sgmv_shrink_ops):
+        trace_df['sgmv_shrink_ops'] = trace_df[sgmv_shrink_ops].agg("sum",
+                                                                    axis=1)
+    if len(sgmv_expand_ops):
+        trace_df['sgmv_expand_ops'] = trace_df[sgmv_expand_ops].agg("sum",
+                                                                    axis=1)
+    if len(bgmv_shrink_ops):
+        trace_df['bgmv_shrink_ops'] = trace_df[bgmv_shrink_ops].agg("sum",
+                                                                    axis=1)
+    if len(bgmv_expand_ops):
+        trace_df['bgmv_expand_ops'] = trace_df[bgmv_expand_ops].agg("sum",
+                                                                    axis=1)
+
+    if len(cutlass_gemm_ops):
+        trace_df['cutlass_gemm_ops'] = trace_df[cutlass_gemm_ops].agg("sum",
+                                                                      axis=1)
+
     if len(gemm_ops):
         trace_df['gemm_ops'] = trace_df[gemm_ops].agg("sum", axis=1)
     if len(rms_norm_ops):
@@ -296,7 +341,9 @@ def is_reduce_kernel(op_name: str):
         trace_df['reduce_kernel_ops'] = trace_df[reduce_kernel_ops].agg("sum",
                                                                         axis=1)
 
-    trace_df.drop(attention_ops + quant_ops + gemm_ops + rms_norm_ops +
+    trace_df.drop(attention_ops + quant_ops + sgmv_shrink_ops +
+                  sgmv_expand_ops + bgmv_shrink_ops + bgmv_expand_ops +
+                  cutlass_gemm_ops + gemm_ops + rms_norm_ops +
                   vocab_embed_ops + mem_ops + elementwise_ops +
                   nccl_all_reduce_ops + nccl_gather_ops + nccl_broadcast_ops +
                   nccl_other_ops + cross_device_reduce_1stage_ops +
@@ -315,7 +362,14 @@ def plot_trace_df(traces_df: pd.DataFrame,
                   plot_title: str,
                   output: Optional[Path] = None):
 
+    def get_phase_description(traces_df: pd.DataFrame, phase: str) -> str:
+        phase_df = traces_df.query(f'phase == "{phase}"')
+        descs = phase_df['phase_desc'].to_list()
+        assert all([desc == descs[0] for desc in descs])
+        return descs[0]
+
     phases = traces_df['phase'].unique()
+    phase_descs = [get_phase_description(traces_df, p) for p in phases]
     traces_df = traces_df.pivot_table(index="phase",
                                       columns="name",
                                       values=plot_metric,
@@ -324,7 +378,8 @@ def plot_trace_df(traces_df: pd.DataFrame,
     traces_df = group_trace_by_operations(traces_df)
 
     # Make the figure
-    fig, ax = plt.subplots(1, figsize=(5, 8), sharex=True)
+    fig_size_x = max(5, len(phases))
+    fig, ax = plt.subplots(1, figsize=(fig_size_x, 8), sharex=True)
 
     # Draw the stacked bars
     ops = list(traces_df)
@@ -332,7 +387,7 @@ def plot_trace_df(traces_df: pd.DataFrame,
     for op in ops:
         values = [traces_df[op][phase] for phase in phases]
         values = list(map(lambda x: 0.0 if math.isnan(x) else x, values))
-        ax.bar(phases, values, label=op, bottom=bottom)
+        ax.bar(phase_descs, values, label=op, bottom=bottom)
         bottom = [bottom[j] + values[j] for j in range(len(phases))]
 
     # Write the values as text on the bars
@@ -390,6 +445,14 @@ def keep_only_top_entries(df: pd.DataFrame,
                    ["name"]] = "others"
             return df
 
+        def get_phase_description(key: str) -> str:
+            num_running_seqs = profile_json[key]['metadata'][
+                'num_running_seqs']
+            if num_running_seqs is not None:
+                return f"{key}-seqs-{num_running_seqs}"
+            else:
+                return key
+
         # Get data for each key
         traces = list(map(lambda x: get_entries_and_traces(x), step_keys))
 
@@ -413,6 +476,7 @@ def keep_only_top_entries(df: pd.DataFrame,
         # Fill in information about the step-keys
         for trace_df, step_key in zip(trace_dfs, step_keys):
             trace_df['phase'] = step_key
+            trace_df['phase_desc'] = get_phase_description(step_key)
 
         # Combine all data frames so they can be put in a single plot
         traces_df = pd.concat(trace_dfs)
@@ -426,12 +490,16 @@ def keep_only_top_entries(df: pd.DataFrame,
     def make_plot_title_suffix(profile_json: dict) -> str:
         context = profile_json["context"]
         sparsity = context.get('sparsity', None)
-        return (f"{context['model']}\n"
+        run_type = \
+            f'Run {context["num_steps"]} steps' if context['num_steps'] else \
+                (f'Complete {context["complete_num_requests_per_step"]} per '
+                 f'step; Run till completion')
+        return (f"{context['engine_args']['model']}\n"
                 f"Batch={context['batch_size']}, "
                 f"PromptLen={context['prompt_len']}, "
-                f"OutputLen={context['output_len']},"
-                f"NumGpus={context['tensor_parallel_size']}"
-                f"{', Sparsity ' + sparsity if sparsity else ''}")
+                f"NumGpus={context['engine_args']['tensor_parallel_size']}"
+                f"{', Sparsity ' + sparsity if sparsity else ''}\n"
+                f"Run Type: {run_type}")
 
     profile_json = None
     with open(json_trace) as f:
diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py
index 9d9f427e807f6..33babfebdca1e 100644
--- a/vllm/profiler/layerwise_profile.py
+++ b/vllm/profiler/layerwise_profile.py
@@ -72,6 +72,9 @@ class LayerwiseProfileResults(profile):
     _model_stats_tree: List[_StatsTreeNode] = field(init=False)
     _summary_stats_tree: List[_StatsTreeNode] = field(init=False)
 
+    # profile metadata
+    num_running_seqs: Optional[int] = None
+
     def __post_init__(self):
         self._build_correlation_map()
         self._build_module_tree()
@@ -127,6 +130,9 @@ def export_summary_stats_table_csv(self, filename: str):
 
     def convert_stats_to_dict(self) -> str:
         return {
+            "metadata": {
+                "num_running_seqs": self.num_running_seqs
+            },
             "summary_stats":
             self._convert_stats_tree_to_dict(self._summary_stats_tree),
             "model_stats":
@@ -338,7 +344,15 @@ def df_traversal(node: _StatsTreeNode, curr_json_list: List[Dict]):
 
 class layerwise_profile(profile):
 
-    def __init__(self):
+    def __init__(self, num_running_seqs: Optional[int] = None):
+        """
+        layerwise profile constructor.
+
+        Args:
+            num_running_seqs (Optional[int], optional): When given,
+            num_running_seqs will be passed to LayerProfileResults for metadata
+            update. Defaults to None.
+        """
         super().__init__(
             activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
             record_shapes=True,
@@ -346,9 +360,13 @@ def __init__(self):
             with_modules=True,
             experimental_config=_ExperimentalConfig(verbose=True))
 
+        self.num_running_seqs = num_running_seqs
+
     def __enter__(self):
         return super().__enter__()
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         super().__exit__(exc_type, exc_val, exc_tb)
-        self.results = LayerwiseProfileResults(self.profiler.kineto_results)
+        self.results = LayerwiseProfileResults(
+            self.profiler.kineto_results,
+            num_running_seqs=self.num_running_seqs)

From 551603feffd9b4ba98ccdd34e02e403e04db88c1 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 16 Dec 2024 13:32:25 -0800
Subject: [PATCH 088/357] [core] overhaul memory profiling and fix backward
 compatibility (#10511)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/entrypoints/llm/test_gpu_utilization.py |  25 ++++
 tests/entrypoints/llm/test_lazy_outlines.py   |   2 +-
 tests/test_utils.py                           |  44 +++++-
 tests/worker/test_profile.py                  |  18 +--
 vllm/engine/arg_utils.py                      |  11 +-
 vllm/utils.py                                 | 125 +++++++++++++++++-
 vllm/worker/multi_step_model_runner.py        |   3 +-
 vllm/worker/worker.py                         |  68 ++++------
 8 files changed, 236 insertions(+), 60 deletions(-)
 create mode 100644 tests/entrypoints/llm/test_gpu_utilization.py

diff --git a/tests/entrypoints/llm/test_gpu_utilization.py b/tests/entrypoints/llm/test_gpu_utilization.py
new file mode 100644
index 0000000000000..c2dab300ecefb
--- /dev/null
+++ b/tests/entrypoints/llm/test_gpu_utilization.py
@@ -0,0 +1,25 @@
+from vllm import LLM, SamplingParams
+
+
+def test_gpu_memory_utilization():
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # makes sure gpu_memory_utilization is per-instance limit,
+    # not a global limit
+    llms = [
+        LLM(model="facebook/opt-125m",
+            gpu_memory_utilization=0.3,
+            enforce_eager=True) for i in range(3)
+    ]
+    for llm in llms:
+        outputs = llm.generate(prompts, sampling_params)
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
index 2c53676c5f5dd..bf609b38a94f5 100644
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -36,7 +36,7 @@ def run_lmfe(sample_regex):
     llm = LLM(model="facebook/opt-125m",
               enforce_eager=True,
               guided_decoding_backend="lm-format-enforcer",
-              gpu_memory_utilization=0.6)
+              gpu_memory_utilization=0.3)
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
     outputs = llm.generate(
         prompts=[
diff --git a/tests/test_utils.py b/tests/test_utils.py
index a731b11eae81c..0bc9e5bc32a46 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -5,11 +5,13 @@
 from typing import AsyncIterator, Tuple
 
 import pytest
+import torch
 
 from vllm.utils import (FlexibleArgumentParser, StoreBoolean, deprecate_kwargs,
-                        get_open_port, merge_async_iterators, supports_kw)
+                        get_open_port, memory_profiling, merge_async_iterators,
+                        supports_kw)
 
-from .utils import error_on_warning
+from .utils import error_on_warning, fork_new_process_for_each_test
 
 
 @pytest.mark.asyncio
@@ -270,3 +272,41 @@ def test_supports_kw(callable,kw_name,requires_kw_only,
         requires_kw_only=requires_kw_only,
         allow_var_kwargs=allow_var_kwargs
     ) == is_supported
+
+
+@fork_new_process_for_each_test
+def test_memory_profiling():
+    # Fake out some model loading + inference memory usage to test profiling
+    # Memory used by other processes will show up as cuda usage outside of torch
+    from vllm.distributed.device_communicators.cuda_wrapper import (
+        CudaRTLibrary)
+    lib = CudaRTLibrary()
+    # 512 MiB allocation outside of this instance
+    handle1 = lib.cudaMalloc(512 * 1024 * 1024)
+
+    baseline_memory_in_bytes = \
+        torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]
+
+    # load weights
+
+    weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32)
+
+    weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB
+
+    with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes,
+    weights_memory_in_bytes=weights_memory_in_bytes) as result:
+        # make a memory spike, 1 GiB
+        spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
+        del spike
+
+        # Add some extra non-torch memory 256 MiB (simulate NCCL)
+        handle2 = lib.cudaMalloc(256 * 1024 * 1024)
+
+    # Check that the memory usage is within 5% of the expected values
+    non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa
+    torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa
+    assert abs(non_torch_ratio - 1) <= 0.05
+    assert abs(torch_peak_ratio - 1) <= 0.05
+    del weights
+    lib.cudaFree(handle1)
+    lib.cudaFree(handle2)
diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py
index 194ea2aa506f4..79233c75714de 100644
--- a/tests/worker/test_profile.py
+++ b/tests/worker/test_profile.py
@@ -31,10 +31,6 @@ def test_gpu_memory_profiling():
         is_driver_worker=True,
     )
 
-    # Load the model so we can profile it
-    worker.init_device()
-    worker.load_model()
-
     # Set 10GiB as the total gpu ram to be device-agnostic
     def mock_mem_info():
         current_usage = torch.cuda.memory_stats(
@@ -46,20 +42,24 @@ def mock_mem_info():
 
     from unittest.mock import patch
     with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info):
+        # Load the model so we can profile it
+        worker.init_device()
+        worker.load_model()
         gpu_blocks, _ = worker.determine_num_available_blocks()
 
-    # Peak vram usage by torch should be 0.7077 GiB
+    # Peak vram usage by torch should be 0.47 GiB
+    # Model weights take 0.25 GiB
     # No memory should be allocated outside of torch
     # 9.0 GiB should be the utilization target
-    # 8.2923 GiB should be available for the KV cache
+    # 8.28 GiB should be available for the KV cache
     block_size = CacheEngine.get_cache_block_size(
         engine_config.cache_config, engine_config.model_config,
         engine_config.parallel_config)
 
-    expected_blocks = (8.2923 * 1024**3) // block_size
+    expected_blocks = (8.28 * 1024**3) // block_size
 
     # Check within a small tolerance for portability
     # Hardware, kernel, or dependency changes could all affect memory
     # utilization.
-    # A 10 block tolerance here should be about 6MB of wiggle room.
-    assert abs(gpu_blocks - expected_blocks) < 10
+    # A 100 block tolerance here should be about 60MB of wiggle room.
+    assert abs(gpu_blocks - expected_blocks) < 100
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 0aa367a173b6c..06b8542779dc0 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -487,11 +487,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             help='The fraction of GPU memory to be used for the model '
             'executor, which can range from 0 to 1. For example, a value of '
             '0.5 would imply 50%% GPU memory utilization. If unspecified, '
-            'will use the default value of 0.9. This is a global gpu memory '
-            'utilization limit, for example if 50%% of the gpu memory is '
-            'already used before vLLM starts and --gpu-memory-utilization is '
-            'set to 0.9, then only 40%% of the gpu memory will be allocated '
-            'to the model executor.')
+            'will use the default value of 0.9. This is a per-instance '
+            'limit, and only applies to the current vLLM instance.'
+            'It does not matter if you have another vLLM instance running '
+            'on the same GPU. For example, if you have two vLLM instances '
+            'running on the same GPU, you can set the GPU memory utilization '
+            'to 0.5 for each instance.')
         parser.add_argument(
             '--num-gpu-blocks-override',
             type=int,
diff --git a/vllm/utils.py b/vllm/utils.py
index 45e682ac15782..73d2ae25f15ca 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -23,10 +23,12 @@
 from asyncio import FIRST_COMPLETED, AbstractEventLoop, Future, Task
 from collections import UserDict, defaultdict
 from collections.abc import Iterable, Mapping
+from dataclasses import dataclass, field
 from functools import lru_cache, partial, wraps
 from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
-                    Dict, Generic, Hashable, List, Literal, Optional,
-                    OrderedDict, Set, Tuple, Type, TypeVar, Union, overload)
+                    Dict, Generator, Generic, Hashable, List, Literal,
+                    Optional, OrderedDict, Set, Tuple, Type, TypeVar, Union,
+                    overload)
 from uuid import uuid4
 
 import numpy as np
@@ -1664,3 +1666,122 @@ def kill_process_tree(pid: int):
     # Finally kill the parent
     with contextlib.suppress(ProcessLookupError):
         os.kill(pid, signal.SIGKILL)
+
+
+@dataclass
+class MemorySnapshot:
+    """Memory snapshot."""
+    torch_peak_in_bytes: int = 0
+    torch_memory_in_bytes: int = 0
+    timestamp: float = 0.0
+
+    def measure(self):
+        self.torch_peak_in_bytes = torch.cuda.memory_stats(
+        )["allocated_bytes.all.peak"]
+        self.torch_memory_in_bytes = torch.cuda.memory_stats(
+        )["allocated_bytes.all.current"]
+        self.timestamp = time.time()
+
+    def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
+        """support a - b"""
+        return MemorySnapshot(
+            torch_peak_in_bytes=self.torch_peak_in_bytes -
+            other.torch_peak_in_bytes,
+            torch_memory_in_bytes=self.torch_memory_in_bytes -
+            other.torch_memory_in_bytes,
+            timestamp=self.timestamp - other.timestamp)
+
+
+@dataclass
+class MemoryProfilingResult:
+    """Memory profiling result.
+    """  # noqa
+    baseline_memory_in_bytes: int = 0
+    non_kv_cache_memory_in_bytes: int = 0
+    torch_peak_increase_in_bytes: int = 0
+    non_torch_increase_in_bytes: int = 0
+    weights_memory_in_bytes: float = 0
+    before_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
+    after_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
+    profile_time: float = 0.0
+
+
+@contextlib.contextmanager
+def memory_profiling(
+    baseline_memory_in_bytes: int, weights_memory_in_bytes: int
+) -> Generator[MemoryProfilingResult, None, None]:
+    """Memory profiling context manager.
+    baseline_memory_in_bytes: memory used by all the components other than
+        the current vLLM instance. It contains: memory used by other processes, memory
+        used by another vLLM instance in the same process, etc. It is usually measured
+        before the current vLLM instance initialize the device. And we assume it is
+        constant during the profiling of the current vLLM instance.
+    weights_memory_in_bytes: memory used by PyTorch when loading the model weights.
+        Note that, before loading the model weights, we also initialize the device
+        and distributed environment, which may consume some memory. This part is not
+        included in the weights_memory_in_bytes because PyTorch does not control it.
+
+    The memory in one GPU can be classified into 3 categories:
+    1. memory used by anything other than the current vLLM instance.
+    2. memory used by torch in the current vLLM instance.
+    3. memory used in the current vLLM instance, but not by torch.
+
+    A quantitive example:
+
+    Before creating the current vLLM instance:
+        category 1: 1 GiB
+        category 2: 0 GiB
+        category 3: 0 GiB
+
+    After creating the current vLLM instance and loading the model,
+    (i.e. before profiling):
+        category 1: 1 GiB
+        category 2: 2 GiB (model weights take 2 GiB)
+        category 3: 0.5 GiB (memory used by NCCL)
+
+    During profiling (peak):
+        category 1: 1 GiB
+        category 2: 4 GiB (peak activation tensors take 2 GiB)
+        category 3: 1 GiB (memory used by NCCL + buffers for some attention backends)
+
+    After profiling:
+        category 1: 1 GiB
+        category 2: 3 GiB (after garbage-collecting activation tensors)
+        category 3: 1 GiB (memory used by NCCL + buffers for some attention backends)
+
+    In this case, non-kv cache takes 5 GiB in total, including:
+    a. 2 GiB used by the model weights (category 2)
+    b. 2 GiB reserved for the peak activation tensors (category 2)
+    c. 1 GiB used by non-torch components (category 3)
+
+    The memory used for loading weights (a.) is directly given from the argument `weights_memory_in_bytes`.
+
+    The increase of ``torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.).
+
+    (c.) is tricky. We measure the total memory used in this GPU (`torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]`),
+    subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_stats()["allocated_bytes.all.current"]`.
+    """ # noqa
+    torch.cuda.reset_peak_memory_stats()
+
+    result = MemoryProfilingResult()
+
+    result.baseline_memory_in_bytes = baseline_memory_in_bytes
+    # the part of memory used for holding the model weights
+    result.weights_memory_in_bytes = weights_memory_in_bytes
+
+    result.before_profile.measure()
+
+    yield result
+
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    result.after_profile.measure()
+
+    diff = result.after_profile - result.before_profile
+    result.torch_peak_increase_in_bytes = diff.torch_peak_in_bytes
+    current_cuda_memory_bytes = torch.cuda.mem_get_info(
+    )[1] - torch.cuda.mem_get_info()[0]
+    result.non_torch_increase_in_bytes = current_cuda_memory_bytes - baseline_memory_in_bytes - weights_memory_in_bytes - diff.torch_memory_in_bytes  # noqa
+    result.profile_time = diff.timestamp
+    result.non_kv_cache_memory_in_bytes = result.non_torch_increase_in_bytes + result.torch_peak_increase_in_bytes + result.weights_memory_in_bytes  # noqa
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index e08a61e31fe42..18b03bf1bfb56 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -645,7 +645,8 @@ def _advance_step(self, model_input: StatefulModelInput,
         return model_input
 
     def load_model(self) -> None:
-        return self._base_model_runner.load_model()
+        self._base_model_runner.load_model()
+        self.model_memory_usage = self._base_model_runner.model_memory_usage
 
     def save_sharded_state(
         self,
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index a368bb9ee9a5b..f51b51d433d3d 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -1,7 +1,6 @@
 """A GPU worker class."""
 import gc
 import os
-import time
 from typing import Dict, List, Optional, Set, Tuple, Type, Union
 
 import torch
@@ -22,6 +21,7 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
                            SequenceGroupMetadata, SequenceGroupMetadataDelta)
+from vllm.utils import GiB_bytes, memory_profiling
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
 from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
@@ -192,33 +192,22 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         torch.cuda.reset_peak_memory_stats()
 
         free_memory_pre_profile, total_gpu_memory = torch.cuda.mem_get_info()
-        start_time = time.time()
 
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
-        self.model_runner.profile_run()
-        torch.cuda.synchronize()
+        with memory_profiling(baseline_memory_in_bytes=total_gpu_memory -
+                              self.init_gpu_memory,
+                              weights_memory_in_bytes=self.model_runner.
+                              model_memory_usage) as result:
+            self.model_runner.profile_run()
+            torch.cuda.synchronize()
 
         self._assert_memory_footprint_increased_during_profiling()
 
-        # Get the peak memory allocation recorded by torch
-        peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
-
-        # Check for any memory left around that may have been allocated on the
-        # gpu outside of `torch`. NCCL operations, for example, can use a few
-        # GB during a forward pass
-        torch.cuda.empty_cache()
-        torch_allocated_bytes = torch.cuda.memory_stats(
-        )["allocated_bytes.all.current"]
-        total_allocated_bytes = torch.cuda.mem_get_info(
-        )[1] - torch.cuda.mem_get_info()[0]
-        non_torch_allocations = total_allocated_bytes - torch_allocated_bytes
-        if non_torch_allocations > 0:
-            peak_memory += non_torch_allocations
-
-        available_kv_cache_memory = (
-            total_gpu_memory * self.cache_config.gpu_memory_utilization -
-            peak_memory)
+        memory_for_current_instance = total_gpu_memory * \
+            self.cache_config.gpu_memory_utilization
+        available_kv_cache_memory = (memory_for_current_instance -
+                                     result.non_kv_cache_memory_in_bytes)
 
         # Calculate the number of blocks that can be allocated with the
         # profiled peak memory.
@@ -233,24 +222,23 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         num_gpu_blocks = max(num_gpu_blocks, 0)
         num_cpu_blocks = max(num_cpu_blocks, 0)
 
-        end_time = time.time()
-        logger.info(
-            "Memory profiling results: "
-            "duration=%.2f seconds, "
-            "total_gpu_memory=%.2fGiB, "
-            "initial_memory_usage=%.2fGiB, "
-            "peak_torch_memory=%.2fGiB, "
-            "memory_usage_post_profile=%.2fGiB, "
-            "non_torch_memory=%.2fGiB, "
-            "kv_cache_size=%.2fGiB, "
-            "gpu_memory_utilization=%.2f.", end_time - start_time,
-            total_gpu_memory / (1024**3),
-            (total_gpu_memory - free_memory_pre_profile) / (1024**3),
-            (peak_memory - non_torch_allocations) / (1024**3),
-            total_allocated_bytes / (1024**3),
-            non_torch_allocations / (1024**3),
-            available_kv_cache_memory / (1024**3),
-            self.cache_config.gpu_memory_utilization)
+        msg = (f"Memory profiling takes {result.profile_time:.2f} seconds\n"
+               "the current vLLM instance can use "
+               "total_gpu_memory "
+               f"({(total_gpu_memory / GiB_bytes):.2f}GiB)"
+               " x gpu_memory_utilization "
+               f"({self.cache_config.gpu_memory_utilization:.2f})"
+               f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n"
+               "model weights take "
+               f"{(result.weights_memory_in_bytes / GiB_bytes):.2f}GiB;"
+               " non_torch_memory takes "
+               f"{(result.non_torch_increase_in_bytes / GiB_bytes):.2f}GiB;"
+               " PyTorch activation peak memory takes "
+               f"{(result.torch_peak_increase_in_bytes / GiB_bytes):.2f}GiB;"
+               " the rest of the memory reserved for KV Cache is "
+               f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB.")
+
+        logger.info(msg)
 
         # Final cleanup
         if self.model_runner.lora_manager:

From 35ffa682b1cd3f47eb6cda586a16dab5c0401477 Mon Sep 17 00:00:00 2001
From: bk-TurbaAI <babar.khan@turba.ai>
Date: Mon, 16 Dec 2024 23:20:39 +0100
Subject: [PATCH 089/357] [Docs] hint to enable use of GPU performance counters
 in profiling tools for multi-node distributed serving (#11235)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 docs/source/serving/distributed_serving.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
index 4d57206e53a05..b24ba53e59694 100644
--- a/docs/source/serving/distributed_serving.rst
+++ b/docs/source/serving/distributed_serving.rst
@@ -54,7 +54,7 @@ Multi-Node Inference and Serving
 
 If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
 
-The first step, is to start containers and organize them into a cluster. We have provided a helper `script <https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh>`_ to start the cluster.
+The first step, is to start containers and organize them into a cluster. We have provided a helper `script <https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh>`_ to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have ``CAP_SYS_ADMIN`` to the docker container by using the ``--cap-add`` option in the docker run command.
 
 Pick a node as the head node, and run the following command:
 

From c301616ed23fef433db1a49df332b9d61d3178ad Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 16 Dec 2024 15:53:18 -0800
Subject: [PATCH 090/357] [ci][tests] add gh200 tests (#11244)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/run-gh200-test.sh | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 .buildkite/run-gh200-test.sh

diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
new file mode 100644
index 0000000000000..d25510c47fe6b
--- /dev/null
+++ b/.buildkite/run-gh200-test.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# This script build the GH200 docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+DOCKER_BUILDKIT=1 docker build . \
+  --target test \
+  -platform "linux/arm64" \
+  -t gh200-test \
+  --build-arg max_jobs=66 \
+  --build-arg nvcc_threads=2 \
+  --build-arg torch_cuda_arch_list="9.0+PTX" \
+  --build-arg vllm_fa_cmake_gpu_arches="90-real"
+
+# Setup cleanup
+remove_docker_container() { docker rm -f gh200-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and test offline inference
+docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+    python3 examples/offline_inference.py
+'

From 88a412ed3d964de3443c42a6a35108115ee0ad25 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 16 Dec 2024 16:15:22 -0800
Subject: [PATCH 091/357] [torch.compile] fast inductor (#11108)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 vllm/compilation/backends.py | 213 +++++++++++++++++-
 vllm/config.py               | 415 ++++++++++++++++++++++++++++++++++-
 vllm/envs.py                 |   3 +
 3 files changed, 624 insertions(+), 7 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 4a5dc337d01b8..0c7bbfe599b02 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -1,6 +1,10 @@
+import ast
 import copy
 import dataclasses
+import os
+import pprint
 import time
+from collections import defaultdict
 from contextlib import ExitStack
 from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple
 from unittest.mock import patch
@@ -21,6 +25,122 @@
 logger = init_logger(__name__)
 
 
+class InductorHashCache:
+    """
+    Disk format: a Python list of tuples, each tuple is
+    (runtime_shape, graph_index, hash_str)
+    We use list of tuple for readability.
+
+    In-memory format: a defaultdict of dict, where the key is
+    runtime_shape, and the value is a dict of graph_index to hash_str.
+
+    The data is essentially `Dict[Optional[int], Dict[int, str]]`,
+    we don't use json here because json doesn't support int as key.
+
+    TODO: better off-the-shelf solution to serialize the data?
+    """
+
+    def __init__(self, cache_dir: str, disabled: bool = False):
+        self.cache: defaultdict = defaultdict(dict)
+        self.disabled = disabled
+        self.cache_dir = cache_dir
+        self.cache_file_path = os.path.join(cache_dir,
+                                            "inductor_hash_cache.py")
+        if disabled:
+            return
+        # set flags so that Inductor and Triton store their cache
+        # in the cache_dir, then users only need to copy the cache_dir
+        # to another machine to reuse the cache.
+        inductor_cache = os.path.join(cache_dir, "inductor_cache")
+        os.makedirs(inductor_cache, exist_ok=True)
+        os.environ["TORCHINDUCTOR_CACHE_DIR"] = inductor_cache
+        triton_cache = os.path.join(cache_dir, "triton_cache")
+        os.makedirs(triton_cache, exist_ok=True)
+        os.environ["TRITON_CACHE_DIR"] = triton_cache
+        if os.path.exists(self.cache_file_path):
+            with open(self.cache_file_path) as f:
+                self.deserialize(f.read())
+
+    def deserialize(self, data: str):
+        # we use ast.literal_eval to parse the data
+        # because it is a safe way to parse Python literals.
+        # do not use eval(), it is unsafe.
+        list_data = ast.literal_eval(data)
+        for runtime_shape, graph_index, hash_str in list_data:
+            self.cache[runtime_shape][graph_index] = hash_str
+
+    def serialize(self) -> str:
+        data = []
+        for runtime_shape, graph_index_to_hash_str in self.cache.items():
+            for graph_index, hash_str in graph_index_to_hash_str.items():
+                data.append((runtime_shape, graph_index, hash_str))
+        printer = pprint.PrettyPrinter(indent=4)
+        return printer.pformat(data)
+
+    def save_to_file(self):
+        if self.disabled:
+            return
+        with open(self.cache_file_path, "w") as f:
+            f.write(self.serialize())
+
+    def __contains__(self, key: Tuple[Optional[int], int]) -> bool:
+        if self.disabled:
+            return False
+        runtime_shape, graph_index = key
+        return runtime_shape in self.cache and graph_index in self.cache[
+            runtime_shape]
+
+    def __getitem__(self, key: Tuple[Optional[int], int]) -> str:
+        if self.disabled:
+            raise KeyError("cannot read from disabled cache")
+        runtime_shape, graph_index = key
+        return self.cache[runtime_shape][graph_index]
+
+    def __setitem__(self, key: Tuple[Optional[int], int], value: str):
+        # setitem for disabled cache is fine, because we
+        # don't actually write to the disk
+        runtime_shape, graph_index = key
+        self.cache[runtime_shape][graph_index] = value
+
+
+class AlwaysHitShapeEnv:
+    """
+    Why do we need this class:
+
+    For normal `torch.compile` usage, every compilation will have
+    one Dynamo bytecode compilation and one Inductor compilation.
+    The Inductor compilation happens under the context of the
+    Dynamo bytecode compilation, and that context is used to
+    determine the dynamic shape information, etc.
+
+    For our use case, we only run Dynamo bytecode compilation once,
+    and run Inductor compilation multiple times with different shapes
+    plus a general shape. The compilation for specific shapes happens
+    outside of the context of the Dynamo bytecode compilation. At that
+    time, we don't have shape environment to provide to Inductor, and
+    it will fail the Inductor code cache lookup.
+
+    By providing a dummy shape environment that always hits, we can
+    make the Inductor code cache lookup always hit, and we can
+    compile the graph for different shapes as needed.
+
+    The following dummy methods are obtained by trial-and-error
+    until it works.
+    """
+
+    def __init__(self) -> None:
+        self.guards: List[Any] = []
+
+    def evaluate_guards_expression(self, *args, **kwargs):
+        return True
+
+    def get_pruned_guards(self, *args, **kwargs):
+        return []
+
+    def produce_guards_expression(self, *args, **kwargs):
+        return ""
+
+
 def wrap_inductor(graph,
                   example_inputs,
                   additional_inductor_config,
@@ -55,9 +175,93 @@ def wrap_inductor(graph,
     # inductor can inplace modify the graph, so we need to copy it
     # see https://github.com/pytorch/pytorch/issues/138980
     graph = copy.deepcopy(graph)
-    compiled_graph = compile_fx(graph,
-                                example_inputs,
-                                config_patches=current_config)
+
+    cache_data = compilation_config.inductor_hash_cache
+    if (runtime_shape, graph_index) in cache_data:
+        # we compiled this graph before
+        # so we can directly lookup the compiled graph via hash
+        hash_str = cache_data[(runtime_shape, graph_index)]
+        if graph_index == 0:
+            # adds some info logging for the first graph
+            logger.info(
+                "Directly lookup the graph for shape %s from the cache",
+                str(runtime_shape))  # noqa
+        logger.debug(
+            "directly lookup the %s-th graph for shape %s via hash %s",
+            graph_index, str(runtime_shape), hash_str)
+        from torch._inductor.codecache import FxGraphCache
+        with patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
+                   lambda *args, **kwargs: AlwaysHitShapeEnv()):
+            inductor_compiled_graph = FxGraphCache._lookup_graph(
+                hash_str, example_inputs, True, False)
+            assert inductor_compiled_graph is not None, (
+                "Inductor cache lookup failed. Please remove"
+                f"the cache file {compilation_config.inductor_hash_cache.cache_file_path} and try again."  # noqa
+            )
+
+        # Inductor calling convention (function signature):
+        # f(list) -> tuple
+        # Dynamo calling convention (function signature):
+        # f(*args) -> Any
+
+        # need to know if the graph returns a tuple
+        from torch._inductor.compile_fx import graph_returns_tuple
+        returns_tuple = graph_returns_tuple(graph)
+
+        # this is the graph we return to Dynamo to run
+        def compiled_graph(*args):
+            # convert args to list
+            list_args = list(args)
+            graph_output = inductor_compiled_graph(list_args)
+            # unpack the tuple if needed
+            if returns_tuple:
+                return graph_output
+            else:
+                return graph_output[0]
+    else:
+        # it's the first time we compile this graph
+        # the assumption is that we don't have nested Inductor compilation.
+        # compiled_fx_graph_hash will only be called once, and we can hook
+        # it to get the hash of the compiled graph directly.
+        from torch._inductor.codecache import compiled_fx_graph_hash
+
+        def hijack_compiled_fx_graph_hash(*args, **kwargs):
+            out = compiled_fx_graph_hash(*args, **kwargs)
+            # store the hash in the cache
+            nonlocal cache_data
+            cache_data[(runtime_shape, graph_index)] = out[0]
+            if graph_index == 0:
+                # adds some info logging for the first graph
+                logger.info("Cache the graph of shape %s for later use",
+                            str(runtime_shape))
+            logger.debug("store the %s-th graph for shape %s via hash %s",
+                         graph_index, str(runtime_shape), out[0])
+            return out
+
+        def _check_can_cache(*args, **kwargs):
+            # no error means it can be cached.
+            # Inductor refuses to cache the graph outside of Dynamo
+            # tracing context, and also disables caching for graphs
+            # with high-order ops.
+            # For vLLM, in either case, we want to cache the graph.
+            # see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa
+            return
+
+        def _get_shape_env():
+            return AlwaysHitShapeEnv()
+
+        with patch(# for hijacking the hash of the compiled graph
+                "torch._inductor.codecache.compiled_fx_graph_hash",
+                hijack_compiled_fx_graph_hash), \
+            patch(# for providing a dummy shape environment
+                "torch._inductor.codecache.FxGraphCache._get_shape_env",
+                 _get_shape_env), \
+            patch(# for forcing the graph to be cached
+                "torch._inductor.codecache.FxGraphCache._check_can_cache",
+                _check_can_cache):
+            compiled_graph = compile_fx(graph,
+                                        example_inputs,
+                                        config_patches=current_config)
 
     # after compiling the last graph, record the end time
     if graph_index == num_graphs - 1:
@@ -457,6 +661,9 @@ def __call__(self, *args) -> Any:
 
             # finished compilations for all required shapes
             if self.is_last_graph and not self.to_be_compiled_sizes:
+
+                # save the hash of the inductor graph for the next run
+                self.compilation_config.inductor_hash_cache.save_to_file()
                 end_monitoring_torch_compile(self.vllm_config)
 
         if not entry.use_cudagraph:
diff --git a/vllm/config.py b/vllm/config.py
index fce8011be4015..9cfd08024ea7b 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3,6 +3,7 @@
 import enum
 import hashlib
 import json
+import os
 import warnings
 from contextlib import contextmanager
 from dataclasses import dataclass, field, replace
@@ -162,6 +163,30 @@ class ModelConfig:
             which allows no processors.
     """
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: List[Any] = []
+        factors.append(self.model)
+        factors.append(self.dtype)
+        factors.append(self.quantization)
+        factors.append(self.quantization_param_path)
+        factors.append(self.revision)
+        factors.append(self.code_revision)
+        factors.append(self.trust_remote_code)
+        factors.append(self.rope_scaling)
+        factors.append(self.rope_theta)
+        return hashlib.sha256(str(factors).encode()).hexdigest()
+
     def __init__(self,
                  model: str,
                  task: Union[TaskOption, Literal["draft"]],
@@ -203,6 +228,8 @@ def __init__(self,
         self.seed = seed
         self.revision = revision
         self.code_revision = code_revision
+        self.rope_scaling = rope_scaling
+        self.rope_theta = rope_theta
 
         if hf_overrides is None:
             hf_overrides = {}
@@ -832,6 +859,24 @@ class CacheConfig:
         cpu_offload_gb: Size of the CPU offload buffer in GiB.
     """
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: List[Any] = []
+        factors.append(self.cache_dtype)
+        # `cpu_offload_gb` does not use `torch.compile` yet.
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __init__(
         self,
         block_size: int,
@@ -928,6 +973,24 @@ class TokenizerPoolConfig:
     pool_type: Union[str, Type["BaseTokenizerGroup"]]
     extra_config: dict
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
         if self.pool_type not in ("ray", ) and not isinstance(
                 self.pool_type, type):
@@ -1010,6 +1073,24 @@ class LoadConfig:
         default_factory=dict)
     ignore_patterns: Optional[Union[List[str], str]] = None
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
         model_loader_extra_config = self.model_loader_extra_config or {}
         if isinstance(model_loader_extra_config, str):
@@ -1073,6 +1154,19 @@ class ParallelConfig:
 
     rank: int = 0
 
+    def compute_hash(self):
+        """
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: List[Any] = []
+        factors.append(self.pipeline_parallel_size)
+        factors.append(self.tensor_parallel_size)
+        return hashlib.sha256(str(factors).encode()).hexdigest()
+
     def __post_init__(self) -> None:
         self.world_size = self.pipeline_parallel_size * \
             self.tensor_parallel_size
@@ -1209,6 +1303,24 @@ class SchedulerConfig:
 
     chunked_prefill_enabled: bool = field(init=False)
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self) -> None:
         if self.max_num_batched_tokens is None:
             if self.enable_chunked_prefill:
@@ -1286,6 +1398,25 @@ class DeviceConfig:
     device: Optional[torch.device]
     device_type: str
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # the device/platform information will be summarized
+        # by torch/vllm automatically.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __init__(self, device: str = "auto") -> None:
         if device == "auto":
             # Automated device type detection
@@ -1313,6 +1444,24 @@ class SpeculativeConfig:
     decoding with top-1 proposals.
     """
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # spec decode does not use `torch.compile` yet.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     @staticmethod
     def maybe_create_spec_config(
         target_model_config: ModelConfig,
@@ -1753,6 +1902,24 @@ class LoRAConfig:
     long_lora_scaling_factors: Optional[Tuple[float]] = None
     bias_enabled: bool = False
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # LoRA is not compatible with `torch.compile` .
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
         # Setting the maximum rank to 256 should be able to satisfy the vast
         # majority of applications.
@@ -1802,6 +1969,24 @@ class PromptAdapterConfig:
     max_cpu_prompt_adapters: Optional[int] = None
     prompt_adapter_dtype: Optional[torch.dtype] = None
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
 
         if self.max_prompt_adapters < 1:
@@ -1830,6 +2015,24 @@ class MultiModalConfig:
     for each :class:`~vllm.multimodal.MultiModalPlugin`.
     """
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     # TODO: Add configs to init vision tower or not.
 
 
@@ -1869,6 +2072,24 @@ class PoolerConfig:
     ``math-shepherd-mistral-7b-prm`` model.
     """
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     @staticmethod
     def from_json(json_str: str) -> "PoolerConfig":
         return PoolerConfig(**json.loads(json_str))
@@ -2103,6 +2324,24 @@ class DecodingConfig:
     # 'outlines' / 'lm-format-enforcer' / 'xgrammar'
     guided_decoding_backend: str = 'xgrammar'
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
         valid_guided_backends = ['outlines', 'lm-format-enforcer', 'xgrammar']
         backend = self.guided_decoding_backend
@@ -2124,6 +2363,24 @@ class ObservabilityConfig:
     # If set, collects the model execute time for the request.
     collect_model_execute_time: bool = False
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     def __post_init__(self):
         if not is_otel_available() and self.otlp_traces_endpoint is not None:
             raise ValueError(
@@ -2165,6 +2422,24 @@ class KVTransferConfig(BaseModel):
     # The KV connector port, used to build distributed connection
     kv_port: int = 14579
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: List[Any] = []
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        return hash_str
+
     @classmethod
     def from_cli(cls, cli_value: str) -> "KVTransferConfig":
         """Parse the CLI value for the kv cache transfer config."""
@@ -2234,6 +2509,9 @@ class CompilationConfig(BaseModel):
             - 2: dynamo once.
             - 3: piecewise compilation.
         - debug_dump_path: the path to dump the debug information.
+        - cache_dir: the directory to store the compiled graph, to
+            accelerate Inductor compilation. By default, it will use
+            model-related information to generate a cache directory.
         - backend: the backend for compilation. It needs to be a string.
             - "" (empty string): use the default backend.
             - "eager"/"openxla"/...: use the specified backend registered in PyTorch.
@@ -2302,12 +2580,10 @@ class CompilationConfig(BaseModel):
     """ # noqa
     level: int = 0
     debug_dump_path: str = ""
+    cache_dir: str = ""
     backend: str = ""
     custom_ops: List[str] = Field(default_factory=list)
-    splitting_ops: List[str] = Field(default_factory=lambda: [
-        "vllm.unified_attention",
-        "vllm.unified_attention_with_output",
-    ])
+    splitting_ops: List[str] = Field(default=None)  # type: ignore
 
     use_inductor: bool = True
     candidate_compile_sizes: Optional[List[int]] = Field(default=None)
@@ -2371,12 +2647,37 @@ def model_post_init(self, __context: Any) -> None:
     enabled_custom_ops: Counter[str] = PrivateAttr
     disabled_custom_ops: Counter[str] = PrivateAttr
     compilation_time: float = PrivateAttr
+    # should be InductorHashCache, but Pydantic does not support it
+    inductor_hash_cache: Any = PrivateAttr
 
     # Per-model forward context
     # Mainly used to store attention cls
     # Map from layer name to the attention cls
     static_forward_context: Dict[str, Any] = PrivateAttr
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: List[Any] = []
+        factors.append(self.level)
+        factors.append(self.backend)
+        factors.append(self.custom_ops)
+        factors.append(self.splitting_ops)
+        factors.append(self.use_inductor)
+        factors.append(self.inductor_compile_config)
+        factors.append(self.inductor_passes)
+        factors.append(self.pass_config.uuid())
+        return hashlib.sha256(str(factors).encode()).hexdigest()
+
     def __repr__(self) -> str:
         exclude = {
             "static_forward_context",
@@ -2405,6 +2706,27 @@ def model_post_init(self, __context: Any) -> None:
         count_all = self.custom_ops.count("all")
         assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
 
+        if self.splitting_ops is None:
+            if envs.VLLM_USE_V1:
+                # v1 must split the graph on attention ops
+                # for piecewise cudagraph
+                self.splitting_ops = [
+                    "vllm.unified_attention",
+                    "vllm.unified_attention_with_output",
+                ]
+            else:
+                # v0 can use full graph compilation without splitting,
+                # splitting is optional.
+                # right now we still need it. kv cache shape
+                # will be included in the graph if we don't split
+                # the graph.
+                # TODO: hide kv cache in static forward context
+                # so that inductor does not see it.
+                self.splitting_ops = [
+                    "vllm.unified_attention",
+                    "vllm.unified_attention_with_output",
+                ]
+
         for k, v in self.inductor_passes.items():
             if not isinstance(v, str):
                 assert callable(v), (
@@ -2444,6 +2766,30 @@ def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
         # TODO: pass user-specified backend to piecewise compilation
         # merge with the config use_inductor
         assert self.level == CompilationLevel.PIECEWISE
+
+        if not self.cache_dir:
+            # no provided cache dir, generate one based on the known factors
+            # that affects the compilation. if none of the factors change,
+            # the cache dir will be the same so that we can reuse the compiled
+            # graph.
+            hash_key = vllm_config.compute_hash()
+            cache_dir = os.path.join(
+                envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key,
+                f"rank_{vllm_config.parallel_config.rank}")
+            os.makedirs(cache_dir, exist_ok=True)
+            self.cache_dir = cache_dir
+
+            disabled = envs.VLLM_DISABLE_COMPILE_CACHE
+            from vllm.compilation.backends import InductorHashCache
+            self.inductor_hash_cache: InductorHashCache = InductorHashCache(
+                self.cache_dir, disabled=disabled)
+            if disabled:
+                logger.info("vLLM's torch.compile cache is disabled.")
+            else:
+                logger.info(
+                    "Using cache directory: %s for vLLM's torch.compile",
+                    self.cache_dir)
+
         from vllm.compilation.backends import VllmBackend
         return VllmBackend(vllm_config)
 
@@ -2520,6 +2866,67 @@ class VllmConfig:
                                                  init=True)  # type: ignore
     instance_id: str = ""
 
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: List[Any] = []
+        # summarize system state
+        from torch._inductor.codecache import CacheBase
+        system_factors = CacheBase.get_system()
+        factors.append(system_factors)
+
+        # summarize pytorch state
+        from torch._inductor.codecache import torch_key
+        torch_factors = torch_key()
+        factors.append(torch_factors)
+
+        # summarize vllm config
+        vllm_factors: List[Any] = []
+        from vllm import __version__
+        vllm_factors.append(__version__)
+        if self.model_config:
+            vllm_factors.append(self.model_config.compute_hash())
+        if self.cache_config:
+            vllm_factors.append(self.cache_config.compute_hash())
+        if self.parallel_config:
+            vllm_factors.append(self.parallel_config.compute_hash())
+        if self.scheduler_config:
+            vllm_factors.append(self.scheduler_config.compute_hash())
+        if self.device_config:
+            vllm_factors.append(self.device_config.compute_hash())
+        if self.load_config:
+            vllm_factors.append(self.load_config.compute_hash())
+        if self.lora_config:
+            vllm_factors.append(self.lora_config.compute_hash())
+        if self.speculative_config:
+            vllm_factors.append(self.speculative_config.compute_hash())
+        if self.decoding_config:
+            vllm_factors.append(self.decoding_config.compute_hash())
+        if self.observability_config:
+            vllm_factors.append(self.observability_config.compute_hash())
+        if self.prompt_adapter_config:
+            vllm_factors.append(self.prompt_adapter_config.compute_hash())
+        if self.quant_config:
+            pass  # should be captured by model_config.quantization
+        if self.compilation_config:
+            vllm_factors.append(self.compilation_config.compute_hash())
+        if self.kv_transfer_config:
+            vllm_factors.append(self.kv_transfer_config.compute_hash())
+
+        factors.append(vllm_factors)
+
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10]
+        return hash_str
+
     def pad_for_cudagraph(self, batch_size: int) -> int:
         # if batch_size > self.compilation_config.max_capture_size,
         # it should raise an IndexError.
diff --git a/vllm/envs.py b/vllm/envs.py
index da17b747ea215..18870c1c6b51a 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -71,6 +71,7 @@
     VLLM_USE_V1: bool = False
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
+    VLLM_DISABLE_COMPILE_CACHE: bool = False
 
 
 def get_default_cache_root():
@@ -463,6 +464,8 @@ def get_default_config_root():
     lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))),
     "VLLM_LOG_BATCHSIZE_INTERVAL":
     lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")),
+    "VLLM_DISABLE_COMPILE_CACHE":
+    lambda: bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))),
 }
 
 # end-env-vars-definition

From 35bae114a89e03e3dc6a6d2f758378e58938bffa Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 16 Dec 2024 17:22:38 -0800
Subject: [PATCH 092/357] fix gh200 tests on main (#11246)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/run-gh200-test.sh                  | 4 ++--
 docs/source/serving/deploying_with_docker.rst | 5 +----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
index d25510c47fe6b..d06604f96f2b8 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -6,8 +6,8 @@ set -ex
 
 # Try building the docker image
 DOCKER_BUILDKIT=1 docker build . \
-  --target test \
-  -platform "linux/arm64" \
+  --target vllm-openai \
+  --platform "linux/arm64" \
   -t gh200-test \
   --build-arg max_jobs=66 \
   --build-arg nvcc_threads=2 \
diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index 11a9f12fd17cd..56f0020a1011a 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -54,16 +54,13 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
     # Example of building on Nvidia GH200 server. (Memory usage: ~12GB, Build time: ~1475s / ~25 min, Image size: 7.26GB)
     $ DOCKER_BUILDKIT=1 sudo docker build . \
       --target vllm-openai \
-      -platform "linux/arm64" \
+      --platform "linux/arm64" \
       -t vllm/vllm-gh200-openai:latest \
       --build-arg max_jobs=66 \
       --build-arg nvcc_threads=2 \
       --build-arg torch_cuda_arch_list="9.0+PTX" \
       --build-arg vllm_fa_cmake_gpu_arches="90-real"
 
-
-
-
 To run vLLM:
 
 .. code-block:: console

From 0064f697d318a2ce38342f7c20754cf229311b8b Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 16 Dec 2024 22:39:58 -0500
Subject: [PATCH 093/357] [CI] Add test case with JSON schema using references
 + use xgrammar by default with OpenAI parse (#10935)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 tests/entrypoints/conftest.py                 | 39 +++++++++++++++++++
 tests/entrypoints/llm/test_guided_generate.py | 28 +++++++++++++
 vllm/entrypoints/openai/protocol.py           |  2 +-
 3 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index 0f7d15e1d85aa..ef74062ce4b41 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -100,6 +100,45 @@ def sample_complex_json_schema():
     }
 
 
+@pytest.fixture
+def sample_definition_json_schema():
+    return {
+        '$defs': {
+            'Step': {
+                'properties': {
+                    'explanation': {
+                        'title': 'Explanation',
+                        'type': 'string'
+                    },
+                    'output': {
+                        'title': 'Output',
+                        'type': 'string'
+                    }
+                },
+                'required': ['explanation', 'output'],
+                'title': 'Step',
+                'type': 'object'
+            }
+        },
+        'properties': {
+            'steps': {
+                'items': {
+                    '$ref': '#/$defs/Step'
+                },
+                'title': 'Steps',
+                'type': 'array'
+            },
+            'final_answer': {
+                'title': 'Final Answer',
+                'type': 'string'
+            }
+        },
+        'required': ['steps', 'final_answer'],
+        'title': 'MathReasoning',
+        'type': 'object'
+    }
+
+
 @pytest.fixture
 def sample_guided_choice():
     return [
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index de6257cfc551c..ed50ec6bbc9eb 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -104,6 +104,34 @@ def test_guided_complex_json_completion(sample_complex_json_schema, llm):
                             schema=sample_complex_json_schema)
 
 
+@pytest.mark.skip_global_cleanup
+def test_guided_definition_json_completion(sample_definition_json_schema, llm):
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json=sample_definition_json_schema))
+    outputs = llm.generate(prompts=[
+        f"Give an example JSON for solving 8x + 7 = -23 "
+        f"that fits this schema: {sample_definition_json_schema}"
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json,
+                            schema=sample_definition_json_schema)
+
+
 @pytest.mark.skip_global_cleanup
 def test_guided_choice_completion(sample_guided_choice, llm):
     sampling_params = SamplingParams(
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 6ed7c2e9dcd6b..5a70e0952666b 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -387,7 +387,7 @@ def to_sampling_params(
                 assert json_schema is not None
                 self.guided_json = json_schema.json_schema
                 if self.guided_decoding_backend is None:
-                    self.guided_decoding_backend = "lm-format-enforcer"
+                    self.guided_decoding_backend = "xgrammar"
 
         guided_decoding = GuidedDecodingParams.from_optional(
             json=self._get_guided_json_from_tool() or self.guided_json,

From 66d4b16724226e9f377551198cc7425c12ddafae Mon Sep 17 00:00:00 2001
From: kYLe <kylhuang@nvidia.com>
Date: Tue, 17 Dec 2024 00:09:58 -0600
Subject: [PATCH 094/357] [Frontend] Add OpenAI API support for input_audio
 (#11027)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../serving/openai_compatible_server.md       |  10 +-
 docs/source/usage/multimodal_inputs.rst       |  90 ++++++++++++-
 ...i_chat_completion_client_for_multimodal.py |  34 ++++-
 tests/entrypoints/openai/test_audio.py        | 125 +++++++++++++++++-
 vllm/entrypoints/chat_utils.py                |  65 +++++++--
 5 files changed, 301 insertions(+), 23 deletions(-)

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 14a5b02d72aa5..1bc8d32d2d161 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -34,11 +34,6 @@ We currently support the following OpenAI APIs:
   - *Note: `suffix` parameter is not supported.*
 - [Chat Completions API](#chat-api) (`/v1/chat/completions`)
   - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`) with a [chat template](#chat-template).
-  - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Multimodal Inputs](../usage/multimodal_inputs.rst).
-    - *Note: `image_url.detail` parameter is not supported.*
-  - We also support `audio_url` content type for audio files.
-    - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema.
-    - *TODO: Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).*
   - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
 - [Embeddings API](#embeddings-api) (`/v1/embeddings`)
   - Only applicable to [embedding models](../models/pooling_models.rst) (`--task embed`).
@@ -209,6 +204,11 @@ The following extra parameters are supported:
 
 Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/chat) for more details.
 
+We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
+[Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters;
+see our [Multimodal Inputs](../usage/multimodal_inputs.rst) guide for more information.
+- *Note: `image_url.detail` parameter is not supported.*
+
 #### Extra parameters
 
 The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst
index 1e00f26f9a3ba..680382e457cc5 100644
--- a/docs/source/usage/multimodal_inputs.rst
+++ b/docs/source/usage/multimodal_inputs.rst
@@ -315,7 +315,95 @@ You can use `these tests <https://github.com/vllm-project/vllm/blob/main/tests/e
 Audio
 ^^^^^
 
-Instead of :code:`image_url`, you can pass an audio file via :code:`audio_url`.
+Audio input is supported according to `OpenAI Audio API <https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in>`_.
+Here is a simple example using Ultravox-v0.3.
+
+First, launch the OpenAI-compatible server:
+
+.. code-block:: bash
+
+    vllm serve fixie-ai/ultravox-v0_3
+    
+Then, you can use the OpenAI client as follows:
+
+.. code-block:: python
+
+    import base64
+    import requests
+    from openai import OpenAI
+    from vllm.assets.audio import AudioAsset
+
+    def encode_base64_content_from_url(content_url: str) -> str:
+        """Encode a content retrieved from a remote url to base64 format."""
+
+        with requests.get(content_url) as response:
+            response.raise_for_status()
+            result = base64.b64encode(response.content).decode('utf-8')
+
+        return result
+
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    # Any format supported by librosa is supported
+    audio_url = AudioAsset("winning_call").url
+    audio_base64 = encode_base64_content_from_url(audio_url)
+
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[{
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
+                },
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        "data": audio_base64,
+                        "format": "wav"
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:", result)
+
+Alternatively, you can pass :code:`audio_url`, which is the audio counterpart of :code:`image_url` for image input:
+
+.. code-block:: python
+
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[{
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
+                },
+                {
+                    "type": "audio_url",
+                    "audio_url": {
+                        "url": audio_url
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from audio url:", result)
 
 A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py>`_.
 
diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py
index 0ec4f71dddf93..6a160fd70423f 100644
--- a/examples/openai_chat_completion_client_for_multimodal.py
+++ b/examples/openai_chat_completion_client_for_multimodal.py
@@ -153,10 +153,37 @@ def run_multi_image() -> None:
 
 # Audio input inference
 def run_audio() -> None:
-    # Any format supported by librosa is supported
     audio_url = AudioAsset("winning_call").url
+    audio_base64 = encode_base64_content_from_url(audio_url)
+
+    # OpenAI-compatible schema (`input_audio`)
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
+                },
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        # Any format supported by librosa is supported
+                        "data": audio_base64,
+                        "format": "wav"
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:", result)
 
-    # Use audio url in the payload
+    # HTTP URL
     chat_completion_from_url = client.chat.completions.create(
         messages=[{
             "role":
@@ -169,6 +196,7 @@ def run_audio() -> None:
                 {
                     "type": "audio_url",
                     "audio_url": {
+                        # Any format supported by librosa is supported
                         "url": audio_url
                     },
                 },
@@ -181,7 +209,7 @@ def run_audio() -> None:
     result = chat_completion_from_url.choices[0].message.content
     print("Chat completion output from audio url:", result)
 
-    audio_base64 = encode_base64_content_from_url(audio_url)
+    # base64 URL
     chat_completion_from_base64 = client.chat.completions.create(
         messages=[{
             "role":
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index b579dcbb5c402..0a29d77e73abc 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -155,6 +155,61 @@ async def test_single_chat_session_audio_base64encoded(
     assert message.content is not None and len(message.content) >= 0
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_single_chat_session_input_audio(
+        client: openai.AsyncOpenAI, model_name: str, audio_url: str,
+        base64_encoded_audio: Dict[str, str]):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "input_audio",
+                "input_audio": {
+                    "data": base64_encoded_audio[audio_url],
+                    "format": "wav"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
@@ -212,11 +267,72 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
     assert "".join(chunks) == output
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
+                                          model_name: str, audio_url: str,
+                                          base64_encoded_audio: Dict[str,
+                                                                     str]):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "input_audio",
+                "input_audio": {
+                    "data": base64_encoded_audio[audio_url],
+                    "format": "wav"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks: List[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
 async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
-                                 audio_url: str):
+                                 audio_url: str,
+                                 base64_encoded_audio: Dict[str, str]):
 
     messages = [{
         "role":
@@ -229,9 +345,10 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
                 }
             },
             {
-                "type": "audio_url",
-                "audio_url": {
-                    "url": audio_url
+                "type": "input_audio",
+                "input_audio": {
+                    "data": base64_encoded_audio[audio_url],
+                    "format": "wav"
                 }
             },
             {
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index aaa5cd759366a..3df08c740d65b 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -13,7 +13,8 @@
 # yapf conflicts with isort for this block
 # yapf: disable
 from openai.types.chat import (ChatCompletionAssistantMessageParam,
-                               ChatCompletionContentPartImageParam)
+                               ChatCompletionContentPartImageParam,
+                               ChatCompletionContentPartInputAudioParam)
 from openai.types.chat import (
     ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam)
 from openai.types.chat import (ChatCompletionContentPartRefusalParam,
@@ -105,6 +106,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
 
 ChatCompletionContentPartParam: TypeAlias = Union[
     OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
+    ChatCompletionContentPartInputAudioParam,
     ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
     CustomChatCompletionContentSimpleImageParam,
     CustomChatCompletionContentSimpleAudioParam,
@@ -519,6 +521,10 @@ def parse_image(self, image_url: str) -> None:
     def parse_audio(self, audio_url: str) -> None:
         raise NotImplementedError
 
+    @abstractmethod
+    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
+        raise NotImplementedError
+
     @abstractmethod
     def parse_video(self, video_url: str) -> None:
         raise NotImplementedError
@@ -545,6 +551,15 @@ def parse_audio(self, audio_url: str) -> None:
         placeholder = self._tracker.add("audio", audio)
         self._add_placeholder(placeholder)
 
+    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
+        input_audio_data = input_audio.get("data","")
+        input_audio_format = input_audio.get("format","")
+        audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}"
+        audio = get_and_parse_audio(audio_url)
+
+        placeholder = self._tracker.add("audio", audio)
+        self._add_placeholder(placeholder)
+
     def parse_video(self, video_url: str) -> None:
         video = get_and_parse_video(video_url)
 
@@ -574,6 +589,15 @@ def parse_audio(self, audio_url: str) -> None:
         placeholder = self._tracker.add("audio", audio_coro)
         self._add_placeholder(placeholder)
 
+    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
+        input_audio_data = input_audio.get("data","")
+        input_audio_format = input_audio.get("format","")
+        audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}"
+        audio_coro = async_get_and_parse_audio(audio_url)
+
+        placeholder = self._tracker.add("audio", audio_coro)
+        self._add_placeholder(placeholder)
+
     def parse_video(self, video_url: str) -> None:
         video = async_get_and_parse_video(video_url)
 
@@ -667,17 +691,22 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
 _TextParser = partial(cast, ChatCompletionContentPartTextParam)
 _ImageParser = partial(cast, ChatCompletionContentPartImageParam)
 _AudioParser = partial(cast, ChatCompletionContentPartAudioParam)
+_InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
 _VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
 
 # Define a mapping from part types to their corresponding parsing functions.
-MM_PARSER_MAP: Dict[str, Callable[[ChatCompletionContentPartParam], str]] = {
+MM_PARSER_MAP: Dict[str,
+                    Callable[[ChatCompletionContentPartParam],
+                             Union[str, Dict[str,str]]]] = {
     "text":
     lambda part: _TextParser(part).get("text", ""),
     "image_url":
     lambda part: _ImageParser(part).get("image_url", {}).get("url", ""),
     "audio_url":
     lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""),
+    "input_audio":
+    lambda part: _InputAudioParser(part).get("input_audio", {}),
     "refusal":
     lambda part: _RefusalParser(part).get("refusal", ""),
     "video_url":
@@ -686,7 +715,8 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
 
 
 def _parse_chat_message_content_mm_part(
-        part: ChatCompletionContentPartParam) -> Tuple[str, str]:
+        part: ChatCompletionContentPartParam) -> Tuple[str,
+                                                Union[str, Dict[str, str]]]:
     """
     Parses a given multi-modal content part based on its type.
 
@@ -717,6 +747,7 @@ def _parse_chat_message_content_mm_part(
         return part_type, content
 
     # Handle missing 'type' but provided direct URL fields.
+    # 'type' is required field by pydantic
     if part_type is None:
         if part.get("image_url") is not None:
             image_params = cast(CustomChatCompletionContentSimpleImageParam,
@@ -726,6 +757,9 @@ def _parse_chat_message_content_mm_part(
             audio_params = cast(CustomChatCompletionContentSimpleAudioParam,
                                 part)
             return "audio_url", audio_params.get("audio_url", "")
+        if part.get("input_audio") is not None:
+            input_audio_params = cast(Dict[str, str], part)
+            return "input_audio", input_audio_params
         if part.get("video_url") is not None:
             video_params = cast(CustomChatCompletionContentSimpleVideoParam,
                                 part)
@@ -739,7 +773,7 @@ def _parse_chat_message_content_mm_part(
 
 
 VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
-                                       "audio_url", "video_url")
+                                       "audio_url", "input_audio", "video_url")
 
 
 def _parse_chat_message_content_parts(
@@ -795,7 +829,7 @@ def _parse_chat_message_content_part(
     # Handle structured dictionary parts
     part_type, content = _parse_chat_message_content_mm_part(part)
 
-    # if part_type is text/refusal/image_url/audio_url/video_url but
+    # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
     # content is empty, log a warning and skip
     if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content:
         logger.warning(
@@ -804,18 +838,30 @@ def _parse_chat_message_content_part(
         return None
 
     if part_type in ("text", "refusal"):
-        return {'type': 'text', 'text': content} if wrap_dicts else content
+        str_content = cast(str, content)
+        if wrap_dicts:
+            return {'type': 'text', 'text': str_content}
+        else:
+            return str_content
 
     if part_type == "image_url":
-        mm_parser.parse_image(content)
+        str_content = cast(str, content)
+        mm_parser.parse_image(str_content)
         return {'type': 'image'} if wrap_dicts else None
 
     if part_type == "audio_url":
-        mm_parser.parse_audio(content)
+        str_content = cast(str, content)
+        mm_parser.parse_audio(str_content)
+        return {'type': 'audio'} if wrap_dicts else None
+
+    if part_type == "input_audio":
+        dict_content = cast(Dict[str, str], content)
+        mm_parser.parse_input_audio(dict_content)
         return {'type': 'audio'} if wrap_dicts else None
 
     if part_type == "video_url":
-        mm_parser.parse_video(content)
+        str_content = cast(str, content)
+        mm_parser.parse_video(str_content)
         return {'type': 'video'} if wrap_dicts else None
 
     raise NotImplementedError(f"Unknown part type: {part_type}")
@@ -840,7 +886,6 @@ def _parse_chat_message_content(
         content = [
             ChatCompletionContentPartTextParam(type="text", text=content)
         ]
-
     result = _parse_chat_message_content_parts(
         role,
         content,  # type: ignore

From 59c9b6ebeba79b2d744eec86734a7e13b03dcab7 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 16 Dec 2024 22:10:57 -0800
Subject: [PATCH 095/357] [V1][VLM] Proper memory profiling for image language
 models (#11210)

Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: ywang96 <ywang@example.com>
---
 vllm/config.py                        |  8 ++++
 vllm/model_executor/models/pixtral.py |  5 ++
 vllm/multimodal/registry.py           | 23 +++++++--
 vllm/v1/core/scheduler.py             |  7 ++-
 vllm/v1/engine/mm_input_mapper.py     |  1 +
 vllm/v1/worker/gpu_model_runner.py    | 67 ++++++++++++++++++++++++---
 6 files changed, 98 insertions(+), 13 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 9cfd08024ea7b..9ecd3e72afa9f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1280,6 +1280,14 @@ class SchedulerConfig:
 
     is_multimodal_model: bool = False
 
+    # FIXME(woosuk & ywang96): Below are placeholder values. We need to
+    # calculate the actual values from the configurations.
+    # Multimodal encoder run compute budget, only used in V1
+    max_num_encoder_input_tokens = 16384
+
+    # Multimodal encoder cache size, only used in V1
+    encoder_cache_size = 16384
+
     # Whether to perform preemption by swapping or
     # recomputation. If not specified, we determine the mode as follows:
     # We use recomputation by default since it incurs lower overhead than
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 161d6b41bfa5f..f05ea195e043d 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -245,6 +245,11 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
             # Do not split, return as tensor of shape [1, fs, hs]
             return image_embeds.unsqueeze(0)
 
+        # If the last split index is the last index in image_tokens, we
+        # ignore it to avoid empty split tensor
+        if split_indices[-1] == len(image_tokens):
+            split_indices = split_indices[:-1]
+
         image_embeds = image_embeds.tensor_split(split_indices.cpu())
         return image_embeds
 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 03f8814a95356..6cd79d414c978 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -200,6 +200,23 @@ def register_max_image_tokens(
         """
         return self.register_max_multimodal_tokens("image", max_mm_tokens)
 
+    def get_max_tokens_per_item_by_modality(
+        self,
+        model_config: "ModelConfig",
+    ) -> Mapping[str, int]:
+        """
+        Get the maximum number of tokens per data item from each modality
+        for profiling the memory usage of a model.
+
+        Note:
+            This is currently directly used only in V1.
+        """
+
+        return {
+            key: plugin.get_max_multimodal_tokens(model_config)
+            for key, plugin in self._plugins.items()
+        }
+
     def get_max_tokens_by_modality(
         self,
         model_config: "ModelConfig",
@@ -216,9 +233,9 @@ def get_max_tokens_by_modality(
         limits_per_plugin = self._limits_by_model[model_config]
 
         return {
-            key: (limits_per_plugin[key] *
-                  plugin.get_max_multimodal_tokens(model_config))
-            for key, plugin in self._plugins.items()
+            key: limits_per_plugin[key] * max_tokens_per_mm_item
+            for key, max_tokens_per_mm_item in
+            self.get_max_tokens_per_item_by_modality(model_config).items()
         }
 
     def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index f76364f64033d..178532e477dae 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -73,14 +73,13 @@ def __init__(
         # NOTE(woosuk): Here, "encoder" includes the vision encoder (and
         # projector if needed). Currently, we assume that the encoder also
         # has the Transformer architecture (e.g., ViT).
-        # FIXME(woosuk): Below are placeholder values. We need to calculate the
-        # actual values from the configurations.
-        self.max_num_encoder_input_tokens = 16384
+        self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens  #noqa: E501
         # NOTE(woosuk): For the models without encoder (e.g., text-only models),
         # the encoder cache will not be initialized and used, regardless of
         # the cache size. This is because the memory space for the encoder cache
         # is preallocated in the profiling run.
-        self.encoder_cache_manager = EncoderCacheManager(cache_size=16384)
+        self.encoder_cache_manager = EncoderCacheManager(
+            cache_size=self.scheduler_config.encoder_cache_size)
 
     def schedule(self) -> "SchedulerOutput":
         # NOTE(woosuk) on the scheduling algorithm:
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index cca27c2218af7..6cdeba6f3f71e 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -54,6 +54,7 @@ def cache_hit_ratio(self, steps):
             logger.debug("MMInputMapper: cache_hit_ratio = %.2f ",
                          self.mm_cache_hits / self.mm_cache_total)
 
+    # TODO: Support modalities beyond image.
     def process_inputs(
         self,
         mm_data: MultiModalDataDict,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 67166fb05085c..c6fab5f05fcb3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -10,15 +10,16 @@
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.distributed.parallel_state import graph_capture
 from vllm.forward_context import set_forward_context
-from vllm.inputs import INPUT_REGISTRY, InputRegistry
+from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.sampling_params import SamplingType
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                         LayerBlockType, cdiv, is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
+from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -35,7 +36,6 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         device: torch.device,
-        input_registry: InputRegistry = INPUT_REGISTRY,
     ):
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
@@ -77,7 +77,12 @@ def __init__(
         self.hidden_size = model_config.get_hidden_size()
 
         # Multi-modal data support
-        self.input_registry = input_registry
+        self.input_registry = INPUT_REGISTRY
+        self.mm_registry = MULTIMODAL_REGISTRY
+        # NOTE: mm_input_mapper is only used for memory profiling.
+        self.mm_input_mapper = MMInputMapperClient(self.model_config)
+        self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens  # noqa: E501
+        self.encoder_cache_size = self.scheduler_config.encoder_cache_size
 
         # Lazy initialization
         # self.model: nn.Module  # Set after load_model
@@ -599,8 +604,6 @@ def _dummy_run(
         return hidden_states
 
     def profile_run(self) -> None:
-        # TODO(woosuk): Profile the max memory usage of the encoder and
-        # the encoder cache.
         # use an empty tensor instead of `None`` to force Dynamo to pass
         # it by reference, rather by specializing on the value `None`.
         # the `dtype` argument does not matter, and we use `float32` as
@@ -612,6 +615,57 @@ def profile_run(self) -> None:
             torch.tensor([], dtype=torch.float32, device=self.device)
             for _ in range(self.num_attn_layers)
         ]
+
+        # Profile with multimodal encoder & encoder cache.
+        # TODO (ywang96): generalize this beyond image modality since
+        # mm_input_mapper only supports image inputs.
+        if self.is_multimodal_model:
+
+            # Create dummy batch of multimodal inputs.
+            dummy_request_data = self.input_registry.dummy_data_for_profiling(
+                model_config=self.model_config,
+                seq_len=self.max_num_tokens,
+                mm_registry=self.mm_registry,
+            )
+            dummy_mm_data = dummy_request_data.multi_modal_data
+            dummy_mm_kwargs, _ = self.mm_input_mapper.process_inputs(
+                mm_data=dummy_mm_data,
+                mm_hashes=None,
+                mm_processor_kwargs=None,
+                precomputed_mm_inputs=None)
+
+            # NOTE: Currently model is profiled with a single non-text
+            # modality even when it supports multiple.
+            max_tokens_per_mm_item = max(
+                self.mm_registry.get_max_tokens_per_item_by_modality(
+                    self.model_config).values())
+
+            max_num_mm_items = min(
+                self.max_num_encoder_input_tokens,
+                self.encoder_cache_size) // max_tokens_per_mm_item
+
+            # Dummy data definition in V0 may contain multiple multimodal items
+            # (e.g, multiple images) for a single request, therefore here we
+            # always replicate first item by max_num_mm_items times since in V1
+            # they are scheduled to be processed separately.
+            batched_dummy_mm_inputs = MultiModalKwargs.batch(
+                [dummy_mm_kwargs[0]] * max_num_mm_items)
+            batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(
+                batched_dummy_mm_inputs, device=self.device)
+
+            # Run multimodal encoder.
+            dummy_encoder_outputs = self.model.get_multimodal_embeddings(
+                **batched_dummy_mm_inputs)
+            assert len(dummy_encoder_outputs) == max_num_mm_items, (
+                "Expected dimension 0 of encoder outputs to match the number "
+                f"of multimodal data items: {max_num_mm_items}, got "
+                f"{len(dummy_encoder_outputs)=} instead. This is most likely "
+                "due to the 'get_multimodal_embeddings' method of the model "
+                "not implemented correctly.")
+
+            # Cache the dummy encoder outputs.
+            self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
+
         # Trigger compilation for general shape.
         hidden_states = self._dummy_run(self.model, self.max_num_tokens,
                                         dummy_kv_caches)
@@ -620,6 +674,7 @@ def profile_run(self) -> None:
         # TODO(woosuk): Consider the memory usage of the sampler.
         torch.cuda.synchronize()
         del hidden_states, logits
+        self.encoder_cache.clear()
         gc.collect()
 
     def capture_model(self) -> None:

From e88db68cf5712956f36e77c288699592327b15bd Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Tue, 17 Dec 2024 14:11:06 +0800
Subject: [PATCH 096/357] [Platform] platform agnostic for EngineArgs
 initialization (#11225)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 vllm/engine/arg_utils.py   | 8 ++------
 vllm/platforms/cpu.py      | 3 +++
 vllm/platforms/cuda.py     | 4 ++++
 vllm/platforms/hpu.py      | 6 ++++++
 vllm/platforms/neuron.py   | 6 ++++++
 vllm/platforms/openvino.py | 3 +++
 vllm/platforms/rocm.py     | 4 ++++
 vllm/platforms/tpu.py      | 5 +++++
 vllm/platforms/xpu.py      | 4 ++++
 9 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 06b8542779dc0..f6d276fe7c0c8 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -112,9 +112,7 @@ class EngineArgs:
     pipeline_parallel_size: int = 1
     tensor_parallel_size: int = 1
     max_parallel_loading_workers: Optional[int] = None
-    # NOTE(kzawora): default block size for Gaudi should be 128
-    # smaller sizes still work, but very inefficiently
-    block_size: int = 16 if not current_platform.is_hpu() else 128
+    block_size: Optional[int] = None
     enable_prefix_caching: Optional[bool] = None
     disable_sliding_window: bool = False
     use_v2_block_manager: bool = True
@@ -1036,9 +1034,7 @@ def create_engine_config(self,
             self.enable_prefix_caching = False
 
         cache_config = CacheConfig(
-            # neuron needs block_size = max_model_len
-            block_size=self.block_size if self.device != "neuron" else
-            (self.max_model_len if self.max_model_len is not None else 0),
+            block_size=self.block_size,
             gpu_memory_utilization=self.gpu_memory_utilization,
             swap_space=self.swap_space,
             cache_dtype=self.kv_cache_dtype,
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index aad8755d9fcd8..d95a2b4cd5565 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -60,6 +60,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
 
         cache_config = vllm_config.cache_config
 
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
         kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
 
         if kv_cache_space >= 0:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index ae1fd6d5ce068..3c5350b778345 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -137,6 +137,10 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 else:
                     parallel_config.worker_cls = "vllm.worker.worker.Worker"
 
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 2b947d280f9f8..0a44f2b74163a 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -48,6 +48,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker"
 
+        # NOTE(kzawora): default block size for Gaudi should be 128
+        # smaller sizes still work, but very inefficiently
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 128
+
     @classmethod
     def is_pin_memory_available(cls):
         logger.warning("Pin memory is not supported on HPU.")
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 86113523385f6..a4bbbd27c8a89 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -33,6 +33,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             parallel_config.worker_cls = \
                 "vllm.worker.neuron_worker.NeuronWorker"
 
+        cache_config = vllm_config.cache_config
+        if cache_config:
+            # neuron needs block_size = max_model_len
+            vllm_config.cache_config.block_size = \
+                vllm_config.model_config.max_model_len
+
     @classmethod
     def is_pin_memory_available(cls) -> bool:
         logger.warning("Pin memory is not supported on Neuron.")
diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
index ccd94e8adb3b1..16eb8dc81efc2 100644
--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -87,6 +87,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         # check and update cache config
         ov_core = ov.Core()
         cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
         if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
             if not OpenVinoPlatform.is_openvino_cpu():
                 logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is"
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 0133f26a0b1bc..7778b565372cb 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -84,6 +84,10 @@ def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
 
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
         if parallel_config.worker_cls == "auto":
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 10d874349f36b..77f5c8401424b 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -46,6 +46,11 @@ def inference_mode(cls):
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         from vllm.config import CompilationLevel
+
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
         compilation_config = vllm_config.compilation_config
         if compilation_config.level == CompilationLevel.NO_COMPILATION:
             # TPU does not support NO_COMPILATION
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index c20190e789d7e..78e17c2afec65 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -51,6 +51,10 @@ def inference_mode():
 
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
         # check and update model config
         model_config = vllm_config.model_config
         if model_config.dtype == torch.bfloat16:

From 2bfdbf2a36256bb08547cea3d4ef83b5d27c4b04 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Tue, 17 Dec 2024 01:11:33 -0500
Subject: [PATCH 097/357] [V1][Core] Use weakref.finalize instead of atexit
 (#11242)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 vllm/v1/engine/core_client.py          | 13 ++-----------
 vllm/v1/executor/multiproc_executor.py | 10 +++-------
 2 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index ff25a9b2e9cac..d56fcbdb1e7c4 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,5 +1,5 @@
-import atexit
 import os
+import weakref
 from typing import List, Optional
 
 import msgspec
@@ -165,15 +165,9 @@ def __init__(
             ready_path=ready_path,  # type: ignore[misc]
             **kwargs,
         )
-        atexit.register(self.shutdown)
+        self._finalizer = weakref.finalize(self, self.shutdown)
 
     def shutdown(self):
-        # During final garbage collection in process shutdown, atexit may be
-        # None.
-        if atexit:
-            # in case shutdown gets called via __del__ first
-            atexit.unregister(self.shutdown)
-
         # Shut down the zmq context.
         self.ctx.destroy(linger=0)
 
@@ -197,9 +191,6 @@ def shutdown(self):
                     os.remove(socket_file)
             self.proc_handle = None
 
-    def __del__(self):
-        self.shutdown()
-
 
 class SyncMPClient(MPClient):
     """Synchronous client for multi-proc EngineCore."""
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 17441dacdc5cf..128101aa6956d 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -1,9 +1,9 @@
-import atexit
 import os
 import pickle
 import signal
 import sys
 import time
+import weakref
 from dataclasses import dataclass
 from enum import Enum, auto
 from multiprocessing.process import BaseProcess
@@ -37,7 +37,7 @@ class MultiprocExecutor(Executor):
     def __init__(self, vllm_config: VllmConfig) -> None:
         # Call self.shutdown at exit to clean up
         # and ensure workers will be terminated.
-        atexit.register(self.shutdown)
+        self._finalizer = weakref.finalize(self, self.shutdown)
 
         self.vllm_config = vllm_config
         self.parallel_config = vllm_config.parallel_config
@@ -195,14 +195,10 @@ def _cleanup_sockets(self):
                 os.remove(socket_path)
 
     def shutdown(self):
-        if atexit:
-            # in case shutdown was called explicitly, we don't need to call it
-            # again
-            atexit.unregister(self.shutdown)
         """Properly shut down the executor and its workers"""
         if getattr(self, 'shutting_down', False):
             self.shutting_down = True
-            for w in self.workers:  #TODO: not sure if needed
+            for w in self.workers:
                 w.worker_response_mq = None
             self._ensure_worker_termination()
 

From 02222a0256f60319f5bcd56d1d036a943d6334f8 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 16 Dec 2024 22:57:02 -0800
Subject: [PATCH 098/357] [Misc] Kernel Benchmark for `RMSNorm` (#11241)

Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Xiaoyu Zhang <BBuf@users.noreply.github.com>
---
 benchmarks/kernels/benchmark_rmsnorm.py | 262 ++++++++++++++++++++++++
 1 file changed, 262 insertions(+)
 create mode 100644 benchmarks/kernels/benchmark_rmsnorm.py

diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py
new file mode 100644
index 0000000000000..baa5de0fff1bd
--- /dev/null
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@@ -0,0 +1,262 @@
+import itertools
+from typing import Optional, Tuple, Union
+
+import torch
+import triton
+from flashinfer.norm import fused_add_rmsnorm, rmsnorm
+from torch import nn
+
+from vllm import _custom_ops as vllm_ops
+
+
+class HuggingFaceRMSNorm(nn.Module):
+
+    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        orig_dtype = x.dtype
+        x = x.to(torch.float32)
+        if residual is not None:
+            x = x + residual.to(torch.float32)
+            residual = x.to(orig_dtype)
+
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = x.to(orig_dtype) * self.weight
+        if residual is None:
+            return x
+        else:
+            return x, residual
+
+
+def rmsnorm_naive(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
+    naive_norm.weight = nn.Parameter(weight)
+    naive_norm = naive_norm.to(x.device)
+
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    output = naive_norm(x, residual)
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def rmsnorm_flashinfer(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    if residual is not None:
+        fused_add_rmsnorm(x, residual, weight, eps)
+        output = (x, residual)
+    else:
+        output = rmsnorm(x, weight, eps)
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def rmsnorm_vllm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    if residual is not None:
+        vllm_ops.fused_add_rms_norm(x, residual, weight, eps)
+        output = (x, residual)
+    else:
+        out = torch.empty_like(x)
+        vllm_ops.rms_norm(out, x, weight, eps)
+        output = out
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True):
+    dtype = torch.bfloat16
+    x = torch.randn(batch_size,
+                    seq_len,
+                    hidden_size,
+                    dtype=dtype,
+                    device="cuda")
+    weight = torch.ones(hidden_size, dtype=dtype, device="cuda")
+    residual = torch.randn_like(x) if use_residual else None
+
+    output_naive = rmsnorm_naive(
+        x.clone(), weight,
+        residual.clone() if residual is not None else None)
+    output_flashinfer = rmsnorm_flashinfer(
+        x.clone(), weight,
+        residual.clone() if residual is not None else None)
+    output_vllm = rmsnorm_vllm(
+        x.clone(), weight,
+        residual.clone() if residual is not None else None)
+
+    if use_residual:
+        output_naive = output_naive[0]
+        output_flashinfer = output_flashinfer[0]
+        output_vllm = output_vllm[0]
+
+    print(f"Naive output={output_naive}")
+    print(f"FlashInfer output={output_flashinfer}")
+    print(f"VLLM output={output_vllm}")
+
+    if torch.allclose(output_naive, output_flashinfer, atol=1e-2,
+                      rtol=1e-2) and torch.allclose(
+                          output_naive, output_vllm, atol=1e-2, rtol=1e-2):
+        print("✅ All implementations match")
+    else:
+        print("❌ Implementations differ")
+
+
+batch_size_range = [2**i for i in range(0, 7, 2)]
+seq_length_range = [2**i for i in range(6, 11, 1)]
+head_num_range = [32, 48]
+configs = list(
+    itertools.product(head_num_range, batch_size_range, seq_length_range))
+
+
+def get_benchmark(use_residual):
+
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["head_num", "batch_size", "seq_len"],
+            x_vals=[list(_) for _ in configs],
+            line_arg="provider",
+            line_vals=["huggingface", "flashinfer", "vllm"],
+            line_names=["HuggingFace", "FlashInfer", "vLLM"],
+            styles=[("blue", "-"), ("green", "-"), ("red", "-")],
+            ylabel="us",
+            plot_name=
+            f"rmsnorm-perf-{'with' if use_residual else 'without'}-residual",
+            args={},
+        ))
+    def benchmark(head_num, batch_size, seq_len, provider):
+        dtype = torch.bfloat16
+        hidden_size = head_num * 128  # assuming head_dim = 128
+
+        x = torch.randn(batch_size,
+                        seq_len,
+                        hidden_size,
+                        dtype=dtype,
+                        device="cuda")
+        weight = torch.ones(hidden_size, dtype=dtype, device="cuda")
+        residual = torch.randn_like(x) if use_residual else None
+
+        quantiles = [0.5, 0.2, 0.8]
+
+        if provider == "huggingface":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_naive(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+        elif provider == "flashinfer":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_flashinfer(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_vllm(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+    return benchmark
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=4,
+        help="Batch size",
+    )
+    parser.add_argument(
+        "--seq-len",
+        type=int,
+        default=128,
+        help="Sequence length",
+    )
+    parser.add_argument(
+        "--hidden-size",
+        type=int,
+        default=4096,
+        help="Hidden size (2nd dimension) of the sequence",
+    )
+    parser.add_argument("--use-residual",
+                        action="store_true",
+                        help="Whether to use residual connection")
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default="./configs/rmsnorm/",
+        help="Path to save rmsnorm benchmark results",
+    )
+
+    args = parser.parse_args()
+
+    # Run correctness test
+    calculate_diff(batch_size=args.batch_size,
+                   seq_len=args.seq_len,
+                   hidden_size=args.hidden_size,
+                   use_residual=args.use_residual)
+
+    # Get the benchmark function with proper use_residual setting
+    benchmark = get_benchmark(args.use_residual)
+    # Run performance benchmark
+    benchmark.run(print_data=True, save_path=args.save_path)

From f9ecbb18bf03338a4272c933a49a87021363b048 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 17 Dec 2024 16:37:04 +0800
Subject: [PATCH 099/357] [Misc] Allow passing logits_soft_cap for xformers
 backend (#11252)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/attention/backends/xformers.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index e2e989efb020c..3e59b3603d2c6 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -17,9 +17,7 @@
     is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set)
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
+from vllm.utils import print_warning_once
 
 
 class XFormersBackend(AttentionBackend):
@@ -386,8 +384,8 @@ def __init__(
             raise ValueError(
                 "XFormers does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            raise ValueError(
-                "XFormers does not support attention logits soft capping.")
+            print_warning_once("XFormers does not support logits soft cap. "
+                               "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)

From 2d1b9baa8f57fc59912c7bcd07fd630fb9d72c9d Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Tue, 17 Dec 2024 13:26:32 -0700
Subject: [PATCH 100/357] [Bugfix] Fix request cancellation without polling
 (#11190)

---
 tests/entrypoints/openai/test_basic.py        | 51 ++++++++++++++++
 tests/test_utils.py                           |  6 +-
 tests/utils.py                                | 11 ++--
 vllm/engine/async_llm_engine.py               | 46 +++++++++------
 vllm/entrypoints/api_server.py                | 11 ++--
 vllm/entrypoints/openai/api_server.py         |  8 +++
 vllm/entrypoints/openai/serving_chat.py       |  5 --
 vllm/entrypoints/openai/serving_completion.py |  3 +-
 vllm/entrypoints/openai/serving_embedding.py  |  5 +-
 vllm/entrypoints/openai/serving_score.py      |  5 +-
 vllm/entrypoints/utils.py                     | 57 ++++++++++++++++++
 vllm/utils.py                                 | 59 ++-----------------
 12 files changed, 164 insertions(+), 103 deletions(-)
 create mode 100644 vllm/entrypoints/utils.py

diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index 4616f363cc04a..547c1fd020928 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -1,6 +1,8 @@
+import asyncio
 from http import HTTPStatus
 from typing import List
 
+import openai
 import pytest
 import pytest_asyncio
 import requests
@@ -103,3 +105,52 @@ async def test_check_health(server: RemoteOpenAIServer):
     response = requests.get(server.url_for("health"))
 
     assert response.status_code == HTTPStatus.OK
+
+
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param(["--max-model-len", "10100"],
+                     id="default-frontend-multiprocessing"),
+        pytest.param(
+            ["--disable-frontend-multiprocessing", "--max-model-len", "10100"],
+            id="disable-frontend-multiprocessing")
+    ],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_request_cancellation(server: RemoteOpenAIServer):
+    # clunky test: send an ungodly amount of load in with short timeouts
+    # then ensure that it still responds quickly afterwards
+
+    chat_input = [{"role": "user", "content": "Write a long story"}]
+    client = server.get_async_client(timeout=0.5)
+    tasks = []
+    # Request about 2 million tokens
+    for _ in range(200):
+        task = asyncio.create_task(
+            client.chat.completions.create(messages=chat_input,
+                                           model=MODEL_NAME,
+                                           max_tokens=10000,
+                                           extra_body={"min_tokens": 10000}))
+        tasks.append(task)
+
+    done, pending = await asyncio.wait(tasks,
+                                       return_when=asyncio.ALL_COMPLETED)
+
+    # Make sure all requests were sent to the server and timed out
+    # (We don't want to hide other errors like 400s that would invalidate this
+    # test)
+    assert len(pending) == 0
+    for d in done:
+        with pytest.raises(openai.APITimeoutError):
+            d.result()
+
+    # If the server had not cancelled all the other requests, then it would not
+    # be able to respond to this one within the timeout
+    client = server.get_async_client(timeout=5)
+    response = await client.chat.completions.create(messages=chat_input,
+                                                    model=MODEL_NAME,
+                                                    max_tokens=10)
+
+    assert len(response.choices) == 1
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0bc9e5bc32a46..32a6b0aed66aa 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,7 +1,6 @@
 import asyncio
 import os
 import socket
-from functools import partial
 from typing import AsyncIterator, Tuple
 
 import pytest
@@ -26,10 +25,7 @@ async def mock_async_iterator(idx: int):
             print(f"iterator {idx} cancelled")
 
     iterators = [mock_async_iterator(i) for i in range(3)]
-    merged_iterator = merge_async_iterators(*iterators,
-                                            is_cancelled=partial(asyncio.sleep,
-                                                                 0,
-                                                                 result=False))
+    merged_iterator = merge_async_iterators(*iterators)
 
     async def stream_output(generator: AsyncIterator[Tuple[int, str]]):
         async for idx, output in generator:
diff --git a/tests/utils.py b/tests/utils.py
index afeb708f3bcdc..bf3d88194e4ca 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -163,12 +163,11 @@ def get_client(self):
             api_key=self.DUMMY_API_KEY,
         )
 
-    def get_async_client(self):
-        return openai.AsyncOpenAI(
-            base_url=self.url_for("v1"),
-            api_key=self.DUMMY_API_KEY,
-            max_retries=0,
-        )
+    def get_async_client(self, **kwargs):
+        return openai.AsyncOpenAI(base_url=self.url_for("v1"),
+                                  api_key=self.DUMMY_API_KEY,
+                                  max_retries=0,
+                                  **kwargs)
 
 
 def _test_completion(
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 32396fd10188d..f50e20cf70323 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1065,16 +1065,20 @@ async def generate(
             >>> # Process and return the final output
             >>> ...
         """
-        async for output in await self.add_request(
-                request_id,
-                prompt,
-                sampling_params,
-                lora_request=lora_request,
-                trace_headers=trace_headers,
-                prompt_adapter_request=prompt_adapter_request,
-                priority=priority,
-        ):
-            yield LLMEngine.validate_output(output, RequestOutput)
+        try:
+            async for output in await self.add_request(
+                    request_id,
+                    prompt,
+                    sampling_params,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    prompt_adapter_request=prompt_adapter_request,
+                    priority=priority,
+            ):
+                yield LLMEngine.validate_output(output, RequestOutput)
+        except asyncio.CancelledError:
+            await self.abort(request_id)
+            raise
 
     async def encode(
         self,
@@ -1147,15 +1151,19 @@ async def encode(
             >>> # Process and return the final output
             >>> ...
         """
-        async for output in await self.add_request(
-                request_id,
-                prompt,
-                pooling_params,
-                lora_request=lora_request,
-                trace_headers=trace_headers,
-                priority=priority,
-        ):
-            yield LLMEngine.validate_output(output, PoolingRequestOutput)
+        try:
+            async for output in await self.add_request(
+                    request_id,
+                    prompt,
+                    pooling_params,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    priority=priority,
+            ):
+                yield LLMEngine.validate_output(output, PoolingRequestOutput)
+        except asyncio.CancelledError:
+            await self.abort(request_id)
+            raise
 
     async def abort(self, request_id: str) -> None:
         """Abort a request.
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index ea3c93f733038..95da1c6e7b9bf 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -17,11 +17,11 @@
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.launcher import serve_http
+from vllm.entrypoints.utils import with_cancellation
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import (FlexibleArgumentParser, iterate_with_cancellation,
-                        random_uuid)
+from vllm.utils import FlexibleArgumentParser, random_uuid
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger("vllm.entrypoints.api_server")
@@ -47,6 +47,11 @@ async def generate(request: Request) -> Response:
     - other fields: the sampling parameters (See `SamplingParams` for details).
     """
     request_dict = await request.json()
+    return await _generate(request_dict, raw_request=request)
+
+
+@with_cancellation
+async def _generate(request_dict: dict, raw_request: Request) -> Response:
     prompt = request_dict.pop("prompt")
     stream = request_dict.pop("stream", False)
     sampling_params = SamplingParams(**request_dict)
@@ -54,8 +59,6 @@ async def generate(request: Request) -> Response:
 
     assert engine is not None
     results_generator = engine.generate(prompt, sampling_params, request_id)
-    results_generator = iterate_with_cancellation(
-        results_generator, is_cancelled=request.is_disconnected)
 
     # Streaming case
     async def stream_results() -> AsyncGenerator[bytes, None]:
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 14e3a34ce141c..00e2d1a56f160 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -59,6 +59,7 @@
 from vllm.entrypoints.openai.serving_tokenization import (
     OpenAIServingTokenization)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
+from vllm.entrypoints.utils import with_cancellation
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
@@ -311,6 +312,7 @@ async def health(raw_request: Request) -> Response:
 
 
 @router.post("/tokenize")
+@with_cancellation
 async def tokenize(request: TokenizeRequest, raw_request: Request):
     handler = tokenization(raw_request)
 
@@ -325,6 +327,7 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):
 
 
 @router.post("/detokenize")
+@with_cancellation
 async def detokenize(request: DetokenizeRequest, raw_request: Request):
     handler = tokenization(raw_request)
 
@@ -353,6 +356,7 @@ async def show_version():
 
 
 @router.post("/v1/chat/completions")
+@with_cancellation
 async def create_chat_completion(request: ChatCompletionRequest,
                                  raw_request: Request):
     handler = chat(raw_request)
@@ -373,6 +377,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
 
 
 @router.post("/v1/completions")
+@with_cancellation
 async def create_completion(request: CompletionRequest, raw_request: Request):
     handler = completion(raw_request)
     if handler is None:
@@ -390,6 +395,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
 
 
 @router.post("/v1/embeddings")
+@with_cancellation
 async def create_embedding(request: EmbeddingRequest, raw_request: Request):
     handler = embedding(raw_request)
     if handler is None:
@@ -407,6 +413,7 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
 
 
 @router.post("/score")
+@with_cancellation
 async def create_score(request: ScoreRequest, raw_request: Request):
     handler = score(raw_request)
     if handler is None:
@@ -424,6 +431,7 @@ async def create_score(request: ScoreRequest, raw_request: Request):
 
 
 @router.post("/v1/score")
+@with_cancellation
 async def create_score_v1(request: ScoreRequest, raw_request: Request):
     logger.warning(
         "To indicate that Score API is not part of standard OpenAI API, we "
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 527418c635093..81bce0dd370bb 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -32,7 +32,6 @@
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.transformers_utils.tokenizers import maybe_serialize_tool_calls
-from vllm.utils import iterate_with_cancellation
 
 logger = init_logger(__name__)
 
@@ -234,10 +233,6 @@ async def create_chat_completion(
         assert len(generators) == 1
         result_generator, = generators
 
-        if raw_request:
-            result_generator = iterate_with_cancellation(
-                result_generator, raw_request.is_disconnected)
-
         # Streaming response
         if request.stream:
             return self.chat_completion_stream_generator(
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index bd39a4c42e938..5cf9df92e296e 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -159,8 +159,7 @@ async def create_completion(
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
-        result_generator = merge_async_iterators(
-            *generators, is_cancelled=raw_request.is_disconnected)
+        result_generator = merge_async_iterators(*generators)
 
         model_name = self._get_model_name(lora_request)
         num_prompts = len(engine_prompts)
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index fd501ad4f833e..879276646d2ba 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -202,10 +202,7 @@ async def create_embedding(
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
-        result_generator = merge_async_iterators(
-            *generators,
-            is_cancelled=raw_request.is_disconnected if raw_request else None,
-        )
+        result_generator = merge_async_iterators(*generators)
 
         num_prompts = len(engine_prompts)
 
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 6f5cc14ac37cc..101d170bee4d6 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -186,10 +186,7 @@ async def create_score(
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
-        result_generator = merge_async_iterators(
-            *generators,
-            is_cancelled=raw_request.is_disconnected if raw_request else None,
-        )
+        result_generator = merge_async_iterators(*generators)
 
         num_prompts = len(engine_prompts)
 
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
new file mode 100644
index 0000000000000..e8a78d216d0f0
--- /dev/null
+++ b/vllm/entrypoints/utils.py
@@ -0,0 +1,57 @@
+import asyncio
+import functools
+
+from fastapi import Request
+
+
+async def listen_for_disconnect(request: Request) -> None:
+    """Returns if a disconnect message is received"""
+    while True:
+        message = await request.receive()
+        if message["type"] == "http.disconnect":
+            break
+
+
+def with_cancellation(handler_func):
+    """Decorator that allows a route handler to be cancelled by client
+    disconnections.
+    
+    This does _not_ use request.is_disconnected, which does not work with
+    middleware. Instead this follows the pattern from 
+    starlette.StreamingResponse, which simultaneously awaits on two tasks- one
+    to wait for an http disconnect message, and the other to do the work that we
+    want done. When the first task finishes, the other is cancelled.
+
+    A core assumption of this method is that the body of the request has already
+    been read. This is a safe assumption to make for fastapi handlers that have
+    already parsed the body of the request into a pydantic model for us.
+    This decorator is unsafe to use elsewhere, as it will consume and throw away
+    all incoming messages for the request while it looks for a disconnect
+    message.
+
+    In the case where a `StreamingResponse` is returned by the handler, this
+    wrapper will stop listening for disconnects and instead the response object
+    will start listening for disconnects.
+    """
+
+    # Functools.wraps is required for this wrapper to appear to fastapi as a
+    # normal route handler, with the correct request type hinting.
+    @functools.wraps(handler_func)
+    async def wrapper(*args, **kwargs):
+
+        # The request is either the second positional arg or `raw_request`
+        request = args[1] if len(args) > 1 else kwargs["raw_request"]
+
+        handler_task = asyncio.create_task(handler_func(*args, **kwargs))
+        cancellation_task = asyncio.create_task(listen_for_disconnect(request))
+
+        done, pending = await asyncio.wait([handler_task, cancellation_task],
+                                           return_when=asyncio.FIRST_COMPLETED)
+        for task in pending:
+            task.cancel()
+
+        if handler_task in done:
+            return handler_task.result()
+        return None
+
+    return wrapper
diff --git a/vllm/utils.py b/vllm/utils.py
index 73d2ae25f15ca..38c7dea6d2d3d 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -20,7 +20,7 @@
 import uuid
 import warnings
 import weakref
-from asyncio import FIRST_COMPLETED, AbstractEventLoop, Future, Task
+from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
 from collections import UserDict, defaultdict
 from collections.abc import Iterable, Mapping
 from dataclasses import dataclass, field
@@ -370,72 +370,23 @@ def _next_task(iterator: AsyncGenerator[T, None],
     return loop.create_task(iterator.__anext__())  # type: ignore[arg-type]
 
 
-async def iterate_with_cancellation(
-    iterator: AsyncGenerator[T, None],
-    is_cancelled: Callable[[], Awaitable[bool]],
-) -> AsyncGenerator[T, None]:
-    """Convert async iterator into one that polls the provided function
-    at least once per second to check for client cancellation.
-    """
-
-    loop = asyncio.get_running_loop()
-
-    awaits: List[Future[T]] = [_next_task(iterator, loop)]
-    next_cancel_check: float = 0
-    while True:
-        done, pending = await asyncio.wait(awaits, timeout=1.5)
-
-        # Check for cancellation at most once per second
-        time_now = time.time()
-        if time_now >= next_cancel_check:
-            if await is_cancelled():
-                with contextlib.suppress(BaseException):
-                    awaits[0].cancel()
-                    await iterator.aclose()
-                raise asyncio.CancelledError("client cancelled")
-            next_cancel_check = time_now + 1
-
-        if done:
-            try:
-                item = await awaits[0]
-                awaits[0] = _next_task(iterator, loop)
-                yield item
-            except StopAsyncIteration:
-                # we are done
-                return
-
-
 async def merge_async_iterators(
-    *iterators: AsyncGenerator[T, None],
-    is_cancelled: Optional[Callable[[], Awaitable[bool]]] = None,
-) -> AsyncGenerator[Tuple[int, T], None]:
+    *iterators: AsyncGenerator[T,
+                               None], ) -> AsyncGenerator[Tuple[int, T], None]:
     """Merge multiple asynchronous iterators into a single iterator.
 
     This method handle the case where some iterators finish before others.
     When it yields, it yields a tuple (i, item) where i is the index of the
     iterator that yields the item.
-
-    It also optionally polls a provided function at least once per second
-    to check for client cancellation.
     """
 
     loop = asyncio.get_running_loop()
 
     awaits = {_next_task(pair[1], loop): pair for pair in enumerate(iterators)}
-    timeout = None if is_cancelled is None else 1.5
-    next_cancel_check: float = 0
     try:
         while awaits:
-            done, pending = await asyncio.wait(awaits.keys(),
-                                               return_when=FIRST_COMPLETED,
-                                               timeout=timeout)
-            if is_cancelled is not None:
-                # Check for cancellation at most once per second
-                time_now = time.time()
-                if time_now >= next_cancel_check:
-                    if await is_cancelled():
-                        raise asyncio.CancelledError("client cancelled")
-                    next_cancel_check = time_now + 1
+            done, _ = await asyncio.wait(awaits.keys(),
+                                         return_when=FIRST_COMPLETED)
             for d in done:
                 pair = awaits.pop(d)
                 try:

From c77eb8a33ceb62858d951ffef87ae626a0d09973 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 17 Dec 2024 19:34:06 -0500
Subject: [PATCH 101/357] [Bugfix] Set temperature=0.7 in
 test_guided_choice_chat (#11264)

---
 tests/entrypoints/openai/test_chat.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 8d23a2be6f9bb..47c521a9b5eb5 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -482,6 +482,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=10,
+        temperature=0.7,
         extra_body=dict(guided_choice=sample_guided_choice,
                         guided_decoding_backend=guided_decoding_backend))
     choice1 = chat_completion.choices[0].message.content
@@ -496,6 +497,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=10,
+        temperature=0.7,
         extra_body=dict(guided_choice=sample_guided_choice,
                         guided_decoding_backend=guided_decoding_backend))
     choice2 = chat_completion.choices[0].message.content

From bf8717ebaea8d74279df84fbe127ad22cf62e219 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Tue, 17 Dec 2024 16:37:59 -0800
Subject: [PATCH 102/357] [V1] Prefix caching for vision language models
 (#11187)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
---
 tests/v1/core/test_prefix_caching.py |  88 +++++++++++++++++++-
 tests/v1/engine/test_engine_args.py  |  15 ----
 vllm/engine/arg_utils.py             |  27 ++++---
 vllm/inputs/data.py                  |  20 +++++
 vllm/multimodal/inputs.py            |   3 +
 vllm/v1/core/kv_cache_manager.py     |  74 +++++++++++------
 vllm/v1/core/kv_cache_utils.py       | 115 ++++++++++++++++++++++++---
 vllm/v1/core/scheduler.py            |   2 +
 vllm/v1/engine/async_llm.py          |  10 ++-
 vllm/v1/engine/core.py               |   8 +-
 vllm/v1/engine/llm_engine.py         |   9 ++-
 vllm/v1/engine/mm_input_mapper.py    |  33 ++++----
 vllm/v1/engine/processor.py          |  12 +--
 vllm/v1/request.py                   |  24 +++++-
 14 files changed, 342 insertions(+), 98 deletions(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 00f7b0fcfe1dc..ed04f0a373c51 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -2,16 +2,23 @@
 import pytest
 
 from vllm.inputs import token_inputs
+from vllm.multimodal.inputs import PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.utils import cdiv
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
 from vllm.v1.core.kv_cache_utils import KVCacheBlock, hash_block_tokens
 
 
-def make_request(request_id, prompt_token_ids):
+def make_request(request_id,
+                 prompt_token_ids,
+                 mm_positions=None,
+                 mm_hashes=None):
     return Request(
         request_id=request_id,
-        inputs=token_inputs(prompt_token_ids=prompt_token_ids),
+        inputs=token_inputs(prompt_token_ids=prompt_token_ids,
+                            multi_modal_placeholders={"image": mm_positions}
+                            if mm_positions else None,
+                            multi_modal_hashes=mm_hashes),
         sampling_params=SamplingParams(max_tokens=17),
         eos_token_id=100,
         arrival_time=0,
@@ -38,6 +45,7 @@ def test_prefill():
     all_token_ids = common_token_ids + unique_token_ids
     req0 = make_request("0", all_token_ids)
     computed_blocks = manager.get_computed_blocks(req0)
+    assert len(req0.kv_block_hashes) == 3
     assert not computed_blocks
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
     assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
@@ -61,6 +69,7 @@ def test_prefill():
     unique_token_ids = [3] * 5
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks = manager.get_computed_blocks(req1)
+    assert len(req1.kv_block_hashes) == 3
     assert [b.block_id for b in computed_blocks] == [0, 1, 2]
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
@@ -90,6 +99,7 @@ def test_prefill():
     unique_token_ids = [3] * 6
     req2 = make_request("2", common_token_ids + unique_token_ids)
     computed_block = manager.get_computed_blocks(req2)
+    assert len(req2.kv_block_hashes) == 3
     assert [b.block_id for b in computed_block] == [0, 1, 2]
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
@@ -416,3 +426,77 @@ def test_cache_blocks():
     )
     assert len(manager.cached_block_hash_to_block) == 3
     assert blocks[0].block_hash is not None
+
+
+def test_mm_prefix_caching():
+    """
+    This tests that the multi-modal prefix caching is correct.
+    """
+    manager = KVCacheManager(
+        block_size=16,
+        num_gpu_blocks=10,
+        max_model_len=8192,
+        sliding_window=None,
+        enable_caching=True,
+        num_preallocate_tokens=16,
+    )
+
+    # Common prompt tokens (T is text tokens and P is image placeholder tokens)
+    # [T,...,T, P0,...,P0], [P0,...,P0,T,...,T,P1,...,P1], [P1,...,P1]
+    common_token_ids = list(range(10)) + [-1] * 6
+    common_token_ids += [-1] * 4 + list(range(10, 20)) + [-1] * 2
+    common_token_ids += [-1] * 16
+
+    common_mm_positions = [
+        PlaceholderRange(offset=11, length=10),
+        PlaceholderRange(offset=30, length=18),
+    ]
+    common_mm_hashes = ["aaa", "bbb"]
+
+    # A unique image plus some text tokens.
+    unique_token_ids = [-1] * 7 + [100] * 4
+    all_token_ids = common_token_ids + unique_token_ids
+    mm_positions = common_mm_positions + [
+        PlaceholderRange(offset=48, length=7)
+    ]
+    mm_hashes = common_mm_hashes + ["ccc"]
+    req0 = make_request("0",
+                        all_token_ids,
+                        mm_positions=mm_positions,
+                        mm_hashes=mm_hashes)
+    computed_blocks = manager.get_computed_blocks(req0)
+
+    # Completed block should have hashes with extra keys.
+    assert not computed_blocks
+    assert len(req0.kv_block_hashes) == 3
+    assert req0.kv_block_hashes[0].extra_keys == (("aaa", 0), )
+    assert req0.kv_block_hashes[1].extra_keys == (("aaa", 5), ("bbb", 0))
+    assert req0.kv_block_hashes[2].extra_keys == (("bbb", 2), )
+
+    blocks = manager.allocate_slots(req0, 59, computed_blocks)
+    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    req0.num_computed_tokens = 59
+
+    # Append slots without allocating a new block.
+    for _ in range(5):
+        req0.append_output_token_ids(8)
+    new_blocks = manager.append_slots(req0, 5)
+    assert new_blocks is not None and len(new_blocks) == 0
+
+    # The just completed block should have hashes with extra keys.
+    assert len(req0.kv_block_hashes) == 4
+    assert req0.kv_block_hashes[3].extra_keys == (("ccc", 0), )
+
+    # Cache hit.
+    unique_token_ids = [-1] * 7 + [200] * 5
+    all_token_ids = common_token_ids + unique_token_ids
+    mm_positions = common_mm_positions + [
+        PlaceholderRange(offset=48, length=7)
+    ]
+    mm_hashes = common_mm_hashes + ["ccc"]
+    req1 = make_request("1",
+                        all_token_ids,
+                        mm_positions=mm_positions,
+                        mm_hashes=mm_hashes)
+    computed_blocks = manager.get_computed_blocks(req1)
+    assert len(computed_blocks) == 3
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
index ac5e7dde525a7..ff38a4568ecb1 100644
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -31,14 +31,6 @@ def test_prefix_caching_from_cli():
     assert engine_args.enable_prefix_caching
 
 
-def test_defaults():
-    engine_args = EngineArgs(model="facebook/opt-125m")
-
-    # Assert V1 defaults
-    assert (engine_args.enable_prefix_caching
-            ), "V1 turns on prefix caching by default"
-
-
 def test_defaults_with_usage_context():
     engine_args = EngineArgs(model="facebook/opt-125m")
     vllm_config: VllmConfig = engine_args.create_engine_config(
@@ -52,10 +44,3 @@ def test_defaults_with_usage_context():
         UsageContext.OPENAI_API_SERVER)
     assert vllm_config.scheduler_config.max_num_seqs == 1024
     assert vllm_config.scheduler_config.max_num_batched_tokens == 2048
-
-
-def test_prefix_cache_disabled_with_multimodel():
-    engine_args = EngineArgs(model="llava-hf/llava-1.5-7b-hf")
-
-    vllm_config = engine_args.create_engine_config(UsageContext.LLM_CLASS)
-    assert not vllm_config.cache_config.enable_prefix_caching
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f6d276fe7c0c8..674577f23eba6 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -205,6 +205,7 @@ def __post_init__(self):
         # by user.
         if self.enable_prefix_caching is None:
             self.enable_prefix_caching = bool(envs.VLLM_USE_V1)
+
         # Override max_num_seqs if it's not set by user.
         if self.max_num_seqs is None:
             self.max_num_seqs = 256 if not envs.VLLM_USE_V1 else 1024
@@ -1026,11 +1027,11 @@ def create_engine_config(self,
         device_config = DeviceConfig(device=self.device)
         model_config = self.create_model_config()
 
-        if model_config.is_multimodal_model:
-            if self.enable_prefix_caching:
-                logger.warning(
-                    "--enable-prefix-caching is currently not "
-                    "supported for multimodal models and has been disabled.")
+        if (model_config.is_multimodal_model and not envs.VLLM_USE_V1
+                and self.enable_prefix_caching):
+            logger.warning("--enable-prefix-caching is currently not "
+                           "supported for multimodal models in v0 and "
+                           "has been disabled.")
             self.enable_prefix_caching = False
 
         cache_config = CacheConfig(
@@ -1249,11 +1250,14 @@ def _override_v1_engine_args(self, usage_context: UsageContext) -> None:
         # When no user override, set the default values based on the usage
         # context.
         # TODO(woosuk): Tune the default values for different hardware.
-        if self.max_num_batched_tokens is None:
-            if usage_context == UsageContext.LLM_CLASS:
-                self.max_num_batched_tokens = 8192
-            elif usage_context == UsageContext.OPENAI_API_SERVER:
-                self.max_num_batched_tokens = 2048
+        default_max_num_batched_tokens = {
+            UsageContext.LLM_CLASS: 8192,
+            UsageContext.OPENAI_API_SERVER: 2048,
+        }
+        if (self.max_num_batched_tokens is None
+                and usage_context in default_max_num_batched_tokens):
+            self.max_num_batched_tokens = default_max_num_batched_tokens[
+                usage_context]
             logger.warning(
                 "Setting max_num_batched_tokens to %d for %s usage context.",
                 self.max_num_batched_tokens, usage_context.value)
@@ -1263,9 +1267,6 @@ def _override_v1_engine_config(self, engine_config: VllmConfig) -> None:
         Override the EngineConfig's configs based on the usage context for V1.
         """
         assert envs.VLLM_USE_V1, "V1 is not enabled"
-        if engine_config.model_config.is_multimodal_model:
-            # TODO (ywang96): Enable APC by default when VLM supports it.
-            assert not engine_config.cache_config.enable_prefix_caching
 
 
 @dataclass
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 85aaaa776907f..d54cbb5c37819 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -162,6 +162,11 @@ class TokenInputs(TypedDict):
     Placeholder ranges for the multi-modal data.
     """
 
+    multi_modal_hashes: NotRequired[List[str]]
+    """
+    The hashes of the multi-modal data.
+    """
+
     mm_processor_kwargs: NotRequired[Dict[str, Any]]
     """
     Optional multi-modal processor kwargs to be forwarded to the
@@ -177,6 +182,7 @@ def token_inputs(
     prompt: Optional[str] = None,
     multi_modal_data: Optional["MultiModalDataDict"] = None,
     multi_modal_inputs: Optional["MultiModalKwargs"] = None,
+    multi_modal_hashes: Optional[List[str]] = None,
     multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None,
     mm_processor_kwargs: Optional[Dict[str, Any]] = None,
 ) -> TokenInputs:
@@ -191,6 +197,8 @@ def token_inputs(
         inputs["multi_modal_data"] = multi_modal_data
     if multi_modal_inputs is not None:
         inputs["multi_modal_inputs"] = multi_modal_inputs
+    if multi_modal_hashes is not None:
+        inputs["multi_modal_hashes"] = multi_modal_hashes
     if multi_modal_placeholders is not None:
         inputs["multi_modal_placeholders"] = multi_modal_placeholders
     if mm_processor_kwargs is not None:
@@ -295,6 +303,18 @@ def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]:
 
         assert_never(inputs)
 
+    @cached_property
+    def multi_modal_hashes(self) -> List[str]:
+        inputs = self.inputs
+
+        if inputs["type"] == "token":
+            return inputs.get("multi_modal_hashes", [])
+
+        if inputs["type"] == "multimodal":
+            return inputs.get("mm_hashes", [])
+
+        assert_never(inputs)
+
     @cached_property
     def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict":
         inputs = self.inputs
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 229a8fbdf5831..c00943a5f26d9 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -215,6 +215,9 @@ class MultiModalInputsV2(TypedDict):
     mm_kwargs: MultiModalKwargs
     """Keyword arguments to be directly passed to the model after batching."""
 
+    mm_hashes: NotRequired[List[str]]
+    """The hashes of the multi-modal data."""
+
     mm_placeholders: MultiModalPlaceholderDict
     """
     For each modality, information about the placeholder tokens in
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index aaa44c930e324..61a3f5fd6d841 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -4,7 +4,9 @@
 from vllm.logger import init_logger
 from vllm.utils import cdiv
 from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
-                                         KVCacheBlock, hash_block_tokens,
+                                         KVCacheBlock,
+                                         generate_block_hash_extra_keys,
+                                         hash_block_tokens,
                                          hash_request_tokens)
 from vllm.v1.request import Request
 
@@ -83,10 +85,12 @@ def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]:
 
         computed_blocks = []
 
-        # TODO(rickyx): potentially we could cache this so we don't have to
-        # recompute it every time.
-        block_hashes = hash_request_tokens(self.block_size,
-                                           request.all_token_ids)
+        # The block hashes for the request may already be computed
+        # if the request was preempted and resumed.
+        if not request.kv_block_hashes:
+            request.set_kv_block_hashes(
+                hash_request_tokens(self.block_size, request))
+        block_hashes = request.kv_block_hashes
 
         for block_hash in block_hashes:
             # block_hashes is a chain of block hashes. If a block hash is not
@@ -242,14 +246,16 @@ def allocate_slots(
         num_computed_tokens = len(computed_blocks) * self.block_size
         num_full_blocks = (num_computed_tokens + num_tokens) // self.block_size
 
-        self._cache_full_blocks(
-            request=request,
-            blk_start_idx=len(computed_blocks),
-            # The new full blocks are the full blocks that are not computed.
-            full_blocks=self.req_to_blocks[request.request_id]
-            [len(computed_blocks):num_full_blocks],
-            prev_block=computed_blocks[-1] if computed_blocks else None,
-        )
+        new_full_blocks = self.req_to_blocks[
+            request.request_id][len(computed_blocks):num_full_blocks]
+        if new_full_blocks:
+            self._cache_full_blocks(
+                request=request,
+                blk_start_idx=len(computed_blocks),
+                # The new full blocks are the full blocks that are not computed.
+                full_blocks=new_full_blocks,
+                prev_block=computed_blocks[-1] if computed_blocks else None,
+            )
 
         return new_blocks
 
@@ -376,6 +382,8 @@ def _cache_full_blocks(
             full_blocks: The list of blocks to update hash metadata.
             prev_block: The previous block in the chain.
         """
+        num_cached_block_hashes = len(request.kv_block_hashes)
+
         # Update the new blocks with the block hashes through the chain.
         prev_block_hash_value = None
         if prev_block is not None:
@@ -387,17 +395,35 @@ def _cache_full_blocks(
         for i, blk in enumerate(full_blocks):
             blk_idx = blk_start_idx + i
 
-            block_tokens = request.all_token_ids[blk_idx *
-                                                 self.block_size:(blk_idx +
-                                                                  1) *
-                                                 self.block_size]
-            assert len(block_tokens) == self.block_size, (
-                f"Expected {self.block_size} tokens, got {len(block_tokens)} "
-                f"at {blk_idx}th block for request "
-                f"{request.request_id}({request})")
-
-            # Compute the hash of the current block.
-            block_hash = hash_block_tokens(prev_block_hash_value, block_tokens)
+            if blk_idx < num_cached_block_hashes:
+                # The block hash may already be computed in
+                # "get_computed_blocks" if the tokens are not generated by
+                # this request (either the prompt tokens or the previously
+                # generated tokens with preemption). In this case we simply
+                # reuse the block hash.
+                block_hash = request.kv_block_hashes[blk_idx]
+            else:
+                # Otherwise compute the block hash and cache it in the request
+                # in case it will be preempted in the future.
+                start_token_idx = blk_idx * self.block_size
+                end_token_idx = (blk_idx + 1) * self.block_size
+                block_tokens = request.all_token_ids[
+                    start_token_idx:end_token_idx]
+                assert len(block_tokens) == self.block_size, (
+                    f"Expected {self.block_size} tokens, got "
+                    f"{len(block_tokens)} at {blk_idx}th block for request "
+                    f"{request.request_id}({request})")
+
+                # Generate extra keys for multi-modal inputs. Note that since
+                # we reach to this branch only when the block is completed with
+                # generated tokens, we only need to consider the last mm input.
+                extra_keys, _ = generate_block_hash_extra_keys(
+                    request, start_token_idx, end_token_idx, -1)
+
+                # Compute the hash of the current block.
+                block_hash = hash_block_tokens(prev_block_hash_value,
+                                               block_tokens, extra_keys)
+                request.append_kv_block_hashes(block_hash)
 
             # Update and added the full block to the cache.
             blk.block_hash = block_hash
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 0ba338aa5a3d2..d80ea128c7749 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1,20 +1,25 @@
 """KV-Cache Utilities."""
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import List, NamedTuple, Optional, Tuple
+from typing import Any, List, NamedTuple, Optional, Tuple
 
 from vllm.logger import init_logger
+from vllm.v1.request import Request
 
 logger = init_logger(__name__)
 
 
 class BlockHashType(NamedTuple):
-    """Hash value of a block and the token IDs in the block.
-    The reason we keep a tuple of token IDs is to make sure no hash
-    collision happens when the hash value is the same.
+    """Hash value of a block (int), the token IDs in the block, and extra keys.
+    The reason we keep a tuple of token IDs and extra keys is to make sure
+    no hash collision happens when the hash value is the same.
     """
+    # Hash value of the block in an integer.
     hash_value: int
+    # Token IDs in the block.
     token_ids: Tuple[int, ...]
+    # Extra keys for the block.
+    extra_keys: Optional[Any] = None
 
 
 @dataclass
@@ -159,8 +164,80 @@ def get_all_free_blocks(self) -> List[KVCacheBlock]:
         return ret
 
 
-def hash_block_tokens(parent_block_hash: Optional[int],
-                      curr_block_token_ids: Sequence[int]) -> BlockHashType:
+def generate_block_hash_extra_keys(
+        request: Request, start_token_idx: int, end_token_idx: int,
+        start_mm_idx: int) -> Tuple[Optional[Tuple[Any, ...]], int]:
+    """Generate extra keys for the block hash. The extra keys can come from
+    the multi-modal inputs and request specific metadata (e.g., LoRA ID).
+    For multi-modal inputs, the extra keys are (mm_hash, start_offset) that
+    indicate a mm input contained in the block and its starting offset in
+    the block tokens.
+    
+    Args:
+        request: The request object.
+        start_token_idx: The start token index of the block.
+        end_token_idx: The end token index of the block.
+        start_mm_idx: The start multi-modal index of the block.
+    
+    Returns:
+        A tuple of extra keys and the next multi-modal index.
+    """
+
+    mm_positions, mm_hashes = request.mm_positions, request.mm_hashes
+    if not mm_positions:
+        return None, start_mm_idx
+
+    if mm_positions and len(mm_positions) != len(mm_hashes):
+        raise ValueError(
+            "The number of multi-modal positions and hashes must match. This "
+            "is likely because you do not enable MM preprocessor hashing. "
+            "Please set mm_cache_preprocessor=True.")
+
+    # Note that we assume mm_positions is sorted by offset.
+    # We do not need to check all mm inputs if the start token index is out of
+    # range. This usually happens in the late prefill phase and decoding phase.
+    if mm_positions[-1]["offset"] + mm_positions[-1][
+            "length"] < start_token_idx:
+        return None, start_mm_idx
+
+    # Support start_mm_idx == -1 to indicate the last mm input.
+    if start_mm_idx < 0:
+        assert -start_mm_idx <= len(mm_positions)
+        start_mm_idx = len(mm_positions) + start_mm_idx
+
+    extra_keys = []
+    curr_mm_idx = start_mm_idx
+    while mm_positions and curr_mm_idx < len(mm_positions):
+        assert mm_hashes[curr_mm_idx] is not None
+        offset = mm_positions[curr_mm_idx]["offset"]
+        length = mm_positions[curr_mm_idx]["length"]
+        if end_token_idx > offset:
+            if start_token_idx > offset + length:
+                # This block has passed the current mm input.
+                curr_mm_idx += 1
+                continue
+
+            # The block contains the current mm input.
+            mm_start = max(0, start_token_idx - offset)
+            extra_keys.append((mm_hashes[curr_mm_idx], mm_start))
+            if end_token_idx >= offset + length:
+                # If this block contains the end of the current mm input,
+                # move to the next mm input as this block may also contain
+                # the next mm input.
+                curr_mm_idx += 1
+            else:
+                # Otherwise this block is done with mm inputs.
+                break
+        else:
+            # This block has not reached the current mm input.
+            break
+    return tuple(extra_keys), curr_mm_idx
+
+
+def hash_block_tokens(
+        parent_block_hash: Optional[int],
+        curr_block_token_ids: Sequence[int],
+        extra_keys: Optional[Tuple[Any, ...]] = None) -> BlockHashType:
     """Computes a hash value corresponding to the contents of a block and
     the contents of the preceding block(s). The hash value is used for
     prefix caching. We use LRU cache for this function to avoid recomputing
@@ -174,27 +251,39 @@ def hash_block_tokens(parent_block_hash: Optional[int],
             if this is the first block.
         curr_block_token_ids: A list of token ids in the current
             block. The current block is assumed to be full.
+        extra_keys: Extra keys for the block.
 
     Returns:
         The hash value of the block and the token ids in the block.
         The entire tuple is used as the hash key of the block.
     """
     return BlockHashType(hash((parent_block_hash, *curr_block_token_ids)),
-                         tuple(curr_block_token_ids))
+                         tuple(curr_block_token_ids), extra_keys)
 
 
 def hash_request_tokens(block_size: int,
-                        token_ids: Sequence[int]) -> List[BlockHashType]:
+                        request: Request) -> List[BlockHashType]:
     """Computes hash values of a chain of blocks given a sequence of
     token IDs. The hash value is used for prefix caching.
 
     Args:
         block_size: The size of each block.
-        token_ids: A sequence of token ids in the request.
+        request: The request object.
 
     Returns:
         The list of computed hash values.
     """
+    token_ids = request.all_token_ids
+    mm_positions, mm_hashes = request.mm_positions, request.mm_hashes
+    if mm_positions and len(mm_positions) != len(mm_hashes):
+        raise ValueError(
+            "The number of multi-modal positions and hashes must match.")
+
+    # TODO: Extend this to support other features such as LoRA.
+    need_extra_keys = bool(mm_positions)
+    extra_keys = None
+    curr_mm_idx = 0
+
     ret = []
     parent_block_hash_value = None
     for start in range(0, len(token_ids), block_size):
@@ -203,8 +292,14 @@ def hash_request_tokens(block_size: int,
         # Do not hash the block if it is not full.
         if len(block_token_ids) < block_size:
             break
+
+        # Add extra keys if the block is a multi-modal block.
+        if need_extra_keys:
+            extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
+                request, start, end, curr_mm_idx)
+
         block_hash = hash_block_tokens(parent_block_hash_value,
-                                       block_token_ids)
+                                       block_token_ids, extra_keys)
         ret.append(block_hash)
         parent_block_hash_value = block_hash.hash_value
     return ret
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 178532e477dae..08e7c0fd4dc9b 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -516,6 +516,7 @@ class NewRequestData:
     prompt_token_ids: List[int]
     prompt: Optional[str]
     mm_inputs: List["MultiModalKwargs"]
+    mm_hashes: List[str]
     mm_positions: List["PlaceholderRange"]
     sampling_params: SamplingParams
     block_ids: List[int]
@@ -533,6 +534,7 @@ def from_request(
             prompt_token_ids=request.prompt_token_ids,
             prompt=request.prompt,
             mm_inputs=request.mm_inputs,
+            mm_hashes=request.mm_hashes,
             mm_positions=request.mm_positions,
             sampling_params=request.sampling_params,
             block_ids=block_ids,
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index b36de5f66917c..41fb4b25d45bb 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -60,9 +60,13 @@ def __init__(
         self.client_aborted_requests: List[str] = []
 
         # Processor (converts Inputs --> EngineCoreRequests).
-        self.processor = Processor(vllm_config.model_config,
-                                   vllm_config.lora_config, self.tokenizer,
-                                   input_registry)
+        self.processor = Processor(
+            model_config=vllm_config.model_config,
+            cache_config=vllm_config.cache_config,
+            lora_config=vllm_config.lora_config,
+            tokenizer=self.tokenizer,
+            input_registry=input_registry,
+        )
 
         # Detokenizer (converts EngineCoreOutputs --> RequestOutput).
         self.detokenizer = Detokenizer(
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 56d4dc67e4a0e..497d5db5b4c99 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -65,7 +65,8 @@ def __init__(
 
         self._last_logging_time = time.time()
 
-        self.mm_input_mapper_server = MMInputMapperServer()
+        self.mm_input_mapper_server = MMInputMapperServer(
+            vllm_config.model_config)
 
     def _initialize_kv_caches(self,
                               cache_config: CacheConfig) -> Tuple[int, int]:
@@ -98,9 +99,8 @@ def add_request(self, request: EngineCoreRequest):
             # MM mapper, so anything that has a hash must have a HIT cache
             # entry here as well.
             assert request.mm_inputs is not None
-            request.mm_inputs, request.mm_hashes = (
-                self.mm_input_mapper_server.process_inputs(
-                    request.mm_inputs, request.mm_hashes))
+            request.mm_inputs = self.mm_input_mapper_server.process_inputs(
+                request.mm_inputs, request.mm_hashes)
 
         req = Request.from_engine_core_request(request)
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 15dedbd0f9529..bea8c5502f612 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -55,9 +55,12 @@ def __init__(
         self.tokenizer.ping()
 
         # Processor (convert Inputs --> EngineCoreRequests)
-        self.processor = Processor(vllm_config.model_config,
-                                   vllm_config.lora_config, self.tokenizer,
-                                   input_registry, mm_registry)
+        self.processor = Processor(model_config=vllm_config.model_config,
+                                   cache_config=vllm_config.cache_config,
+                                   lora_config=vllm_config.lora_config,
+                                   tokenizer=self.tokenizer,
+                                   input_registry=input_registry,
+                                   mm_registry=mm_registry)
 
         # Detokenizer (converts EngineCoreOutputs --> RequestOutput)
         self.detokenizer = Detokenizer(
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index 6cdeba6f3f71e..e53ba092ede04 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional
 
 import PIL
 from blake3 import blake3
@@ -42,6 +42,8 @@ def __init__(
             model_config)
         self.mm_registry.init_mm_limits_per_prompt(model_config)
 
+        # Init cache
+        self.use_cache = model_config.mm_cache_preprocessor
         self.mm_cache = LRUDictCache[str, MultiModalKwargs](MM_CACHE_SIZE)
 
         # DEBUG: Set to None to disable
@@ -61,7 +63,7 @@ def process_inputs(
         mm_hashes: Optional[List[str]],
         mm_processor_kwargs: Optional[Dict[str, Any]],
         precomputed_mm_inputs: Optional[List[MultiModalKwargs]],
-    ) -> Tuple[List[MultiModalKwargs], Optional[List[str]]]:
+    ) -> List[MultiModalKwargs]:
         if precomputed_mm_inputs is None:
             image_inputs = mm_data["image"]
             if not isinstance(image_inputs, list):
@@ -70,26 +72,21 @@ def process_inputs(
         else:
             num_inputs = len(precomputed_mm_inputs)
 
-        # Check if hash is enabled
-        use_hash = mm_hashes is not None
-        if use_hash:
+        # Sanity
+        if self.use_cache:
             assert mm_hashes is not None
-            assert num_inputs == len(
-                mm_hashes), "num_inputs = {} len(mm_hashes) = {}".format(
-                    num_inputs, len(mm_hashes))
+            assert num_inputs == len(mm_hashes)
 
         # Process each image input separately, so that later we can schedule
         # them in a fine-grained manner.
         # Apply caching (if enabled) and reuse precomputed inputs (if provided)
-        ret_hashes: Optional[List[str]] = [] if use_hash else None
         ret_inputs: List[MultiModalKwargs] = []
         for input_id in range(num_inputs):
             if self.mm_debug_cache_hit_ratio_steps is not None:
                 self.cache_hit_ratio(self.mm_debug_cache_hit_ratio_steps)
 
-            mm_hash = None
             mm_input = None
-            if use_hash:
+            if self.use_cache:
                 assert mm_hashes is not None
                 mm_hash = mm_hashes[input_id]
                 mm_input = self.mm_cache.get(mm_hash)
@@ -106,7 +103,7 @@ def process_inputs(
                         mm_processor_kwargs=mm_processor_kwargs,
                     )
 
-                if use_hash:
+                if self.use_cache:
                     # Add to cache
                     assert mm_hash is not None
                     self.mm_cache.put(mm_hash, mm_input)
@@ -114,18 +111,15 @@ def process_inputs(
                 self.mm_cache_hits += 1
                 mm_input = None  # Avoids sending mm_input to Server
 
-            if use_hash:
-                assert mm_hash is not None
-                assert ret_hashes is not None
-                ret_hashes.append(mm_hash)
             ret_inputs.append(mm_input)
 
-        return ret_inputs, ret_hashes
+        return ret_inputs
 
 
 class MMInputMapperServer:
 
-    def __init__(self, ):
+    def __init__(self, model_config):
+        self.use_cache = model_config.mm_cache_preprocessor
         self.mm_cache = LRUDictCache[str, MultiModalKwargs](MM_CACHE_SIZE)
 
     def process_inputs(
@@ -135,6 +129,9 @@ def process_inputs(
     ) -> List[MultiModalKwargs]:
         assert len(mm_inputs) == len(mm_hashes)
 
+        if not self.use_cache:
+            return mm_inputs
+
         full_mm_inputs = []
         for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
             assert mm_hash is not None
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 679bf8e25e9ca..732757d6b0ac2 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -1,7 +1,7 @@
 import time
 from typing import Any, Dict, Mapping, Optional, Tuple, Union
 
-from vllm.config import LoRAConfig, ModelConfig
+from vllm.config import CacheConfig, LoRAConfig, ModelConfig
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
                          PromptType, SingletonInputsAdapter)
 from vllm.inputs.parse import is_encoder_decoder_inputs
@@ -23,6 +23,7 @@ class Processor:
     def __init__(
         self,
         model_config: ModelConfig,
+        cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
         tokenizer: BaseTokenizerGroup,
         input_registry: InputRegistry = INPUT_REGISTRY,
@@ -45,8 +46,9 @@ def __init__(
         self.mm_input_mapper_client = MMInputMapperClient(model_config)
 
         # Multi-modal hasher (for images)
-        self.mm_hasher = MMHasher(
-        ) if model_config.mm_cache_preprocessor else None
+        self.use_hash = model_config.mm_cache_preprocessor or \
+            cache_config.enable_prefix_caching
+        self.mm_hasher = MMHasher()
 
     # TODO: run in an ThreadpoolExecutor or BackgroundProcess.
     # This ideally should releases the GIL, so we should not block the
@@ -77,7 +79,7 @@ def process_inputs(
 
         # Compute MM hashes (if enabled)
         mm_hashes = None
-        if self.mm_hasher is not None:
+        if self.use_hash:
             mm_hashes = self.mm_hasher.hash(prompt)
 
         # Process inputs.
@@ -118,7 +120,7 @@ def process_inputs(
         # Apply MM mapper
         mm_inputs = None
         if len(decoder_inputs.multi_modal_data) > 0:
-            mm_inputs, mm_hashes = self.mm_input_mapper_client.process_inputs(
+            mm_inputs = self.mm_input_mapper_client.process_inputs(
                 decoder_inputs.multi_modal_data, mm_hashes,
                 decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs)
 
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 1737d096e811d..f4783ae366ef0 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -1,5 +1,5 @@
 import enum
-from typing import List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union
 
 from vllm.inputs import DecoderOnlyInputs, SingletonInputsAdapter, token_inputs
 from vllm.lora.request import LoRARequest
@@ -9,6 +9,9 @@
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.utils import ConstantList
 
+if TYPE_CHECKING:
+    from vllm.v1.core.kv_cache_utils import BlockHashType
+
 
 class Request:
 
@@ -45,6 +48,7 @@ def __init__(
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
         self.num_computed_tokens = 0
 
+        # Multi-modal input metadata.
         mm_positions = self.inputs.multi_modal_placeholders
         if mm_positions:
             # FIXME(woosuk): Support other modalities.
@@ -56,6 +60,12 @@ def __init__(
         if self.inputs.multi_modal_inputs:
             self.mm_inputs = self.inputs.multi_modal_inputs
 
+        self.mm_hashes: List[str] = self.inputs.multi_modal_hashes
+
+        # Cache the computed kv block hashes of the request to avoid
+        # recomputing.
+        self._kv_block_hashes: List[BlockHashType] = []
+
     @classmethod
     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
         return cls(
@@ -65,6 +75,7 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
                 prompt=request.prompt,
                 multi_modal_data=None,
                 multi_modal_inputs=request.mm_inputs,
+                multi_modal_hashes=request.mm_hashes,
                 multi_modal_placeholders=request.mm_placeholders,
                 mm_processor_kwargs=None,
             ),
@@ -121,6 +132,17 @@ def get_num_encoder_tokens(self, input_id: int) -> int:
         num_tokens = self.mm_positions[input_id]["length"]
         return num_tokens
 
+    @property
+    def kv_block_hashes(self) -> ConstantList["BlockHashType"]:
+        # Prevent directly appending to the kv_block_hashes.
+        return ConstantList(self._kv_block_hashes)
+
+    def set_kv_block_hashes(self, value: List["BlockHashType"]) -> None:
+        self._kv_block_hashes = value
+
+    def append_kv_block_hashes(self, block_hash: "BlockHashType") -> None:
+        self._kv_block_hashes.append(block_hash)
+
 
 class RequestStatus(enum.IntEnum):
     """Status of a request."""

From 866fa4550d572f4ff3521ccf503e0df2e76591a1 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 18 Dec 2024 01:39:07 +0100
Subject: [PATCH 103/357] [Bugfix] Restore support for larger block sizes
 (#11259)

Signed-off-by: Konrad Zawora <kzawora@habana.ai>
---
 vllm/config.py           | 4 ++++
 vllm/engine/arg_utils.py | 6 ++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 9ecd3e72afa9f..307cf9c8d5b2a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -917,6 +917,10 @@ def _verify_args(self) -> None:
             raise ValueError(
                 "GPU memory utilization must be less than 1.0. Got "
                 f"{self.gpu_memory_utilization}.")
+        if (current_platform.is_cuda() and self.block_size is not None
+                and self.block_size > 32):
+            raise ValueError("CUDA Paged Attention kernel only supports "
+                             f"block sizes up to 32. Got {self.block_size}.")
 
     def _verify_cache_dtype(self) -> None:
         if self.cache_dtype == "auto":
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 674577f23eba6..64cc4592c2861 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -424,10 +424,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument('--block-size',
                             type=int,
                             default=EngineArgs.block_size,
-                            choices=[8, 16, 32],
+                            choices=[8, 16, 32, 64, 128],
                             help='Token block size for contiguous chunks of '
                             'tokens. This is ignored on neuron devices and '
-                            'set to max-model-len')
+                            'set to max-model-len. On CUDA devices, '
+                            'only block sizes up to 32 are supported. '
+                            'On HPU devices, block size defaults to 128.')
 
         parser.add_argument(
             "--enable-prefix-caching",

From 8b79f9e107fd4214187bf65485b3ea1bb3191a46 Mon Sep 17 00:00:00 2001
From: Wallas Henrique <wallashss@users.noreply.github.com>
Date: Wed, 18 Dec 2024 03:34:08 -0300
Subject: [PATCH 104/357] [Bugfix] Fix guided decoding with tokenizer mode
 mistral (#11046)

---
 .buildkite/test-pipeline.yaml                 |   6 +-
 requirements-common.txt                       |   3 +-
 .../model_executor/test_guided_processors.py  |  54 ++++++++-
 .../decoder_only/language/test_mistral.py     |  86 ++++++++++++-
 .../guided_decoding/xgrammar_decoding.py      | 113 +++++++++++-------
 vllm/transformers_utils/tokenizer.py          |   2 +-
 vllm/transformers_utils/tokenizers/mistral.py |   5 +-
 7 files changed, 217 insertions(+), 52 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 44f47fac1c1b3..b563c96343f92 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -224,8 +224,12 @@ steps:
   mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/model_executor/layers
+  - vllm/model_executor/guided_decoding
   - tests/test_logits_processor
-  command: pytest -v -s test_logits_processor.py
+  - tests/model_executor/test_guided_processors
+  commands: 
+    - pytest -v -s test_logits_processor.py
+    - pytest -v -s model_executor/test_guided_processors.py
 
 - label: Speculative decoding tests # 30min
   source_file_dependencies:
diff --git a/requirements-common.txt b/requirements-common.txt
index bd2b4b7a01668..1c935303c8d79 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -14,12 +14,13 @@ aiohttp
 openai >= 1.45.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
 uvicorn[standard]
 pydantic >= 2.9  # Required for fastapi >= 0.113.0
-pillow  # Required for image processing
 prometheus_client >= 0.18.0
+pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
 outlines == 0.1.11
+lark == 1.2.2 
 xgrammar >= 0.1.6; platform_machine == "x86_64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
index 9f4d81b583141..3334c0df149b5 100644
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@@ -1,13 +1,19 @@
+import pickle
+
 import pytest
 import torch
 from transformers import AutoTokenizer
 
+from vllm.config import ModelConfig
 from vllm.model_executor.guided_decoding import (
-    get_guided_decoding_logits_processor)
+    get_guided_decoding_logits_processor,
+    get_local_guided_decoding_logits_processor)
 from vllm.model_executor.guided_decoding.outlines_logits_processors import (
     JSONLogitsProcessor, RegexLogitsProcessor)
 from vllm.sampling_params import GuidedDecodingParams
 
+MODEL_NAME = 'HuggingFaceH4/zephyr-7b-beta'
+
 
 def test_guided_logits_processors(sample_regex, sample_json_schema):
     """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
@@ -38,14 +44,29 @@ def test_guided_logits_processors(sample_regex, sample_json_schema):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("backend",
                          ["outlines", "lm-format-enforcer", "xgrammar"])
-async def test_guided_logits_processor_black_box(backend: str, sample_regex,
+@pytest.mark.parametrize("is_local", [True, False])
+async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
+                                                 sample_regex,
                                                  sample_json_schema):
-    tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
+
+    config = ModelConfig(
+        MODEL_NAME,
+        task="generate",
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="bfloat16",
+    )
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     token_ids = tokenizer.encode(
         f"Give an example IPv4 address with this regex: {sample_regex}")
     regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
-    regex_lp = await get_guided_decoding_logits_processor(
-        regex_request, tokenizer)
+
+    regex_lp = get_local_guided_decoding_logits_processor(
+            regex_request, tokenizer, config) if is_local else \
+            await get_guided_decoding_logits_processor(
+                    regex_request, tokenizer, config)
     assert regex_lp is not None
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
@@ -59,7 +80,7 @@ async def test_guided_logits_processor_black_box(backend: str, sample_regex,
     json_request = GuidedDecodingParams(json=sample_json_schema,
                                         backend=backend)
     json_lp = await get_guided_decoding_logits_processor(
-        json_request, tokenizer)
+        json_request, tokenizer, config)
     assert json_lp is not None
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
@@ -84,3 +105,24 @@ def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
     with pytest.raises(ValueError,
                        match="You can only use one kind of guided"):
         GuidedDecodingParams(json=sample_json_schema, grammar="test grammar")
+
+
+def test_pickle_xgrammar_tokenizer_data():
+
+    # TODO: move to another test file for xgrammar
+    try:
+        import xgrammar as xgr
+    except ImportError:
+        pytest.skip("Could not import xgrammar to run test")
+
+    from vllm.model_executor.guided_decoding.xgrammar_decoding import (
+        TokenizerData)
+    tokenizer_data = TokenizerData(vocab_type=xgr.VocabType.RAW)
+    pickled = pickle.dumps(tokenizer_data)
+
+    assert pickled is not None
+
+    depickled: TokenizerData = pickle.loads(pickled)
+
+    assert depickled is not None
+    assert depickled.vocab_type == xgr.VocabType.RAW
diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index 99b5d5694f9f7..bdc1571784b5d 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -3,17 +3,20 @@
 Run `pytest tests/models/test_mistral.py`.
 """
 import copy
+import json
 
+import jsonschema
+import jsonschema.exceptions
 import pytest
 
-from vllm import SamplingParams
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (  # noqa
     MistralToolParser)
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
 from ...utils import check_logprobs_close
 
 MODELS = [
-    "mistralai/Mistral-7B-Instruct-v0.1",
+    "mistralai/Mistral-7B-Instruct-v0.3",
 ]
 
 MISTRAL_FORMAT_MODELS = [
@@ -126,6 +129,45 @@
     }
 ]
 
+SAMPLE_JSON_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "name": {
+            "type": "string"
+        },
+        "age": {
+            "type": "integer"
+        },
+        "skills": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "maxLength": 10
+            },
+            "minItems": 3
+        },
+        "work_history": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "company": {
+                        "type": "string"
+                    },
+                    "duration": {
+                        "type": "number"
+                    },
+                    "position": {
+                        "type": "string"
+                    }
+                },
+                "required": ["company", "position"]
+            }
+        }
+    },
+    "required": ["name", "age", "skills", "work_history"]
+}
+
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
@@ -251,3 +293,43 @@ def test_mistral_function_calling(
         assert parsed_message.tool_calls[
             0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'  # noqa
         assert parsed_message.content is None
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("guided_backend",
+                         ["outlines", "lm-format-enforcer", "xgrammar"])
+def test_mistral_guided_decoding(
+    vllm_runner,
+    model: str,
+    guided_backend: str,
+) -> None:
+    with vllm_runner(model, dtype='bfloat16',
+                     tokenizer_mode="mistral") as vllm_model:
+
+        guided_decoding = GuidedDecodingParams(json=SAMPLE_JSON_SCHEMA,
+                                               backend=guided_backend)
+        params = SamplingParams(max_tokens=512,
+                                temperature=0.7,
+                                guided_decoding=guided_decoding)
+
+        messages = [{
+            "role": "system",
+            "content": "you are a helpful assistant"
+        }, {
+            "role":
+            "user",
+            "content":
+            f"Give an example JSON for an employee profile that "
+            f"fits this schema: {SAMPLE_JSON_SCHEMA}"
+        }]
+        outputs = vllm_model.model.chat(messages, sampling_params=params)
+
+        generated_text = outputs[0].outputs[0].text
+        json_response = json.loads(generated_text)
+        assert outputs is not None
+
+        try:
+            jsonschema.validate(instance=json_response,
+                                schema=SAMPLE_JSON_SCHEMA)
+        except jsonschema.exceptions.ValidationError:
+            pytest.fail("Generated response is not valid with JSON schema")
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index fc45e37cf6f06..5b97f03257502 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -3,7 +3,7 @@
 
 import json
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, NamedTuple
+from typing import TYPE_CHECKING, Any
 
 import torch
 from transformers import PreTrainedTokenizerFast
@@ -16,6 +16,7 @@
 
 from vllm.model_executor.guided_decoding.xgrammar_utils import (
     convert_lark_to_gbnf, grammar_is_likely_lark)
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer
@@ -37,11 +38,21 @@ def get_local_xgrammar_guided_decoding_logits_processor(
     return XGrammarLogitsProcessor(config)
 
 
-class TokenizerData(NamedTuple):
+@dataclass(frozen=True)
+class TokenizerData:
     """Immutable container for cached tokenizer data."""
-    encoded_vocab: list[str]
-    stop_token_ids: list[int] | None
-    backend_str: str
+    encoded_vocab: list[str] = field(default_factory=list)
+    stop_token_ids: list[int] | None = None
+    # These fields are mutually exclusive: `backend_str` is used to create a
+    # TokenizeInfo with `TokenizerInfo.from_huggingface` while `vocab_type` is
+    # used within the constructor of TokenizeInfo
+    backend_str: str | None = None
+    vocab_type: xgr.VocabType | None = None
+
+    def __post_init__(self):
+        # Check for mutual exclusive
+        assert not (self.backend_str and self.vocab_type), \
+            "backend_str and vocab_type are mutual exclusive"
 
 
 class TokenizerDataCache:
@@ -68,18 +79,27 @@ def get_tokenizer_data(cls,
                     "get_vocab method.") from e
 
             stop_token_ids = None
-            backend_str = xgr.VocabType.RAW
+            backend_str = ""
+            vocab_type = xgr.VocabType.RAW
+
+            if stop_token_ids is None and hasattr(
+                    tokenizer,
+                    "eos_token_id") and tokenizer.eos_token_id is not None:
+                stop_token_ids = [tokenizer.eos_token_id]
+
             if isinstance(tokenizer, PreTrainedTokenizerFast):
                 backend_str = tokenizer.backend_tokenizer.to_str()
-                if stop_token_ids is None and hasattr(
-                        tokenizer,
-                        "eos_token_id") and tokenizer.eos_token_id is not None:
-                    stop_token_ids = [tokenizer.eos_token_id]
+                vocab_type = None
+
+            elif isinstance(tokenizer, MistralTokenizer):
+                # REF: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
+                vocab_type = xgr.VocabType.BYTE_FALLBACK
 
             cls._cache[tokenizer_hash] = TokenizerData(
                 encoded_vocab=encoded_vocab,
                 stop_token_ids=stop_token_ids,
-                backend_str=backend_str)
+                backend_str=backend_str,
+                vocab_type=vocab_type)
 
         return cls._cache[tokenizer_hash]
 
@@ -98,11 +118,30 @@ def get_compiler(cls, config: GrammarConfig) -> xgr.GrammarCompiler:
         cache_key = str(config.tokenizer_hash)
 
         if cache_key not in cls._cache:
-            assert config.encoded_vocab is not None
-            tokenizer_info = xgr.TokenizerInfo._create_from_handle(
-                xgr_core.TokenizerInfo.from_huggingface(
-                    config.encoded_vocab, config.backend_str,
-                    config.vocab_size, config.stop_token_ids))
+            assert config.tokenizer_data is not None
+            assert config.tokenizer_data.encoded_vocab is not None
+
+            config_data = config.tokenizer_data
+
+            # In TokenizerDataCache.get_tokenizer_data, a serializable
+            # tokenizer_data is created and cached. This data is used to build
+            # a tokenizer_info and create an xgrammar compiler.
+            # - If tokenizer_data has backend_str set, use
+            # xgr_core.TokenizerInfo.from_huggingface (a C++ bind).
+            # - Otherwise, use the default constructor with vocab_type.
+            # - xgr_core.TokenizerInfo.from_huggingface !=
+            #   xgr.TokenizerInfo.from_huggingface.
+            if config_data.backend_str:
+                tokenizer_info = xgr.TokenizerInfo._create_from_handle(
+                    xgr_core.TokenizerInfo.from_huggingface(
+                        config_data.encoded_vocab, config_data.backend_str,
+                        config.vocab_size, config_data.stop_token_ids))
+            else:
+                tokenizer_info = xgr.TokenizerInfo(
+                    config_data.encoded_vocab,
+                    config_data.vocab_type,
+                    vocab_size=config.vocab_size,
+                    stop_token_ids=config_data.stop_token_ids)
             cls._cache[cache_key] = xgr.GrammarCompiler(
                 tokenizer_info, max_threads=config.max_threads)
 
@@ -118,10 +157,7 @@ class GrammarConfig:
     grammar_str: str | None = None
     json_object: bool | None = None
     max_threads: int = 8
-    # Only populated if tokenizer_hash not in cache
-    encoded_vocab: list[str] | None = None
-    stop_token_ids: list[int] | None = None
-    backend_str: str | None = None
+    tokenizer_data: TokenizerData | None = None
 
     @classmethod
     def from_guided_params(cls,
@@ -132,9 +168,6 @@ def from_guided_params(cls,
 
         tokenizer_hash = hash(tokenizer)
         tokenizer_data = TokenizerDataCache.get_tokenizer_data(tokenizer)
-        encoded_vocab = tokenizer_data.encoded_vocab
-        stop_token_ids = tokenizer_data.stop_token_ids
-        backend_str = tokenizer_data.backend_str
 
         if guided_params.json:
             if not isinstance(guided_params.json, str):
@@ -152,11 +185,9 @@ def from_guided_params(cls,
 
             return cls(json_str=json_str,
                        vocab_size=model_config.hf_text_config.vocab_size,
-                       encoded_vocab=encoded_vocab,
-                       stop_token_ids=stop_token_ids,
-                       backend_str=backend_str,
                        tokenizer_hash=tokenizer_hash,
-                       max_threads=max_threads)
+                       max_threads=max_threads,
+                       tokenizer_data=tokenizer_data)
         elif guided_params.grammar:
             # XGrammar only supports GBNF grammars, so we must convert Lark
             if grammar_is_likely_lark(guided_params.grammar):
@@ -181,19 +212,17 @@ def from_guided_params(cls,
 
             return cls(grammar_str=grammar_str,
                        vocab_size=model_config.hf_text_config.vocab_size,
-                       encoded_vocab=encoded_vocab,
-                       stop_token_ids=stop_token_ids,
-                       backend_str=backend_str,
                        tokenizer_hash=tokenizer_hash,
-                       max_threads=max_threads)
+                       max_threads=max_threads,
+                       tokenizer_data=tokenizer_data)
         elif guided_params.json_object:
-            return cls(json_object=True,
-                       vocab_size=model_config.hf_text_config.vocab_size,
-                       encoded_vocab=encoded_vocab,
-                       stop_token_ids=stop_token_ids,
-                       backend_str=backend_str,
-                       tokenizer_hash=tokenizer_hash,
-                       max_threads=max_threads)
+            return cls(
+                json_object=True,
+                vocab_size=model_config.hf_text_config.vocab_size,
+                tokenizer_hash=tokenizer_hash,
+                max_threads=max_threads,
+                tokenizer_data=tokenizer_data,
+            )
         else:
             raise ValueError(
                 "Currently only support JSON and EBNF grammar mode for xgrammar"
@@ -269,10 +298,14 @@ def __call__(self, input_ids: list[int],
         # fill_next_token_bitmask so we move it to the device of scores
         device_type = scores.device.type
         if device_type != "cuda":
-            scores = scores.to("cpu")
+            scores = scores.to("cpu").unsqueeze(0)
+
+        # Note: In this method, if the tensors have different dimensions
+        # on CPU device fails, but on GPU it runs without error. Hence the
+        # unsqueeze above for scores, to match the token bitmask shape
         xgr.apply_token_bitmask_inplace(scores,
                                         self.token_bitmask.to(scores.device))
         if device_type != "cuda":
-            scores = scores.to(device_type)
+            scores = scores.to(device_type).squeeze()
 
         return scores
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 54f9f895fe541..e6701f4c4b835 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -132,7 +132,7 @@ def get_tokenizer(
     if is_from_mistral_org and tokenizer_mode != "mistral":
         warnings.warn(
             'It is strongly recommended to run mistral models with '
-            '`--tokenizer_mode "mistral"` to ensure correct '
+            '`--tokenizer-mode "mistral"` to ensure correct '
             'encoding and decoding.',
             FutureWarning,
             stacklevel=2)
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 83b3c37d6f04c..17d722e3d88fe 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -314,12 +314,15 @@ def _token_to_id(t: str):
 
             if regular_tokens:
                 decoded_list.append(
-                    self.decode(regular_tokens))  # type: ignore
+                    self.tokenizer.decode(regular_tokens))  # type: ignore
 
             decoded = ''.join(decoded_list)
 
         return decoded
 
+    # WARN: Outlines logits processors can overwrite this method.
+    # See: guided_decoding/outlines_logits_processors.py::_adapt_tokenizer
+    # for more.
     def decode(self,
                ids: Union[List[int], int],
                skip_special_tokens: bool = True) -> str:

From f04e407e6b6b9ce65c16cffda836f05c2ad32682 Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Wed, 18 Dec 2024 14:34:23 +0800
Subject: [PATCH 105/357] [MISC][XPU]update ipex link for CI fix (#11278)

---
 requirements-xpu.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements-xpu.txt b/requirements-xpu.txt
index e41295792283f..42c6c321d040c 100644
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
@@ -9,8 +9,8 @@ setuptools-scm>=8
 wheel
 jinja2
 
-torch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl
-intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl
-oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl
+torch @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl
+intel-extension-for-pytorch @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl
+oneccl_bind_pt @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl
 
 triton-xpu == 3.0.0b1

From 60508ffda91c22e4cde3b18f149d222211db8886 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Wed, 18 Dec 2024 09:57:16 -0500
Subject: [PATCH 106/357] [Kernel]: Cutlass 2:4 Sparsity + FP8/Int8 Quant
 Support (#10995)

Co-authored-by: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Co-authored-by: ilmarkov <markovilya197@gmail.com>
Co-authored-by: Rahul Tuli <rahul@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 CMakeLists.txt                                |  26 +-
 .../cutlass_benchmarks/sparse_benchmarks.py   | 384 ++++++++++++++
 benchmarks/cutlass_benchmarks/utils.py        |  96 ++++
 .../cutlass_benchmarks/w8a8_benchmarks.py     |  28 +-
 .../cutlass_benchmarks/weight_shapes.py       |   2 +-
 csrc/core/math.hpp                            |   7 +
 csrc/cutlass_extensions/common.cpp            |  11 +
 csrc/cutlass_extensions/common.hpp            |  35 ++
 .../epilogue/scaled_mm_epilogues_c3x.hpp      |   4 +-
 csrc/ops.h                                    |   9 +
 csrc/quantization/cutlass_w8a8/common.hpp     |  27 -
 .../cutlass_w8a8/scaled_mm_c2x.cuh            |   3 +-
 .../cutlass_w8a8/scaled_mm_c3x.cu             |   3 +-
 .../cutlass_w8a8/scaled_mm_entry.cu           |  12 +-
 csrc/sparse/cutlass/sparse_compressor_c3x.cu  | 163 ++++++
 .../sparse/cutlass/sparse_compressor_entry.cu |  42 ++
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   | 303 +++++++++++
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh  | 496 ++++++++++++++++++
 csrc/sparse/cutlass/sparse_scaled_mm_entry.cu |  59 +++
 csrc/torch_bindings.cpp                       |  15 +
 pyproject.toml                                |   2 +-
 tests/kernels/test_semi_structured.py         | 131 +++++
 tests/quantization/test_compressed_tensors.py | 103 +++-
 tests/weight_loading/models.txt               |   2 +
 .../run_model_weight_loading_test.sh          |   4 +
 tests/weight_loading/test_weight_loading.py   |   7 +
 vllm/_custom_ops.py                           | 103 ++++
 .../compressed_tensors/compressed_tensors.py  | 187 ++++++-
 .../compressed_tensors/schemes/__init__.py    |  15 +-
 .../schemes/compressed_tensors_24.py          | 203 +++++++
 30 files changed, 2365 insertions(+), 117 deletions(-)
 create mode 100644 benchmarks/cutlass_benchmarks/sparse_benchmarks.py
 create mode 100644 benchmarks/cutlass_benchmarks/utils.py
 create mode 100644 csrc/core/math.hpp
 create mode 100644 csrc/cutlass_extensions/common.cpp
 create mode 100644 csrc/cutlass_extensions/common.hpp
 delete mode 100644 csrc/quantization/cutlass_w8a8/common.hpp
 create mode 100644 csrc/sparse/cutlass/sparse_compressor_c3x.cu
 create mode 100644 csrc/sparse/cutlass/sparse_compressor_entry.cu
 create mode 100644 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
 create mode 100644 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
 create mode 100644 csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
 create mode 100644 tests/kernels/test_semi_structured.py
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf19b3d227171..51b49a18dddf2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -206,7 +206,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
   # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
-  set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")
 
   # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
   if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -223,13 +223,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        GIT_TAG v3.5.1
+        GIT_TAG 8aa95dbb888be6d81c6fbf7169718c5244b53227
         GIT_PROGRESS TRUE
 
         # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
         # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
         # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
-        GIT_SHALLOW TRUE
+        GIT_SHALLOW FALSE
     )
   endif()
   FetchContent_MakeAvailable(cutlass)
@@ -241,7 +241,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/custom_all_reduce.cu"
     "csrc/permute_cols.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
+    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
+    "csrc/sparse/cutlass/sparse_compressor_entry.cu"
+    "csrc/cutlass_extensions/common.cpp")
 
   set_gencode_flags_for_srcs(
     SRCS "${VLLM_EXT_SRC}"
@@ -271,11 +274,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   #
-  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
+  # The cutlass_scaled_mm cutlass_scaled_sparse_mm, and cutlass_compressor kernels
+  # For Hopper (c3x, i.e. CUTLASS 3.x) require
   # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
   cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
+             "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
+             "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
@@ -284,12 +290,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
   else()
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-      message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
+      message(STATUS "Not building cutlass_c3x kernels as CUDA Compiler version is "
                      "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
-                     "later if you intend on running FP8 quantized models on "
+                     "later if you intend on running FP8 sparse or quantized models on "
                      "Hopper.")
     else()
-      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
+      message(STATUS "Not building cutlass_c3x as no compatible archs found "
                      "in CUDA target architectures")
     endif()
 
@@ -404,7 +410,7 @@ define_gpu_extension_target(
   SOURCES ${VLLM_EXT_SRC}
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
-  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
+  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
   USE_SABI 3
   WITH_SOABI)
 
diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
new file mode 100644
index 0000000000000..3d1c5e392f9e2
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -0,0 +1,384 @@
+import argparse
+import copy
+import itertools
+import pickle as pkl
+import time
+from typing import Callable, Iterable, List, Tuple
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from utils import make_rand_sparse_tensors
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+
+# bench
+def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
+             **kwargs) -> TMeasurement:
+    min_run_time = 1
+
+    globals = {
+        "args": args,
+        "kwargs": kwargs,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(*args, **kwargs)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+               sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.int8
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
+                                       torch.bfloat16)
+    out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
+
+    if not torch.allclose(out, out_ref):
+        print("Incorrect results")
+        print(out)
+        print(out_ref)
+    else:
+        print("Correct results")
+
+    timers = []
+    # pytorch impl - bfloat16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16),
+                 b.to(dtype=torch.bfloat16)))
+
+    # pytorch impl - float16
+    timers.append(
+        bench_fn(label, sub_label,
+                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
+                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
+
+    # cutlass impl
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    # cutlass sparse impl
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16))
+
+    # cutlass sparse with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16, bias))
+
+    return timers
+
+
+def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.float8_e4m3fn
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n,
+                                                     k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
+                                       torch.bfloat16)
+    out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
+
+    if not torch.allclose(out, out_ref):
+        print("Incorrect results")
+        print(out)
+        print(out_ref)
+    else:
+        print("Correct results")
+
+    timers = []
+
+    # pytorch impl w. bf16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+                 b.to(dtype=torch.bfloat16, device="cuda")))
+
+    # pytorch impl: bf16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16))
+
+    # pytorch impl: bf16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16,
+                 use_fast_accum=True))
+
+    # pytorch impl: fp16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16))
+
+    # pytorch impl: fp16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16,
+                 use_fast_accum=True))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16))
+
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.float16))
+
+    # cutlass impl: bf16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label,
+                 "cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16, bias))
+
+    # cutlass impl: fp16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label,
+                 "cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.float16, bias.to(dtype=torch.float16)))
+
+    return timers
+
+
+def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+          sub_label: str) -> Iterable[TMeasurement]:
+    if dtype == torch.int8:
+        return bench_int8(dtype, m, k, n, label, sub_label)
+    if dtype == torch.float8_e4m3fn:
+        return bench_fp8(dtype, m, k, n, label, sub_label)
+    raise ValueError("unsupported type")
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def run(dtype: torch.dtype,
+        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+    results = []
+    for m, k, n in MKNs:
+        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
+                       f"MKN=({m}x{k}x{n})")
+        print_timers(timers)
+        results.extend(timers)
+
+    return results
+
+
+# output makers
+def make_output(data: Iterable[TMeasurement],
+                MKNs: Iterable[Tuple[int, int, int]],
+                base_description: str,
+                timestamp=None):
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+
+
+# argparse runners
+
+
+def run_square_bench(args):
+    dim_sizes = list(
+        range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"square_bench-{args.dtype}")
+
+
+def run_range_bench(args):
+    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
+    n = len(dim_sizes)
+    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
+    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
+    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
+    MKNs = list(zip(Ms, Ks, Ns))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"range_bench-{args.dtype}")
+
+
+def run_model_bench(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            for k, n in KNs:
+                MKNs.append((m, k, n))
+
+        data = run(args.dtype, MKNs)
+        model_bench_data.append(data)
+
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+        print_timers(data)
+
+    timestamp = int(time.time())
+
+    all_data = []
+    for d in model_bench_data:
+        all_data.extend(d)
+    # pickle all data
+    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(all_data, f)
+
+
+if __name__ == '__main__':
+
+    def to_torch_dtype(dt):
+        if dt == "int8":
+            return torch.int8
+        if dt == "fp8":
+            return torch.float8_e4m3fn
+        raise ValueError("unsupported dtype")
+
+    parser = FlexibleArgumentParser(
+        description="""
+Benchmark Cutlass GEMM.
+
+    To run square GEMMs:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+    
+    To run constant N and K and sweep M:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+    
+    To run dimensions from a model:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument("--dtype",
+                        type=to_torch_dtype,
+                        required=True,
+                        help="Available options are ['int8', 'fp8']")
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    square_parser = subparsers.add_parser("square_bench")
+    square_parser.add_argument("--dim-start", type=int, required=True)
+    square_parser.add_argument("--dim-end", type=int, required=True)
+    square_parser.add_argument("--dim-increment", type=int, required=True)
+    square_parser.set_defaults(func=run_square_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument("--dim-start", type=int, required=True)
+    range_parser.add_argument("--dim-end", type=int, required=True)
+    range_parser.add_argument("--dim-increment", type=int, required=True)
+    range_parser.add_argument("--m-constant", type=int, default=None)
+    range_parser.add_argument("--n-constant", type=int, default=None)
+    range_parser.add_argument("--k-constant", type=int, default=None)
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument("--models",
+                              nargs="+",
+                              type=str,
+                              default=DEFAULT_MODELS,
+                              choices=WEIGHT_SHAPES.keys())
+    model_parser.add_argument("--tp-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_TP_SIZES)
+    model_parser.add_argument("--batch-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_BATCH_SIZES)
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+    args.func(args)
diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py
new file mode 100644
index 0000000000000..ef06fcd6604dd
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/utils.py
@@ -0,0 +1,96 @@
+# Cutlass bench utils
+from typing import Iterable, Tuple
+
+import torch
+
+import vllm._custom_ops as ops
+
+
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.bfloat16)
+
+
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.float16)
+
+
+def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
+                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    if dtype == torch.int8:
+        return to_int8(a), to_int8(b)
+    if dtype == torch.float8_e4m3fn:
+        return to_fp8(a), to_fp8(b)
+
+    raise ValueError("unsupported dtype")
+
+
+def prune_to_2_4(tensor):
+    # Reshape tensor to [N, 4] where N is number of groups of 4
+    original_shape = tensor.shape
+    reshaped = tensor.reshape(-1, 4)
+
+    # Get indices of top 2 absolute values in each group of 4
+    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
+
+    # Create binary mask
+    mask = torch.zeros_like(reshaped)
+    mask.scatter_(dim=1,
+                  index=indices,
+                  src=torch.ones_like(indices, dtype=mask.dtype))
+
+    # Apply mask and reshape back
+    pruned = reshaped * mask
+
+    # Turn all -0.0 to 0.0
+    pruned[pruned == -0.0] = 0.0
+
+    return pruned.reshape(original_shape)
+
+
+def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
+                             k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    b = prune_to_2_4(b.t()).t()
+
+    if dtype == torch.int8:
+        a, b = to_int8(a), to_int8(b)
+    elif dtype == torch.float8_e4m3fn:
+        a, b = to_fp8(a), to_fp8(b)
+    elif dtype == torch.float16:
+        a, b = to_fp16(a), to_fp16(b)
+    elif dtype == torch.bfloat16:
+        a, b = to_bf16(a), to_bf16(b)
+    else:
+        raise ValueError("unsupported dtype")
+
+    b_compressed, e = ops.cutlass_sparse_compress(b.t())
+
+    # Compressed B, Metadata, Original A, B
+    return b_compressed, e, a, b
+
+
+def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
+                        m: int, n: int, k: int) -> \
+                        Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
+    ABs = []
+    for _ in range(num_tensors):
+        b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
+        if b_comp is not None:
+            ABs.append(make_rand_sparse_tensors(dtype, m, n, k))
+    BComps, Es, As, Bs = zip(*ABs)
+    return list(BComps), list(Es), list(As), list(Bs)
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 63cf5d50cac75..d0353bc8cb42a 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -8,6 +8,7 @@
 import torch
 import torch.utils.benchmark as TBenchmark
 from torch.utils.benchmark import Measurement as TMeasurement
+from utils import make_rand_tensors
 from weight_shapes import WEIGHT_SHAPES
 
 from vllm import _custom_ops as ops
@@ -17,31 +18,6 @@
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
 DEFAULT_TP_SIZES = [1]
 
-# helpers
-
-
-def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def to_int8(tensor: torch.Tensor) -> torch.Tensor:
-    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
-
-
-def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
-                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
-
-    if dtype == torch.int8:
-        return to_int8(a), to_int8(b)
-    if dtype == torch.float8_e4m3fn:
-        return to_fp8(a), to_fp8(b)
-
-    raise ValueError("unsupported dtype")
-
 
 # bench
 def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
@@ -386,4 +362,4 @@ def to_torch_dtype(dt):
     model_parser.set_defaults(func=run_model_bench)
 
     args = parser.parse_args()
-    args.func(args)
+    args.func(args)
\ No newline at end of file
diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py
index 25ec9d6028627..d58fb0bf86374 100644
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@@ -40,4 +40,4 @@
         ([8192, 57344], 1),
         ([28672, 8192], 0),
     ],
-}
+}
\ No newline at end of file
diff --git a/csrc/core/math.hpp b/csrc/core/math.hpp
new file mode 100644
index 0000000000000..ba9f40a230c8e
--- /dev/null
+++ b/csrc/core/math.hpp
@@ -0,0 +1,7 @@
+#include <climits>
+#include <iostream>
+
+inline uint32_t next_pow_2(uint32_t const num) {
+  if (num <= 1) return num;
+  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/common.cpp b/csrc/cutlass_extensions/common.cpp
new file mode 100644
index 0000000000000..3d2093ab94297
--- /dev/null
+++ b/csrc/cutlass_extensions/common.cpp
@@ -0,0 +1,11 @@
+#include "cutlass_extensions/common.hpp"
+
+int32_t get_sm_version_num() {
+  int32_t major_capability, minor_capability;
+  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
+                         0);
+  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
+                         0);
+  int32_t version_num = major_capability * 10 + minor_capability;
+  return version_num;
+}
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp
new file mode 100644
index 0000000000000..85e359aa57113
--- /dev/null
+++ b/csrc/cutlass_extensions/common.hpp
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include <climits>
+#include "cuda_runtime.h"
+#include <iostream>
+
+/**
+ * Helper function for checking CUTLASS errors
+ */
+#define CUTLASS_CHECK(status)                       \
+  {                                                 \
+    cutlass::Status error = status;                 \
+    TORCH_CHECK(error == cutlass::Status::kSuccess, \
+                cutlassGetStatusString(error));     \
+  }
+
+/**
+ * Panic wrapper for unwinding CUDA runtime errors
+ */
+#define CUDA_CHECK(status)                                        \
+  {                                                               \
+    cudaError_t error = status;                                   \
+    TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
+  }
+
+inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
+  int max_shared_mem_per_block_opt_in = 0;
+  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
+                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                        device);
+  return max_shared_mem_per_block_opt_in;
+}
+
+int32_t get_sm_version_num();
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
index 95764ecddc79f..fcc17c7727f94 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -36,13 +36,13 @@ struct ScaledEpilogueBase {
   // Don't want to support nullptr by default
   template <typename T, bool EnableNullPtr = false>
   using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
       Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
 
   // Don't want to support nullptr by default
   template <typename T, bool EnableNullPtr = false>
   using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
       Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
 
   // This utility function constructs the arguments for the load descriptors
diff --git a/csrc/ops.h b/csrc/ops.h
index 816b471d062d2..c145e4eda0845 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -162,6 +162,15 @@ void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            torch::Tensor const& azp_adj,
                            c10::optional<torch::Tensor> const& azp,
                            c10::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
+                              torch::Tensor const& b, torch::Tensor const& e,
+                              torch::Tensor const& a_scales,
+                              torch::Tensor const& b_scales,
+                              c10::optional<torch::Tensor> const& bias);
+
+bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed,
+                                   torch::Tensor& e, torch::Tensor const& a);
 #endif
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
diff --git a/csrc/quantization/cutlass_w8a8/common.hpp b/csrc/quantization/cutlass_w8a8/common.hpp
deleted file mode 100644
index bf04bb400790f..0000000000000
--- a/csrc/quantization/cutlass_w8a8/common.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include <climits>
-
-/**
- * Helper function for checking CUTLASS errors
- */
-#define CUTLASS_CHECK(status)                        \
-  {                                                  \
-    TORCH_CHECK(status == cutlass::Status::kSuccess, \
-                cutlassGetStatusString(status))      \
-  }
-
-inline uint32_t next_pow_2(uint32_t const num) {
-  if (num <= 1) return num;
-  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
-}
-
-inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
-  int max_shared_mem_per_block_opt_in = 0;
-  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
-                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
-                        device);
-  return max_shared_mem_per_block_opt_in;
-}
-
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
index d03242f44ab1d..75681f7f37820 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
@@ -21,7 +21,8 @@
 #include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
 #include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
 
-#include "common.hpp"
+#include "core/math.hpp"
+#include "cutlass_extensions/common.hpp"
 // clang-format on
 
 using namespace cute;
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index 33581a63d4c3d..8190277997161 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -24,7 +24,8 @@
 #include "cutlass/gemm/collective/collective_builder.hpp"
 
 #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
-#include "common.hpp"
+#include "core/math.hpp"
+#include "cutlass_extensions/common.hpp"
 // clang-format on
 
 using namespace cute;
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 97a969cf5e3e0..4f7b6588ef3f7 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -3,6 +3,8 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/all.h>
 
+#include "cutlass_extensions/common.hpp"
+
 void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
@@ -79,16 +81,6 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
   return false;
 }
 
-int32_t get_sm_version_num() {
-  int32_t major_capability, minor_capability;
-  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
-                         0);
-  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
-                         0);
-  int32_t version_num = major_capability * 10 + minor_capability;
-  return version_num;
-}
-
 void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales,
diff --git a/csrc/sparse/cutlass/sparse_compressor_c3x.cu b/csrc/sparse/cutlass/sparse_compressor_c3x.cu
new file mode 100644
index 0000000000000..218c5317b4de6
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_compressor_c3x.cu
@@ -0,0 +1,163 @@
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#include "sparse_scaled_mm_c3x.cuh"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/transform/device/transform_universal_adapter.hpp"
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+// clang-format on
+
+using namespace cute;
+using namespace vllm;
+
+/// Make A structured sparse by replacing elements with 0 and compress it
+template <typename ElementA_, typename ElementAcc_>
+bool cutlass_sparse_compress(torch::Tensor& a_nzs, torch::Tensor& a_meta,
+                             torch::Tensor const& a) {
+  // Checks for conformality
+  TORCH_CHECK(a.dtype() == torch::kInt8 || a.dtype() == torch::kFloat8_e4m3fn ||
+              a.dtype() == torch::kFloat16 || a.dtype() == torch::kBFloat16);
+  TORCH_CHECK(a.dim() == 2)
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(0) % 4 == 0)  // Required for semi-structured sparsity
+  TORCH_CHECK(a.stride(1) == 1)
+
+  int m = a.size(0);
+  int k = a.size(1);
+
+  // Sparse kernel setup; this kernel is not used for matmul,
+  // but just for setting up the compressor utility
+  // A matrix configuration
+  using ElementA = ElementA_;
+  using LayoutTagA = cutlass::layout::RowMajor;
+  constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+  // B matrix configuration
+  using ElementB = ElementA;
+  using LayoutTagB = cutlass::layout::ColumnMajor;
+  constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+  // C/D matrix configuration
+  using ElementC = float;
+  using LayoutTagC = cutlass::layout::ColumnMajor;
+  constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  // Core kernel configurations
+  using ElementAccumulator = ElementAcc_;
+  using TileShape = Shape<_128, _128, _128>;
+  using TileShapeRef = Shape<_128, _128, _64>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using KernelSchedule = typename std::conditional<
+      std::is_same_v<ElementA, cutlass::float_e4m3_t>,
+      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum,
+      cutlass::gemm::KernelTmaWarpSpecialized>::type;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using ProblemShape = Shape<int, int, int, int>;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAccumulator, ElementAccumulator, ElementC, LayoutTagC,
+          AlignmentC, ElementC, LayoutTagC, AlignmentC,
+          EpilogueSchedule>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, ElementA,
+          LayoutTagA, AlignmentA, ElementB, LayoutTagB, AlignmentB,
+          ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
+                                           CollectiveEpilogue>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutTagA>;
+  using StrideE = StrideA;
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+
+  // The n (=1) dimension does not matter for the compressor
+  typename GemmKernel::ProblemShape prob_shape{m, 1, k, 1};
+
+  using LayoutA = typename GemmKernel::CollectiveMainloop::LayoutA;
+  using LayoutE = typename GemmKernel::CollectiveMainloop::LayoutE;
+
+  using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
+  using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
+
+  // Offline compressor kernel
+  using CompressorUtility =
+      cutlass::transform::kernel::StructuredSparseCompressorUtility<
+          ProblemShape, ElementA, LayoutTagA, SparseConfig>;
+
+  using CompressorKernel =
+      cutlass::transform::kernel::StructuredSparseCompressor<
+          ProblemShape, ElementA, LayoutTagA, SparseConfig,
+          cutlass::arch::Sm90>;
+
+  using Compressor =
+      cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+
+  auto [M, N, K, L] = prob_shape;
+
+  StrideA stride_A;
+  stride_A =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+
+  CompressorUtility compressor_utility(prob_shape, stride_A);
+
+  int ME = compressor_utility.get_metadata_m_physical();
+  int KE = compressor_utility.get_metadata_k_physical();
+  int KC = compressor_utility.get_tensorA_k_physical();
+
+  auto a_ptr = static_cast<ElementA*>(a.data_ptr());
+
+  auto a_nzs_ptr = static_cast<ElementA*>(a_nzs.data_ptr());
+  auto a_meta_ptr = static_cast<typename Gemm::CollectiveMainloop::ElementE*>(
+      a_meta.data_ptr());
+
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = 0;
+  hw_info.sm_count =
+      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+          hw_info.device_id);
+  typename Compressor::Arguments arguments{
+      prob_shape, {a_ptr, stride_A, a_nzs_ptr, a_meta_ptr}, {hw_info}};
+
+  Compressor compressor_op;
+  size_t workspace_size = Compressor::get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  CUTLASS_CHECK(compressor_op.can_implement(arguments));
+  CUTLASS_CHECK(compressor_op.initialize(arguments, workspace.get()));
+  CUTLASS_CHECK(compressor_op.run());
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  return true;
+}
+
+bool cutlass_sparse_compress_sm90(torch::Tensor& a_nzs, torch::Tensor& a_meta,
+                                  torch::Tensor const& a) {
+  if (a.dtype() == torch::kBFloat16) {
+    return cutlass_sparse_compress<cutlass::bfloat16_t, float>(a_nzs, a_meta,
+                                                               a);
+  } else if (a.dtype() == torch::kFloat16) {
+    return cutlass_sparse_compress<cutlass::half_t, float>(a_nzs, a_meta, a);
+  } else if (a.dtype() == torch::kFloat8_e4m3fn) {
+    return cutlass_sparse_compress<cutlass::float_e4m3_t, float>(a_nzs, a_meta,
+                                                                 a);
+  } else if (a.dtype() == torch::kInt8) {
+    return cutlass_sparse_compress<int8_t, int32_t>(a_nzs, a_meta, a);
+  }
+  return false;
+}
\ No newline at end of file
diff --git a/csrc/sparse/cutlass/sparse_compressor_entry.cu b/csrc/sparse/cutlass/sparse_compressor_entry.cu
new file mode 100644
index 0000000000000..d23d937b6ac28
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_compressor_entry.cu
@@ -0,0 +1,42 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include "cutlass_extensions/common.hpp"
+
+#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+bool cutlass_sparse_compress_sm90(torch::Tensor& a_nzs, torch::Tensor& a_meta,
+                                  torch::Tensor const& a);
+#endif
+
+bool cutlass_sparse_compress_entry(torch::Tensor& a_nzs, torch::Tensor& a_meta,
+                                   torch::Tensor const& a) {
+  // Checks for conformality
+  TORCH_CHECK(a.dim() == 2 && a_meta.dim() == 2 && a_nzs.dim() == 2);
+  TORCH_CHECK(a.size(0) == a_nzs.size(0) && a.size(0) == a_meta.size(0) &&
+              a_nzs.size(1) * 2 == a.size(1) &&
+              a_meta.size(1) * 2 * 4 == a.size(1));
+  // Considering elemsPerMetaElem = 8b / 2b_per_nz = 4
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && a_nzs.stride(1) == 1 &&
+              a_meta.stride(1) == 1);  // Row-major
+  TORCH_CHECK(a.stride(0) % 8 == 0);   // 8 Byte Alignment for Compression
+
+  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
+  int32_t version_num = get_sm_version_num();
+
+  // Guard against compilation issues for sm90 kernels
+#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+  if (version_num >= 90) {
+    return cutlass_sparse_compress_sm90(a_nzs, a_meta, a);
+  }
+#endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_scaled_sparse_mm for a compute capability less than "
+      "CUDA device capability: ",
+      version_num);
+}
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
new file mode 100644
index 0000000000000..b50e9a3a2c240
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -0,0 +1,303 @@
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+#include "sparse_scaled_mm_c3x.cuh"
+// clang-format on
+
+using namespace cute;
+using namespace vllm;
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                    torch::Tensor const& bt_nzs,
+                                    torch::Tensor const& bt_meta,
+                                    EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM256 =
+      typename sm90_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM512 =
+      typename sm90_fp8_config_M512<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  using Cutlass3xGemm1 =
+      typename sm90_fp8_config_1<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm2 =
+      typename sm90_fp8_config_2<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm3 =
+      typename sm90_fp8_config_3<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm4 =
+      typename sm90_fp8_config_4<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm5 =
+      typename sm90_fp8_config_5<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm6 =
+      typename sm90_fp8_config_6<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm7 =
+      typename sm90_fp8_config_7<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm8 =
+      typename sm90_fp8_config_8<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  uint32_t const n = bt_nzs.size(0);
+  uint32_t const m = a.size(0);  // Batch size
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 64) {
+    if (n == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm2>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 4096 || n == 6144) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm1>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 128) {
+    if (n == 4096) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm3>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm5>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 6144) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm4>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 256) {
+    if (n == 4096) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm6>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm8>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 6144) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm7>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  } else {
+    if (n == 6144 || n == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm8>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 4096) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm7>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+
+  // Otherwise the default heuristic
+  if (mp2 <= 64) {
+    // n in [1, 64]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // n in (64, 128]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
+    // n in (128, 256]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM256>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // n in (256, inf)
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM512>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_fp16_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& bt_nzs,
+                                     torch::Tensor const& bt_meta,
+                                     EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::half_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat16);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kFloat16);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  // m in (128, inf)
+  return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
+      out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_bf16_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& bt_nzs,
+                                     torch::Tensor const& bt_meta,
+                                     EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::bfloat16_t>());
+  TORCH_CHECK(a.dtype() == torch::kBFloat16);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kBFloat16);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  // m in (128, inf)
+  return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
+      out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& bt_nzs,
+                                     torch::Tensor const& bt_meta,
+                                     EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kInt8);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_int8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NBig =
+      typename sm90_int8_config_M32_NBig<InType, OutType,
+                                         Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NSmall =
+      typename sm90_int8_config_M32_NSmall<InType, OutType,
+                                           Epilogue>::Cutlass3xGemm;
+
+  uint32_t const n = out.size(1);
+  bool const is_small_n = n < 8192;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 32) {
+    // m in [1, 32]
+    if (is_small_n) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemmM32NSmall>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemmM32NBig>(
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 64) {
+    // m in (32, 64]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& bt_nzs,
+                                            torch::Tensor const& bt_meta,
+                                            EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  if (a.dtype() == torch::kInt8) {
+    TORCH_CHECK(bt_nzs.dtype() == torch::kInt8);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                             Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  } else if (a.dtype() == torch::kFloat8_e4m3fn) {
+    TORCH_CHECK(bt_nzs.dtype() == torch::kFloat8_e4m3fn);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::bfloat16_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::half_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  } else if (a.dtype() == torch::kFloat16) {
+    TORCH_CHECK(bt_nzs.dtype() == torch::kFloat16);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t,
+                                             cutlass::bfloat16_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t, cutlass::half_t,
+                                             Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  } else {  // a.dtype() == torch::kBFloat16
+    TORCH_CHECK(a.dtype() == torch::kBFloat16);
+    TORCH_CHECK(bt_nzs.dtype() == torch::kBFloat16);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
+                                             cutlass::bfloat16_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
+                                             cutlass::half_t, Epilogue>(
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  }
+}
+
+void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
+                                   torch::Tensor const& bt_nzs,
+                                   torch::Tensor const& bt_meta,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   c10::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogueBias>(
+        out, a, bt_nzs, bt_meta, b_scales, a_scales, *bias);
+  } else {
+    return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogue>(
+        out, a, bt_nzs, bt_meta, b_scales, a_scales);
+  }
+}
+
+#endif
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
new file mode 100644
index 0000000000000..10178b53f4af0
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -0,0 +1,496 @@
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "core/math.hpp"
+#include "cutlass_extensions/cute_utils.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "cutlass_extensions/common.hpp"
+#include "cutlass_extensions/torch_utils.hpp"
+// clang-format on
+
+using namespace cute;
+
+/*
+   This file defines sparse quantized GEMM operations using the CUTLASS 3.x API,
+   for NVIDIA GPUs with sm90a (Hopper) or later.
+*/
+
+namespace {
+
+// A wrapper for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+using GemmUniversalMode = cutlass::gemm::GemmUniversalMode;
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule, typename AccType,
+          typename TileSchedule = cutlass::gemm::PersistentScheduler,
+          GemmUniversalMode Mode_ = GemmUniversalMode::kGemm>
+struct cutlass_sparse_3x_gemm {
+  static const GemmUniversalMode Mode = Mode_;
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+  using ElementAcc = AccType;
+
+  using EpilogueDescriptor =
+      cutlass::epilogue::collective::detail::EpilogueDescriptor<
+          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
+          ElementD, EpilogueSchedule>;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
+
+  using ElementC = void;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = LayoutC;
+  using StrideC = cutlass::detail::TagToStrideA_t<LayoutC>;
+  using StrideD = cutlass::detail::TagToStrideA_t<LayoutD>;
+
+  using LayoutC_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutC>::type;
+  using LayoutD_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutD>::type;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  static constexpr int AlignmentA =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD =
+      128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, ElementAcc, ElementC, LayoutC_Transpose, AlignmentCD,
+          ElementD, LayoutD_Transpose, AlignmentCD, EpilogueSchedule,
+          EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // clang-format off
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, 
+          ElementAB, cutlass::layout::RowMajor, AlignmentA, 
+          ElementAB, cutlass::layout::ColumnMajor, AlignmentB, 
+          ElementAcc, TileShape, ClusterShape,
+          Stages,
+          KernelSchedule>::CollectiveOp;
+  // clang-format on
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      TileSchedule>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_sparse_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& bt_nzs,
+                                torch::Tensor const& bt_meta,
+                                EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  // Interface stride expected from the argument a (will get transposed)
+  // We compute C^T = B^T * A^T, but we assume B is transposed before
+  // compression and hence the bt_* naming
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutA;
+  using LayoutE = typename Gemm::GemmKernel::CollectiveMainloop::LayoutE;
+  using LayoutD = cutlass::layout::RowMajor;
+
+  using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
+  using StrideD = cutlass::detail::TagToStrideA_t<LayoutD>;
+
+  auto layout_A = make_cute_layout<StrideA>(a, "A");
+  auto layout_D = make_cute_layout<StrideD>(out, "D");
+
+  // Transpose A and D
+  // A doesn't need to be transposed since cutlass expects a NxK matrix
+  // for B (which is At)
+  auto stride_At = layout_A.stride();
+  auto stride_Dt = permute_layout<1, 0, 2>(layout_D).stride();
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  typename GemmKernel::ProblemShape prob_shape{
+      static_cast<int>(bt_nzs.size(0)), static_cast<int>(size<0>(layout_A)),
+      static_cast<int>(size<1>(layout_A)), 1};
+
+  using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
+  using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
+
+  LayoutB b_layout = SparseConfig::fill_layoutA(prob_shape);
+  LayoutE e_layout = SparseConfig::fill_layoutE(prob_shape);
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(bt_nzs.data_ptr());
+  auto e_ptr = static_cast<ElementE*>(bt_meta.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{
+      b_ptr, b_layout, a_ptr, stride_At, e_ptr, e_layout};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, stride_Dt, c_ptr, stride_Dt};
+
+  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
+                                      prob_shape, mainloop_args, epilogue_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default {};
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<half_t, OutType, Epilogue> {
+  // M in (128, inf)
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<half_t, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<cutlass::bfloat16_t, OutType, Epilogue> {
+  // M in (128, inf)
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<cutlass::bfloat16_t, OutType, Epilogue, TileShape,
+                             ClusterShape, KernelSchedule, EpilogueSchedule,
+                             float>;
+};
+
+//////////////////////// Cherry-Picking Kernels ////////////////////////
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_1 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_2 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _64, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_3 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_4 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_5 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_6 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_7 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_8 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _256, _128>;
+  using ClusterShape = Shape<_8, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
+};
+////////////////////////////////////////////////////////////////////////
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<cutlass::float_e4m3_t, OutType, Epilogue> {
+  // M in (128, inf)
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<cutlass::float_e4m3_t, OutType, Epilogue,
+                             TileShape, ClusterShape, KernelSchedule,
+                             EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M64 {
+  // M in [1, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M128 {
+  // M in (64, 128]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M256 {
+  // M in (128, 256]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M512 {
+  // M in (256, ]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
+};
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<int8_t, OutType, Epilogue> {
+  // For M > 128 and any N
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<int8_t, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M128 {
+  // For M in (64, 128] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M64 {
+  // For M in (32, 64] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NBig {
+  // For M in [1, 32] and N >= 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NSmall {
+  // For M in [1, 32] and N < 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _8, _1>;
+  using Cutlass3xGemm =
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+}  // namespace
\ No newline at end of file
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
new file mode 100644
index 0000000000000..4c930b603c9e4
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -0,0 +1,59 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include "cutlass_extensions/common.hpp"
+
+#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+                                   torch::Tensor const& b,
+                                   torch::Tensor const& e,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   c10::optional<torch::Tensor> const& bias);
+#endif
+
+void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
+                              torch::Tensor const& bt_nzs,
+                              torch::Tensor const& bt_meta,
+                              torch::Tensor const& a_scales,
+                              torch::Tensor const& b_scales,
+                              c10::optional<torch::Tensor> const& bias) {
+  // Checks for conformality
+  TORCH_CHECK(a.dim() == 2 && bt_nzs.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(1) == bt_nzs.size(0) && bt_nzs.size(1) * 2 == a.size(1) &&
+              a.size(0) == c.size(0));
+  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
+  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == bt_nzs.size(0));
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && bt_nzs.stride(1) == 1 &&
+              c.stride(1) == 1);            // Row-major
+  TORCH_CHECK(c.stride(0) % 16 == 0);       // 16 Byte Alignment
+  TORCH_CHECK(bt_nzs.stride(0) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+
+  if (bias) {
+    TORCH_CHECK(bias->numel() == bt_nzs.size(0) && bias->is_contiguous() &&
+                bias->dim() == 1);
+  }
+
+  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
+  int32_t version_num = get_sm_version_num();
+
+  // Guard against compilation issues for sm90 kernels
+#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+  if (version_num >= 90) {
+    cutlass_scaled_sparse_mm_sm90(c, a, bt_nzs, bt_meta, a_scales, b_scales,
+                                  bias);
+    return;
+  }
+#endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_scaled_sparse_mm for a compute capability less than "
+      "CUDA device capability: ",
+      version_num);
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 1ffab14862fed..88a4e60c75cbe 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -321,6 +321,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
   ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
 
+  // CUTLASS sparse GEMM, supporting symmetric per-tensor or per-row/column
+  // quantization, as well as bias
+  ops.def(
+      "cutlass_scaled_sparse_mm(Tensor! out, Tensor a,"
+      "                         Tensor bt_nzs,"
+      "                         Tensor bt_meta, Tensor a_scales,"
+      "                         Tensor b_scales, Tensor? bias) -> ()");
+  ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);
+
+  // CUTLASS sparse matrix compressor
+  ops.def(
+      "cutlass_sparse_compress_entry(Tensor! a_nzs, Tensor! a_meta,"
+      "                              Tensor a) -> bool");
+  ops.impl("cutlass_sparse_compress_entry", &cutlass_sparse_compress_entry);
+
   // Mamba selective scan kernel
   ops.def(
       "selective_scan_fwd(Tensor! u, Tensor! delta,"
diff --git a/pyproject.toml b/pyproject.toml
index c5a14ecf5aea9..45fa4bff4e680 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -83,7 +83,7 @@ exclude = [
 ]
 
 [tool.codespell]
-ignore-words-list = "dout, te, indicies, subtile"
+ignore-words-list = "dout, te, indicies, subtile, ElementE"
 skip = "./tests/models/fixtures,./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
 
 [tool.isort]
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
new file mode 100644
index 0000000000000..34244a8fe4ca7
--- /dev/null
+++ b/tests/kernels/test_semi_structured.py
@@ -0,0 +1,131 @@
+"""Tests for sparse cutlass kernels
+
+Run `pytest tests/kernels/test_semi_structured.py`.
+"""
+from typing import Optional, Tuple, Type
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+capability = current_platform.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+
+
+def to_fp8(tensor: torch.Tensor):
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor):
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def rand_int8(shape: tuple, device: str = "cuda"):
+    return to_int8(torch.rand(shape, device=device) * 255 - 128)
+
+
+def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.bfloat16)
+
+
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.float16)
+
+
+def prune_to_2_4(tensor):
+    # Reshape tensor to [N, 4] where N is number of groups of 4
+    original_shape = tensor.shape
+    reshaped = tensor.reshape(-1, 4)
+
+    # Get indices of top 2 absolute values in each group of 4
+    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
+
+    # Create binary mask
+    mask = torch.zeros_like(reshaped)
+    mask.scatter_(dim=1,
+                  index=indices,
+                  src=torch.ones_like(indices, dtype=mask.dtype))
+
+    # Apply mask and reshape back
+    pruned = reshaped * mask
+
+    # Turn all -0.0 to 0.0
+    pruned[pruned == -0.0] = 0.0
+
+    return pruned.reshape(original_shape)
+
+
+def make_rand_sparse_tensors(
+        dtype: torch.dtype, m: int, n: int, k: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    b = prune_to_2_4(b.t()).t()
+
+    if dtype == torch.int8:
+        a, b = to_int8(a), to_int8(b)
+    elif dtype == torch.float8_e4m3fn:
+        a, b = to_fp8(a), to_fp8(b)
+    elif dtype == torch.float16:
+        a, b = to_fp16(a), to_fp16(b)
+    elif dtype == torch.bfloat16:
+        a, b = to_bf16(a), to_bf16(b)
+    else:
+        raise ValueError("unsupported dtype")
+
+    b_compressed, e = ops.cutlass_sparse_compress(b.t())
+
+    # Compressed B, Metadata, Original A, B
+    return b_compressed, e, a, b
+
+
+def baseline_scaled_mm(a: torch.Tensor,
+                       b: torch.Tensor,
+                       scale_a: torch.Tensor,
+                       scale_b: torch.Tensor,
+                       out_dtype: Type[torch.dtype],
+                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    output = (scale_a * (scale_b * (torch.mm(
+        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
+    if bias is not None:
+        output = output + bias
+
+    return output
+
+
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+# Test working with a subset of A and B for sparse matmul
+def test_cutlass_sparse_subset():
+    big_m = 1024
+    m, n, k = 512, 512, 512
+
+    # Create tensors
+    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn,
+                                                     big_m, n, k)
+    a = whole_a[0:m, 0:k]
+    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=torch.bfloat16)
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=torch.bfloat16)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 26add5bf6d90d..21fec990aa873 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -10,9 +10,11 @@
 
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
-    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
-    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
+    CompressedTensors24, CompressedTensorsLinearMethod,
+    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
+    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
+    CompressedTensorsWNA16)
+from vllm.platforms import current_platform
 
 
 @pytest.mark.parametrize(
@@ -208,3 +210,98 @@ def test_compressed_tensors_kv_cache(vllm_runner):
     with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
         output = llm.generate_greedy("Hello world!", max_tokens=20)
         assert output
+
+
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy):
+    assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+    assert isinstance(qkv_proj.scheme, CompressedTensors24)
+
+    assert qkv_proj.scheme.weight_quant.strategy == weight_strategy
+    assert qkv_proj.scheme.input_quant.strategy == input_strategy
+    assert qkv_proj.scheme.quantized
+    assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+    sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
+    assert sparsity_map.get("Linear").format == "dense"
+    assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
+
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+@pytest.mark.parametrize("args_2of4", [
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", "channel",
+     "token"),
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
+     "channel", "tensor"),
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing", "tensor",
+     "tensor"),
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
+     "tensor", "token"),
+])
+def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
+    model, weight_strategy, input_strategy = args_2of4
+    with vllm_runner(model) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+        assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn
+        _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+@pytest.mark.parametrize("args_2of4", [
+    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
+     "channel", "token"),
+    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing", "tensor",
+     "tensor"),
+    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
+     "tensor", "token"),
+])
+def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
+    model, weight_strategy, input_strategy = args_2of4
+    with vllm_runner(model) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+        assert qkv_proj.scheme.weights_dtype == torch.int8
+        _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+@pytest.mark.parametrize(
+    "args_2of4",
+    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])
+def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
+    model = args_2of4
+    with vllm_runner(model) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(qkv_proj.scheme, CompressedTensors24)
+
+        assert qkv_proj.scheme.weight_quant is None
+        assert qkv_proj.scheme.input_quant is None
+        assert not qkv_proj.scheme.quantized
+        assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+        sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
+        assert sparsity_map.get("Linear").format == "dense"
+        assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index 2afffb5b9d1c8..a06956ce18a93 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -21,6 +21,8 @@ compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
 compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
 compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
 compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
+compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-FP8-Dynamic-testing, main, 90
+compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing, main, 90
 awq, casperhansen/mixtral-instruct-awq, main
 awq_marlin, casperhansen/mixtral-instruct-awq, main
 fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
diff --git a/tests/weight_loading/run_model_weight_loading_test.sh b/tests/weight_loading/run_model_weight_loading_test.sh
index a4d0c44c22b51..693128640e07d 100755
--- a/tests/weight_loading/run_model_weight_loading_test.sh
+++ b/tests/weight_loading/run_model_weight_loading_test.sh
@@ -26,6 +26,10 @@ do
     export QUANTIZATION=${array[0]}
     export MODEL_NAME=${array[1]}
     export REVISION=${array[2]}
+    # If array length is larger than 3, then MIN_CAPABILITY is provided
+    if [ ${#array[@]} -gt 3 ]; then
+        export MIN_CAPABILITY=${array[3]}
+    fi
     pytest -s weight_loading/test_weight_loading.py || LOCAL_SUCCESS=$?
 
     if [[ $LOCAL_SUCCESS == 0 ]]; then
diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py
index d8bca05e204c0..199731bdc21fe 100644
--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
@@ -1,14 +1,21 @@
 import os
 
+import pytest
 import torch
 
+from vllm.platforms import current_platform
+
 MAX_MODEL_LEN = 1024
 MODEL_NAME = os.environ.get("MODEL_NAME",
                             "robertgshaw2/zephyr-7b-beta-channelwise-gptq")
 REVISION = os.environ.get("REVISION", "main")
 QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin")
+MIN_CAPABILITY = os.environ.get("MIN_CAPABILITY", "89")
 
 
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(int(MIN_CAPABILITY)),
+    reason="Current system does not have minimum capability.")
 def test_weight_loading(vllm_runner):
     """
     Test parameter weight loading with tp>1.
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index d6002630ee02c..f6b5514f8987d 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -552,6 +552,109 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
     return out
 
 
+def cutlass_sparse_compress(a: torch.Tensor) \
+    -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Compresses a sparse matrix for use with Cutlass sparse operations.
+
+    This function takes a dense tensor and compresses it into two components:
+    non-zero elements and metadata. The compressed representation is compatible
+    with Cutlass sparse kernels.
+
+    Args:
+        a (torch.Tensor): 
+            The input tensor to be compressed. Must have one of the following data types:
+            - `torch.int8`
+            - `torch.float8_e4m3fn`
+            - `torch.bfloat16`
+            - `torch.float16`
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: 
+            A tuple containing:
+            - `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`.
+            - `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation.
+
+    Raises:
+        ValueError: If the compression operation fails.
+
+    Notes:
+        - The `a_meta` tensor has a data type of `torch.uint8`.
+        - Each metadata element encodes the sparsity of 4 non-zero elements (i.e., `elemsPerMetaElem = 4`).
+        - The shape of `a_nzs` is `(m, k // 2)`, where `m` and `k` are the dimensions of the input tensor.
+        - The shape of `a_meta` is `(m, k // 2 // elemsPerMetaElem)`.
+    """
+    assert (a.dtype in [
+        torch.int8, torch.float8_e4m3fn, torch.bfloat16, torch.float16
+    ])
+    assert (a.is_contiguous())
+
+    # a_meta.dtype: torch.uint8 so elemsPerMetaElem = 8b / 2b_per_nz = 4
+    elemsPerMetaElem = 4
+
+    m = a.shape[0]
+    k = a.shape[1]
+    assert (k % 2 == 0)
+    a_nzs = torch.empty((m, k // 2), dtype=a.dtype, device=a.device)
+    a_meta = torch.empty((m, k // 2 // elemsPerMetaElem),
+                         dtype=torch.uint8,
+                         device=a.device)
+
+    if not (torch.ops._C.cutlass_sparse_compress_entry(a_nzs, a_meta, a)):
+        raise ValueError
+
+    assert (a_nzs.is_contiguous())
+    assert (a_meta.is_contiguous())
+
+    return a_nzs, a_meta
+
+
+def cutlass_scaled_sparse_mm(
+        a: torch.Tensor,
+        bt_nzs: torch.Tensor,
+        bt_meta: torch.Tensor,
+        scale_a: torch.Tensor,
+        scale_b: torch.Tensor,
+        out_dtype: torch.dtype,
+        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    Performs a scaled sparse matrix multiplication using Cutlass.
+
+    Steps:
+    1. Create a dense matrix `a` of shape (m, k) on the CUDA device:
+    `a = torch.randn((m, k), device='cuda')`.
+
+    2. Create a dense matrix `b` of shape (k, n) on the CUDA device:
+    `b = torch.randn((k, n), device='cuda')`.
+
+    3. Prune matrix `b` to 2:4 sparsity along the specified dimension:
+    `b = prune_to_2_4(b, dim=0)`.
+
+    4. Compress the transposed sparse matrix `b.t()`:
+    `bt_nzs, bt_meta = cutlass_sparse_compress(b.t())`.
+
+    5. Perform sparse matrix multiplication using the compressed matrix,
+    applying scaling factors for `a` and `b`, and the output data type:
+    `out = cutlass_scaled_sparse_mm(a, bt_nzs, bt_meta, scale_a, scale_b, out_dtype)`.
+
+    Returns:
+    - The result of the scaled sparse matrix multiplication.
+    """
+    assert (bt_nzs.shape[0] % 16 == 0 and bt_nzs.shape[1] % 16 == 0)
+    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
+    assert bias is None or bias.shape[0] == bt_nzs.shape[0] \
+        and bias.dtype == out_dtype
+
+    m = a.shape[0]
+    n = bt_nzs.shape[0]
+    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
+
+    torch.ops._C.cutlass_scaled_sparse_mm(out, a, bt_nzs, bt_meta, scale_a,
+                                          scale_b, bias)
+
+    return out
+
+
 # aqlm
 def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
               codebooks: torch.Tensor, scales: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 4f5758a42dbbc..0c1fc18228f5c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -1,7 +1,9 @@
-from typing import Any, Dict, List, Optional, cast
+from typing import Any, Dict, List, Literal, Optional, cast
 
 import torch
-from compressed_tensors.config import CompressionFormat
+from compressed_tensors.config import (CompressionFormat,
+                                       SparsityCompressionConfig,
+                                       SparsityStructure)
 from compressed_tensors.quantization import (QuantizationArgs,
                                              QuantizationStrategy,
                                              QuantizationType)
@@ -15,7 +17,7 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (  # noqa: E501
     CompressedTensorsMoEMethod)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS,
+    W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, CompressedTensors24,
     CompressedTensorsScheme, CompressedTensorsW4A16Sparse24,
     CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
     CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
@@ -27,20 +29,29 @@
 
 __all__ = ["CompressedTensorsLinearMethod"]
 
+SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config"
+QUANTIZATION_SCHEME_MAP_TYPE = Dict[str, Optional[Dict[str, QuantizationArgs]]]
+
 
 class CompressedTensorsConfig(QuantizationConfig):
 
-    def __init__(self,
-                 target_scheme_map: Dict[str, Any],
-                 ignore: List[str],
-                 quant_format: str,
-                 kv_cache_scheme: Optional[Dict[str, Any]] = None):
+    def __init__(
+        self,
+        target_scheme_map: Dict[str, Any],
+        ignore: List[str],
+        quant_format: str,
+        sparsity_scheme_map: Dict[str, SparsityCompressionConfig],
+        kv_cache_scheme: Optional[Dict[str, Any]] = None,
+        config: Optional[Dict[str, Any]] = None,
+    ):
 
         self.ignore = ignore
         self.quant_format = quant_format
         # Map from [target -> scheme]
         self.target_scheme_map = target_scheme_map
         self.kv_cache_scheme = kv_cache_scheme
+        self.sparsity_scheme_map = sparsity_scheme_map
+        self.config = config
 
     def get_linear_method(self) -> "CompressedTensorsLinearMethod":
         return CompressedTensorsLinearMethod(self)
@@ -78,8 +89,50 @@ def get_quant_method(
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
+        ignore: List[str] = cast(List[str], config.get("ignore", []))
+        quant_format = cast(str, config.get("format"))
+        target_scheme_map = cls._quantization_scheme_map_from_config(
+            config=config)
+        sparsity_scheme_map = cls._sparsity_scheme_map_from_config(
+            config=config)
+
+        return cls(
+            target_scheme_map=target_scheme_map,
+            ignore=ignore,
+            quant_format=quant_format,
+            sparsity_scheme_map=sparsity_scheme_map,
+            config=config,
+        )
+
+    @classmethod
+    def _sparsity_scheme_map_from_config(
+            cls, config: Dict[str,
+                              Any]) -> Dict[str, SparsityCompressionConfig]:
+        """
+        :param config: The `quantization_config` dictionary from config.json
+        :return: A dictionary mapping target layer names to their corresponding
+            sparsity compression configurations
+        """
+        if (sparsity_config := config.get(SPARSITY_CONFIG_NAME)) is None:
+            return dict()
+
+        sparsity_config = SparsityCompressionConfig.model_validate(
+            sparsity_config)
+        sparse_scheme_map: Dict[str, SparsityCompressionConfig] = {
+            target: sparsity_config
+            for target in sparsity_config.targets or list()
+        }
+        return sparse_scheme_map
+
+    @classmethod
+    def _quantization_scheme_map_from_config(
+            cls, config: Dict[str, Any]) -> QUANTIZATION_SCHEME_MAP_TYPE:
+        """
+        :param config: The `quantization_config` dictionary from config.json
+        :return: A dictionary mapping target layer names to their corresponding
+            quantization_args for weights and input activations
+        """
         target_scheme_map: Dict[str, Any] = dict()
-        ignore = cast(List[str], config.get("ignore"))
         quant_format = cast(str, config.get("format"))
 
         # The quant_config has multiple config_groups, each containing
@@ -90,12 +143,14 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
         # details follow the structure defined by the QuantizationArgs
         # pydantic model, which is used to verify the structure of the
         # quant_config and also store the details for later use.
-        for _, quant_config in config["config_groups"].items():
+
+        config_groups = config.get("config_groups", dict())
+        for _, quant_config in config_groups.items():
             targets = quant_config.get("targets")
             for target in targets:
                 target_scheme_map[target] = {}
                 target_scheme_map[target][
-                    "weights"] = QuantizationArgs.parse_obj(
+                    "weights"] = QuantizationArgs.model_validate(
                         quant_config.get("weights"))
 
                 target_scheme_map[target]["input_activations"] = None
@@ -110,13 +165,9 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
                             "weights"].type == QuantizationType.FLOAT
                     else:
                         target_scheme_map[target][
-                            "input_activations"] = QuantizationArgs.parse_obj(
+                            "input_activations"] = QuantizationArgs.model_validate(  # noqa: E501
                                 quant_config.get("input_activations"))
-
-        return cls(target_scheme_map=target_scheme_map,
-                   ignore=ignore,
-                   quant_format=quant_format,
-                   kv_cache_scheme=config.get("kv_cache_scheme"))
+        return target_scheme_map
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:
@@ -315,23 +366,105 @@ def get_scheme(
         # TODO (@robertgshaw): add compressed-tensors as dep
         # so we do not have to re-write these functions
         # need to make accelerate optional in ct to do this
-        matched_target = find_matched_target(
-            layer_name=layer_name,
-            module=layer,
-            targets=self.target_scheme_map.keys())
 
-        # Find the quant_scheme
-        scheme_dict = self.target_scheme_map[matched_target]
-        scheme = self._get_scheme_from_parts(
-            weight_quant=scheme_dict["weights"],
-            input_quant=scheme_dict["input_activations"])
+        # Will be empty for models with only sparsity
+        if self.target_scheme_map:
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=self.target_scheme_map.keys())
+
+            scheme_dict = self.target_scheme_map[matched_target]
+            weight_quant = scheme_dict.get("weights")
+            input_quant = scheme_dict.get("input_activations")
+        elif self.sparsity_scheme_map:
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=self.sparsity_scheme_map.keys())
+            weight_quant = None
+            input_quant = None
+
+        # For models with sparsity, assumes that the sparse layers are also
+        # quantized for cutlass 2:4 support
+        sparsity_scheme: Optional[
+            SparsityCompressionConfig] = self.sparsity_scheme_map.get(
+                matched_target)
+
+        if self.supports_cutlass_24(weight_quant=weight_quant,
+                                    input_quant=input_quant,
+                                    sparsity_scheme=sparsity_scheme):
+            # Have a valid sparsity scheme
+            # Validate layer is supported by Cutlass 2:4 Kernel
+            scheme = CompressedTensors24(quantized=weight_quant is not None
+                                         or input_quant is not None,
+                                         weight_quant=weight_quant,
+                                         input_quant=input_quant)
+        else:
+            # Find the quant_scheme
+            scheme = self._get_scheme_from_parts(  # type: ignore
+                weight_quant=weight_quant,
+                input_quant=input_quant,
+            )
 
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)
         self._check_scheme_supported(scheme.get_min_capability())
-
         return scheme
 
+    @staticmethod
+    def supports_cutlass_24(
+            weight_quant: Optional[QuantizationArgs],
+            input_quant: Optional[QuantizationArgs],
+            sparsity_scheme: Optional[SparsityCompressionConfig] = None
+    ) -> bool:
+        """
+        Check if the layer is supported by the Cutlass 2:4 Kernel
+        Conditions:
+            - Overarching condition: Sparsity Structure is 2:4
+            - Unquantized cases are supported
+            - Weight only quantization is not-supported
+            - Supported weight quantization strategies are TENSOR and CHANNEL
+            - Supported input quantization strategies are TENSOR and TOKEN
+            - Only 8 bit quantization is supported 
+
+        :return: True if the layer is supported by the Cutlass 2:4 Kernel
+            False otherwise
+        """
+        is_valid_sparsity = (sparsity_scheme is not None
+                             and sparsity_scheme.sparsity_structure
+                             == SparsityStructure.TWO_FOUR.value
+                             and sparsity_scheme.format == "dense")
+        if not is_valid_sparsity:
+            return False
+
+        # Unquantized cases are supported
+        if weight_quant is None and input_quant is None:
+            return True
+
+        # Weight only quantization is not-supported
+        if weight_quant is not None and input_quant is None:
+            return False
+
+        supported_weight_quant_strategies = [
+            QuantizationStrategy.TENSOR.value,
+            QuantizationStrategy.CHANNEL.value
+        ]
+
+        assert weight_quant is not None
+        assert input_quant is not None
+        if weight_quant.strategy not in supported_weight_quant_strategies:
+            return False
+
+        supported_input_quant_strategies = [
+            QuantizationStrategy.TENSOR.value, QuantizationStrategy.TOKEN.value
+        ]
+
+        if input_quant.strategy not in supported_input_quant_strategies:
+            return False
+
+        return weight_quant.num_bits == input_quant.num_bits == 8
+
 
 class CompressedTensorsLinearMethod(LinearMethodBase):
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 5d259ec72051c..569ecaa6f5a76 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -7,13 +7,12 @@
 from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS,
                                        CompressedTensorsWNA16)
 
+from .compressed_tensors_24 import CompressedTensors24  # isort: skip
+
 __all__ = [
-    "CompressedTensorsScheme",
-    "CompressedTensorsWNA16",
-    "CompressedTensorsW8A16Fp8",
-    "CompressedTensorsW4A16Sparse24",
-    "CompressedTensorsW8A8Int8",
-    "CompressedTensorsW8A8Fp8",
-    "WNA16_SUPPORTED_BITS",
-    "W4A16SPARSE24_SUPPORTED_BITS",
+    "CompressedTensorsScheme", "CompressedTensorsWNA16",
+    "CompressedTensorsW8A16Fp8", "CompressedTensorsW4A16Sparse24",
+    "CompressedTensorsW8A8Int8", "CompressedTensorsW8A8Fp8",
+    "WNA16_SUPPORTED_BITS", "W4A16SPARSE24_SUPPORTED_BITS",
+    "CompressedTensors24"
 ]
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
new file mode 100644
index 0000000000000..af266769aef89
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -0,0 +1,203 @@
+from typing import Callable, List, Optional
+
+import torch
+from compressed_tensors.quantization import (QuantizationArgs,
+                                             QuantizationStrategy,
+                                             QuantizationType)
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+
+__all__ = ["CompressedTensors24"]
+
+
+class CompressedTensors24(CompressedTensorsScheme):
+
+    def __init__(self,
+                 quantized: bool = False,
+                 weight_quant: Optional[QuantizationArgs] = None,
+                 input_quant: Optional[QuantizationArgs] = None):
+
+        self.quantized = quantized
+        self.weight_quant = weight_quant
+        self.input_quant = input_quant
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # Only cutlass 3.x kernels are implemented so far
+        return 90
+
+    def create_weights(self, layer: torch.nn.Module, input_size: int,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+
+        self.output_dtype = params_dtype
+        layer.logical_widths = output_partition_sizes
+        self.weights_dtype: torch.dtype = self._get_params_dtype(params_dtype)
+
+        # parameter to store uncompressed weight
+        weight = ModelWeightParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition,
+            dtype=self.weights_dtype),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+
+        # Check if quantized, not just 2:4 Sparse
+        if self.quantized:
+            if (self.weight_quant and self.weight_quant.strategy
+                    == QuantizationStrategy.CHANNEL.value):
+                weight_scale = ChannelQuantScaleParameter(
+                    data=torch.empty((sum(output_partition_sizes), 1),
+                                     dtype=torch.float32),
+                    output_dim=0,
+                    weight_loader=weight_loader)
+            else:
+                assert (self.weight_quant and self.weight_quant.strategy
+                        == QuantizationStrategy.TENSOR.value)
+                weight_scale = PerTensorScaleParameter(
+                    data=torch.empty(len(output_partition_sizes),
+                                     dtype=torch.float32),
+                    weight_loader=weight_loader)
+
+            layer.register_parameter("weight_scale", weight_scale)
+
+            # input quant will be non-none
+            if self.input_quant and not self.input_quant.dynamic:
+                # register input quant scale
+                assert (self.input_quant.strategy ==
+                        QuantizationStrategy.TENSOR.value)
+                input_scale = BasevLLMParameter(data=torch.empty(
+                    1, dtype=torch.float32),
+                                                weight_loader=weight_loader)
+
+                layer.register_parameter("input_scale", input_scale)
+
+        else:
+            # for sparse-only, pass in 1 for weight/input scales
+            weight_scale = torch.nn.Parameter(data=torch.ones(
+                1, dtype=torch.float32),
+                                              requires_grad=False)
+            input_scale = torch.nn.Parameter(data=torch.ones(
+                1, dtype=torch.float32),
+                                             requires_grad=False)
+            layer.register_parameter("input_scale", input_scale)
+            layer.register_parameter("weight_scale", weight_scale)
+
+        layer.register_parameter("weight", weight)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """
+        Compress weights after loading. Store compressed weight and meta
+            tensor
+        
+        :post-condition: layer.w_compressed and layer.meta are
+            set to the compressed weight and meta tensor in the
+            format expected by the Cutlass kernels
+        :param layer: The layer with the weights to be processed
+        
+        """
+        # torch.compile workaround
+        if hasattr(layer, "input_scale"):
+            layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
+                                                   requires_grad=False)
+
+        if self.weight_quant:
+            if self.weight_quant.strategy == QuantizationStrategy.TENSOR.value:
+                layer.weight_scale = torch.nn.Parameter(convert_to_channelwise(
+                    weight_scale=layer.weight_scale,
+                    logical_widths=layer.logical_widths),
+                                                        requires_grad=False)
+            else:
+                # torch.compile workaround
+                layer.weight_scale = torch.nn.Parameter(
+                    layer.weight_scale.data, requires_grad=False)
+
+        w_compressed, meta = ops.cutlass_sparse_compress(layer.weight.data)
+        layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False)
+        layer.meta = torch.nn.Parameter(meta, requires_grad=False)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Returns the output tensor for the layer with 2:4 
+        sparse compressed weights, given the input tensor
+        and bias
+
+        :param layer: The layer with 2:4 sparse compressed 
+            weights to be used for the computation
+        :param x: The input tensor to the layer
+        :param bias: The bias to be added to the output tensor
+        :return: The output tensor of the layer 
+        """
+        if self.quantized:
+            scale = None
+            if hasattr(layer, "input_scale"):
+                scale = layer.input_scale
+
+            if self.weights_dtype == torch.int8:
+                ops_output = ops.scaled_int8_quant(x, scale=scale)
+                q_input = ops_output[0]
+                input_scale = ops_output[1]
+            else:
+                assert self.weights_dtype == torch.float8_e4m3fn
+                if scale is not None:
+                    q_input, input_scale = ops.scaled_fp8_quant(x, scale=scale)
+                else:
+                    q_input, input_scale = ops.scaled_fp8_quant(
+                        x, use_per_token_if_dynamic=True)
+
+        else:
+            # Not quantized, nothing to do with the input_scales, use as is
+            input_scale = layer.input_scale
+            q_input = x
+
+        out = ops.cutlass_scaled_sparse_mm(a=q_input,
+                                           bt_nzs=layer.weight,
+                                           bt_meta=layer.meta,
+                                           scale_a=input_scale,
+                                           scale_b=layer.weight_scale,
+                                           out_dtype=self.output_dtype,
+                                           bias=bias)
+        assert out.is_contiguous()
+        return out
+
+    def _get_params_dtype(self, params_dtype: torch.dtype) -> torch.dtype:
+        if not self.quantized:
+            return params_dtype
+
+        assert self.weight_quant is not None
+        assert self.input_quant is not None
+
+        is_8_bits = self.weight_quant.num_bits == self.input_quant.num_bits == 8
+
+        if not is_8_bits:
+            raise ValueError("Cutlass only supports 8-bit quantization")
+
+        if (self.weight_quant.type == QuantizationType.FLOAT
+                and self.input_quant.type == QuantizationType.FLOAT):
+            return torch.float8_e4m3fn
+
+        if (self.weight_quant.type == QuantizationType.INT
+                and self.input_quant.type == QuantizationType.INT):
+            return torch.int8
+
+        raise ValueError("Quantization type not supported by Cutlass")
+
+
+def check_24(tensor):
+    new_tensor = tensor.view(-1, 4)
+    zero_counts = (new_tensor == 0).sum(dim=1)
+    return (zero_counts >= 2).all().item()

From 996aa70f00818933866d8cfdbbf8f131a6a63664 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 19 Dec 2024 02:16:40 +0800
Subject: [PATCH 107/357] [Bugfix] Fix broken phi3-v mm_processor_kwargs tests
 (#11263)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 .../mm_processor_kwargs/test_phi3v.py           | 12 +++++-------
 vllm/model_executor/models/phi3v.py             | 17 ++++++++++++-----
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
index ce8ac8d8e0ceb..f95cee277f4e6 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
@@ -58,16 +58,14 @@ def test_max_tokens_override(get_max_phi3v_image_tokens, model: str,
 
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
-    "num_crops,expected_toks_per_img,num_imgs",
+    "num_crops,expected_toks_per_img",
     [
-        (4, 757, 1),
-        (4, 757, 2),
-        (16, 1921, 1),
-        (16, 1921, 2),
+        (4, 757),
+        (16, 1921),
         # the default num_crops of phi-3.5-vision is 4
-        (None, 757, 2),
-        (None, 757, 2),
+        (None, 757),
     ])
+@pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_override(processor_for_phi3v, image_assets: _ImageAssets,
                             model: str, num_crops: Optional[int],
                             expected_toks_per_img: int, num_imgs: int):
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 7ab06768ae612..b19329a57a8cf 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -302,11 +302,18 @@ def add_image_newline(self, image_features_hd):
         return image_features_hd_newline
 
 
-def get_max_phi3v_image_tokens(ctx: InputContext) -> int:
-    processor = ctx.get_hf_processor()
-    image_processor = processor.image_processor  # type: ignore
-
-    return image_processor.calc_num_image_tokens_from_image_size(
+def get_max_phi3v_image_tokens(
+    ctx: InputContext,
+    *,
+    num_crops: Optional[int] = None,
+) -> int:
+    mm_processor_kwargs = {}
+    if num_crops:
+        mm_processor_kwargs["num_crops"] = num_crops
+
+    processor = ctx.get_hf_processor(**mm_processor_kwargs)
+
+    return processor.calc_num_image_tokens_from_image_size(
         width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
         height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
     )

From 362cff1eb3d6c1af434d29e8ed022dec048211fa Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Wed, 18 Dec 2024 10:16:53 -0800
Subject: [PATCH 108/357] [CI][Misc] Remove Github Action Release Workflow
 (#11274)

---
 .github/workflows/publish.yml | 123 +++++++++++++++++-----------------
 1 file changed, 62 insertions(+), 61 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index c1051d10a4860..e40ceaaa8b037 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -39,67 +39,68 @@ jobs:
             const script = require('.github/workflows/scripts/create_release.js')
             await script(github, context, core)
 
-  wheel:
-    name: Build Wheel
-    runs-on: ${{ matrix.os }}
-    needs: release
-
-    strategy:
-      fail-fast: false
-      matrix:
-          os: ['ubuntu-20.04']
-          python-version: ['3.9', '3.10', '3.11', '3.12']
-          pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
-          cuda-version: ['11.8', '12.1']
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
-        with:
-          create-symlink: true
-          key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
-
-      - name: Set up Linux Env
-        if: ${{ runner.os == 'Linux' }}
-        run: |
-          bash -x .github/workflows/scripts/env.sh
-
-      - name: Set up Python
-        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-        with:
-            python-version: ${{ matrix.python-version }}
-
-      - name: Install CUDA ${{ matrix.cuda-version }}
-        run: |
-          bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
-
-      - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
-        run: |
-          bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
-
-      - name: Build wheel
-        shell: bash
-        env:
-          CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
-        run: |
-          bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
-          wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
-          asset_name=${wheel_name//"linux"/"manylinux1"}
-          echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
-          echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
-
-      - name: Upload Release Asset
-        uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          upload_url: ${{ needs.release.outputs.upload_url }}
-          asset_path: ./dist/${{ env.wheel_name }}
-          asset_name: ${{ env.asset_name }}
-          asset_content_type: application/*
+  # NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow. 
+  # wheel:
+  #   name: Build Wheel
+  #   runs-on: ${{ matrix.os }}
+  #   needs: release
+
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #         os: ['ubuntu-20.04']
+  #         python-version: ['3.9', '3.10', '3.11', '3.12']
+  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
+  #         cuda-version: ['11.8', '12.1']
+
+  #   steps:
+  #     - name: Checkout
+  #       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+  #     - name: Setup ccache
+  #       uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
+  #       with:
+  #         create-symlink: true
+  #         key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
+
+  #     - name: Set up Linux Env
+  #       if: ${{ runner.os == 'Linux' }}
+  #       run: |
+  #         bash -x .github/workflows/scripts/env.sh
+
+  #     - name: Set up Python
+  #       uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+  #       with:
+  #           python-version: ${{ matrix.python-version }}
+
+  #     - name: Install CUDA ${{ matrix.cuda-version }}
+  #       run: |
+  #         bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
+
+  #     - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
+  #       run: |
+  #         bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
+
+  #     - name: Build wheel
+  #       shell: bash
+  #       env:
+  #         CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
+  #       run: |
+  #         bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
+  #         wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
+  #         asset_name=${wheel_name//"linux"/"manylinux1"}
+  #         echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
+  #         echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
+
+  #     - name: Upload Release Asset
+  #       uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
+  #       env:
+  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  #       with:
+  #         upload_url: ${{ needs.release.outputs.upload_url }}
+  #         asset_path: ./dist/${{ env.wheel_name }}
+  #         asset_name: ${{ env.asset_name }}
+  #         asset_content_type: application/*
 
       # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
       # - name: Publish package

From f954fe0e65cc078e62a40e8407f329996541d8c4 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Thu, 19 Dec 2024 02:17:05 +0800
Subject: [PATCH 109/357] [FIX] update openai version (#11287)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 requirements-common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 1c935303c8d79..250e2b17ffc23 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -11,7 +11,7 @@ protobuf # Required by LlamaTokenizer.
 fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
 fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
 aiohttp
-openai >= 1.45.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
+openai >= 1.52.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
 uvicorn[standard]
 pydantic >= 2.9  # Required for fastapi >= 0.113.0
 prometheus_client >= 0.18.0

From ca5f54a9b93a9d044458b8103ed8c9dcc62e6611 Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Wed, 18 Dec 2024 10:34:26 -0800
Subject: [PATCH 110/357] [Bugfix] fix minicpmv test (#11304)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 tests/lora/test_minicpmv.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/lora/test_minicpmv.py b/tests/lora/test_minicpmv.py
index 1f3de9edc0d0f..78bf5a1617233 100644
--- a/tests/lora/test_minicpmv.py
+++ b/tests/lora/test_minicpmv.py
@@ -67,7 +67,6 @@ def test_minicpmv_lora(minicpmv_lora_files):
         max_loras=4,
         max_lora_rank=64,
         trust_remote_code=True,
-        gpu_memory_utilization=0.97,  # This model is pretty big for CI gpus
         enable_chunked_prefill=True,
     )
     output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)

From fdea8ec16775e1645620b5ff46b799d60df4624c Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>
Date: Wed, 18 Dec 2024 18:54:46 -0500
Subject: [PATCH 111/357] [V1] VLM - enable processor cache by default (#11305)

Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
---
 examples/offline_inference_vision_language.py | 50 +++++++++----------
 vllm/config.py                                | 11 ++--
 vllm/engine/arg_utils.py                      | 11 ++--
 vllm/v1/core/kv_cache_utils.py                |  2 +-
 vllm/v1/engine/mm_input_mapper.py             | 20 ++++++--
 vllm/v1/engine/processor.py                   |  4 +-
 vllm/v1/worker/gpu_model_runner.py            | 22 ++++++--
 7 files changed, 72 insertions(+), 48 deletions(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 6d0495fdd4054..64c7b93f4a71b 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -28,7 +28,7 @@ def run_aria(question: str, modality: str):
               tokenizer_mode="slow",
               trust_remote_code=True,
               dtype="bfloat16",
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
 
     prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
               "<|im_end|>\n<|im_start|>assistant\n")
@@ -45,7 +45,7 @@ def run_blip2(question: str, modality: str):
     # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
     prompt = f"Question: {question} Answer:"
     llm = LLM(model="Salesforce/blip2-opt-2.7b",
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -57,7 +57,7 @@ def run_chameleon(question: str, modality: str):
     prompt = f"{question}<image>"
     llm = LLM(model="facebook/chameleon-7b",
               max_model_len=4096,
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -70,7 +70,7 @@ def run_fuyu(question: str, modality: str):
     llm = LLM(model="adept/fuyu-8b",
               max_model_len=2048,
               max_num_seqs=2,
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -85,7 +85,7 @@ def run_glm4v(question: str, modality: str):
               max_num_seqs=2,
               trust_remote_code=True,
               enforce_eager=True,
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     prompt = question
     stop_token_ids = [151329, 151336, 151338]
     return llm, prompt, stop_token_ids
@@ -101,7 +101,7 @@ def run_h2ovl(question: str, modality: str):
         model=model_name,
         trust_remote_code=True,
         max_model_len=8192,
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -134,7 +134,7 @@ def run_idefics3(question: str, modality: str):
                 "longest_edge": 3 * 364
             },
         },
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
     prompt = (
         f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
@@ -153,7 +153,7 @@ def run_internvl(question: str, modality: str):
         model=model_name,
         trust_remote_code=True,
         max_model_len=4096,
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -180,7 +180,7 @@ def run_llava(question: str, modality: str):
 
     llm = LLM(model="llava-hf/llava-1.5-7b-hf",
               max_model_len=4096,
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -192,7 +192,7 @@ def run_llava_next(question: str, modality: str):
     prompt = f"[INST] <image>\n{question} [/INST]"
     llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
               max_model_len=8192,
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -205,7 +205,7 @@ def run_llava_next_video(question: str, modality: str):
     prompt = f"USER: <video>\n{question} ASSISTANT:"
     llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
               max_model_len=8192,
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -223,7 +223,7 @@ def run_llava_onevision(question: str, modality: str):
 
     llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
               max_model_len=16384,
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -239,7 +239,7 @@ def run_mantis(question: str, modality: str):
         model="TIGER-Lab/Mantis-8B-siglip-llama3",
         max_model_len=4096,
         hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
     stop_token_ids = [128009]
     return llm, prompt, stop_token_ids
@@ -266,7 +266,7 @@ def run_minicpmv(question: str, modality: str):
         max_model_len=4096,
         max_num_seqs=2,
         trust_remote_code=True,
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
     # NOTE The stop_token_ids are different for various versions of MiniCPM-V
     # 2.0
@@ -305,7 +305,7 @@ def run_mllama(question: str, modality: str):
         max_model_len=4096,
         max_num_seqs=16,
         enforce_eager=True,
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
     prompt = f"<|image|><|begin_of_text|>{question}"
@@ -323,7 +323,7 @@ def run_molmo(question, modality):
         model=model_name,
         trust_remote_code=True,
         dtype="bfloat16",
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
     prompt = question
@@ -343,7 +343,7 @@ def run_nvlm_d(question: str, modality: str):
         trust_remote_code=True,
         max_model_len=4096,
         tensor_parallel_size=4,
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -363,7 +363,7 @@ def run_paligemma(question: str, modality: str):
     # PaliGemma has special prompt format for VQA
     prompt = "caption en"
     llm = LLM(model="google/paligemma-3b-mix-224",
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -375,7 +375,7 @@ def run_paligemma2(question: str, modality: str):
     # PaliGemma 2 has special prompt format for VQA
     prompt = "caption en"
     llm = LLM(model="google/paligemma2-3b-ft-docci-448",
-              mm_cache_preprocessor=args.mm_cache_preprocessor)
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -405,7 +405,7 @@ def run_phi3v(question: str, modality: str):
         max_num_seqs=2,
         # Note - mm_processor_kwargs can also be passed to generate/chat calls
         mm_processor_kwargs={"num_crops": 16},
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
     stop_token_ids = None
     return llm, prompt, stop_token_ids
@@ -420,7 +420,7 @@ def run_pixtral_hf(question: str, modality: str):
     llm = LLM(
         model=model_name,
         max_model_len=8192,
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
     prompt = f"<s>[INST]{question}\n[IMG][/INST]"
@@ -437,7 +437,7 @@ def run_qwen_vl(question: str, modality: str):
         trust_remote_code=True,
         max_model_len=1024,
         max_num_seqs=2,
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
     prompt = f"{question}Picture 1: <img></img>\n"
@@ -460,7 +460,7 @@ def run_qwen2_vl(question: str, modality: str):
             "min_pixels": 28 * 28,
             "max_pixels": 1280 * 28 * 28,
         },
-        mm_cache_preprocessor=args.mm_cache_preprocessor,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
     prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
@@ -651,9 +651,9 @@ def main(args):
         ' (if enabled)')
 
     parser.add_argument(
-        '--mm-cache-preprocessor',
+        '--disable-mm-preprocessor-cache',
         action='store_true',
-        help='If True, enable caching of multi-modal preprocessor/mapper.')
+        help='If True, disables caching of multi-modal preprocessor/mapper.')
 
     parser.add_argument(
         '--time-generate',
diff --git a/vllm/config.py b/vllm/config.py
index 307cf9c8d5b2a..9acc3efa4816c 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -148,9 +148,8 @@ class ModelConfig:
             HuggingFace config.
         mm_processor_kwargs: Arguments to be forwarded to the model's processor
             for multi-modal data, e.g., image processor.
-        mm_cache_preprocessor: If true, then enables caching of the multi-modal 
-            preprocessor/mapper. Otherwise, the mapper executes each time, and 
-            for better performance consider enabling frontend process.
+        disable_mm_preprocessor_cache: If true, then disables caching of the
+            multi-modal preprocessor/mapper. (not recommended)
         override_neuron_config: Initialize non default neuron config or
             override default neuron config that are specific to Neuron devices,
             this argument will be used to configure the neuron config that
@@ -216,7 +215,7 @@ def __init__(self,
                  config_format: ConfigFormat = ConfigFormat.AUTO,
                  hf_overrides: Optional[HfOverrides] = None,
                  mm_processor_kwargs: Optional[Dict[str, Any]] = None,
-                 mm_cache_preprocessor: bool = False,
+                 disable_mm_preprocessor_cache: bool = False,
                  override_neuron_config: Optional[Dict[str, Any]] = None,
                  override_pooler_config: Optional["PoolerConfig"] = None,
                  logits_processor_pattern: Optional[str] = None) -> None:
@@ -286,7 +285,7 @@ def __init__(self,
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
         self.use_async_output_proc = use_async_output_proc
         self.mm_processor_kwargs = mm_processor_kwargs
-        self.mm_cache_preprocessor = mm_cache_preprocessor
+        self.disable_mm_preprocessor_cache = disable_mm_preprocessor_cache
 
         # Set enforce_eager to False if the value is unset.
         if self.enforce_eager is None:
@@ -3155,7 +3154,7 @@ def __str__(self):
             f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
             f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
             f"use_async_output_proc={self.model_config.use_async_output_proc}, "
-            f"mm_cache_preprocessor={self.model_config.mm_cache_preprocessor!r}, "  # noqa
+            f"disable_mm_preprocessor_cache={self.model_config.disable_mm_preprocessor_cache!r}, "  # noqa
             f"mm_processor_kwargs={self.model_config.mm_processor_kwargs}, "
             f"pooler_config={self.model_config.pooler_config!r}, "
             f"compilation_config={self.compilation_config!r}")
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 64cc4592c2861..75e79d509d2e1 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -141,7 +141,7 @@ class EngineArgs:
     tokenizer_pool_extra_config: Optional[Dict[str, Any]] = None
     limit_mm_per_prompt: Optional[Mapping[str, int]] = None
     mm_processor_kwargs: Optional[Dict[str, Any]] = None
-    mm_cache_preprocessor: bool = False
+    disable_mm_preprocessor_cache: bool = False
     enable_lora: bool = False
     enable_lora_bias: bool = False
     max_loras: int = 1
@@ -606,11 +606,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             help=('Overrides for the multimodal input mapping/processing, '
                   'e.g., image processor. For example: {"num_crops": 4}.'))
         parser.add_argument(
-            '--mm-cache-preprocessor',
+            '--disable-mm-preprocessor-cache',
             action='store_true',
-            help='If true, then enables caching of the multi-modal '
-            'preprocessor/mapper. Otherwise, the mapper executes each time'
-            ', and for better performance consider enabling frontend process.')
+            help='If true, then disables caching of the multi-modal '
+            'preprocessor/mapper. (not recommended)')
 
         # LoRA related configs
         parser.add_argument('--enable-lora',
@@ -983,7 +982,7 @@ def create_model_config(self) -> ModelConfig:
             use_async_output_proc=not self.disable_async_output_proc,
             config_format=self.config_format,
             mm_processor_kwargs=self.mm_processor_kwargs,
-            mm_cache_preprocessor=self.mm_cache_preprocessor,
+            disable_mm_preprocessor_cache=self.disable_mm_preprocessor_cache,
             override_neuron_config=self.override_neuron_config,
             override_pooler_config=self.override_pooler_config,
             logits_processor_pattern=self.logits_processor_pattern)
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index d80ea128c7749..9ddbff7c9a604 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -191,7 +191,7 @@ def generate_block_hash_extra_keys(
         raise ValueError(
             "The number of multi-modal positions and hashes must match. This "
             "is likely because you do not enable MM preprocessor hashing. "
-            "Please set mm_cache_preprocessor=True.")
+            "Please set disable_mm_preprocessor_cache=False.")
 
     # Note that we assume mm_positions is sorted by offset.
     # We do not need to check all mm inputs if the start token index is out of
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index e53ba092ede04..bba71c29cc108 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -43,7 +43,7 @@ def __init__(
         self.mm_registry.init_mm_limits_per_prompt(model_config)
 
         # Init cache
-        self.use_cache = model_config.mm_cache_preprocessor
+        self.use_cache = not model_config.disable_mm_preprocessor_cache
         self.mm_cache = LRUDictCache[str, MultiModalKwargs](MM_CACHE_SIZE)
 
         # DEBUG: Set to None to disable
@@ -119,7 +119,7 @@ def process_inputs(
 class MMInputMapperServer:
 
     def __init__(self, model_config):
-        self.use_cache = model_config.mm_cache_preprocessor
+        self.use_cache = not model_config.disable_mm_preprocessor_cache
         self.mm_cache = LRUDictCache[str, MultiModalKwargs](MM_CACHE_SIZE)
 
     def process_inputs(
@@ -151,12 +151,26 @@ class MMHasher:
     def __init__(self):
         pass
 
-    def hash(self, prompt: PromptType) -> Optional[List[str]]:
+    def hash_mm_data(
+            self,
+            mm_data: Optional[MultiModalDataDict]) -> Optional[List[str]]:
+        if mm_data is None:
+            return None
+
+        image_inputs = mm_data['image']
+
+        return self.hash_images(image_inputs)
+
+    def hash_prompt(self, prompt: PromptType) -> Optional[List[str]]:
         if "multi_modal_data" not in prompt:
             return None
 
         mm_data = prompt["multi_modal_data"]
         image_inputs = mm_data["image"]
+
+        return self.hash_images(image_inputs)
+
+    def hash_images(self, image_inputs) -> Optional[List[str]]:
         if not isinstance(image_inputs, list):
             image_inputs = [image_inputs]
         assert len(image_inputs) > 0
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 732757d6b0ac2..61dce40a584c8 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -46,7 +46,7 @@ def __init__(
         self.mm_input_mapper_client = MMInputMapperClient(model_config)
 
         # Multi-modal hasher (for images)
-        self.use_hash = model_config.mm_cache_preprocessor or \
+        self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \
             cache_config.enable_prefix_caching
         self.mm_hasher = MMHasher()
 
@@ -80,7 +80,7 @@ def process_inputs(
         # Compute MM hashes (if enabled)
         mm_hashes = None
         if self.use_hash:
-            mm_hashes = self.mm_hasher.hash(prompt)
+            mm_hashes = self.mm_hasher.hash_prompt(prompt)
 
         # Process inputs.
         preprocessed_inputs = self.input_preprocessor.preprocess(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c6fab5f05fcb3..8ec4a252d5925 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -19,7 +19,7 @@
                         LayerBlockType, cdiv, is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
-from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
+from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -79,8 +79,14 @@ def __init__(
         # Multi-modal data support
         self.input_registry = INPUT_REGISTRY
         self.mm_registry = MULTIMODAL_REGISTRY
-        # NOTE: mm_input_mapper is only used for memory profiling.
-        self.mm_input_mapper = MMInputMapperClient(self.model_config)
+
+        # NOTE: mm_input_mapper_client and mm_hasher are only used for memory
+        # profiling.
+        self.mm_input_mapper_client = MMInputMapperClient(self.model_config)
+        self.mm_hasher = MMHasher()
+        self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \
+            cache_config.enable_prefix_caching
+
         self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens  # noqa: E501
         self.encoder_cache_size = self.scheduler_config.encoder_cache_size
 
@@ -628,9 +634,15 @@ def profile_run(self) -> None:
                 mm_registry=self.mm_registry,
             )
             dummy_mm_data = dummy_request_data.multi_modal_data
-            dummy_mm_kwargs, _ = self.mm_input_mapper.process_inputs(
+
+            # Compute MM hashes (if enabled)
+            mm_hashes = None
+            if self.use_hash:
+                mm_hashes = self.mm_hasher.hash_mm_data(dummy_mm_data)
+
+            dummy_mm_kwargs = self.mm_input_mapper_client.process_inputs(
                 mm_data=dummy_mm_data,
-                mm_hashes=None,
+                mm_hashes=mm_hashes,
                 mm_processor_kwargs=None,
                 precomputed_mm_inputs=None)
 

From 5a9da2e6e952160a80936b0119364e789661c7a1 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Wed, 18 Dec 2024 21:43:30 -0500
Subject: [PATCH 112/357] [Bugfix][Build/CI] Fix sparse CUTLASS compilation on
 CUDA [12.0, 12.2) (#11311)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 CMakeLists.txt                                | 39 ++++++++++++++-----
 csrc/ops.h                                    |  2 +
 csrc/sparse/cutlass/sparse_compressor_c3x.cu  |  4 +-
 .../sparse/cutlass/sparse_compressor_entry.cu |  4 +-
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   |  2 +-
 csrc/sparse/cutlass/sparse_scaled_mm_entry.cu | 15 ++++++-
 csrc/torch_bindings.cpp                       |  7 ++++
 tests/kernels/test_semi_structured.py         |  5 ++-
 tests/quantization/test_compressed_tensors.py |  8 ++--
 vllm/_custom_ops.py                           |  5 +++
 .../schemes/compressed_tensors_24.py          |  7 +++-
 .../layers/quantization/utils/w8a8_utils.py   | 11 ++++++
 12 files changed, 89 insertions(+), 20 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 51b49a18dddf2..83c8033434f3b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -273,15 +273,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                    " in CUDA target architectures")
   endif()
 
-  #
-  # The cutlass_scaled_mm cutlass_scaled_sparse_mm, and cutlass_compressor kernels
-  # For Hopper (c3x, i.e. CUTLASS 3.x) require
+  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
   # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
   cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
-             "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
-             "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
@@ -290,12 +286,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
   else()
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-      message(STATUS "Not building cutlass_c3x kernels as CUDA Compiler version is "
+      message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
                      "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
-                     "later if you intend on running FP8 sparse or quantized models on "
+                     "later if you intend on running FP8 quantized models on "
                      "Hopper.")
     else()
-      message(STATUS "Not building cutlass_c3x as no compatible archs found "
+      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
                      "in CUDA target architectures")
     endif()
 
@@ -329,6 +325,31 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     endif()
   endif()
 
+  #
+  # 2:4 Sparse Kernels
+
+  # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
+  # require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now).
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
+    set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
+             "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
+    message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
+      message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
+                     "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
+                     "if you intend on running FP8 sparse quantized models on Hopper.")
+    else()
+      message(STATUS "Not building sparse_scaled_mm_c3x as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
+  endif()
+
 
   #
   # Machete kernels
diff --git a/csrc/ops.h b/csrc/ops.h
index c145e4eda0845..347c502845d8f 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -163,6 +163,8 @@ void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            c10::optional<torch::Tensor> const& azp,
                            c10::optional<torch::Tensor> const& bias);
 
+bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability);
+
 void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
                               torch::Tensor const& b, torch::Tensor const& e,
                               torch::Tensor const& a_scales,
diff --git a/csrc/sparse/cutlass/sparse_compressor_c3x.cu b/csrc/sparse/cutlass/sparse_compressor_c3x.cu
index 218c5317b4de6..bd53695503241 100644
--- a/csrc/sparse/cutlass/sparse_compressor_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_compressor_c3x.cu
@@ -2,6 +2,7 @@
 // clang-format off
 #include <cudaTypedefs.h>
 
+#if defined CUDA_VERSION && CUDA_VERSION >= 12020
 #include "sparse_scaled_mm_c3x.cuh"
 
 #include "cutlass/numeric_conversion.h"
@@ -160,4 +161,5 @@ bool cutlass_sparse_compress_sm90(torch::Tensor& a_nzs, torch::Tensor& a_meta,
     return cutlass_sparse_compress<int8_t, int32_t>(a_nzs, a_meta, a);
   }
   return false;
-}
\ No newline at end of file
+}
+#endif
diff --git a/csrc/sparse/cutlass/sparse_compressor_entry.cu b/csrc/sparse/cutlass/sparse_compressor_entry.cu
index d23d937b6ac28..3401761c1b703 100644
--- a/csrc/sparse/cutlass/sparse_compressor_entry.cu
+++ b/csrc/sparse/cutlass/sparse_compressor_entry.cu
@@ -5,7 +5,7 @@
 
 #include "cutlass_extensions/common.hpp"
 
-#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
 bool cutlass_sparse_compress_sm90(torch::Tensor& a_nzs, torch::Tensor& a_meta,
                                   torch::Tensor const& a);
 #endif
@@ -28,7 +28,7 @@ bool cutlass_sparse_compress_entry(torch::Tensor& a_nzs, torch::Tensor& a_meta,
   int32_t version_num = get_sm_version_num();
 
   // Guard against compilation issues for sm90 kernels
-#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
   if (version_num >= 90) {
     return cutlass_sparse_compress_sm90(a_nzs, a_meta, a);
   }
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index b50e9a3a2c240..6223dc8cca704 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -2,7 +2,7 @@
 // clang-format off
 #include <cudaTypedefs.h>
 
-#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+#if defined CUDA_VERSION && CUDA_VERSION >= 12020
 #include "sparse_scaled_mm_c3x.cuh"
 // clang-format on
 
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
index 4c930b603c9e4..d464b045b895f 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -5,7 +5,18 @@
 
 #include "cutlass_extensions/common.hpp"
 
-#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability) {
+  // sparse CUTLASS kernels need at least
+  //   CUDA 12.2 and SM90 (Hopper)
+
+#if defined CUDA_VERSION
+  return CUDA_VERSION >= 12020 && cuda_device_capability >= 90;
+#endif
+
+  return false;
+}
+
+#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
 void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                                    torch::Tensor const& b,
                                    torch::Tensor const& e,
@@ -43,7 +54,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
   int32_t version_num = get_sm_version_num();
 
   // Guard against compilation issues for sm90 kernels
-#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
   if (version_num >= 90) {
     cutlass_scaled_sparse_mm_sm90(c, a, bt_nzs, bt_meta, a_scales, b_scales,
                                   bias);
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 88a4e60c75cbe..956258c1001d3 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -321,6 +321,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
   ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
 
+  // Check if cutlass sparse scaled_mm is supported for CUDA devices of the
+  // given capability
+  ops.def(
+      "cutlass_sparse_scaled_mm_supported(int cuda_device_capability) -> bool");
+  ops.impl("cutlass_sparse_scaled_mm_supported",
+           &cutlass_sparse_scaled_mm_supported);
+
   // CUTLASS sparse GEMM, supporting symmetric per-tensor or per-row/column
   // quantization, as well as bias
   ops.def(
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index 34244a8fe4ca7..4316d6ab30e33 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -8,6 +8,8 @@
 import torch
 
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    sparse_cutlass_supported)
 from vllm.platforms import current_platform
 
 CUDA_DEVICES = [
@@ -102,10 +104,11 @@ def baseline_scaled_mm(a: torch.Tensor,
     return output
 
 
-@pytest.mark.skipif(not current_platform.has_device_capability(90),
+@pytest.mark.skipif(not sparse_cutlass_supported(),
                     reason="Sparse FP8 is not yet supported on this GPU type.")
 # Test working with a subset of A and B for sparse matmul
 def test_cutlass_sparse_subset():
+
     big_m = 1024
     m, n, k = 512, 512, 512
 
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 21fec990aa873..38e02f6018dee 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -14,6 +14,8 @@
     CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
     CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
     CompressedTensorsWNA16)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    sparse_cutlass_supported)
 from vllm.platforms import current_platform
 
 
@@ -212,7 +214,7 @@ def test_compressed_tensors_kv_cache(vllm_runner):
         assert output
 
 
-@pytest.mark.skipif(not current_platform.has_device_capability(90),
+@pytest.mark.skipif(not sparse_cutlass_supported(),
                     reason="Sparse FP8 is not yet supported on this GPU type.")
 def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy):
     assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
@@ -254,7 +256,7 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
         assert output
 
 
-@pytest.mark.skipif(not current_platform.has_device_capability(90),
+@pytest.mark.skipif(not sparse_cutlass_supported(),
                     reason="Sparse FP8 is not yet supported on this GPU type.")
 @pytest.mark.parametrize("args_2of4", [
     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
@@ -279,7 +281,7 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
         assert output
 
 
-@pytest.mark.skipif(not current_platform.has_device_capability(90),
+@pytest.mark.skipif(not sparse_cutlass_supported(),
                     reason="Sparse FP8 is not yet supported on this GPU type.")
 @pytest.mark.parametrize(
     "args_2of4",
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index f6b5514f8987d..19f31b8ec419d 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -552,6 +552,11 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
     return out
 
 
+def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_sparse_scaled_mm_supported(
+        cuda_device_capability)
+
+
 def cutlass_sparse_compress(a: torch.Tensor) \
     -> Tuple[torch.Tensor, torch.Tensor]:
     """
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index af266769aef89..bc697ef93b34b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -9,7 +9,7 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    convert_to_channelwise)
+    convert_to_channelwise, sparse_cutlass_supported)
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            ChannelQuantScaleParameter,
                                            ModelWeightParameter,
@@ -40,6 +40,11 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
 
+        if not sparse_cutlass_supported():
+            raise ValueError(
+                "Sparse CUTLASS not supported. vLLM must be built with"
+                "CUDA 12.2 or later to use this feature")
+
         self.output_dtype = params_dtype
         layer.logical_widths = output_partition_sizes
         self.weights_dtype: torch.dtype = self._get_params_dtype(params_dtype)
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 4037bcb963b25..d77722499d0e9 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -10,6 +10,17 @@
 TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32)
 
 
+def sparse_cutlass_supported() -> bool:
+    # sparse cutlass is not supported on Rocm
+    if current_platform.is_rocm():
+        return False
+
+    capability_tuple = current_platform.get_device_capability()
+    capability = -1 if capability_tuple is None else capability_tuple.to_int()
+
+    return ops.cutlass_sparse_scaled_mm_supported(capability)
+
+
 def cutlass_fp8_supported() -> bool:
     # cutlass is not supported on Rocm
     if current_platform.is_rocm():

From 17ca964273464fad7e682380bab8288d4fac05c5 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Wed, 18 Dec 2024 20:27:24 -0700
Subject: [PATCH 113/357] [Model] IBM Granite 3.1 (#11307)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 docs/source/models/supported_models.rst              |  4 ++--
 docs/source/usage/tool_calling.md                    |  7 ++++++-
 tests/tool_use/utils.py                              | 10 +++++++++-
 .../openai/tool_parsers/granite_tool_parser.py       | 12 ++++++++++--
 4 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 3bef3f3226062..8d39e6f14a59c 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -194,8 +194,8 @@ Text Generation (``--task generate``)
     -
     - ✅︎
   * - :code:`GraniteForCausalLM`
-    - Granite 3.0, PowerLM
-    - :code:`ibm-granite/granite-3.0-2b-base`, :code:`ibm-granite/granite-3.0-8b-instruct`, :code:`ibm/PowerLM-3b`, etc.
+    - Granite 3.0, Granite 3.1, PowerLM
+    - :code:`ibm-granite/granite-3.0-2b-base`, :code:`ibm-granite/granite-3.1-8b-instruct`, :code:`ibm/PowerLM-3b`, etc.
     - ✅︎
     - ✅︎
   * - :code:`GraniteMoeForCausalLM`
diff --git a/docs/source/usage/tool_calling.md b/docs/source/usage/tool_calling.md
index f8be023307b0c..34b26647a959f 100644
--- a/docs/source/usage/tool_calling.md
+++ b/docs/source/usage/tool_calling.md
@@ -170,6 +170,12 @@ Recommended flags: `--tool-call-parser granite --chat-template examples/tool_cha
 
 `examples/tool_chat_template_granite.jinja`: this is a modified chat template from the original on Huggingface. Parallel function calls are supported.
 
+* `ibm-granite/granite-3.1-8b-instruct`
+
+Recommended flags: `--tool-call-parser granite`
+
+The chat template from Huggingface can be used directly. Parallel function calls are supported.
+
 * `ibm-granite/granite-20b-functioncalling`
 
 Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja`
@@ -284,4 +290,3 @@ Then you can use this plugin in the command line like this.
     --tool-call-parser example \
     --chat-template <your chat template> \
 ```
-
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index 6818ac44b2478..2241f1846e746 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -103,7 +103,7 @@ def ensure_system_prompt(messages: List[Dict[str, Any]],
         "supports_rocm":
         False,
     },
-    "granite8b": {
+    "granite-3.0-8b": {
         "model":
         "ibm-granite/granite-3.0-8b-instruct",
         "arguments": [
@@ -111,6 +111,14 @@ def ensure_system_prompt(messages: List[Dict[str, Any]],
             str(VLLM_PATH / "examples/tool_chat_template_granite.jinja")
         ],
     },
+    "granite-3.1-8b": {
+        "model": "ibm-granite/granite-3.1-8b-instruct",
+        "arguments": [
+            "--tool-call-parser",
+            "granite",
+        ],
+        "supports_parallel": True,
+    },
     "internlm": {
         "model":
         "internlm/internlm2_5-7b-chat",
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
index dae481a2154a1..8aefcd8d58a39 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -35,13 +35,18 @@ class GraniteToolParser(ToolParser):
 
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
+        # for granite 3.0, the token `<|tool_call|>`
         self.bot_token = "<|tool_call|>"
+        # for granite 3.1, the string `<tool_call>`
+        self.bot_string = "<tool_call>"
 
     def extract_tool_calls(
             self, model_output: str,
             request: ChatCompletionRequest) -> ExtractedToolCallInformation:
-        # remove whitespace and the BOT token if it exists
-        stripped = model_output.strip().removeprefix(self.bot_token).lstrip()
+        stripped = model_output.strip()\
+                    .removeprefix(self.bot_token)\
+                    .removeprefix(self.bot_string)\
+                    .lstrip()
         if not stripped or stripped[0] != '[':
             return ExtractedToolCallInformation(tools_called=False,
                                                 tool_calls=[],
@@ -91,6 +96,9 @@ def extract_tool_calls_streaming(
         if current_text[start_idx:].startswith(self.bot_token):
             start_idx = consume_space(start_idx + len(self.bot_token),
                                       current_text)
+        if current_text[start_idx:].startswith(self.bot_string):
+            start_idx = consume_space(start_idx + len(self.bot_string),
+                                      current_text)
         if not current_text or start_idx >= len(current_text)\
             or current_text[start_idx] != '[':
             return DeltaMessage(content=delta_text)

From a30482f0545a216d6c02241f6663b9cddbcb066e Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 18 Dec 2024 23:00:38 -0500
Subject: [PATCH 114/357] [CI] Expand test_guided_generate to test all backends
 (#11313)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 tests/entrypoints/llm/test_guided_generate.py | 112 +++++++++++-------
 .../model_executor/test_guided_processors.py  |   4 +-
 .../guided_decoding/__init__.py               |  64 +++++++++-
 3 files changed, 129 insertions(+), 51 deletions(-)

diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index ed50ec6bbc9eb..e9c48f2b6b551 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -10,7 +10,8 @@
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
+GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
 
 
 @pytest.fixture(scope="module")
@@ -26,11 +27,13 @@ def llm():
 
 
 @pytest.mark.skip_global_cleanup
-def test_guided_regex(sample_regex, llm):
-    sampling_params = SamplingParams(
-        temperature=0.8,
-        top_p=0.95,
-        guided_decoding=GuidedDecodingParams(regex=sample_regex))
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_regex(sample_regex, llm, guided_decoding_backend: str):
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     guided_decoding=GuidedDecodingParams(
+                                         regex=sample_regex,
+                                         backend=guided_decoding_backend))
     outputs = llm.generate(prompts=[
         f"Give an example IPv4 address with this regex: {sample_regex}"
     ] * 2,
@@ -50,11 +53,14 @@ def test_guided_regex(sample_regex, llm):
 
 
 @pytest.mark.skip_global_cleanup
-def test_guided_json_completion(sample_json_schema, llm):
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_json_completion(sample_json_schema, llm,
+                                guided_decoding_backend: str):
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json=sample_json_schema,
+                                         backend=guided_decoding_backend))
     outputs = llm.generate(prompts=[
         f"Give an example JSON for an employee profile "
         f"that fits this schema: {sample_json_schema}"
@@ -77,11 +83,14 @@ def test_guided_json_completion(sample_json_schema, llm):
 
 
 @pytest.mark.skip_global_cleanup
-def test_guided_complex_json_completion(sample_complex_json_schema, llm):
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(json=sample_complex_json_schema))
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_complex_json_completion(sample_complex_json_schema, llm,
+                                        guided_decoding_backend: str):
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json=sample_complex_json_schema,
+                                         backend=guided_decoding_backend))
     outputs = llm.generate(prompts=[
         f"Give an example JSON for an assignment grade "
         f"that fits this schema: {sample_complex_json_schema}"
@@ -105,11 +114,14 @@ def test_guided_complex_json_completion(sample_complex_json_schema, llm):
 
 
 @pytest.mark.skip_global_cleanup
-def test_guided_definition_json_completion(sample_definition_json_schema, llm):
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_definition_json_completion(sample_definition_json_schema, llm,
+                                           guided_decoding_backend: str):
     sampling_params = SamplingParams(temperature=1.0,
                                      max_tokens=1000,
                                      guided_decoding=GuidedDecodingParams(
-                                         json=sample_definition_json_schema))
+                                         json=sample_definition_json_schema,
+                                         backend=guided_decoding_backend))
     outputs = llm.generate(prompts=[
         f"Give an example JSON for solving 8x + 7 = -23 "
         f"that fits this schema: {sample_definition_json_schema}"
@@ -133,11 +145,14 @@ def test_guided_definition_json_completion(sample_definition_json_schema, llm):
 
 
 @pytest.mark.skip_global_cleanup
-def test_guided_choice_completion(sample_guided_choice, llm):
-    sampling_params = SamplingParams(
-        temperature=0.8,
-        top_p=0.95,
-        guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_choice_completion(sample_guided_choice, llm,
+                                  guided_decoding_backend: str):
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     guided_decoding=GuidedDecodingParams(
+                                         choice=sample_guided_choice,
+                                         backend=guided_decoding_backend))
     outputs = llm.generate(
         prompts="The best language for type-safe systems programming is ",
         sampling_params=sampling_params,
@@ -156,13 +171,20 @@ def test_guided_choice_completion(sample_guided_choice, llm):
 
 
 @pytest.mark.skip_global_cleanup
-def test_guided_grammar(sample_sql_statements, llm):
-
-    sampling_params = SamplingParams(
-        temperature=0.8,
-        top_p=0.95,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(grammar=sample_sql_statements))
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_grammar(sample_sql_statements, llm,
+                        guided_decoding_backend: str):
+    if guided_decoding_backend == "outlines":
+        pytest.skip("Outlines backend fails in this test case with:\n"
+                    "AttributeError: Error in model execution: 'ParserConf' "
+                    "object has no attribute 'deterministic'")
+
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         grammar=sample_sql_statements,
+                                         backend=guided_decoding_backend))
     outputs = llm.generate(
         prompts=("Generate a sql state that select col_1 from "
                  "table_1 where it is equals to 1"),
@@ -218,15 +240,18 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm):
 
 
 @pytest.mark.skip_global_cleanup
-def test_guided_json_object(llm):
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=100,
-        guided_decoding=GuidedDecodingParams(json_object=True))
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_json_object(llm, guided_decoding_backend: str):
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=100,
+                                     n=2,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json_object=True,
+                                         backend=guided_decoding_backend))
 
     outputs = llm.generate(
-        prompts=("Generate a JSON object describing a person with name "
-                 "and age for John Smith who is 31 years old."),
+        prompts=("Generate a JSON object with curly braces for a person with "
+                 "name and age fields for John Smith who is 31 years old."),
         sampling_params=sampling_params,
         use_tqdm=True)
 
@@ -235,10 +260,11 @@ def test_guided_json_object(llm):
         assert output is not None
         assert isinstance(output, RequestOutput)
 
-        generated_text = output.outputs[0].text
-        print(generated_text)
-        assert generated_text is not None
+        for i in range(2):
+            generated_text = output.outputs[i].text
+            print(generated_text)
+            assert generated_text is not None
 
-        # Parse to verify it is valid JSON
-        parsed_json = json.loads(generated_text)
-        assert isinstance(parsed_json, dict)
+            # Parse to verify it is valid JSON
+            parsed_json = json.loads(generated_text)
+            assert isinstance(parsed_json, dict)
diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
index 3334c0df149b5..be5282d9c8223 100644
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@@ -13,6 +13,7 @@
 from vllm.sampling_params import GuidedDecodingParams
 
 MODEL_NAME = 'HuggingFaceH4/zephyr-7b-beta'
+GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
 
 
 def test_guided_logits_processors(sample_regex, sample_json_schema):
@@ -42,8 +43,7 @@ def test_guided_logits_processors(sample_regex, sample_json_schema):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("backend",
-                         ["outlines", "lm-format-enforcer", "xgrammar"])
+@pytest.mark.parametrize("backend", GUIDED_DECODING_BACKENDS)
 @pytest.mark.parametrize("is_local", [True, False])
 async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
                                                  sample_regex,
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index e631aec928ec5..550b892303feb 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -49,15 +49,60 @@ def check_object(obj: dict) -> bool:
     return check_object(schema)
 
 
+def has_lmf_unsupported_json_features(schema: dict) -> bool:
+    """
+    Check if JSON schema contains features unsupported 
+    by lm_format_enforcer.
+
+    Known issues:
+    - Regex patterns:
+        "grade": {
+            "type": "string",
+            "pattern": "^[A-D]$"  # Regex pattern
+        },
+    """
+
+    def check_object(obj: dict) -> bool:
+        if not isinstance(obj, dict):
+            return False
+
+        # Check for pattern restrictions
+        if "pattern" in obj:
+            return True
+
+        # Recursively check all nested objects and arrays
+        for value in obj.values():
+            if isinstance(value, dict):
+                if check_object(value):
+                    return True
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict) and check_object(item):
+                        return True
+
+        return False
+
+    return check_object(schema)
+
+
 def maybe_backend_fallback(
         guided_params: GuidedDecodingParams) -> GuidedDecodingParams:
     # lm-format-enforce doesn't support grammar, fallback to xgrammar
-    if (guided_params.backend == "lm-format-enforcer"
-            and guided_params.grammar is not None):
-        logger.warning(
-            "lm-format-enforcer does not support grammar guided decoding. "
-            "Falling back to use xgrammar instead.")
-        guided_params.backend = "xgrammar"
+    if guided_params.backend == "lm-format-enforcer":
+        if guided_params.grammar is not None:
+            logger.warning(
+                "lm-format-enforcer does not support grammar guided decoding. "
+                "Falling back to use xgrammar instead.")
+            guided_params.backend = "xgrammar"
+
+        # lm-format-enforcer doesn't support some JSON schema features
+        elif (guided_params.json is not None
+              and has_lmf_unsupported_json_features(guided_params.json)):
+            logger.warning(
+                "lm-format-enforcer does not support advanced JSON schema "
+                "features like patterns or numeric ranges. "
+                "Falling back to use outlines instead.")
+            guided_params.backend = "outlines"
 
     if guided_params.backend == "xgrammar":
         # xgrammar only has x86 wheels for linux, fallback to outlines
@@ -82,6 +127,13 @@ def maybe_backend_fallback(
                 "Falling back to use outlines instead.")
             guided_params.backend = "outlines"
 
+    if (guided_params.backend == "outlines"
+            and guided_params.json_object is not None):
+        # outlines doesn't support json_object, fallback to xgrammar
+        logger.warning("outlines does not support json_object. "
+                       "Falling back to use xgrammar instead.")
+        guided_params.backend = "xgrammar"
+
     return guided_params
 
 

From c6b0a7d3ba03ca414be1174e9bd86a97191b7090 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Wed, 18 Dec 2024 20:17:12 -0800
Subject: [PATCH 115/357] [V1] Simplify prefix caching logic by removing
 `num_evictable_computed_blocks` (#11310)

---
 vllm/v1/core/kv_cache_manager.py | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 61a3f5fd6d841..78efacccfa078 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -201,23 +201,15 @@ def allocate_slots(
                 f"num_tokens must be greater than 0, got {num_tokens}")
 
         # Touch the computed blocks to make sure they won't be evicted.
-        num_evictable_computed_blocks = 0
         if self.enable_caching:
             self._touch(computed_blocks)
-
-            # If a computed block of a request is an eviction candidate (in the
-            # free queue and ref_cnt == 0), it cannot be counted as a free block
-            # when allocating this request.
-            num_evictable_computed_blocks = len(
-                [blk for blk in computed_blocks if blk.ref_cnt == 0])
         else:
             assert not computed_blocks, (
                 "Computed blocks should be empty when "
                 "prefix caching is disabled")
 
         num_required_blocks = cdiv(num_tokens, self.block_size)
-        if (num_required_blocks > self.free_block_queue.num_free_blocks -
-                num_evictable_computed_blocks):
+        if (num_required_blocks > self.free_block_queue.num_free_blocks):
             # Cannot allocate new blocks.
             return None
 
@@ -225,8 +217,7 @@ def allocate_slots(
         # preallocated blocks.
         num_new_blocks = min(
             num_required_blocks + self.num_preallocate_blocks,
-            self.free_block_queue.num_free_blocks -
-            num_evictable_computed_blocks,
+            self.free_block_queue.num_free_blocks,
             # Should not exceed the maximum number of blocks per request.
             # This is especially because the block table has the shape
             # [..., max_num_blocks_per_req].

From 6142ef0adafd76f6b33ff0adb9a097e45a5df279 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 19 Dec 2024 14:14:17 +0800
Subject: [PATCH 116/357] [VLM] Merged multimodal processor for Qwen2-Audio
 (#11303)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 examples/offline_inference_audio_language.py  |   6 +
 .../audio_language/test_ultravox.py           |   9 +-
 vllm/inputs/registry.py                       | 100 ++++--
 vllm/model_executor/models/llava.py           |   4 +-
 vllm/model_executor/models/phi3v.py           |  16 +-
 vllm/model_executor/models/qwen2_audio.py     | 303 +++++++-----------
 vllm/model_executor/models/ultravox.py        | 113 ++++---
 vllm/multimodal/audio.py                      |  18 ++
 vllm/multimodal/inputs.py                     |  25 +-
 vllm/multimodal/processing.py                 | 152 ++++++---
 vllm/utils.py                                 |  30 +-
 11 files changed, 416 insertions(+), 360 deletions(-)

diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py
index 68b786961b14a..6fd74782a9aae 100644
--- a/examples/offline_inference_audio_language.py
+++ b/examples/offline_inference_audio_language.py
@@ -18,6 +18,10 @@
     2: "What sport and what nursery rhyme are referenced?"
 }
 
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
+
 
 # Ultravox 0.3
 def run_ultravox(question: str, audio_count: int):
@@ -33,6 +37,8 @@ def run_ultravox(question: str, audio_count: int):
                                            add_generation_prompt=True)
 
     llm = LLM(model=model_name,
+              max_model_len=4096,
+              max_num_seqs=5,
               trust_remote_code=True,
               limit_mm_per_prompt={"audio": audio_count})
     stop_token_ids = None
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index c548cfdf53414..0bb98df1b58e6 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -5,6 +5,7 @@
 import pytest_asyncio
 from transformers import AutoModel, AutoTokenizer, BatchEncoding
 
+from vllm.multimodal.audio import resample_audio
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 
@@ -130,16 +131,14 @@ def process(hf_inputs: BatchEncoding, **kwargs):
                    dtype=dtype,
                    postprocess_inputs=process,
                    auto_cls=AutoModel) as hf_model:
-        import librosa
-
         hf_outputs_per_audio = [
             hf_model.generate_greedy_logprobs_limit(
                 [hf_prompt],
                 max_tokens,
                 num_logprobs=num_logprobs,
-                audios=[(librosa.resample(audio[0],
-                                          orig_sr=audio[1],
-                                          target_sr=16000), 16000)])
+                audios=[(resample_audio(audio[0],
+                                        orig_sr=audio[1],
+                                        target_sr=16000), 16000)])
             for _, hf_prompt, audio in prompts_and_audios
         ]
 
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 0b85484c48714..fb02627eb22bd 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -1,11 +1,11 @@
 import functools
 from collections import UserDict
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, NamedTuple,
-                    Optional, Protocol, Type)
+from typing import (TYPE_CHECKING, Any, Callable, Mapping, NamedTuple,
+                    Optional, Protocol, Union)
 
 from torch import nn
-from transformers import PretrainedConfig, ProcessorMixin
+from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 from typing_extensions import TypeVar, assert_never
 
 from vllm.logger import init_logger
@@ -26,6 +26,7 @@
 logger = init_logger(__name__)
 
 C = TypeVar("C", bound=PretrainedConfig, default=PretrainedConfig)
+P = TypeVar("P", bound=ProcessorMixin, default=ProcessorMixin)
 
 
 @dataclass(frozen=True)
@@ -38,24 +39,28 @@ class InputContext:
     model_config: "ModelConfig"
     """The configuration of the model."""
 
-    def get_hf_config(self, hf_config_type: Type[C] = PretrainedConfig) -> C:
+    def get_hf_config(
+        self,
+        typ: Union[type[C], tuple[type[C], ...]] = PretrainedConfig,
+        /,
+    ) -> C:
         """
         Get the HuggingFace configuration
         (:class:`transformers.PretrainedConfig`) of the model,
         additionally checking its type.
 
         Raises:
-            TypeError: If the model is not of the specified type.
+            TypeError: If the configuration is not of the specified type.
         """
         hf_config = self.model_config.hf_config
-        if not isinstance(hf_config, hf_config_type):
+        if not isinstance(hf_config, typ):
             raise TypeError("Invalid type of HuggingFace config. "
-                            f"Expected type: {hf_config_type}, but "
+                            f"Expected type: {typ}, but "
                             f"found type: {type(hf_config)}")
 
         return hf_config
 
-    def get_hf_image_processor_config(self) -> Dict[str, Any]:
+    def get_hf_image_processor_config(self) -> dict[str, Any]:
         """
         Get the HuggingFace image processor configuration of the model.
         """
@@ -74,18 +79,37 @@ def get_mm_config(self):
 
         return mm_config
 
-    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
+    def get_hf_processor(
+        self,
+        typ: Union[type[P], tuple[type[P], ...]] = ProcessorMixin,
+        /,
+        **kwargs: object,
+    ) -> P:
+        """
+        Get the HuggingFace processor
+        (:class:`transformers.ProcessorMixin`) of the model,
+        additionally checking its type.
+
+        Raises:
+            TypeError: If the processor is not of the specified type.
+        """
         base_kwargs = self.model_config.mm_processor_kwargs
         if base_kwargs is None:
             base_kwargs = {}
 
         merged_kwargs = {**base_kwargs, **kwargs}
 
-        return cached_get_processor(
+        hf_processor = cached_get_processor(
             self.model_config.model,
             trust_remote_code=self.model_config.trust_remote_code,
             **merged_kwargs,
         )
+        if not isinstance(hf_processor, typ):
+            raise TypeError("Invalid type of HuggingFace processor. "
+                            f"Expected type: {typ}, but "
+                            f"found type: {type(hf_processor)}")
+
+        return hf_processor
 
 
 @dataclass(frozen=True)
@@ -93,39 +117,55 @@ class InputProcessingContext(InputContext):
     tokenizer: AnyTokenizer
     """The tokenizer used to tokenize the inputs."""
 
-    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
-        base_kwargs = self.model_config.mm_processor_kwargs
-        if base_kwargs is None:
-            base_kwargs = {}
-
-        merged_kwargs = {**base_kwargs, **kwargs}
-
-        return cached_get_processor(
-            self.model_config.model,
-            tokenizer=self.tokenizer,  # Override the tokenizer with ours
-            trust_remote_code=self.model_config.trust_remote_code,
-            **merged_kwargs,
+    def get_hf_processor(
+        self,
+        typ: Union[type[P], tuple[type[P], ...]] = ProcessorMixin,
+        /,
+        **kwargs: object,
+    ) -> P:
+        return super().get_hf_processor(
+            typ,
+            tokenizer=self.tokenizer,
+            **kwargs,
         )
 
-    def resolve_hf_processor_call_kwargs(
+    def call_hf_processor(
         self,
         hf_processor: ProcessorMixin,
+        prompt: str,
+        processor_data: Mapping[str, object],
         inference_kwargs: Mapping[str, object],
-    ) -> Mapping[str, object]:
+    ) -> BatchFeature:
         assert callable(hf_processor)
 
         base_kwargs = self.model_config.mm_processor_kwargs
         if base_kwargs is None:
             base_kwargs = {}
 
-        return resolve_mm_processor_kwargs(
+        merged_kwargs = resolve_mm_processor_kwargs(
             base_kwargs,
             inference_kwargs,
             hf_processor,
+            requires_kw_only=False,
+            allow_var_kwargs=True,
         )
 
+        try:
+            return hf_processor(
+                text=prompt,
+                **processor_data,
+                **merged_kwargs,
+                return_tensors="pt",
+            )
+        except Exception as exc:
+            data = dict(text=prompt, **processor_data)
+            msg = (f"Failed to apply {type(hf_processor).__name__} "
+                   f"on data={data} with kwargs={merged_kwargs}")
+
+            raise RuntimeError(msg) from exc
+
 
-N = TypeVar("N", bound=Type[nn.Module])
+N = TypeVar("N", bound=type[nn.Module])
 
 
 class DummyData(NamedTuple):
@@ -232,7 +272,7 @@ def wrapper(model_cls: N) -> N:
 
         return wrapper
 
-    def _get_dummy_data_factory(self, model_cls: Type[nn.Module]):
+    def _get_dummy_data_factory(self, model_cls: type[nn.Module]):
         return self._dummy_factories_by_model_type \
             .get(model_cls, self._default_dummy_data_factory)
 
@@ -257,7 +297,7 @@ def wrapper(model_cls: N) -> N:
 
         return wrapper
 
-    def _get_dummy_encoder_data_factory(self, model_cls: Type[nn.Module]):
+    def _get_dummy_encoder_data_factory(self, model_cls: type[nn.Module]):
         return self._dummy_encoder_factories_by_model_type \
             .get(model_cls, self._default_dummy_data_factory)
 
@@ -368,14 +408,14 @@ def wrapper(model_cls: N) -> N:
 
         return wrapper
 
-    def _get_model_input_processor(self, model_cls: Type[nn.Module]):
+    def _get_model_input_processor(self, model_cls: type[nn.Module]):
         return self._input_processors_by_model_type \
             .get(model_cls, self._default_input_processor)
 
     def _ensure_mm_kwargs(
         self,
         inputs: SingletonInputs,
-        mm_processor_kwargs: Dict[str, Any],
+        mm_processor_kwargs: dict[str, Any],
     ):
         if inputs["type"] == "token":
             # In case the input processor for that model fails to set it
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index a2e404cf43238..0662d90e79b92 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -133,8 +133,8 @@ def preprocess(__self, *args, **kwargs):
         hf_processor.__is_patched__ = True  # type: ignore
 
     def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]:
-        hf_processor = self.ctx.get_hf_processor()
-        assert isinstance(hf_processor, (LlavaProcessor, PixtralProcessor))
+        hf_processor = self.ctx.get_hf_processor(
+            (LlavaProcessor, PixtralProcessor))
 
         if isinstance(hf_processor, PixtralProcessor):
             self._patch_pixtral_processor(hf_processor)
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index b19329a57a8cf..e2263f63f7bba 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -34,7 +34,6 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataDict,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
 from vllm.sequence import IntermediateTensors
@@ -330,20 +329,27 @@ def _get_hf_processor(
             return self.ctx.get_hf_processor(num_crops=num_crops)
         return self.ctx.get_hf_processor()
 
-    def _apply_hf_processor(
+    def _call_hf_processor(
         self,
+        hf_processor: ProcessorMixin,
         prompt: str,
-        mm_data: MultiModalDataDict,
+        processor_data: Mapping[str, object],
         mm_processor_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        processed_outputs = super()._apply_hf_processor(
-            prompt, mm_data, mm_processor_kwargs)
+        processed_outputs = super()._call_hf_processor(
+            hf_processor,
+            prompt=prompt,
+            processor_data=processor_data,
+            mm_processor_kwargs=mm_processor_kwargs,
+        )
+
         # Phi3v processor has inserted -1, -2 etc as placeholder in prompt_ids,
         # which will cause OverflowError when decoding the prompt_ids.
         # Therefore, we need to do an early replacement here
         token_ids = processed_outputs['input_ids']
         token_ids[token_ids < 0] = _IMAGE_TOKEN_ID
         processed_outputs['input_ids'] = token_ids
+
         return processed_outputs
 
     def _get_prompt_replacements(
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 48a2d470414b9..07e29b71c2ed4 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -19,45 +19,43 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
-from functools import cached_property, lru_cache
-from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
-                    Union)
+from functools import cached_property
+from typing import (Any, Iterable, List, Mapping, Optional, Set, Tuple,
+                    TypedDict, Union)
 
-import librosa
 import numpy as np
 import torch
 import torch.nn as nn
-from transformers import Qwen2AudioEncoder
+from transformers import BatchFeature, ProcessorMixin
+from transformers.models.qwen2_audio import (Qwen2AudioConfig,
+                                             Qwen2AudioEncoder,
+                                             Qwen2AudioProcessor)
+from transformers.models.whisper import WhisperFeatureExtractor
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
-from vllm.logger import init_logger
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import NestedTensors
-from vllm.multimodal.utils import consecutive_placeholder_ranges
-from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
+from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
-logger = init_logger(__name__)
-
 
 # # === Audio Inputs === #
 class Qwen2AudioInputs(TypedDict):
     input_features: torch.Tensor
-    """Shape: 
-    `(num_audios, num_mel_bins, 3000)`
-    """
+    """Shape: `(num_audios, num_mel_bins, 3000)`"""
 
     feature_attention_mask: torch.Tensor
-    """Shape: `(num_audios, 3000)`
-    """
+    """Shape: `(num_audios, 3000)`"""
 
 
 # === Audio Encoder === #
@@ -74,187 +72,114 @@ def forward(self, audio_features):
         return hidden_states
 
 
-def dummy_data_for_qwen2_audio(ctx: InputContext, seq_len: int,
-                               mm_counts: Mapping[str, int]):
-    num_audios = mm_counts["audio"]
-    max_tokens_per_audio = get_max_qwen2_audio_audio_tokens(ctx)
-    max_llm_audio_tokens = max_tokens_per_audio * num_audios
-    if seq_len - max_llm_audio_tokens - 2 < 0:
-        raise RuntimeError(
-            f"Qwen2-Audio cannot process {num_audios} audios in a prompt, "
-            "please increase max_model_len or reduce audio limit by "
-            "--limit-mm-per-prompt.")
-
-    audio_token_index = ctx.model_config.hf_config.audio_token_index
-
-    dummy_seqdata = SequenceData.from_prompt_token_counts(
-        (audio_token_index, max_llm_audio_tokens),
-        (0, seq_len - max_llm_audio_tokens),
-    )
-    dummy_audio = np.full((max_llm_audio_tokens * 2 * 2 * 160, ), 0.)
-    return DummyData(
-        dummy_seqdata, {"audio": [(dummy_audio, 16000)] * num_audios}, {
-            "audio":
-            consecutive_placeholder_ranges(num_items=num_audios,
-                                           item_size=max_tokens_per_audio)
-        })
-
-
-def get_processor(
-    processor_name: str,
-    *args,
-    trust_remote_code: bool = False,
-    **kwargs,
-):
-    """Gets a processor for the given model name via HuggingFace.
-
-    Derived from `vllm.transformers_utils.image_processor.get_image_processor`.
-    """
-    # don't put this import at the top level
-    # it will call torch.cuda.device_count()
-    from transformers import AutoProcessor
-
-    try:
-        processor = AutoProcessor.from_pretrained(
-            processor_name,
-            *args,
-            trust_remote_code=trust_remote_code,
-            **kwargs)
-    except ValueError as e:
-        # If the error pertains to the processor class not existing or not
-        # currently being imported, suggest using the --trust-remote-code flag.
-        # Unlike AutoTokenizer, AutoProcessor does not separate such errors
-        if not trust_remote_code:
-            err_msg = (
-                "Failed to load the processor. If the processor is "
-                "a custom processor not yet available in the HuggingFace "
-                "transformers library, consider setting "
-                "`trust_remote_code=True` in LLM or using the "
-                "`--trust-remote-code` flag in the CLI.")
-            raise RuntimeError(err_msg) from e
-        else:
-            raise e
+# From Qwen2AudioEncoder._get_feat_extract_output_lengths
+def _get_feat_extract_output_lengths(input_lengths: torch.LongTensor):
+    feat_lengths = (input_lengths - 1) // 2 + 1
+    output_lengths = (feat_lengths - 2) // 2 + 1
+    return feat_lengths, output_lengths
 
-    return processor
 
+def get_max_qwen2_audio_audio_tokens(ctx: InputContext) -> int:
+    hf_config = ctx.get_hf_config(Qwen2AudioConfig)
+    max_source_position = hf_config.audio_config.max_source_positions
+    output_lengths = (max_source_position - 2) // 2 + 1
+    return output_lengths
 
-cached_get_processor = lru_cache(get_processor)
 
+class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor):
 
-def _get_feat_extract_output_lengths(input_lengths: torch.LongTensor):
-    """
-    Computes the output length of the convolutional layers
-    and the output length of the audio encoder
-    """
-    input_lengths = (input_lengths - 1) // 2 + 1
-    output_lengths = (input_lengths - 2) // 2 + 1
-    return input_lengths, output_lengths
+    def _get_hf_processor(self) -> Qwen2AudioProcessor:
+        return self.ctx.get_hf_processor(Qwen2AudioProcessor)
 
+    def _get_feature_extractor(self) -> WhisperFeatureExtractor:
+        return self._get_hf_processor().feature_extractor  # type: ignore
 
-def get_max_qwen2_audio_audio_tokens(ctx: InputContext) -> int:
-    max_source_position = (
-        ctx.model_config.hf_config.audio_config.max_source_positions)
-    output_lengths = (max_source_position - 2) // 2 + 1
-    return output_lengths
+    def _get_processor_data(
+        self,
+        mm_items: MultiModalDataItems,
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
+        # resample audio to the model's sampling rate
+        feature_extractor = self._get_feature_extractor()
+        mm_items.resample_audios(feature_extractor.sampling_rate)
 
+        return super()._get_processor_data(mm_items)
 
-def input_processor_for_qwen2_audio(
-        ctx: InputContext, inputs: DecoderOnlyInputs) -> DecoderOnlyInputs:
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "audio" not in multi_modal_data:
-        return inputs
-
-    audios = multi_modal_data["audio"]
-    if not isinstance(audios, list):
-        audios = [audios]
-
-    if len(audios) == 0:
-        return inputs
-
-    processor = cached_get_processor(ctx.model_config.model)
-    resampled_audios = [
-        librosa.resample(audio,
-                         orig_sr=sampling_rate,
-                         target_sr=processor.feature_extractor.sampling_rate)
-        for audio, sampling_rate in audios
-    ]
-    audio_input_lengths = np.array(
-        [min(3000, _.shape[0] // 160 + 1) for _ in resampled_audios])
-
-    audio_feat_lengths, audio_output_lengths = _get_feat_extract_output_lengths(
-        audio_input_lengths)
-
-    audio_token_index = ctx.model_config.hf_config.audio_token_index
-
-    input_ids = inputs['prompt_token_ids']
-
-    new_input_ids = []
-    audio_num = input_ids.count(audio_token_index)
-    assert len(audio_input_lengths) == audio_num, \
-        (f'The text input contains {audio_num} audio tokens, '
-         f'but {len(audio_input_lengths)} audios provided')
-    start = 0
-    for audio_idx in range(audio_num):
-        end = input_ids.index(audio_token_index, start)
-        new_input_ids.extend(input_ids[start:end])  # text part
-
-        new_input_ids.extend([audio_token_index] *
-                             audio_output_lengths[audio_idx])
-        start = end + 1
-    new_input_ids.extend(input_ids[start:])
-
-    return token_inputs(
-        prompt_token_ids=new_input_ids,
-        prompt=inputs.get("prompt"),
-        multi_modal_data=multi_modal_data,
-    )
-
-
-def input_mapper_for_qwen2_audio(
-    ctx: InputContext,
-    multi_modal_data: Union[np.ndarray, List[np.ndarray]],
-) -> MultiModalKwargs:
-    """Input mapper for Qwen2-Audio."""
-    if not isinstance(multi_modal_data, list):
-        multi_modal_data = [multi_modal_data]
-
-    if len(multi_modal_data) == 0:
-        return MultiModalKwargs()
-
-    processor = cached_get_processor(ctx.model_config.model)
-    audio_feature_extractor = processor.feature_extractor
-    if audio_feature_extractor is None:
-        raise RuntimeError(
-            "No HuggingFace audio_feature_extractor is available "
-            "to process the audio object")
-
-    try:
-        resampled_audios = [
-            librosa.resample(
-                audio,
-                orig_sr=sampling_rate,
-                target_sr=processor.feature_extractor.sampling_rate)
-            for audio, sampling_rate in multi_modal_data
+    def _call_hf_processor(
+        self,
+        hf_processor: ProcessorMixin,
+        prompt: str,
+        processor_data: Mapping[str, object],
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processor_data = dict(processor_data)
+        audios = processor_data.pop("audios", [])
+
+        if audios:
+            processor_data["audios"] = audios
+
+            feature_extractor = self._get_feature_extractor()
+            mm_processor_kwargs = dict(
+                **mm_processor_kwargs,
+                sampling_rate=feature_extractor.sampling_rate,
+            )
+        else:
+            # NOTE: WhisperFeatureExtractor cannot handle empty list of audios
+            pass
+
+        return super()._call_hf_processor(
+            hf_processor,
+            prompt=prompt,
+            processor_data=processor_data,
+            mm_processor_kwargs=mm_processor_kwargs,
+        )
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_inputs: BatchFeature,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> list[PromptReplacement]:
+        hf_config = self.ctx.get_hf_config(Qwen2AudioConfig)
+        placeholder = hf_config.audio_token_index
+
+        feature_attention_mask = hf_inputs.get("feature_attention_mask")
+        if feature_attention_mask is None:
+            audio_output_lengths = []
+        else:
+            _, audio_output_lengths = _get_feat_extract_output_lengths(
+                feature_attention_mask.sum(-1))
+
+        def get_replacement_qwen2_audio(item_idx: int):
+            return [placeholder] * audio_output_lengths[item_idx]
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target=[placeholder],
+                replacement=get_replacement_qwen2_audio,
+            )
         ]
-        batch_data = audio_feature_extractor(resampled_audios,
-                                             sampling_rate=16000,
-                                             return_attention_mask=True,
-                                             padding="max_length",
-                                             return_tensors="pt").data
-        batch_data["feature_attention_mask"] = batch_data.pop("attention_mask")
-    except Exception:
-        logger.error("Failed to process audio (%s)", multi_modal_data)
-        raise
-
-    return MultiModalKwargs(batch_data)
-
-
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_audio)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_qwen2_audio)
-@MULTIMODAL_REGISTRY.register_input_mapper("audio",
-                                           input_mapper_for_qwen2_audio)
+
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        audio_len = get_max_qwen2_audio_audio_tokens(self.ctx)
+
+        audio_count = mm_counts["audio"]
+        audio = np.zeros(audio_len)
+        data = {"audio": [audio] * audio_count}
+
+        return ProcessorInputs(
+            prompt_text="<|AUDIO|>" * audio_count,
+            mm_data=data,
+            mm_processor_kwargs={},
+        )
+
+
 @MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
     "audio", get_max_qwen2_audio_audio_tokens)
+@MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor)
 class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
                                          SupportsPP):
 
@@ -289,9 +214,7 @@ def sampler(self):
 
         return get_sampler()
 
-    def _validate_and_reshape_mm_tensor(self,
-                                        mm_input: Union[torch.Tensor,
-                                                        List[torch.Tensor]],
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
                                         name: str) -> torch.Tensor:
         if not isinstance(mm_input, (torch.Tensor, list)):
             raise ValueError(f"Incorrect type of {name}. "
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index ebaa8a4c4f38a..c60b208c3d27d 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -3,7 +3,7 @@
 
 import math
 from functools import cached_property, lru_cache
-from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set,
+from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set,
                     Tuple, TypedDict, Union)
 
 import numpy as np
@@ -11,7 +11,7 @@
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import functional as F
-from transformers import BatchFeature
+from transformers import BatchFeature, ProcessorMixin
 from transformers.models.whisper import WhisperFeatureExtractor
 from transformers.models.whisper.modeling_whisper import WhisperEncoder
 
@@ -25,11 +25,11 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataDict,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
+from vllm.utils import is_list_of
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
@@ -61,8 +61,8 @@ def cached_feature_extractor(model_id: str) -> WhisperFeatureExtractor:
 
 
 def whisper_feature_extractor(ctx: InputContext) -> WhisperFeatureExtractor:
-    return cached_feature_extractor(
-        ctx.get_hf_config(UltravoxConfig).audio_model_id)
+    hf_config = ctx.get_hf_config(UltravoxConfig)
+    return cached_feature_extractor(hf_config.audio_model_id)
 
 
 def get_ultravox_max_audio_tokens(ctx: InputContext):
@@ -73,72 +73,71 @@ def get_ultravox_max_audio_tokens(ctx: InputContext):
 class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
 
     def _get_feature_extractor(self) -> WhisperFeatureExtractor:
-        return self._get_hf_processor().audio_processor.feature_extractor
+        hf_processor = self._get_hf_processor()
+        return hf_processor.audio_processor.feature_extractor  # type: ignore
 
-    def _resample_audio(
+    def _get_processor_data(
         self,
-        audio: np.ndarray,
-        sr: int,
-    ) -> Dict[str, Union[np.ndarray, int]]:
+        mm_items: MultiModalDataItems,
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
         # resample audio to the model's sampling rate
         feature_extractor = self._get_feature_extractor()
-        if sr != feature_extractor.sampling_rate:
-            try:
-                import librosa
-            except ImportError as exc:
-                raise ImportError(
-                    "Please install vllm[audio] for audio support.") from exc
-            audio = librosa.resample(audio,
-                                     orig_sr=sr,
-                                     target_sr=feature_extractor.sampling_rate)
-            sr = feature_extractor.sampling_rate
-        return {"audio": audio, "sampling_rate": sr}
-
-    def _apply_hf_processor(
+        mm_items.resample_audios(feature_extractor.sampling_rate)
+
+        return super()._get_processor_data(mm_items)
+
+    def _call_hf_processor(
         self,
+        hf_processor: ProcessorMixin,
         prompt: str,
-        mm_data: MultiModalDataDict,
+        processor_data: Mapping[str, object],
         mm_processor_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        if not mm_data or not mm_data.get("audio", None):
-            return super()._apply_hf_processor(prompt, mm_data,
-                                               mm_processor_kwargs)
+        processor_data = dict(processor_data)
+        audios = processor_data.pop("audios", [])
+
+        if not audios:
+            return super()._call_hf_processor(
+                hf_processor,
+                prompt=prompt,
+                processor_data=processor_data,
+                mm_processor_kwargs=mm_processor_kwargs,
+            )
 
-        audio_data = mm_data["audio"]
-        if not isinstance(audio_data, list):
-            audio_data = [audio_data]
+        feature_extractor = self._get_feature_extractor()
+        mm_processor_kwargs = dict(
+            **mm_processor_kwargs,
+            sampling_rate=feature_extractor.sampling_rate,
+        )
+
+        # Already resampled by _get_processor_data
+        assert is_list_of(audios, np.ndarray)
 
         # Ultravox processor doesn't support multiple inputs,
         # therefore we need to input text and audio one by one
-        tokenizer = self._get_tokenizer()
         audio_features, audio_token_len = [], []
-        processed_inputs = {}
-        for audio, sr in audio_data:
-            data = self._resample_audio(audio, sr)
-            processed_inputs = super()._apply_hf_processor(
-                prompt, data, mm_processor_kwargs)
-            prompt = tokenizer.decode(processed_inputs["input_ids"][0],
-                                      skip_special_tokens=False)
-            audio_features.append(
-                processed_inputs.pop("audio_values").squeeze(0))
-            audio_token_len.append(
-                processed_inputs.pop("audio_token_len").item())
-
-        return dict(
-            **processed_inputs,
+        shared_outputs = {}
+        for audio in audios:
+            # NOTE: Ultravox processor accepts "audio" instead of "audios"
+            item_processor_data = dict(**processor_data, audio=audio)
+
+            item_outputs = super()._call_hf_processor(
+                hf_processor,
+                prompt=prompt,
+                processor_data=item_processor_data,
+                mm_processor_kwargs=mm_processor_kwargs,
+            )
+
+            audio_features.append(item_outputs.pop("audio_values")[0])
+            audio_token_len.append(item_outputs.pop("audio_token_len").item())
+            shared_outputs = item_outputs
+
+        combined_outputs = dict(
+            **shared_outputs,
             audio_features=audio_features,
             audio_token_len=audio_token_len,
         )
-
-    def _get_processor_data(
-        self,
-        mm_data: MultiModalDataDict,
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        # Ultravox uses "audio" instead of "audios" as calling keyword
-        processor_data, passthrough_data = super()._get_processor_data(mm_data)
-        if "audios" in processor_data:
-            processor_data["audio"] = processor_data.pop("audios")
-        return processor_data, passthrough_data
+        return BatchFeature(combined_outputs)
 
     def _get_prompt_replacements(
         self,
@@ -147,7 +146,7 @@ def _get_prompt_replacements(
         mm_processor_kwargs: Mapping[str, object],
     ) -> list[PromptReplacement]:
         hf_processor = self._get_hf_processor()
-        placeholder = hf_processor.audio_token_replacement
+        placeholder = hf_processor.audio_token_replacement  # type: ignore
 
         def get_replacement_ultravox(item_idx: int):
             audio_token_len = hf_inputs["audio_token_len"][item_idx]
@@ -171,7 +170,7 @@ def _get_dummy_mm_inputs(
 
         audio_count = mm_counts["audio"]
         audio = np.zeros(audio_len)
-        data = {"audio": [(audio, sampling_rate)] * audio_count}
+        data = {"audio": [audio] * audio_count}
 
         return ProcessorInputs(
             prompt_text="<|audio|>" * audio_count,
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index 1a230602966d4..c92deddbcb255 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -1,3 +1,6 @@
+import numpy as np
+import numpy.typing as npt
+
 from vllm.inputs.registry import InputContext
 
 from .base import MultiModalPlugin
@@ -21,3 +24,18 @@ def _default_input_mapper(
     def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
         raise NotImplementedError(
             "There is no default maximum multimodal tokens")
+
+
+def resample_audio(
+    audio: npt.NDArray[np.floating],
+    *,
+    orig_sr: float,
+    target_sr: float,
+) -> npt.NDArray[np.floating]:
+    try:
+        import librosa
+    except ImportError as exc:
+        msg = "Please install vllm[audio] for audio support."
+        raise ImportError(msg) from exc
+
+    return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index c00943a5f26d9..138cc6a44c11a 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -15,31 +15,32 @@
 # yapf: disable
 ImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
 """
-A :class:`transformers.image_utils.ImageInput` representing a single image,
-which can be passed to a HuggingFace :code:`ImageProcessor`.
+A :class:`transformers.image_utils.ImageInput` representing a single image
+item, which can be passed to a HuggingFace :code:`ImageProcessor`.
 """
 
 VideoItem: TypeAlias = Union[
-    List[Image],
+    list[Image],
     np.ndarray,
     torch.Tensor,
-    List[np.ndarray],
-    List[torch.Tensor],
+    list[np.ndarray],
+    list[torch.Tensor],
 ]
 """
-
-A :class:`transformers.image_utils.VideoInput` representing a single video,
-which can be passed to a HuggingFace :code:`VideoProcessor`.
+A :class:`transformers.image_utils.VideoInput` representing a single video
+item, which can be passed to a HuggingFace :code:`VideoProcessor`.
 """
 
 AudioItem: TypeAlias = Union[
     np.ndarray,
-    List[float],
-    Tuple[np.ndarray, float],  # DEPRECATED: Use mm_processor_kwargs instead
+    list[float],
+    # `(audio, sampling_rate)`: If the audio's sampling rate is different
+    # from that expected by the model, we need to resample it.
+    tuple[np.ndarray, float],
 ]
 """
-Represents a single audio that can be inputted to a HuggingFace
-:code:`AudioProcessor`.
+Represents a single audio
+item, which can be passed to a HuggingFace :code:`AudioProcessor`.
 """
 # yapf: enable
 
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 339e193eefe20..b00513e5b37cb 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -17,6 +17,7 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import flatten_2d_lists, full_groupby, is_list_of
 
+from .audio import resample_audio
 from .inputs import (AudioItem, ImageItem, MultiModalDataDict,
                      MultiModalInputsV2, MultiModalKwargs, PlaceholderRange,
                      VideoItem)
@@ -30,7 +31,7 @@
 @dataclass
 class PromptReplacement:
     modality: str
-    """The modality for which the replacement is made"""
+    """The modality for which the replacement is made."""
 
     target: _PromptSeq
     """The text or token sequence to find and replace."""
@@ -211,20 +212,48 @@ class MultiModalDataItems(UserDict[str, list[Any]]):
     corresponds to a list.
     """
 
+    @staticmethod
+    def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
+        """
+        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
+        """
+        multi_data = MultiModalDataItems()
+
+        for k, v in data.items():
+            # yapf: disable
+            if k == "video":
+                # Special case since even a single item can be a list
+                multi_data[k] = (  # type: ignore[index]
+                    v if is_list_of(v, (list, torch.Tensor)) else [v]
+                )
+            elif k in ("image", "audio"):
+                multi_data[k] = (  # type: ignore[index]
+                    v if isinstance(v, (list, torch.Tensor)) else [v]
+                )
+            else:
+                multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
+            # yapf: enable
+
+        return multi_data
+
+    # NOTE: When a field (e.g. `images`) doesn't exist, directly appending to
+    # `self.images` doesn't update this dictionary, which may be confusing
+    # We annotate the getter methods as `Sequence` to prevent others from
+    # trying to update the list in this way
     @property
-    def image(self) -> list[ImageItem]:
-        return self["image"]
+    def images(self) -> Sequence[ImageItem]:
+        return self.get("image", [])
 
     @property
-    def video(self) -> list[VideoItem]:
-        return self["video"]
+    def videos(self) -> Sequence[VideoItem]:
+        return self.get("video", [])
 
     @property
-    def audio(self) -> list[AudioItem]:
-        return self["audio"]
+    def audios(self) -> Sequence[AudioItem]:
+        return self.get("audio", [])
 
     def get_image_size(self, item_idx: int) -> ImageSize:
-        image = self.image[item_idx]
+        image = self.images[item_idx]
 
         if isinstance(image, Image):
             return ImageSize(*image.size)
@@ -234,25 +263,41 @@ def get_image_size(self, item_idx: int) -> ImageSize:
 
         assert_never(image)
 
+    def get_audio_with_sr(
+        self,
+        item_idx: int,
+        *,
+        default_sr: float,
+    ) -> tuple[np.ndarray, float]:
+        audio = self.audios[item_idx]
+
+        if isinstance(audio, tuple):
+            return audio
+        if isinstance(audio, list):
+            return np.array(audio), default_sr
+        if isinstance(audio, np.ndarray):
+            return audio, default_sr
+
+        assert_never(audio)
+
+    def resample_audios(self, new_sr: float, *, drop_sr: bool = True) -> None:
+        """
+        If :code:`drop_sr=True`, the audio items in this dictionary are updated
+        to be NumPy arrays which implicitly means that their sampling rate is
+        the same as the model's expected sampling rate; otherwise, they remain
+        as :code:`(audio, new_sr)` tuples.
+        """
+        if not self.audios:
+            return
+
+        new_audios = []
+        for item_idx in range(len(self.audios)):
+            audio, sr = self.get_audio_with_sr(item_idx, default_sr=new_sr)
+            audio = resample_audio(audio, orig_sr=sr, target_sr=new_sr)
 
-def to_multi_format(data: MultiModalDataDict) -> MultiModalDataItems:
-    """
-    Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
-    """
-    multi_data = MultiModalDataItems()
-
-    for k, v in data.items():
-        # yapf: disable
-        if k == "video":
-            # Special case since even a single item can be a list
-            multi_data[k] = v if is_list_of(v, list) else [v]  # type: ignore[index]
-        elif k in ("image", "audio"):
-            multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
-        else:
-            multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
-        # yapf: enable
+            new_audios.append(audio if drop_sr else (audio, new_sr))
 
-    return multi_data
+        self["audio"] = new_audios
 
 
 class _TokenMatch(NamedTuple):
@@ -596,18 +641,20 @@ def _find_placeholders(
 
     def _get_processor_data(
         self,
-        mm_data: MultiModalDataDict,
-    ) -> BatchFeature:
+        mm_items: MultiModalDataItems,
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
         processor_data = dict[str, Any]()
         passthrough_data = dict[str, Any]()
-        for k, v in mm_data.items():
+
+        for k, v in mm_items.items():
             # TODO: Make a separate modality for embedding inputs
             # to avoid confusion
             if k in ("image", "video", "audio"):
                 if isinstance(v, torch.Tensor) and v.ndim == 3:
                     # Pass through embedding inputs (single)
                     passthrough_data[f"{k}_embeds"] = [v]
-                elif is_list_of(v, torch.Tensor) and v[0].ndim == 2:
+                elif (is_list_of(v, torch.Tensor) and len(v) > 0
+                      and v[0].ndim == 2):
                     # Pass through embedding inputs (multi)
                     passthrough_data[f"{k}_embeds"] = v
                 else:
@@ -615,40 +662,41 @@ def _get_processor_data(
                     processor_data[f"{k}s"] = v
             else:
                 processor_data[k] = v
+
         return processor_data, passthrough_data
 
+    def _call_hf_processor(
+        self,
+        hf_processor: ProcessorMixin,
+        prompt: str,
+        processor_data: Mapping[str, object],
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        return self.ctx.call_hf_processor(
+            hf_processor,
+            prompt,
+            processor_data,
+            mm_processor_kwargs,
+        )
+
     def _apply_hf_processor(
         self,
         prompt: str,
-        mm_data: MultiModalDataDict,
+        mm_items: MultiModalDataItems,
         mm_processor_kwargs: Mapping[str, object],
     ) -> BatchFeature:
         # some mm_processor_kwargs may be used in processor initialization
         # instead of processor call
         hf_processor = self._get_hf_processor(**mm_processor_kwargs)
 
-        processor_data, passthrough_data = self._get_processor_data(mm_data)
+        processor_data, passthrough_data = self._get_processor_data(mm_items)
 
-        assert callable(hf_processor)
-        mm_processor_kwargs = self.ctx.resolve_hf_processor_call_kwargs(
+        hf_inputs = self._call_hf_processor(
             hf_processor,
-            mm_processor_kwargs,
+            prompt=prompt,
+            processor_data=processor_data,
+            mm_processor_kwargs=mm_processor_kwargs,
         )
-
-        try:
-            hf_inputs = hf_processor(
-                text=prompt,  # type: ignore
-                **processor_data,
-                **mm_processor_kwargs,
-                return_tensors="pt",
-            )
-        except Exception as exc:
-            data = dict(text=prompt, **processor_data)
-
-            raise RuntimeError(
-                f"Failed to apply {type(hf_processor).__name__} "
-                f"on data={data} with kwargs={mm_processor_kwargs}") from exc
-
         hf_inputs.update(passthrough_data)
 
         return hf_inputs
@@ -730,14 +778,13 @@ def apply(
         3. Extract information about the placeholder tokens from the
            processed token IDs.
         """
-        tokenizer = self._get_tokenizer()
+        mm_items = MultiModalDataItems.from_dict(mm_data)
 
-        hf_inputs = self._apply_hf_processor(prompt_text, mm_data,
+        hf_inputs = self._apply_hf_processor(prompt_text, mm_items,
                                              mm_processor_kwargs)
         prompt_ids, = hf_inputs.pop("input_ids").tolist()
         mm_kwargs = MultiModalKwargs(hf_inputs)
 
-        mm_items = to_multi_format(mm_data)
         prompt_repls = self._get_prompt_replacements(mm_items, hf_inputs,
                                                      mm_processor_kwargs)
         all_prompt_repls = self._bind_prompt_replacements(prompt_repls)
@@ -749,6 +796,7 @@ def apply(
                                                    prompt_ids, mm_item_counts)
 
         if all_placeholders:
+            tokenizer = self._get_tokenizer()
             prompt_text = _decode(tokenizer, prompt_ids)
         else:
             (
diff --git a/vllm/utils.py b/vllm/utils.py
index 38c7dea6d2d3d..ba567feb19792 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -775,7 +775,7 @@ def get_dtype_size(dtype: torch.dtype) -> int:
 # `collections` helpers
 def is_list_of(
     value: object,
-    typ: Type[T],
+    typ: Union[type[T], tuple[type[T], ...]],
     *,
     check: Literal["first", "all"] = "first",
 ) -> TypeIs[List[T]]:
@@ -1282,6 +1282,7 @@ async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args,
 def supports_kw(
     callable: Callable[..., object],
     kw_name: str,
+    *,
     requires_kw_only: bool = False,
     allow_var_kwargs: bool = True,
 ) -> bool:
@@ -1326,6 +1327,8 @@ def resolve_mm_processor_kwargs(
     init_kwargs: Optional[Mapping[str, object]],
     inference_kwargs: Optional[Mapping[str, object]],
     callable: Callable[..., object],
+    *,
+    requires_kw_only: bool = True,
     allow_var_kwargs: bool = False,
 ) -> Dict[str, Any]:
     """Applies filtering to eliminate invalid mm_processor_kwargs, i.e.,
@@ -1344,11 +1347,17 @@ def resolve_mm_processor_kwargs(
     runtime_mm_kwargs = get_allowed_kwarg_only_overrides(
         callable,
         overrides=inference_kwargs,
-        allow_var_kwargs=allow_var_kwargs)
+        requires_kw_only=requires_kw_only,
+        allow_var_kwargs=allow_var_kwargs,
+    )
 
     # Filter init time multimodal processor kwargs provided
     init_mm_kwargs = get_allowed_kwarg_only_overrides(
-        callable, overrides=init_kwargs, allow_var_kwargs=allow_var_kwargs)
+        callable,
+        overrides=init_kwargs,
+        requires_kw_only=requires_kw_only,
+        allow_var_kwargs=allow_var_kwargs,
+    )
 
     # Merge the final processor kwargs, prioritizing inference
     # time values over the initialization time values.
@@ -1359,6 +1368,8 @@ def resolve_mm_processor_kwargs(
 def get_allowed_kwarg_only_overrides(
     callable: Callable[..., object],
     overrides: Optional[Mapping[str, object]],
+    *,
+    requires_kw_only: bool = True,
     allow_var_kwargs: bool = False,
 ) -> Dict[str, Any]:
     """
@@ -1390,16 +1401,21 @@ def get_allowed_kwarg_only_overrides(
         for kwarg_name, val in overrides.items()
         if supports_kw(callable,
                        kwarg_name,
-                       requires_kw_only=True,
+                       requires_kw_only=requires_kw_only,
                        allow_var_kwargs=allow_var_kwargs)
     }
 
     # If anything is dropped, log a warning
     dropped_keys = overrides.keys() - filtered_overrides.keys()
     if dropped_keys:
-        logger.warning(
-            "The following intended overrides are not keyword-only args "
-            "and and will be dropped: %s", dropped_keys)
+        if requires_kw_only:
+            logger.warning(
+                "The following intended overrides are not keyword-only args "
+                "and and will be dropped: %s", dropped_keys)
+        else:
+            logger.warning(
+                "The following intended overrides are not keyword args "
+                "and and will be dropped: %s", dropped_keys)
 
     return filtered_overrides
 

From 8936316d587ca0afb5ef058584c407d404c0ffb0 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Thu, 19 Dec 2024 02:00:18 -0500
Subject: [PATCH 117/357] [Kernel] Refactor Cutlass c3x (#10049)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 .../epilogue/scaled_mm_epilogues_c2x.hpp      |   2 +
 .../epilogue/scaled_mm_epilogues_c3x.hpp      |   2 +
 .../cutlass_w8a8/scaled_mm_c2x.cuh            |   6 +-
 .../cutlass_w8a8/scaled_mm_c3x.cu             | 373 +-----------------
 .../cutlass_w8a8/scaled_mm_c3x.cuh            | 160 ++++++++
 .../scaled_mm_c3x_sm90_fp8_dispatch.cuh       |  96 +++++
 .../scaled_mm_c3x_sm90_int8_dispatch.cuh      | 140 +++++++
 7 files changed, 406 insertions(+), 373 deletions(-)
 create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
 create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh
 create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh

diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
index c69e87999ae71..26f7423fd7455 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
@@ -1,3 +1,5 @@
+#pragma once
+
 #include "cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp"
 
 /*
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
index fcc17c7727f94..c723adf126422 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -1,3 +1,5 @@
+#pragma once
+
 #include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
 
 /*
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
index 75681f7f37820..f2fae4b66d651 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
@@ -28,9 +28,9 @@
 using namespace cute;
 
 /*
-   Epilogue functions can be defined to post-process the output before it is
-   written to GPU memory.
-   Epilogues must contain a public type named EVTCompute of type Sm80EVT,
+   Epilogues defined in,
+   csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
+   must contain a public type named EVTCompute of type Sm80EVT,
    as well as a static prepare_args function that constructs an
    EVTCompute::Arguments struct.
 */
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index 8190277997161..123f4359c0d1a 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -1,385 +1,18 @@
-// clang-format will break include orders
-// clang-format off
 #include <cudaTypedefs.h>
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
 
-#include <torch/all.h>
+  #include "scaled_mm_c3x_sm90_fp8_dispatch.cuh"
+  #include "scaled_mm_c3x_sm90_int8_dispatch.cuh"
 
-#include <ATen/cuda/CUDAContext.h>
-
-#include <iostream>
-#include <sstream>
-#include <vector>
-
-#include "cutlass/cutlass.h"
-
-#include "cute/tensor.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-#include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/gemm/collective/collective_builder.hpp"
-
-#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
-#include "core/math.hpp"
-#include "cutlass_extensions/common.hpp"
-// clang-format on
-
-using namespace cute;
+  #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
 using namespace vllm;
 
 /*
    This file defines quantized GEMM operations using the CUTLASS 3.x API, for
    NVIDIA GPUs with sm90a (Hopper) or later.
-
-   Epilogue functions can be defined to post-process the output before it is
-   written to GPU memory.
-   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
-   as well as a static prepare_args function that constructs an
-   EVTCompute::Arguments struct.
 */
 
-namespace {
-
-// A wrapper for the GEMM kernel that is used to guard against compilation on
-// architectures that will never use the kernel. The purpose of this is to
-// reduce the size of the compiled binary.
-// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
-// into code that will be executed on the device where it is defined.
-template <typename Kernel>
-struct enable_sm90_or_later : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE void operator()(Args&&... args) {
-  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
-    Kernel::operator()(std::forward<Args>(args)...);
-  #endif
-  }
-};
-template <typename ElementAB_, typename ElementD_,
-          template <typename, typename, typename> typename Epilogue_,
-          typename TileShape, typename ClusterShape, typename KernelSchedule,
-          typename EpilogueSchedule>
-struct cutlass_3x_gemm {
-  using ElementAB = ElementAB_;
-  using ElementD = ElementD_;
-  using ElementAcc =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
-                                float>::type;
-
-  using EpilogueDescriptor =
-      cutlass::epilogue::collective::detail::EpilogueDescriptor<
-          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
-          ElementD, EpilogueSchedule>;
-
-  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
-
-  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
-  using ElementC = void;
-  using StrideC = StrideD;
-
-  using EVTCompute = typename Epilogue::EVTCompute;
-
-  using CollectiveEpilogue =
-      typename cutlass::epilogue::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
-          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
-          EpilogueSchedule, EVTCompute>::CollectiveOp;
-
-  static constexpr size_t CEStorageSize =
-      sizeof(typename CollectiveEpilogue::SharedStorage);
-  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
-      static_cast<int>(CEStorageSize)>;
-
-  // clang-format off
-  using CollectiveMainloop =
-      typename cutlass::gemm::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
-          ElementAB, cutlass::layout::RowMajor, 16, 
-          ElementAB, cutlass::layout::ColumnMajor, 16, 
-          ElementAcc, TileShape, ClusterShape,
-          Stages,
-          KernelSchedule>::CollectiveOp;
-  // clang-format on
-
-  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
-      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
-      cutlass::gemm::PersistentScheduler>>;
-
-  struct GemmKernel : public KernelType {};
-};
-
-template <typename Gemm, typename... EpilogueArgs>
-void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                         torch::Tensor const& b,
-                         EpilogueArgs&&... epilogue_params) {
-  using ElementAB = typename Gemm::ElementAB;
-  using ElementD = typename Gemm::ElementD;
-
-  int32_t m = a.size(0);
-  int32_t n = b.size(1);
-  int32_t k = a.size(1);
-
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
-
-  using StrideA = Stride<int64_t, Int<1>, int64_t>;
-  using StrideB = Stride<int64_t, Int<1>, int64_t>;
-  using StrideC = typename Gemm::StrideC;
-
-  StrideA a_stride{lda, Int<1>{}, 0};
-  StrideB b_stride{ldb, Int<1>{}, 0};
-  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
-
-  using GemmKernel = typename Gemm::GemmKernel;
-  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
-
-  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
-  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
-                                                       b_stride};
-
-  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
-  typename GemmKernel::EpilogueArguments epilogue_args{
-      Gemm::Epilogue::prepare_args(
-          std::forward<EpilogueArgs>(epilogue_params)...),
-      c_ptr, c_stride, c_ptr, c_stride};
-
-  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
-                                      prob_shape, mainloop_args, epilogue_args};
-
-  // Launch the CUTLASS GEMM kernel.
-  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-  GemmOp gemm_op;
-  CUTLASS_CHECK(gemm_op.can_implement(args));
-
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  auto const workspace_options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
-  auto workspace = torch::empty(workspace_size, workspace_options);
-
-  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
-
-  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
-  CUTLASS_CHECK(status);
-}
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_default {
-  // M in (128, inf)
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M128 {
-  // M in (64, 128]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M64 {
-  // M in [1, 64]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _128>;
-  using ClusterShape = Shape<_1, _8, _1>;
-
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_default {
-  // For M > 128 and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule =
-      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M128 {
-  // For M in (64, 128] and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule =
-      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M64 {
-  // For M in (32, 64] and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _256>;
-  using ClusterShape = Shape<_1, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M32_NBig {
-  // For M in [1, 32] and N >= 8192
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _256>;
-  using ClusterShape = Shape<_1, _4, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M32_NSmall {
-  // For M in [1, 32] and N < 8192
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _256>;
-  using ClusterShape = Shape<_1, _8, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-}  // namespace
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                    torch::Tensor const& b,
-                                    EpilogueArgs&&... args) {
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
-
-  using Cutlass3xGemmDefault =
-      typename sm90_fp8_config_default<InType, OutType,
-                                       Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM64 =
-      typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM128 =
-      typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
-
-  uint32_t const m = a.size(0);
-  uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
-
-  if (mp2 <= 64) {
-    // m in [1, 64]
-    return cutlass_gemm_caller<Cutlass3xGemmM64>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 128) {
-    // m in (64, 128]
-    return cutlass_gemm_caller<Cutlass3xGemmM128>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else {
-    // m in (128, inf)
-    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  }
-}
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& b,
-                                     EpilogueArgs&&... args) {
-  static_assert(std::is_same<InType, int8_t>());
-  TORCH_CHECK(a.dtype() == torch::kInt8);
-  TORCH_CHECK(b.dtype() == torch::kInt8);
-
-  using Cutlass3xGemmDefault =
-      typename sm90_int8_config_default<InType, OutType,
-                                        Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM128 =
-      typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM64 =
-      typename sm90_int8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM32NBig =
-      typename sm90_int8_config_M32_NBig<InType, OutType,
-                                         Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM32NSmall =
-      typename sm90_int8_config_M32_NSmall<InType, OutType,
-                                           Epilogue>::Cutlass3xGemm;
-
-  uint32_t const n = out.size(1);
-  bool const is_small_n = n < 8192;
-
-  uint32_t const m = a.size(0);
-  uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
-
-  if (mp2 <= 32) {
-    // m in [1, 32]
-    if (is_small_n) {
-      return cutlass_gemm_caller<Cutlass3xGemmM32NSmall>(
-          out, a, b, std::forward<EpilogueArgs>(args)...);
-    } else {
-      return cutlass_gemm_caller<Cutlass3xGemmM32NBig>(
-          out, a, b, std::forward<EpilogueArgs>(args)...);
-    }
-  } else if (mp2 <= 64) {
-    // m in (32, 64]
-    return cutlass_gemm_caller<Cutlass3xGemmM64>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 128) {
-    // m in (64, 128]
-    return cutlass_gemm_caller<Cutlass3xGemmM128>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else {
-    // m in (128, inf)
-    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  }
-}
-
 template <template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
 void cutlass_scaled_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor const& a,
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
new file mode 100644
index 0000000000000..d4bc2f0ade50d
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
@@ -0,0 +1,160 @@
+#pragma once
+
+// clang-format will break include orders
+// clang-format off
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "core/math.hpp"
+#include "cutlass_extensions/common.hpp"
+// clang-format on
+
+/*
+  Epilogues defined in,
+  csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp,
+  must contain a public type named EVTCompute of type Sm90EVT, as well as a
+  static prepare_args function that constructs an EVTCompute::Arguments struct.
+*/
+
+using namespace cute;
+
+namespace vllm {
+
+// A wrapper for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_gemm {
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using EpilogueDescriptor =
+      cutlass::epilogue::collective::detail::EpilogueDescriptor<
+          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
+          ElementD, EpilogueSchedule>;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
+
+  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+  using ElementC = void;
+  using StrideC = StrideD;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
+          EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // clang-format off
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
+          ElementAB, cutlass::layout::RowMajor, 16, 
+          ElementAB, cutlass::layout::ColumnMajor, 16, 
+          ElementAcc, TileShape, ClusterShape,
+          Stages,
+          KernelSchedule>::CollectiveOp;
+  // clang-format on
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      cutlass::gemm::PersistentScheduler>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                         torch::Tensor const& b,
+                         EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  using StrideB = Stride<int64_t, Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, Int<1>{}, 0};
+  StrideB b_stride{ldb, Int<1>{}, 0};
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};
+
+  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
+                                      prob_shape, mainloop_args, epilogue_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh
new file mode 100644
index 0000000000000..f08419b3122b2
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh
@@ -0,0 +1,96 @@
+#pragma once
+
+#include "scaled_mm_c3x.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM90 (fp8) based on the Gemm
+ * shape.
+ */
+
+namespace vllm {
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_default {
+  // M in (128, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M128 {
+  // M in (64, 128]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M64 {
+  // M in [1, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _128>;
+  using ClusterShape = Shape<_1, _8, _1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
+                                           torch::Tensor const& a,
+                                           torch::Tensor const& b,
+                                           EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_fp8_config_default<InType, OutType,
+                                       Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 64) {
+    // m in [1, 64]
+    return cutlass_gemm_caller<Cutlass3xGemmM64>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_gemm_caller<Cutlass3xGemmM128>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh
new file mode 100644
index 0000000000000..34e5fd90ba26a
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh
@@ -0,0 +1,140 @@
+#pragma once
+
+#include "scaled_mm_c3x.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM90 (int8) based on the
+ * Gemm shape.
+ */
+
+namespace vllm {
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_default {
+  // For M > 128 and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M128 {
+  // For M in (64, 128] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M64 {
+  // For M in (32, 64] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NBig {
+  // For M in [1, 32] and N >= 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NSmall {
+  // For M in [1, 32] and N < 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _8, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& b,
+                                            EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_int8_config_default<InType, OutType,
+                                        Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_int8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NBig =
+      typename sm90_int8_config_M32_NBig<InType, OutType,
+                                         Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NSmall =
+      typename sm90_int8_config_M32_NSmall<InType, OutType,
+                                           Epilogue>::Cutlass3xGemm;
+
+  uint32_t const n = out.size(1);
+  bool const is_small_n = n < 8192;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 32) {
+    // m in [1, 32]
+    if (is_small_n) {
+      return cutlass_gemm_caller<Cutlass3xGemmM32NSmall>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      return cutlass_gemm_caller<Cutlass3xGemmM32NBig>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 64) {
+    // m in (32, 64]
+    return cutlass_gemm_caller<Cutlass3xGemmM64>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_gemm_caller<Cutlass3xGemmM128>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+}  // namespace vllm
\ No newline at end of file

From f26c4aeecba481ce1445be7a998b0b97460a13bb Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Wed, 18 Dec 2024 23:38:02 -0800
Subject: [PATCH 118/357] [Misc] Optimize ray worker initialization time
 (#11275)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
---
 vllm/executor/ray_gpu_executor.py | 35 +++++++++++++++++++------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 4bf5cbbd18ffe..e2c549cbd5331 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -123,6 +123,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
 
         # Create the workers.
         driver_ip = get_ip()
+        workers = []
         for bundle_id, bundle in enumerate(placement_group.bundle_specs):
             if not bundle.get("GPU", 0):
                 continue
@@ -138,20 +139,30 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 scheduling_strategy=scheduling_strategy,
                 **ray_remote_kwargs,
             )(RayWorkerWrapper).remote(vllm_config=self.vllm_config)
+            workers.append(worker)
 
-            if self.use_ray_spmd_worker:
-                self.workers.append(worker)
-            else:
-                worker_ip = ray.get(worker.get_node_ip.remote())
-                if worker_ip == driver_ip and self.driver_dummy_worker is None:
+        worker_ip_refs = [
+            worker.get_node_ip.remote()  # type: ignore[attr-defined]
+            for worker in workers
+        ]
+        worker_ips = ray.get(worker_ip_refs)
+
+        if not self.use_ray_spmd_worker:
+            for i in range(len(workers)):
+                worker = workers[i]
+                worker_ip = worker_ips[i]
+                if self.driver_dummy_worker is None and worker_ip == driver_ip:
                     # If the worker is on the same node as the driver, we use it
                     # as the resource holder for the driver process.
                     self.driver_dummy_worker = worker
                     self.driver_worker = RayWorkerWrapper(
                         vllm_config=self.vllm_config)
-                else:
-                    # Else, added to the list of workers.
-                    self.workers.append(worker)
+                    workers.pop(i)
+                    worker_ips.pop(i)
+                    self.workers = workers
+                    break
+        else:
+            self.workers = workers
 
         logger.debug("workers: %s", self.workers)
         logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
@@ -161,14 +172,12 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 "adjusting the Ray placement group or running the driver on a "
                 "GPU node.")
 
-        worker_ips = [
-            ray.get(worker.get_node_ip.remote())  # type: ignore[attr-defined]
-            for worker in self.workers
-        ]
         ip_counts: Dict[str, int] = {}
         for ip in worker_ips:
             ip_counts[ip] = ip_counts.get(ip, 0) + 1
 
+        worker_to_ip = dict(zip(self.workers, worker_ips))
+
         def sort_by_driver_then_worker_ip(worker):
             """
             Sort the workers based on 3 properties:
@@ -179,7 +188,7 @@ def sort_by_driver_then_worker_ip(worker):
             3. Finally, if the work is on a node with smaller IP address, it
                 should be placed first.
             """
-            ip = ray.get(worker.get_node_ip.remote())
+            ip = worker_to_ip[worker]
             return (ip != driver_ip, ip_counts[ip], ip)
 
         # After sorting, the workers on the same node will be

From 98356735ac51e25877f1b63c5f0733df7cebe5f7 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Thu, 19 Dec 2024 02:43:16 -0500
Subject: [PATCH 119/357] [misc] benchmark_throughput : Add LoRA (#11267)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 benchmarks/benchmark_throughput.py | 102 +++++++++++++++++++++++++----
 1 file changed, 89 insertions(+), 13 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 1e5967bd9bf8b..c1b10b3cf8f58 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -4,7 +4,8 @@
 import json
 import random
 import time
-from typing import List, Optional
+from functools import cache
+from typing import Dict, List, Optional, Tuple
 
 import torch
 import uvloop
@@ -17,8 +18,11 @@
 from vllm.entrypoints.openai.api_server import (
     build_async_engine_client_from_engine_args)
 from vllm.inputs import TextPrompt
+from vllm.lora.request import LoRARequest
+from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
 from vllm.sampling_params import BeamSearchParams
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators
 
 
@@ -28,15 +32,17 @@ class SampleRequest:
 
     Attributes:
         prompt: The input text prompt for the model.
-        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
-            images).
         prompt_len: The length of the prompt in tokens.
         expected_output_len: The expected length of the output in tokens.
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+            images).
+        lora_request: Optional LoRARequest specifying the LoRA to use. 
     """
     prompt: str
     prompt_len: int
     expected_output_len: int
     multi_modal_data: Optional[MultiModalDataDict] = None
+    lora_request: Optional[LoRARequest] = None
 
 
 def _get_prompt_for_image_model(question: str, *, model: str) -> str:
@@ -60,8 +66,30 @@ def _get_prompt_for_image_model(question: str, *, model: str) -> str:
     raise ValueError(f"Unsupported model {model}")
 
 
+@cache
+def lora_path_on_disk(lora_path: str) -> str:
+    return get_adapter_absolute_path(lora_path)
+
+
+lora_tokenizer_cache: Dict[int, AnyTokenizer] = {}
+
+
+def get_random_lora_request(
+        args: argparse.Namespace
+) -> Tuple[LoRARequest, Optional[AnyTokenizer]]:
+    global lora_tokenizer_cache
+    lora_id = random.randint(1, args.max_loras)
+    lora_request = LoRARequest(lora_name=str(lora_id),
+                               lora_int_id=lora_id,
+                               lora_path=lora_path_on_disk(args.lora_path))
+    if lora_id not in lora_tokenizer_cache:
+        lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
+    return lora_request, lora_tokenizer_cache[lora_id]
+
+
 def sample_requests(tokenizer: PreTrainedTokenizerBase,
                     args: argparse.Namespace) -> List[SampleRequest]:
+
     dataset_path: str = args.dataset
     num_requests: int = args.num_prompts
     fixed_output_len: Optional[int] = args.output_len
@@ -79,7 +107,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
 
     # Filter out sequences that are too long or too short
     filtered_dataset: List[SampleRequest] = []
-    for data in dataset:
+    for data in tqdm(dataset,
+                     total=len(filtered_dataset),
+                     desc="sampling requests"):
         if len(filtered_dataset) == num_requests:
             break
 
@@ -102,9 +132,16 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
                 continue
             prompt = _get_prompt_for_image_model(question=prompt, model=model)
 
+        request_tokenizer = tokenizer
+        lora_request: Optional[LoRARequest] = None
+        if args.enable_lora:
+            lora_request, lora_tokenizer = get_random_lora_request(args)
+            if lora_tokenizer:
+                request_tokenizer = lora_tokenizer
+
         # Tokenize the prompts and completions.
-        prompt_token_ids = tokenizer(prompt).input_ids
-        completion_token_ids = tokenizer(completion).input_ids
+        prompt_token_ids = request_tokenizer(prompt).input_ids
+        completion_token_ids = request_tokenizer(completion).input_ids
         prompt_len = len(prompt_token_ids)
         output_len = len(completion_token_ids
                          ) if fixed_output_len is None else fixed_output_len
@@ -118,7 +155,8 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
             SampleRequest(prompt=prompt,
                           prompt_len=prompt_len,
                           expected_output_len=output_len,
-                          multi_modal_data=multi_modal_data))
+                          multi_modal_data=multi_modal_data,
+                          lora_request=lora_request))
 
     return filtered_dataset
 
@@ -146,14 +184,21 @@ def run_vllm(
                 ignore_eos=True,
                 max_tokens=request.expected_output_len,
             ))
+    lora_requests: Optional[List[LoRARequest]] = None
+    if engine_args.enable_lora:
+        lora_requests = [request.lora_request for request in requests]
 
     use_beam_search = False
 
     if not use_beam_search:
         start = time.perf_counter()
-        llm.generate(prompts, sampling_params, use_tqdm=True)
+        llm.generate(prompts,
+                     sampling_params,
+                     lora_request=lora_requests,
+                     use_tqdm=True)
         end = time.perf_counter()
     else:
+        assert lora_requests is None, "BeamSearch API does not support LoRA"
         prompts = [request.prompt for request in requests]
         # output_len should be the same for all requests.
         output_len = requests[0][2]
@@ -185,6 +230,7 @@ async def run_vllm_async(
         # Add the requests to the engine.
         prompts: List[TextPrompt] = []
         sampling_params: List[SamplingParams] = []
+        lora_requests: List[Optional[LoRARequest]] = []
         for request in requests:
             prompts.append(
                 TextPrompt(prompt=request.prompt,
@@ -197,11 +243,16 @@ async def run_vllm_async(
                     ignore_eos=True,
                     max_tokens=request.expected_output_len,
                 ))
+            lora_requests.append(request.lora_request)
 
         generators = []
         start = time.perf_counter()
-        for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
-            generator = llm.generate(prompt, sp, request_id=f"test{i}")
+        for i, (prompt, sp,
+                lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
+            generator = llm.generate(prompt,
+                                     sp,
+                                     lora_request=lr,
+                                     request_id=f"test{i}")
             generators.append(generator)
         all_gens = merge_async_iterators(*generators)
         async for i, res in all_gens:
@@ -297,6 +348,14 @@ def main(args: argparse.Namespace):
         vocab_size = tokenizer.vocab_size
         requests = []
         for _ in range(args.num_prompts):
+
+            request_tokenizer = tokenizer
+            lora_request: Optional[LoRARequest] = None
+            if args.enable_lora:
+                lora_request, lora_tokenizer = get_random_lora_request(args)
+                if lora_tokenizer:
+                    request_tokenizer = lora_tokenizer
+
             # Synthesize a prompt with the given input length.
             candidate_ids = [
                 random.randint(0, vocab_size - 1)
@@ -305,8 +364,8 @@ def main(args: argparse.Namespace):
             # As tokenizer may add additional tokens like BOS, we need to try
             # different lengths to get the desired input length.
             for _ in range(5):  # Max attempts to correct
-                candidate_prompt = tokenizer.decode(candidate_ids)
-                tokenized_len = len(tokenizer.encode(candidate_prompt))
+                candidate_prompt = request_tokenizer.decode(candidate_ids)
+                tokenized_len = len(request_tokenizer.encode(candidate_prompt))
 
                 if tokenized_len == args.input_len:
                     break
@@ -323,7 +382,8 @@ def main(args: argparse.Namespace):
             requests.append(
                 SampleRequest(prompt=candidate_prompt,
                               prompt_len=args.input_len,
-                              expected_output_len=args.output_len))
+                              expected_output_len=args.output_len,
+                              lora_request=lora_request))
     else:
         requests = sample_requests(tokenizer, args)
 
@@ -422,6 +482,14 @@ def main(args: argparse.Namespace):
                         action='store_true',
                         default=False,
                         help="Disable decoupled async engine frontend.")
+    # LoRA
+    parser.add_argument(
+        "--lora-path",
+        type=str,
+        default=None,
+        help="Path to the lora adapters to use. This can be an absolute path, "
+        "a relative path, or a Hugging Face model identifier.")
+
     parser = AsyncEngineArgs.add_cli_args(parser)
     args = parser.parse_args()
     if args.tokenizer is None:
@@ -431,6 +499,8 @@ def main(args: argparse.Namespace):
         assert args.output_len is not None
     else:
         assert args.input_len is None
+    if args.enable_lora:
+        assert args.lora_path is not None
 
     if args.backend == "vllm":
         if args.hf_max_batch_size is not None:
@@ -440,6 +510,9 @@ def main(args: argparse.Namespace):
             raise ValueError("HF max batch size is required for HF backend.")
         if args.quantization is not None:
             raise ValueError("Quantization is only for vLLM backend.")
+        if args.enable_lora is not None:
+            raise ValueError("LoRA benchmarking is only supported for vLLM"
+                             " backend")
     elif args.backend == "mii":
         if args.dtype != "auto":
             raise ValueError("dtype must be auto for MII backend.")
@@ -452,4 +525,7 @@ def main(args: argparse.Namespace):
         if args.tokenizer != args.model:
             raise ValueError("Tokenizer must be the same as the model for MII "
                              "backend.")
+        if args.enable_lora is not None:
+            raise ValueError("LoRA benchmarking is only supported for vLLM"
+                             " backend")
     main(args)

From 5aef49806da2e6cc8a92c948d44e8a722469135f Mon Sep 17 00:00:00 2001
From: Yanyi Liu <wolfsonliu@163.com>
Date: Thu, 19 Dec 2024 18:50:38 +0800
Subject: [PATCH 120/357] [Feature] Add load generation config from model
 (#11164)

Signed-off-by: liuyanyi <wolfsonliu@163.com>
Signed-off-by: Yanyi Liu <wolfsonliu@163.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 ...nference_with_default_generation_config.py |  30 ++++
 tests/entrypoints/openai/test_serving_chat.py |  61 ++++++++
 vllm/config.py                                |  59 +++++++-
 vllm/engine/arg_utils.py                      |  15 +-
 vllm/engine/llm_engine.py                     |  23 +--
 vllm/entrypoints/llm.py                       |   9 +-
 vllm/entrypoints/openai/protocol.py           | 139 ++++++++++++++----
 vllm/entrypoints/openai/serving_chat.py       |  12 +-
 vllm/entrypoints/openai/serving_completion.py |  13 +-
 vllm/v1/engine/processor.py                   |  20 +--
 10 files changed, 307 insertions(+), 74 deletions(-)
 create mode 100644 examples/offline_inference_with_default_generation_config.py

diff --git a/examples/offline_inference_with_default_generation_config.py b/examples/offline_inference_with_default_generation_config.py
new file mode 100644
index 0000000000000..346bb80b1e23f
--- /dev/null
+++ b/examples/offline_inference_with_default_generation_config.py
@@ -0,0 +1,30 @@
+from vllm import LLM
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+# Create an LLM with built-in default generation config.
+# The generation config is set to None by default to keep
+# the behavior consistent with the previous version.
+# If you want to use the default generation config from the model,
+# you should set the generation_config to "auto".
+llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", generation_config="auto")
+
+# Load the default sampling parameters from the model.
+sampling_params = llm.get_default_sampling_params()
+# Modify the sampling parameters if needed.
+sampling_params.temperature = 0.5
+
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 5b40a04db15ee..51b255bb2a6db 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -1,6 +1,7 @@
 import asyncio
 from contextlib import suppress
 from dataclasses import dataclass
+from typing import Optional
 from unittest.mock import MagicMock
 
 from vllm.config import MultiModalConfig
@@ -31,6 +32,10 @@ class MockModelConfig:
     multimodal_config = MultiModalConfig()
     hf_config = MockHFConfig()
     logits_processor_pattern = None
+    diff_sampling_param: Optional[dict] = None
+
+    def get_diff_sampling_param(self):
+        return self.diff_sampling_param or {}
 
 
 @dataclass
@@ -94,3 +99,59 @@ def test_serving_chat_should_set_correct_max_tokens():
         asyncio.run(serving_chat.create_chat_completion(req))
 
     assert mock_engine.generate.call_args.args[1].max_tokens == 10
+
+
+def test_serving_chat_could_load_correct_generation_config():
+
+    mock_model_config = MockModelConfig()
+    mock_model_config.diff_sampling_param = {
+        "temperature": 0.5,
+        "repetition_penalty": 1.05
+    }
+
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    # Initialize the serving chat
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     BASE_MODEL_PATHS,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     lora_modules=None,
+                                     prompt_adapters=None,
+                                     request_logger=None)
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        guided_decoding_backend="outlines",
+    )
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].temperature == 0.5
+    assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
+
+    # Test the param when user set it
+    req.temperature = 0.1
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].temperature == 0.1
+    assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
+
+    # Test When temperature==0.0
+    req.temperature = 0.0
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].temperature == 0.0
+    assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
diff --git a/vllm/config.py b/vllm/config.py
index 9acc3efa4816c..0e886e18fcd6d 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -27,7 +27,8 @@
 from vllm.transformers_utils.config import (
     ConfigFormat, get_config, get_hf_image_processor_config,
     get_hf_text_config, get_pooling_config,
-    get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope)
+    get_sentence_transformer_tokenizer_config, is_encoder_decoder,
+    try_get_generation_config, uses_mrope)
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
                         get_cpu_memory, print_warning_once, random_uuid,
                         resolve_obj_by_qualname)
@@ -160,6 +161,7 @@ class ModelConfig:
             logits processor qualified names that can be passed with the
             `logits_processors` extra completion argument. Defaults to None, 
             which allows no processors.
+        generation_config: Configuration parameter file for generation.
     """
 
     def compute_hash(self) -> str:
@@ -218,7 +220,8 @@ def __init__(self,
                  disable_mm_preprocessor_cache: bool = False,
                  override_neuron_config: Optional[Dict[str, Any]] = None,
                  override_pooler_config: Optional["PoolerConfig"] = None,
-                 logits_processor_pattern: Optional[str] = None) -> None:
+                 logits_processor_pattern: Optional[str] = None,
+                 generation_config: Optional[str] = None) -> None:
         self.model = model
         self.tokenizer = tokenizer
         self.tokenizer_mode = tokenizer_mode
@@ -348,6 +351,8 @@ def __init__(self,
         self.pooler_config = self._init_pooler_config(override_pooler_config)
         self.logits_processor_pattern = logits_processor_pattern
 
+        self.generation_config = generation_config
+
         self._verify_quantization()
         self._verify_cuda_graph()
         self._verify_bnb_config()
@@ -813,6 +818,56 @@ def get_multimodal_config(self) -> "MultiModalConfig":
 
         return self.multimodal_config
 
+    def try_get_generation_config(self) -> Dict[str, Any]:
+        if self.generation_config is None or self.generation_config == "auto":
+            config = try_get_generation_config(
+                self.model,
+                trust_remote_code=self.trust_remote_code,
+                revision=self.revision,
+            )
+        else:
+            config = try_get_generation_config(
+                self.generation_config,
+                trust_remote_code=self.trust_remote_code,
+            )
+
+        if config is None:
+            return {}
+
+        return config.to_diff_dict()
+
+    def get_diff_sampling_param(self) -> Dict[str, Any]:
+        """
+        This method returns a dictionary containing the parameters 
+        that differ from the default sampling parameters, but only 
+        if `generation_config` is set. If `generation_config` is not 
+        set, an empty dictionary is returned.
+
+        Returns:
+            Dict[str, Any]: A dictionary with the differing sampling 
+            parameters if `generation_config` is set, otherwise an 
+            empty dictionary.
+        """
+        if self.generation_config is None:
+            # When generation_config is not set
+            return {}
+        config = self.try_get_generation_config()
+        available_params = [
+            "repetition_penalty",
+            "temperature",
+            "top_k",
+            "top_p",
+            "min_p",
+        ]
+        if any(p in config for p in available_params):
+            diff_sampling_param = {
+                p: config.get(p)
+                for p in available_params if config.get(p) is not None
+            }
+        else:
+            diff_sampling_param = {}
+        return diff_sampling_param
+
     @property
     def is_encoder_decoder(self) -> bool:
         """Extract the HF encoder/decoder model flag."""
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 75e79d509d2e1..912a8b2f54adb 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -197,6 +197,8 @@ class EngineArgs:
 
     kv_transfer_config: Optional[KVTransferConfig] = None
 
+    generation_config: Optional[str] = None
+
     def __post_init__(self):
         if not self.tokenizer:
             self.tokenizer = self.model
@@ -942,6 +944,16 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default="auto",
             help='The worker class to use for distributed execution.')
 
+        parser.add_argument(
+            "--generation-config",
+            type=nullable_str,
+            default=None,
+            help="The folder path to the generation config. "
+            "Defaults to None, will use the default generation config in vLLM. "
+            "If set to 'auto', the generation config will be automatically "
+            "loaded from model. If set to a folder path, the generation config "
+            "will be loaded from the specified folder path.")
+
         return parser
 
     @classmethod
@@ -985,7 +997,8 @@ def create_model_config(self) -> ModelConfig:
             disable_mm_preprocessor_cache=self.disable_mm_preprocessor_cache,
             override_neuron_config=self.override_neuron_config,
             override_pooler_config=self.override_pooler_config,
-            logits_processor_pattern=self.logits_processor_pattern)
+            logits_processor_pattern=self.logits_processor_pattern,
+            generation_config=self.generation_config)
 
     def create_load_config(self) -> LoadConfig:
         return LoadConfig(
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index dc2d77d6927cd..e78b6f4d26758 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -5,8 +5,8 @@
 from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import partial
-from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
-                    Iterable, List, Mapping, NamedTuple, Optional)
+from typing import (TYPE_CHECKING, Callable, ClassVar, Deque, Dict, Iterable,
+                    List, Mapping, NamedTuple, Optional)
 from typing import Sequence as GenericSequence
 from typing import Set, Type, Union, cast, overload
 
@@ -52,7 +52,6 @@
                            SequenceGroupOutput, SequenceStatus)
 from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
                           init_tracer)
-from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import (
@@ -65,20 +64,6 @@
 logger = init_logger(__name__)
 _LOCAL_LOGGING_INTERVAL_SEC = 5
 
-
-def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:
-    config = try_get_generation_config(
-        model_config.model,
-        trust_remote_code=model_config.trust_remote_code,
-        revision=model_config.revision,
-    )
-
-    if config is None:
-        return {}
-
-    return config.to_diff_dict()
-
-
 _G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
 _O = TypeVar("_O", RequestOutput, PoolingRequestOutput)
 
@@ -274,8 +259,8 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
             return tokenizer_group.get_lora_tokenizer(sequence.lora_request)
 
         self.seq_counter = Counter()
-        self.generation_config_fields = _load_generation_config_dict(
-            self.model_config)
+        self.generation_config_fields = (
+            self.model_config.try_get_generation_config())
 
         self.input_preprocessor = InputPreprocessor(self.model_config,
                                                     self.tokenizer,
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 58ab892676b9a..94d4a4d89adc9 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -258,6 +258,13 @@ def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
         else:
             tokenizer_group.tokenizer = get_cached_tokenizer(tokenizer)
 
+    def get_default_sampling_params(self) -> SamplingParams:
+        diff_sampling_param = (
+            self.llm_engine.model_config.get_diff_sampling_param())
+        if diff_sampling_param:
+            return SamplingParams.from_optional(**diff_sampling_param)
+        return SamplingParams()
+
     @overload
     def generate(
         self,
@@ -441,7 +448,7 @@ def generate(
 
         if sampling_params is None:
             # Use default sampling params.
-            sampling_params = SamplingParams()
+            sampling_params = self.get_default_sampling_params()
 
         self._validate_and_add_requests(
             prompts=parsed_prompts,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 5a70e0952666b..1314de714215e 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -211,8 +211,8 @@ class ChatCompletionRequest(OpenAIBaseModel):
     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
     stream: Optional[bool] = False
     stream_options: Optional[StreamOptions] = None
-    temperature: Optional[float] = 1.0
-    top_p: Optional[float] = 1.0
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
     tools: Optional[List[ChatCompletionToolsParam]] = None
     tool_choice: Optional[Union[Literal["none"], Literal["auto"],
                                 ChatCompletionNamedToolChoiceParam]] = "none"
@@ -224,9 +224,9 @@ class ChatCompletionRequest(OpenAIBaseModel):
     # doc: begin-chat-completion-sampling-params
     best_of: Optional[int] = None
     use_beam_search: bool = False
-    top_k: int = -1
-    min_p: float = 0.0
-    repetition_penalty: float = 1.0
+    top_k: Optional[int] = None
+    min_p: Optional[float] = None
+    repetition_penalty: Optional[float] = None
     length_penalty: float = 1.0
     stop_token_ids: Optional[List[int]] = Field(default_factory=list)
     include_stop_str_in_output: bool = False
@@ -348,15 +348,32 @@ class ChatCompletionRequest(OpenAIBaseModel):
 
     # doc: end-chat-completion-extra-params
 
-    def to_beam_search_params(self,
-                              default_max_tokens: int) -> BeamSearchParams:
+    # Default sampling parameters for chat completion requests
+    _DEFAULT_SAMPLING_PARAMS: dict = {
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_p": 1.0,
+        "top_k": -1,
+        "min_p": 0.0,
+    }
+
+    def to_beam_search_params(
+            self,
+            default_max_tokens: int,
+            default_sampling_params: Optional[dict] = None
+    ) -> BeamSearchParams:
         # TODO(#9845): remove max_tokens when field is removed from OpenAI API
         max_tokens = self.max_completion_tokens or self.max_tokens
         if max_tokens is None:
             max_tokens = default_max_tokens
 
+        if default_sampling_params is None:
+            default_sampling_params = {}
         n = self.n if self.n is not None else 1
-        temperature = self.temperature if self.temperature is not None else 0.0
+
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get(
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
 
         return BeamSearchParams(
             beam_width=n,
@@ -367,13 +384,36 @@ def to_beam_search_params(self,
             include_stop_str_in_output=self.include_stop_str_in_output)
 
     def to_sampling_params(
-            self, default_max_tokens: int,
-            logits_processor_pattern: Optional[str]) -> SamplingParams:
+            self,
+            default_max_tokens: int,
+            logits_processor_pattern: Optional[str],
+            default_sampling_params: Optional[dict] = None) -> SamplingParams:
         # TODO(#9845): remove max_tokens when field is removed from OpenAI API
         max_tokens = self.max_completion_tokens or self.max_tokens
         if max_tokens is None:
             max_tokens = default_max_tokens
 
+        if default_sampling_params is None:
+            default_sampling_params = {}
+        # Default parameters
+        if (repetition_penalty := self.repetition_penalty) is None:
+            repetition_penalty = default_sampling_params.get(
+                "repetition_penalty",
+                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
+            )
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get(
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
+        if (top_p := self.top_p) is None:
+            top_p = default_sampling_params.get(
+                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
+        if (top_k := self.top_k) is None:
+            top_k = default_sampling_params.get(
+                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
+        if (min_p := self.min_p) is None:
+            min_p = default_sampling_params.get(
+                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
+
         prompt_logprobs = self.prompt_logprobs
         if prompt_logprobs is None and self.echo:
             prompt_logprobs = self.top_logprobs
@@ -403,11 +443,11 @@ def to_sampling_params(
             best_of=self.best_of,
             presence_penalty=self.presence_penalty,
             frequency_penalty=self.frequency_penalty,
-            repetition_penalty=self.repetition_penalty,
-            temperature=self.temperature,
-            top_p=self.top_p,
-            top_k=self.top_k,
-            min_p=self.min_p,
+            repetition_penalty=repetition_penalty,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
             seed=self.seed,
             stop=self.stop,
             stop_token_ids=self.stop_token_ids,
@@ -584,15 +624,15 @@ class CompletionRequest(OpenAIBaseModel):
     stream: Optional[bool] = False
     stream_options: Optional[StreamOptions] = None
     suffix: Optional[str] = None
-    temperature: Optional[float] = 1.0
-    top_p: Optional[float] = 1.0
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
     user: Optional[str] = None
 
     # doc: begin-completion-sampling-params
     use_beam_search: bool = False
-    top_k: int = -1
-    min_p: float = 0.0
-    repetition_penalty: float = 1.0
+    top_k: Optional[int] = None
+    min_p: Optional[float] = None
+    repetition_penalty: Optional[float] = None
     length_penalty: float = 1.0
     stop_token_ids: Optional[List[int]] = Field(default_factory=list)
     include_stop_str_in_output: bool = False
@@ -669,14 +709,30 @@ class CompletionRequest(OpenAIBaseModel):
 
     # doc: end-completion-extra-params
 
-    def to_beam_search_params(self,
-                              default_max_tokens: int) -> BeamSearchParams:
+    # Default sampling parameters for completion requests
+    _DEFAULT_SAMPLING_PARAMS: dict = {
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_p": 1.0,
+        "top_k": -1,
+        "min_p": 0.0,
+    }
+
+    def to_beam_search_params(
+            self,
+            default_max_tokens: int,
+            default_sampling_params: Optional[dict] = None
+    ) -> BeamSearchParams:
         max_tokens = self.max_tokens
         if max_tokens is None:
             max_tokens = default_max_tokens
 
+        if default_sampling_params is None:
+            default_sampling_params = {}
         n = self.n if self.n is not None else 1
-        temperature = self.temperature if self.temperature is not None else 0.0
+
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get("temperature", 1.0)
 
         return BeamSearchParams(
             beam_width=n,
@@ -687,12 +743,35 @@ def to_beam_search_params(self,
             include_stop_str_in_output=self.include_stop_str_in_output)
 
     def to_sampling_params(
-            self, default_max_tokens: int,
-            logits_processor_pattern: Optional[str]) -> SamplingParams:
+            self,
+            default_max_tokens: int,
+            logits_processor_pattern: Optional[str],
+            default_sampling_params: Optional[dict] = None) -> SamplingParams:
         max_tokens = self.max_tokens
         if max_tokens is None:
             max_tokens = default_max_tokens
 
+        if default_sampling_params is None:
+            default_sampling_params = {}
+        # Default parameters
+        if (repetition_penalty := self.repetition_penalty) is None:
+            repetition_penalty = default_sampling_params.get(
+                "repetition_penalty",
+                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
+            )
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get(
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
+        if (top_p := self.top_p) is None:
+            top_p = default_sampling_params.get(
+                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
+        if (top_k := self.top_k) is None:
+            top_k = default_sampling_params.get(
+                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
+        if (min_p := self.min_p) is None:
+            min_p = default_sampling_params.get(
+                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
+
         prompt_logprobs = self.prompt_logprobs
         if prompt_logprobs is None and self.echo:
             prompt_logprobs = self.logprobs
@@ -718,11 +797,11 @@ def to_sampling_params(
             best_of=self.best_of,
             presence_penalty=self.presence_penalty,
             frequency_penalty=self.frequency_penalty,
-            repetition_penalty=self.repetition_penalty,
-            temperature=self.temperature,
-            top_p=self.top_p,
-            top_k=self.top_k,
-            min_p=self.min_p,
+            repetition_penalty=repetition_penalty,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
             seed=self.seed,
             stop=self.stop,
             stop_token_ids=self.stop_token_ids,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 81bce0dd370bb..d085333563d19 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -91,6 +91,10 @@ def __init__(
                                 "been registered") from e
 
         self.enable_prompt_tokens_details = enable_prompt_tokens_details
+        diff_sampling_param = self.model_config.get_diff_sampling_param()
+        if diff_sampling_param:
+            logger.info("Overwriting default chat sampling param with: %s",
+                        diff_sampling_param)
 
     async def create_chat_completion(
         self,
@@ -191,13 +195,17 @@ async def create_chat_completion(
                 sampling_params: Union[SamplingParams, BeamSearchParams]
                 default_max_tokens = self.max_model_len - len(
                     engine_prompt["prompt_token_ids"])
+                # Build default sampling params
+                default_sampling_params = (
+                    self.model_config.get_diff_sampling_param())
                 if request.use_beam_search:
                     sampling_params = request.to_beam_search_params(
-                        default_max_tokens)
+                        default_max_tokens, default_sampling_params)
                 else:
                     sampling_params = request.to_sampling_params(
                         default_max_tokens,
-                        self.model_config.logits_processor_pattern)
+                        self.model_config.logits_processor_pattern,
+                        default_sampling_params)
 
                 self._log_inputs(request_id,
                                  request_prompts[i],
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 5cf9df92e296e..aaad7b8c7f44c 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -55,6 +55,11 @@ def __init__(
                          prompt_adapters=prompt_adapters,
                          request_logger=request_logger,
                          return_tokens_as_token_ids=return_tokens_as_token_ids)
+        diff_sampling_param = self.model_config.get_diff_sampling_param()
+        if diff_sampling_param:
+            logger.info(
+                "Overwriting default completion sampling param with: %s",
+                diff_sampling_param)
 
     async def create_completion(
         self,
@@ -118,13 +123,17 @@ async def create_completion(
                 sampling_params: Union[SamplingParams, BeamSearchParams]
                 default_max_tokens = self.max_model_len - len(
                     engine_prompt["prompt_token_ids"])
+                # Build default sampling params
+                default_sampling_params = (
+                    self.model_config.get_diff_sampling_param())
                 if request.use_beam_search:
                     sampling_params = request.to_beam_search_params(
-                        default_max_tokens)
+                        default_max_tokens, default_sampling_params)
                 else:
                     sampling_params = request.to_sampling_params(
                         default_max_tokens,
-                        self.model_config.logits_processor_pattern)
+                        self.model_config.logits_processor_pattern,
+                        default_sampling_params)
 
                 request_id_item = f"{request_id}-{i}"
 
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 61dce40a584c8..ffcaa158d252d 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -1,5 +1,5 @@
 import time
-from typing import Any, Dict, Mapping, Optional, Tuple, Union
+from typing import Mapping, Optional, Tuple, Union
 
 from vllm.config import CacheConfig, LoRAConfig, ModelConfig
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
@@ -12,7 +12,6 @@
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest
 from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
@@ -34,8 +33,8 @@ def __init__(
         self.lora_config = lora_config
         self.tokenizer = tokenizer
 
-        self.generation_config_fields = _load_generation_config_dict(
-            model_config)
+        self.generation_config_fields = model_config.try_get_generation_config(
+        )
         self.input_preprocessor = InputPreprocessor(model_config,
                                                     self.tokenizer,
                                                     mm_registry)
@@ -181,16 +180,3 @@ def _validate_model_inputs(self, inputs: ProcessorInputs):
             # TODO: Find out how many placeholder tokens are there so we can
             # check that chunked prefill does not truncate them
             # max_batch_len = self.scheduler_config.max_num_batched_tokens
-
-
-def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:
-    config = try_get_generation_config(
-        model_config.model,
-        trust_remote_code=model_config.trust_remote_code,
-        revision=model_config.revision,
-    )
-
-    if config is None:
-        return {}
-
-    return config.to_diff_dict()

From a0f7d53beb176034546c6deb328a3d49e94e1f6d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 19 Dec 2024 21:22:00 +0800
Subject: [PATCH 121/357] [Bugfix] Cleanup Pixtral HF code (#11333)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/pixtral.py | 155 +++-----------------------
 1 file changed, 14 insertions(+), 141 deletions(-)

diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index f05ea195e043d..6676dd16e005f 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -10,12 +10,12 @@
 from PIL import Image
 from transformers import PixtralVisionConfig
 from transformers.models.pixtral.image_processing_pixtral import (
-    _num_image_tokens)
+    _num_image_tokens as _get_pixtral_hf_num_image_tokens)
 from transformers.models.pixtral.modeling_pixtral import (
     PixtralRotaryEmbedding, apply_rotary_pos_emb, position_ids_in_meshgrid)
 
 from vllm.attention import AttentionMetadata
-from vllm.config import ModelConfig, VllmConfig
+from vllm.config import VllmConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
@@ -27,7 +27,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.utils import merge_multimodal_embeddings
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
@@ -35,11 +34,10 @@
                                    consecutive_placeholder_ranges,
                                    resolve_visual_encoder_outputs)
 from vllm.sequence import IntermediateTensors, SequenceData
-from vllm.transformers_utils.processor import cached_get_processor
-from vllm.utils import is_list_of
 
 from .interfaces import SupportsMultiModal, SupportsPP
-from .utils import init_vllm_registered_model, maybe_prefix
+from .utils import (init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
 
 try:
     from xformers import ops as xops
@@ -699,37 +697,14 @@ def get_pixtral_hf_num_patches(*, image_size: int, patch_size: int) -> int:
     return grid_length * grid_length
 
 
-def get_max_pixtral_hf_image_feature_size(
-        hf_config: PixtralVisionConfig) -> int:
-    return get_pixtral_hf_num_patches(image_size=hf_config.image_size,
-                                      patch_size=hf_config.patch_size)
-
-
 def get_max_pixtral_hf_image_tokens(hf_config: PixtralVisionConfig) -> int:
-    return get_max_pixtral_hf_image_feature_size(hf_config)
+    grid_length = get_pixtral_hf_patch_grid_length(
+        image_size=hf_config.image_size,
+        patch_size=hf_config.patch_size,
+    )
 
-
-def dummy_seq_data_for_pixtral_hf(
-        hf_config: PixtralVisionConfig,
-        seq_len: int,
-        num_images: int,
-        *,
-        image_token_id: int,
-        image_feature_size_override: Optional[int] = None,
-        mm_key: str = "image"):
-    if image_feature_size_override is None:
-        image_feature_size = get_max_pixtral_hf_image_feature_size(hf_config)
-    else:
-        image_feature_size = image_feature_size_override
-
-    return SequenceData.from_prompt_token_counts(
-        (image_token_id, image_feature_size * num_images),
-        (0, seq_len - image_feature_size * num_images),
-    ), {
-        mm_key:
-        consecutive_placeholder_ranges(num_items=num_images,
-                                       item_size=image_feature_size)
-    }
+    # Consider the image_break_token
+    return (grid_length + 1) * grid_length
 
 
 def dummy_image_for_pixtral_hf(
@@ -763,116 +738,14 @@ def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig,
         image_width = int(numpy.ceil(image_width / ratio))
         image_height = int(numpy.ceil(image_height / ratio))
 
-    num_height_tokens, num_width_tokens = _num_image_tokens(
-        (image_height, image_width), (patch_height, patch_width))
+    num_height_tokens, num_width_tokens = _get_pixtral_hf_num_image_tokens(
+        (image_height, image_width),
+        (patch_height, patch_width),
+    )
 
     return num_width_tokens, num_height_tokens
 
 
-def input_processor_for_pixtral_hf(
-    model_config: ModelConfig,
-    hf_config: PixtralVisionConfig,
-    inputs: DecoderOnlyInputs,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[Union[int, List[int]]] = None,
-) -> DecoderOnlyInputs:
-    assert image_feature_size_override is None, (
-        "image_feature_size_override is not supported for Pixtral")
-
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-
-    processor = cached_get_processor(model_config.model)
-
-    image_data = multi_modal_data["image"]
-    if isinstance(image_data, Image.Image):
-        image_data = [image_data]
-    elif not is_list_of(image_data, Image.Image):
-        raise TypeError(f"Invalid image type: {type(image_data)}")
-
-    new_prompt = inputs.get("prompt")
-    new_token_ids = inputs["prompt_token_ids"]
-
-    image_token = processor.image_token
-    image_break_token = processor.image_break_token
-    image_end_token = processor.image_end_token
-
-    # Update new_prompt if present
-    if new_prompt:
-        parts = new_prompt.split(image_token)
-        assert len(parts) - 1 == len(image_data)
-        new_parts = [parts[0]]  # Start with the part before any image tokens
-
-        for image, next_part in zip(image_data, parts[1:]):
-            w, h = image.size
-            (num_width_tokens,
-             num_height_tokens) = get_pixtral_hf_image_feature_size(
-                 hf_config, image_width=w, image_height=h)
-
-            replace_tokens = [image_token] * num_width_tokens + [
-                image_break_token
-            ]
-            replace_tokens = replace_tokens * num_height_tokens
-            replace_tokens[-1] = image_end_token
-
-            new_parts.append("".join(replace_tokens))
-            new_parts.append(next_part)
-
-        new_prompt = "".join(new_parts)
-
-    # Update new_token_ids
-    convert_tokens_to_ids = processor.tokenizer.convert_tokens_to_ids
-    image_token_id = convert_tokens_to_ids(image_token)
-    image_break_id = convert_tokens_to_ids(image_break_token)
-    image_end_id = convert_tokens_to_ids(image_end_token)
-    placeholder_token_id = -999
-    # Find all image token indices at once
-    placeholder_indices = [
-        idx for idx, token_id in enumerate(new_token_ids)
-        if token_id == image_token_id
-    ]
-    assert len(placeholder_indices) == len(image_data)
-    replace_tokens_list = []
-    for placeholder_idx, image in zip(placeholder_indices, image_data):
-        new_token_ids[placeholder_idx] = placeholder_token_id
-
-        w, h = image.size
-        (num_width_tokens,
-         num_height_tokens) = get_pixtral_hf_image_feature_size(hf_config,
-                                                                image_width=w,
-                                                                image_height=h)
-
-        replace_tokens = [image_token_id] * num_width_tokens + [image_break_id]
-        replace_tokens = replace_tokens * num_height_tokens
-        replace_tokens[-1] = image_end_id
-        replace_tokens_list.append(replace_tokens)
-
-    reverse_offsets: List[int] = []
-    # Backward iteration for replacement without affecting known indices
-    for placeholder_idx, replace_tokens in zip(reversed(placeholder_indices),
-                                               reversed(replace_tokens_list)):
-        reverse_offsets.append(
-            len(new_token_ids) - placeholder_idx + len(replace_tokens))
-        new_token_ids[placeholder_idx:placeholder_idx + 1] = replace_tokens
-
-    placeholder_ranges: List[PlaceholderRange] = []
-    for reverse_offset, replace_tokens in zip(reversed(reverse_offsets),
-                                              replace_tokens_list):
-        placeholder_ranges.append(
-            PlaceholderRange(
-                offset=len(new_token_ids) - reverse_offset,
-                length=len(replace_tokens),
-            ))
-
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data,
-                        multi_modal_placeholders={"image": placeholder_ranges})
-
-
 class PixtralHFMLP(nn.Module):
 
     def __init__(

From 6c7f8815416f1968a8c1578f52a7e5b63f9310ed Mon Sep 17 00:00:00 2001
From: Yehoshua Cohen <61619195+yecohn@users.noreply.github.com>
Date: Thu, 19 Dec 2024 16:48:06 +0200
Subject: [PATCH 122/357] [Model] Add JambaForSequenceClassification model 
 (#10860)

Signed-off-by: Yehoshua Cohen <yehoshuaco@ai21.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Yehoshua Cohen <yehoshuaco@ai21.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.rst |  5 ++++
 tests/models/registry.py                |  1 +
 vllm/model_executor/models/jamba.py     | 36 ++++++++++++++++++++++++-
 vllm/model_executor/models/registry.py  |  1 +
 vllm/worker/pooling_model_runner.py     |  7 ++++-
 5 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 8d39e6f14a59c..488fcc7709c77 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -476,6 +476,11 @@ Classification (``--task classify``)
     - Example HF Models
     - :ref:`LoRA <lora>`
     - :ref:`PP <distributed_serving>`
+  * - :code:`JambaForSequenceClassification`
+    - Jamba
+    - :code:`ai21labs/Jamba-tiny-reward-dev`, etc.
+    - ✅︎
+    - ✅︎
   * - :code:`Qwen2ForSequenceClassification`
     - Qwen2-based
     - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc.
diff --git a/tests/models/registry.py b/tests/models/registry.py
index fac8c4b2e9b19..819ef957a07f3 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -138,6 +138,7 @@ class _HfExamplesInfo:
     "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
     "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),
     "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
+    "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"),  # noqa: E501
     "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
     "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 831db2ae52d74..91786db5ddc96 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -17,6 +17,7 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
+from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -24,8 +25,9 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
                                                     MambaCacheParams)
+from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors
+from vllm.sequence import IntermediateTensors, PoolerOutput
 from vllm.utils import LayerBlockType
 
 from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP
@@ -593,3 +595,35 @@ def _is_moe_layer(name: str):
             "experts",
             "router",
         ]])
+
+
+class JambaForSequenceClassification(JambaForCausalLM):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        config = vllm_config.model_config.hf_config
+        num_labels: int = config.num_labels
+        score_bias: bool = getattr(config, 'score_bias', False)
+        self.score = nn.Linear(config.hidden_size, num_labels, bias=score_bias)
+
+        pooler_config = vllm_config.model_config.pooler_config
+        self._pooler = Pooler.from_config_with_defaults(
+            pooler_config,
+            pooling_type=PoolingType.LAST,
+            normalize=False,
+            softmax=False)
+
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        hidden_states = hidden_states.float()
+        logits = self.score(hidden_states)
+        return self._pooler(logits, pooling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # TODO: The reward weights themselves have float32 accuracy data, we
+        # would like to load them in fp32 to get that extra precision.
+        super().load_weights(weights)
+        self.score = self.score.float()
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 68a2467a813a1..04d806c3c7eae 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -113,6 +113,7 @@
     "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),
     "GritLM": ("gritlm", "GritLM"),
+    "JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"),  # noqa: E501
     "LlamaModel": ("llama", "LlamaForCausalLM"),
     **{
         # Multiple models share the same architecture, so we include them all
diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
index 1beae1e3884c5..f79b3773bcbd2 100644
--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -91,6 +91,10 @@ def execute_model(
         ]
 
         multi_modal_kwargs = model_input.multi_modal_kwargs or {}
+        seqlen_agnostic_kwargs = {
+            "finished_requests_ids": model_input.finished_requests_ids,
+            "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
+        } if self.has_inner_state else {}
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
             model_forward_start = torch.cuda.Event(enable_timing=True)
@@ -110,7 +114,8 @@ def execute_model(
                 intermediate_tensors=intermediate_tensors,
                 **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
                                              device=self.device),
-                **cross_enc_kwargs)
+                **cross_enc_kwargs,
+                **seqlen_agnostic_kwargs)
 
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):

From 7379b3d4b2e0b85de43e7c5145ff26c8200aac8a Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 19 Dec 2024 08:27:22 -0800
Subject: [PATCH 123/357] [V1] Fix multimodal profiling for `Molmo` (#11325)

Signed-off-by: ywang96 <ywang@example.com>
Co-authored-by: ywang96 <ywang@example.com>
---
 vllm/model_executor/models/molmo.py |  5 +++++
 vllm/v1/engine/mm_input_mapper.py   | 19 +++++++++++++++++--
 vllm/v1/engine/processor.py         |  2 +-
 vllm/v1/worker/gpu_model_runner.py  |  2 +-
 4 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index a328b5a2aeea7..9f744b6918818 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -928,7 +928,11 @@ def image_input_mapper_for_molmo(
     data: object,
 ):
     if isinstance(data, list):
+        assert len(data) == 1, "Molmo supports only one image per prompt."
         data = data[0]
+
+    # Remove unused dummy PIL image
+    data.pop('raw_mm_data', None)
     return MultiModalKwargs(data)
 
 
@@ -974,6 +978,7 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
     dummy_imgdata = {
         "images": out["images"],
         "image_input_idx": out["image_input_idx"],
+        "raw_mm_data": dummy_image,
     }
     if "image_masks" in out:
         dummy_imgdata["image_masks"] = out["image_masks"]
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index bba71c29cc108..cb97f743b1d52 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -151,17 +151,31 @@ class MMHasher:
     def __init__(self):
         pass
 
-    def hash_mm_data(
+    def hash_dummy_mm_data(
             self,
             mm_data: Optional[MultiModalDataDict]) -> Optional[List[str]]:
+        """Hash user-defined dummy multimodal data used for profiling."""
+
         if mm_data is None:
             return None
 
         image_inputs = mm_data['image']
 
+        # This is a temporary workaround for models (e.g, Molmo) that
+        # process multimodal data in the input processor (therefore
+        # image_inputs is MultiModalKwargs instead of raw input format).
+        # `raw_mm_data` with the original input format is expected
+        # in this case.
+        if isinstance(image_inputs, dict):
+            assert "raw_mm_data" in image_inputs and isinstance(
+                image_inputs["raw_mm_data"], PIL.Image.Image)
+            image_inputs = image_inputs.pop("raw_mm_data")
+
         return self.hash_images(image_inputs)
 
-    def hash_prompt(self, prompt: PromptType) -> Optional[List[str]]:
+    def hash_prompt_mm_data(self, prompt: PromptType) -> Optional[List[str]]:
+        """Hash multimodal data in the user input prompt if they exist."""
+
         if "multi_modal_data" not in prompt:
             return None
 
@@ -171,6 +185,7 @@ def hash_prompt(self, prompt: PromptType) -> Optional[List[str]]:
         return self.hash_images(image_inputs)
 
     def hash_images(self, image_inputs) -> Optional[List[str]]:
+        """Hash PIL image objects to strings."""
         if not isinstance(image_inputs, list):
             image_inputs = [image_inputs]
         assert len(image_inputs) > 0
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index ffcaa158d252d..6ee8732bc902c 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -79,7 +79,7 @@ def process_inputs(
         # Compute MM hashes (if enabled)
         mm_hashes = None
         if self.use_hash:
-            mm_hashes = self.mm_hasher.hash_prompt(prompt)
+            mm_hashes = self.mm_hasher.hash_prompt_mm_data(prompt)
 
         # Process inputs.
         preprocessed_inputs = self.input_preprocessor.preprocess(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 8ec4a252d5925..cb89246db0cc9 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -638,7 +638,7 @@ def profile_run(self) -> None:
             # Compute MM hashes (if enabled)
             mm_hashes = None
             if self.use_hash:
-                mm_hashes = self.mm_hasher.hash_mm_data(dummy_mm_data)
+                mm_hashes = self.mm_hasher.hash_dummy_mm_data(dummy_mm_data)
 
             dummy_mm_kwargs = self.mm_input_mapper_client.process_inputs(
                 mm_data=dummy_mm_data,

From e24113a8fe5de5b96459d1f8509d1b48fd7ceebe Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 20 Dec 2024 00:28:00 +0800
Subject: [PATCH 124/357] [Model] Refactor Qwen2-VL to use merged multimodal
 processor (#11258)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 examples/offline_inference_vision_language.py |   8 +-
 .../mm_processor_kwargs/test_qwen2_vl.py      | 192 ++----
 vllm/model_executor/models/qwen2_audio.py     |   4 +-
 vllm/model_executor/models/qwen2_vl.py        | 580 ++++++------------
 vllm/multimodal/processing.py                 |  20 +-
 5 files changed, 277 insertions(+), 527 deletions(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 64c7b93f4a71b..d5a71862656e7 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -447,7 +447,6 @@ def run_qwen_vl(question: str, modality: str):
 
 # Qwen2-VL
 def run_qwen2_vl(question: str, modality: str):
-    assert modality == "image"
 
     model_name = "Qwen/Qwen2-VL-7B-Instruct"
 
@@ -463,8 +462,13 @@ def run_qwen2_vl(question: str, modality: str):
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
+    if modality == "image":
+        placeholder = "<|image_pad|>"
+    elif modality == "video":
+        placeholder = "<|video_pad|>"
+
     prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-              "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+              f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
               f"{question}<|im_end|>\n"
               "<|im_start|>assistant\n")
     stop_token_ids = None
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
index 7e2bea130583e..cd8954ffc48c2 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
@@ -1,12 +1,9 @@
 from typing import Any, Dict, Tuple
 
 import pytest
-import torch
-from PIL.Image import Image
 from transformers import AutoTokenizer
 
-from vllm.inputs import InputContext, token_inputs
-from vllm.multimodal import MultiModalRegistry
+from vllm.inputs import InputContext, InputProcessingContext
 
 from .....conftest import _ImageAssets
 from ....utils import build_model_context
@@ -20,22 +17,9 @@
 # NOTE: Qwen2VL supports multiple input modalities, so it registers multiple
 # input mappers.
 @pytest.fixture()
-def image_input_mapper_for_qwen2_vl():
-    from vllm.model_executor.models.qwen2_vl import (
-        image_input_mapper_for_qwen2_vl)
-    return image_input_mapper_for_qwen2_vl
-
-
-@pytest.fixture()
-def input_processor_for_qwen2_vl():
-    from vllm.model_executor.models.qwen2_vl import (
-        input_processor_for_qwen2_vl)
-    return input_processor_for_qwen2_vl
-
-
-@pytest.fixture()
-def qwen2_vl_context() -> InputContext:
-    return build_model_context(model_name=MODEL)
+def processor_for_qwen2_vl():
+    from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalProcessor
+    return Qwen2VLMultiModalProcessor
 
 
 @pytest.fixture()
@@ -45,12 +29,6 @@ def get_max_qwen2_vl_image_tokens():
     return get_max_qwen2_vl_image_tokens
 
 
-@pytest.fixture()
-def dummy_data_for_qwen2_vl():
-    from vllm.model_executor.models.qwen2_vl import dummy_data_for_qwen2_vl
-    return dummy_data_for_qwen2_vl
-
-
 @pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [
     ({}, 1225),
     ({
@@ -58,110 +36,70 @@ def dummy_data_for_qwen2_vl():
         MAX_PIXELS: 512**2
     }, 324),
 ])
-def test_qwen2_vl_max_image_tokens(get_max_qwen2_vl_image_tokens,
-                                   qwen2_vl_context: InputContext,
-                                   mm_processor_kwargs: Dict[str, Any],
-                                   expected_max_tokens: int):
+@pytest.mark.parametrize("model", [MODEL])
+def test_qwen2_vl_max_image_tokens(
+    get_max_qwen2_vl_image_tokens,
+    model: str,
+    mm_processor_kwargs: Dict[str, Any],
+    expected_max_tokens: int,
+):
     """Ensure that the max token calc handles min/max pixels properly."""
-    actual_max_tokens = get_max_qwen2_vl_image_tokens(qwen2_vl_context,
-                                                      **mm_processor_kwargs)
-    assert actual_max_tokens == expected_max_tokens
-
-
-@pytest.mark.parametrize("mm_processor_kwargs,token_count,img_size", [
-    [{}, 1225, (980, 980)],
-    [{
-        MIN_PIXELS: 64**2,
-        MAX_PIXELS: 512**2
-    }, 324, (504, 504)],
-])
-def test_qwen2_vl_dummy_data(dummy_data_for_qwen2_vl,
-                             qwen2_vl_context: InputContext,
-                             mm_processor_kwargs: Dict[str, Any],
-                             token_count: int, img_size: Tuple[int, int]):
-    """Ensure that the dummy data handles min/max pixels properly."""
-    seq_len = 3000
-    hf_config = qwen2_vl_context.get_hf_config()
-    image_token_id = hf_config.image_token_id
-
-    # NOTE: video value is required, but isn't actually used
-    # when making the dummy data except for error handling currently
-    dummy_data = dummy_data_for_qwen2_vl(
-        ctx=qwen2_vl_context,
-        seq_len=seq_len,
-        mm_counts={
-            "image": 1,
-            "video": 0
-        },
-        **mm_processor_kwargs,
+    ctx = build_model_context(
+        model_name=model,
+        tokenizer_name=model,
+        mm_processor_kwargs=None,
     )
-    seq_data = dummy_data.seq_data
-    mm_data = dummy_data.multi_modal_data
-
-    # Ensure we have the right number of placeholders for min/max pixel values
-    assert seq_data.get_token_ids().count(image_token_id) == token_count
 
-    # Ensure the images were resized correctly
-    image = mm_data["image"]
-    assert isinstance(image, Image)
-    assert image.size == img_size
+    actual_max_tokens = get_max_qwen2_vl_image_tokens(
+        InputContext(ctx.model_config), **mm_processor_kwargs)
+    assert actual_max_tokens == expected_max_tokens
 
 
-@pytest.mark.parametrize("mm_processor_kwargs,num_placeholders", [
-    ({}, 1426),
-    ({
-        MIN_PIXELS: 64**2,
-        MAX_PIXELS: 512**2
-    }, 330),
-])
-def test_input_processor(input_processor_for_qwen2_vl,
-                         qwen2_vl_context: InputContext,
-                         image_assets: _ImageAssets, num_placeholders: int,
-                         mm_processor_kwargs: Dict[str, Any]):
-    """Ensure that the image processor handles min/max pixels properly."""
-    tokenizer = AutoTokenizer.from_pretrained(MODEL)
-    prompt = "<|vision_start|><|image_pad|><|vision_end|>"
-
-    image = image_assets[0].pil_image
-    hf_config = qwen2_vl_context.get_hf_config()
-    image_token_id = hf_config.image_token_id
-
-    inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
-                          prompt=prompt,
-                          multi_modal_data={"image": [image]})
-
-    processed_inputs = input_processor_for_qwen2_vl(qwen2_vl_context, inputs,
-                                                    **mm_processor_kwargs)
-    assert processed_inputs["prompt_token_ids"].count(
-        image_token_id) == num_placeholders
-    assert len(processed_inputs["multi_modal_data"]["image"]) == 1
-
-
-@pytest.mark.parametrize("mm_processor_kwargs,pixels_shape", [
-    ({}, [5704, 1176]),
-    ({
-        MIN_PIXELS: 64**2,
-        MAX_PIXELS: 512**2
-    }, [1320, 1176]),
-])
-def test_image_mapper_override(qwen2_vl_context: InputContext,
-                               image_assets: _ImageAssets,
-                               mm_processor_kwargs: Dict[str, Any],
-                               pixels_shape: Tuple[int, int]):
-    """Ensure that the image mapper handles min/max pixels properly."""
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(qwen2_vl_context.model_config)
-
-    image = image_assets[0].pil_image
-
-    mapped_output = mm_registry.map_input(
-        qwen2_vl_context.model_config,
-        {"image": image},
-        mm_processor_kwargs=mm_processor_kwargs,
+@pytest.mark.parametrize(
+    "mm_processor_kwargs, expected_toks_per_img, expected_pixels_shape", [
+        ({}, 1426, (5704, 1176)),
+        ({
+            MIN_PIXELS: 64**2,
+            MAX_PIXELS: 512**2
+        }, 330, (1320, 1176)),
+    ])
+@pytest.mark.parametrize("model", [MODEL])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_override(
+    processor_for_qwen2_vl,
+    image_assets: _ImageAssets,
+    model: str,
+    mm_processor_kwargs: Dict[str, Any],
+    expected_toks_per_img: int,
+    expected_pixels_shape: Tuple[int, int],
+    num_imgs: int,
+):
+    """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
+    # Same as the previous test - don't initialize mm_processor_kwargs
+    # in this test and assume that the kwargs will be correctly expanded by
+    # the partial when calling the custom input processor.
+    ctx = build_model_context(
+        model_name=model,
+        tokenizer_name=model,
+        mm_processor_kwargs=None,
     )
-
-    # Dimension 0 of pixel values should match the product of image_grid_thw
-    actual_pixels_shape = mapped_output["pixel_values"].shape
-    assert list(actual_pixels_shape) == pixels_shape
-    assert actual_pixels_shape[0] == torch.prod(
-        mapped_output["image_grid_thw"])
+    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+    ctx = InputProcessingContext(ctx.model_config, tokenizer)
+    # Build the image str / prompt based on the number of images we pass
+    prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
+    images = [image_assets[0].pil_image] * num_imgs
+
+    mm_data = {"image": images}
+
+    processor = processor_for_qwen2_vl(ctx)
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    hf_processor = processor._get_hf_processor(**mm_processor_kwargs)
+    image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape
+
+    assert img_tok_count == expected_toks_per_img * num_imgs
+    assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
+    assert pixel_shape[1] == expected_pixels_shape[1]
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 07e29b71c2ed4..6259166a7fc57 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -164,7 +164,9 @@ def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        audio_len = get_max_qwen2_audio_audio_tokens(self.ctx)
+        feature_extractor = self._get_feature_extractor()
+        sampling_rate = feature_extractor.sampling_rate
+        audio_len = feature_extractor.chunk_length * sampling_rate
 
         audio_count = mm_counts["audio"]
         audio = np.zeros(audio_len)
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index cfc90cdab01e4..b38ea923f0bf1 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -22,28 +22,26 @@
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
 from functools import cached_property, partial
-from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
-                    Optional, Set, Tuple, Type, TypedDict, Union)
+from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set,
+                    Tuple, Type, TypedDict, Union)
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
 from PIL import Image
-from transformers.image_utils import (get_image_size,
-                                      infer_channel_dimension_format,
-                                      to_numpy_array)
+from transformers import BatchFeature
+from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
+                                          Qwen2VLProcessor)
 from transformers.models.qwen2_vl.configuration_qwen2_vl import (
     Qwen2VLConfig, Qwen2VLVisionConfig)
-from transformers.models.qwen2_vl.image_processing_qwen2_vl import (
-    make_batched_images, make_batched_videos, smart_resize)
+from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
+from vllm.inputs import InputContext
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.activation import QuickGELU
@@ -56,14 +54,14 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import cached_get_image_processor
-from vllm.multimodal.inputs import (MultiModalData, MultiModalDataDict,
-                                    MultiModalKwargs, NestedTensors)
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.multimodal.inputs import MultiModalDataDict, NestedTensors
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
 from vllm.platforms import _Backend
-from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
-from vllm.transformers_utils.processor import cached_get_processor
+from vllm.utils import is_list_of
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend,
@@ -159,7 +157,7 @@ class Qwen2VisionMLP(nn.Module):
     def __init__(
         self,
         in_features: int,
-        hidden_features: int = None,
+        hidden_features: int,
         act_layer: Type[nn.Module] = QuickGELU,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -644,78 +642,8 @@ def load_weights(self, weights: Iterable[Tuple[str,
 # === Vision input helpers === #
 
 
-def get_mm_processor_kwargs(
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None) -> Dict[str, int]:
-    mm_processor_kwargs = {}
-    if min_pixels:
-        mm_processor_kwargs["min_pixels"] = min_pixels
-    if max_pixels:
-        mm_processor_kwargs["max_pixels"] = max_pixels
-    return mm_processor_kwargs
-
-
-def mm_input_mapper_for_qwen2_vl(
-    ctx: InputContext,
-    data: MultiModalData[object],
-    data_type_key: str,
-    *,
-    min_pixels: Optional[int] = None,
-    max_pixels: Optional[int] = None,
-) -> MultiModalKwargs:
-    """Input mapper for Qwen2-VL."""
-    if data_type_key == "image" and isinstance(data, dict):
-        return MultiModalKwargs({
-            "image_embeds": data.get("image_embeds"),
-            "image_grid_thw": data.get("image_grid_thw"),
-        })
-    if data_type_key == "video" and isinstance(data, dict):
-        return MultiModalKwargs({
-            "video_embeds": data.get("video_embeds"),
-            "video_grid_thw": data.get("video_grid_thw"),
-        })
-
-    model_config = ctx.model_config
-    # Handle mm processor kwargs; we pass these at creation time
-    # because preprocess() in transformers doesn't expose them
-    mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
-                                                  max_pixels=max_pixels)
-    image_processor = cached_get_image_processor(
-        model_config.model,
-        trust_remote_code=model_config.trust_remote_code,
-        **mm_processor_kwargs,
-    )
-    if image_processor is None:
-        raise RuntimeError("No HuggingFace processor is available "
-                           "to process the image object")
-
-    images = None
-    videos = None
-    if data_type_key == "image":
-        images = data
-    else:
-        assert data_type_key == "video"
-        videos = data
-
-    try:
-        batch_data = image_processor \
-            .preprocess(images=images, videos=videos, return_tensors="pt") \
-            .data
-    except Exception:
-        logger.error("Failed to process image (%s)", data)
-        raise
-
-    return MultiModalKwargs(batch_data)
-
-
-image_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl,
-                                          data_type_key="image")
-video_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl,
-                                          data_type_key="video")
-
-
 def _get_vision_info(
-    image_processor,
+    vision_config: Qwen2VLVisionConfig,
     height: int,
     width: int,
     min_pixels: int,
@@ -726,12 +654,15 @@ def _get_vision_info(
 ):
     """Get information (resized height / width and number of vision tokens)
     of input image / video frame."""
+    patch_size = vision_config.patch_size
+    merge_size = vision_config.spatial_merge_size
+    temporal_patch_size = vision_config.temporal_patch_size
 
     if do_resize:
         resized_height, resized_width = smart_resize(
             height=height,
             width=width,
-            factor=image_processor.patch_size * image_processor.merge_size,
+            factor=patch_size * merge_size,
             min_pixels=min_pixels,
             max_pixels=max_pixels,
         )
@@ -742,54 +673,41 @@ def _get_vision_info(
         grid_t = mm_count
     else:
         assert data_type_key == "video"
-        grid_t = max(mm_count // image_processor.temporal_patch_size, 1)
+        grid_t = max(mm_count // temporal_patch_size, 1)
 
-    grid_h = resized_height // image_processor.patch_size
-    grid_w = resized_width // image_processor.patch_size
+    grid_h = resized_height // patch_size
+    grid_w = resized_width // patch_size
     vision_tokens = grid_t * grid_h * grid_w
-    llm_num_vision_tokens = (vision_tokens // image_processor.merge_size //
-                             image_processor.merge_size)
+    llm_num_vision_tokens = vision_tokens // (merge_size**2)
 
     return resized_height, resized_width, llm_num_vision_tokens
 
 
-def _get_max_image_info(
-    image_processor,
-    data_type_key: str = "image",
-    mm_count: int = 1,
-    min_pixels: Optional[int] = None,
-    max_pixels: Optional[int] = None,
-):
-    # Limit min / max pixels unless they're explicitly provided
-    if min_pixels is None:
-        min_pixels = max(image_processor.min_pixels, 28 * 28)
-    if max_pixels is None:
-        max_pixels = min(image_processor.max_pixels, 1280 * 28 * 28)
-
-    return _get_vision_info(
-        image_processor,
-        height=9999999,
-        width=9999999,
-        min_pixels=min_pixels,
-        max_pixels=max_pixels,
-        data_type_key=data_type_key,
-        mm_count=mm_count,
-    )
+def _get_image_processor(hf_processor: Qwen2VLProcessor):
+    image_processor = hf_processor.image_processor  # type: ignore
+    assert isinstance(image_processor, Qwen2VLImageProcessor)
+    return image_processor
 
 
 def get_max_qwen2_vl_mm_tokens(ctx: InputContext,
                                data_type_key: str,
                                *,
-                               min_pixels=None,
-                               max_pixels=None) -> int:
-    mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
-                                                  max_pixels=max_pixels)
-    image_processor = cached_get_image_processor(ctx.model_config.model,
-                                                 **mm_processor_kwargs)
-    max_resized_height, max_resized_width, max_llm_image_tokens = \
-        _get_max_image_info(image_processor, data_type_key=data_type_key,
-                            mm_count=1, min_pixels=min_pixels,
-                            max_pixels=max_pixels)
+                               min_pixels: Optional[int] = None,
+                               max_pixels: Optional[int] = None) -> int:
+    hf_config = ctx.get_hf_config(Qwen2VLConfig)
+    vision_config = hf_config.vision_config
+
+    hf_processor = ctx.get_hf_processor(Qwen2VLProcessor)
+    image_processor = _get_image_processor(hf_processor)
+
+    _, _, max_llm_image_tokens = _get_vision_info(
+        vision_config,
+        height=9999999,
+        width=9999999,
+        min_pixels=min_pixels or image_processor.min_pixels,
+        max_pixels=max_pixels or image_processor.max_pixels,
+        data_type_key=data_type_key,
+    )
     return max_llm_image_tokens
 
 
@@ -799,290 +717,166 @@ def get_max_qwen2_vl_mm_tokens(ctx: InputContext,
                                         data_type_key="video")
 
 
-def dummy_data_for_qwen2_vl(
-    ctx: InputContext,
-    seq_len: int,
-    mm_counts: Mapping[str, int],
-    *,
-    min_pixels: Optional[int] = None,
-    max_pixels: Optional[int] = None
-) -> Tuple[SequenceData, Optional[MultiModalDataDict]]:
-    mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
-                                                  max_pixels=max_pixels)
-    image_processor = cached_get_image_processor(ctx.model_config.model,
-                                                 **mm_processor_kwargs)
-
-    num_images = mm_counts["image"]
-    max_resized_height, max_resized_width, max_llm_image_tokens = \
-        _get_max_image_info(image_processor, data_type_key="image",
-                            mm_count=num_images, min_pixels=min_pixels,
-                            max_pixels=max_pixels)
-    if seq_len - max_llm_image_tokens - 2 < 0:
-        raise RuntimeError(
-            f"Qwen2-VL cannot process {num_images} images in a prompt, "
-            "please increase max_model_len or reduce image limit by "
-            "--limit-mm-per-prompt.")
-
-    # Check video counts.
-    num_videos = mm_counts["video"]
-    max_resized_height, max_resized_width, max_llm_video_tokens = \
-        _get_max_image_info(image_processor, data_type_key="video",
-                            mm_count=num_videos, min_pixels=min_pixels,
-                            max_pixels=max_pixels)
-    if seq_len - max_llm_video_tokens - 2 < 0:
-        raise RuntimeError(
-            f"Qwen2-VL cannot process {num_videos} videos in a prompt, "
-            "please increase max_model_len or reduce video limit by "
-            "--limit-mm-per-prompt.")
-
-    hf_config = ctx.get_hf_config(Qwen2VLConfig)
-
-    dummy_seqdata = SequenceData.from_prompt_token_counts(
-        (hf_config.vision_start_token_id, 1),
-        (hf_config.image_token_id, max_llm_image_tokens),
-        (hf_config.vision_end_token_id, 1),
-        (0, seq_len - max_llm_image_tokens - 2),
-    )
-
-    dummy_image = Image.new("RGB", (max_resized_width, max_resized_height),
-                            color=0)
+class Qwen2VLMultiModalDataItems(MultiModalDataItems):
 
-    return DummyData(dummy_seqdata, {
-        "image":
-        dummy_image if num_images == 1 else [dummy_image] * num_images
-    })
+    @staticmethod
+    def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
+        """
+        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
+        """
+        multi_data = Qwen2VLMultiModalDataItems()
+
+        for k, v in data.items():
+            # TODO: Make a separate modality for embedding inputs
+            # to avoid confusion
+            # yapf: disable
+            if k == "video":
+                # Special case since even a single item can be a list
+                multi_data[k] = (  # type: ignore[index]
+                    v if (isinstance(v, (dict, torch.Tensor))  # type: ignore[assignment]
+                          or is_list_of(v, list)) else [v]
+                )
+            elif k in ("image", "audio"):
+                multi_data[k] = (  # type: ignore[index]
+                    v if isinstance(v, (dict, torch.Tensor, list)) else [v]
+                )
+            else:
+                multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
+            # yapf: enable
 
+        return multi_data
 
-def _get_llm_num_vision_tokens(
-    mm_inputs: list,
-    data_type_key: str,
-    image_processor,
-    min_pixels: int,
-    max_pixels: int,
-):
-    """Get number of vision tokens of multimodal inputs.
+    def get_item_counts(self) -> Mapping[str, int]:
+        return {
+            m: (
+                len(items[f"{m}_grid_thw"])  # type: ignore
+                if isinstance(items, dict) else len(items))
+            for m, items in self.items()
+        }
 
-    This method is derived from `transformers.models.qwen2_vl.
-    image_processing_qwen2_vl.Qwen2VLImageProcessor._preprocess`.
-    """
-    image = to_numpy_array(mm_inputs[0])
-    input_data_format = infer_channel_dimension_format(image)
-    height, width = get_image_size(image, channel_dim=input_data_format)
-
-    _, _, llm_num_vision_tokens = _get_vision_info(
-        image_processor,
-        height=height,
-        width=width,
-        min_pixels=min_pixels,
-        max_pixels=max_pixels,
-        do_resize=image_processor.do_resize,
-        data_type_key=data_type_key,
-        mm_count=len(mm_inputs),
-    )
-    return llm_num_vision_tokens
 
+class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
 
-def _expand_pad_tokens(inputs: list, token_id: int, make_batched_fn: Callable,
-                       data_type_key: str, image_processor: Any,
-                       prompt_token_ids: List[int], min_pixels: Optional[int],
-                       max_pixels: Optional[int]) -> List[int]:
-    """
-    Expand pad tokens for multi-modal inputs (e.g., images or videos).
-
-    Args:
-        inputs (list): The multi-modal inputs (e.g., images or videos).
-        token_id (int): The token ID used to represent the multi-modal input.
-        make_batched_fn (Callable): A function to batch the inputs.
-        data_type_key (str): The type of the multi-modal input.
-        image_processor (Any): The image processor used to process the inputs.
-        prompt_token_ids (List[int]): The list of token IDs in the prompt.
-        min_pixels (int): min pixels to used for img processing
-        max_pixels (int): max pixels to be used for img processing
-
-    Returns:
-        List[int]: The list of token IDs for the multi-modal inputs.
-    """
-    indices = [
-        idx for idx, token in enumerate(prompt_token_ids) if token == token_id
-    ]
-    inputs = make_batched_fn(inputs)
-    assert len(indices) == len(inputs)
-
-    prompt_token_ids_with_data = []
-    for cnt, data in enumerate(inputs):
-        num_tokens = _get_llm_num_vision_tokens(
-            [data] if data_type_key == "image" else data,
-            data_type_key=data_type_key,
-            image_processor=image_processor,
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-        )
-        if cnt == 0:
-            end_idx = indices[cnt]
-            non_data_tokens = prompt_token_ids[:end_idx]
-        else:
-            non_data_tokens = prompt_token_ids[indices[cnt - 1] +
-                                               1:indices[cnt]]
-        prompt_token_ids_with_data.extend(non_data_tokens)
-        prompt_token_ids_with_data.extend(token_id for _ in range(num_tokens))
-    prompt_token_ids_with_data.extend(prompt_token_ids[indices[-1] + 1:])
-    return prompt_token_ids_with_data
-
-
-def input_processor_for_qwen2_vl(
-    ctx: InputContext,
-    inputs: DecoderOnlyInputs,
-    *,
-    min_pixels: Optional[int] = None,
-    max_pixels: Optional[int] = None,
-) -> DecoderOnlyInputs:
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None:
-        return inputs
-
-    image_inputs = multi_modal_data.get("image", None)
-    video_inputs = multi_modal_data.get("video", None)
-
-    processor = cached_get_processor(ctx.model_config.model)
-    image_processor = processor.image_processor
-    # Apply processor kwarg overrides for image processor options
-    min_pixels = min_pixels if min_pixels else image_processor.min_pixels
-    max_pixels = max_pixels if max_pixels else image_processor.max_pixels
-
-    model_config = ctx.model_config
-    hf_config = ctx.get_hf_config(Qwen2VLConfig)
+    def _get_mm_items(
+        self,
+        mm_data: MultiModalDataDict,
+    ) -> MultiModalDataItems:
+        return Qwen2VLMultiModalDataItems.from_dict(mm_data)
 
-    # To avoid redundant processing of vision objects (resize, rescale, etc.),
-    # we extract code of calculating number of vision tokens from
-    # `transformers.models.qwen2_vl.processing_qwen2_vl.Qwen2VLProcessor`.
-    #
-    # The following code is equivalent to:
-    #    prompt = inputs["prompt"]
-    #    inputs = processor(text=[prompt],
-    #                       images=image_inputs,
-    #                       videos=video_inputs,
-    #                       padding=True,
-    #                       return_tensors="pt")
-    #    prompt_token_ids = inputs["input_ids"][0].tolist()
-
-    tokenizer = cached_get_tokenizer(
-        model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code)
-
-    prompt_token_ids = inputs["prompt_token_ids"]
-
-    # Expand image pad tokens.
-
-    if image_inputs is not None:
-        if isinstance(image_inputs, dict):
-            prompt_token_ids_with_image = []
-            image_indices = [
-                idx for idx, token in enumerate(prompt_token_ids)
-                if token == hf_config.image_token_id
-            ]
-
-            # ensure all image tokens have grid_thw
-            assert \
-                len(image_indices) == image_inputs["image_grid_thw"].size(0), \
-                "image token num does not match image_grid_thw.shape"
-
-            image_counter = 0
-            pad_token_counter = 0
-            for idx, token in enumerate(prompt_token_ids):
-                if idx in image_indices:
-                    grid_thw = image_inputs["image_grid_thw"][image_counter]
-                    grid_t, grid_h, grid_w = grid_thw
-                    num_pad_tokens = (grid_t * grid_h * grid_w //
-                                      image_processor.merge_size //
-                                      image_processor.merge_size)
-                    prompt_token_ids_with_image.extend([token] *
-                                                       num_pad_tokens)
-                    image_counter += 1
-                    pad_token_counter += num_pad_tokens
+    def _get_hf_processor(
+        self,
+        *,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+    ) -> Qwen2VLProcessor:
+        hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor)
+        image_processor = _get_image_processor(hf_processor)
+
+        if min_pixels:
+            image_processor.min_pixels = min_pixels
+        if max_pixels:
+            image_processor.max_pixels = max_pixels
+        if max_pixels or min_pixels:
+            image_processor.size = {
+                "min_pixels": image_processor.min_pixels,
+                "max_pixels": image_processor.max_pixels,
+            }
+
+        return hf_processor
+
+    def _get_processor_data(
+        self,
+        mm_items: MultiModalDataItems,
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
+        processor_data = dict[str, Any]()
+        passthrough_data = dict[str, Any]()
+
+        for k, v in mm_items.items():
+            # TODO: Make a separate modality for embedding inputs
+            # to avoid confusion
+            if k in ("image", "video", "audio"):
+                if isinstance(v, dict):
+                    # Pass through embedding inputs (dict)
+                    passthrough_data.update(v)
+                elif isinstance(v, torch.Tensor) and v.ndim == 3:
+                    # Pass through embedding inputs (single)
+                    passthrough_data[f"{k}_embeds"] = [v]
+                elif (is_list_of(v, torch.Tensor) and len(v) > 0
+                      and v[0].ndim == 2):
+                    # Pass through embedding inputs (multi)
+                    passthrough_data[f"{k}_embeds"] = v
                 else:
-                    prompt_token_ids_with_image.append(token)
+                    # Map keys to plural form, e.g.: image -> images
+                    processor_data[f"{k}s"] = v
+            else:
+                processor_data[k] = v
 
-            # ensure all embeddings are used
-            assert \
-                pad_token_counter == image_inputs["image_embeds"].size(0), \
-                "image_embeds.shape does not match image_grid_thw"
+        return processor_data, passthrough_data
 
-            prompt_token_ids = prompt_token_ids_with_image
-        else:
-            prompt_token_ids = _expand_pad_tokens(image_inputs,
-                                                  hf_config.image_token_id,
-                                                  make_batched_images,
-                                                  "image",
-                                                  image_processor,
-                                                  prompt_token_ids,
-                                                  min_pixels=min_pixels,
-                                                  max_pixels=max_pixels)
-
-    if video_inputs is not None:
-        if isinstance(video_inputs, dict):
-            prompt_token_ids_with_video = []
-            video_indices = [
-                idx for idx, token in enumerate(prompt_token_ids)
-                if token == hf_config.video_token_id
-            ]
-
-            # ensure all video tokens have grid_thw
-            assert \
-                len(video_indices) == video_inputs["video_grid_thw"].size(0), \
-                "video token num does not match video_grid_thw.shape"
-
-            video_counter = 0
-            pad_token_counter = 0
-            for idx, token in enumerate(prompt_token_ids):
-                if idx in video_indices:
-                    grid_thw = video_inputs["video_grid_thw"][video_counter]
-                    grid_t, grid_h, grid_w = grid_thw
-                    num_pad_tokens = (grid_t * grid_h * grid_w //
-                                      image_processor.merge_size //
-                                      image_processor.merge_size)
-                    prompt_token_ids_with_video.extend([token] *
-                                                       num_pad_tokens)
-                    video_counter += 1
-                    pad_token_counter += num_pad_tokens
-                else:
-                    prompt_token_ids_with_video.append(token)
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_inputs: BatchFeature,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> list[PromptReplacement]:
+        hf_processor = self._get_hf_processor()
+        image_processor = _get_image_processor(hf_processor)
+
+        # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
+        # image_token and video_token registered
+        placeholder = {
+            "image": hf_processor.image_token,
+            "video": hf_processor.video_token,
+        }
+        merge_length = image_processor.merge_size**2
+
+        def get_replacement_qwen2vl(item_idx: int, modality: str):
+            grid_thw = hf_inputs[f"{modality}_grid_thw"][item_idx]
+            num_tokens = grid_thw.prod() // merge_length
+            return placeholder[modality] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality=modality,
+                target=placeholder[modality],
+                replacement=partial(get_replacement_qwen2vl,
+                                    modality=modality),
+            ) for modality in ("image", "video")
+        ]
 
-            # ensure all embeddings are used
-            assert \
-                pad_token_counter == video_inputs["video_embeds"].size(0), \
-                "video_embeds.shape does not match video_grid_thw"
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts["image"]
+        hf_processor = self._get_hf_processor()
+        image_token: str = hf_processor.image_token
+        image_processor = _get_image_processor(hf_processor)
+
+        data = {}
+        resized_height, resized_width = smart_resize(
+            height=9999999,
+            width=9999999,
+            factor=image_processor.patch_size * image_processor.merge_size,
+            min_pixels=image_processor.min_pixels,
+            max_pixels=image_processor.max_pixels,
+        )
 
-            prompt_token_ids = prompt_token_ids_with_video
-        else:
-            prompt_token_ids = _expand_pad_tokens(video_inputs,
-                                                  hf_config.video_token_id,
-                                                  make_batched_videos,
-                                                  "video",
-                                                  image_processor,
-                                                  prompt_token_ids,
-                                                  min_pixels=min_pixels,
-                                                  max_pixels=max_pixels)
-
-    prompt = inputs.get("prompt")
-    if prompt is None:
-        prompt = tokenizer.decode(prompt_token_ids)
-
-    return token_inputs(
-        prompt_token_ids=prompt_token_ids,
-        prompt=prompt,
-        multi_modal_data=multi_modal_data,
-    )
+        dummy_image = Image.new("RGB", (resized_width, resized_height),
+                                color=0)
+        data["image"] = [dummy_image] * num_images
+
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=data,
+            mm_processor_kwargs={},
+        )
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper(
-    image_input_mapper_for_qwen2_vl)
-@MULTIMODAL_REGISTRY.register_input_mapper("video",
-                                           video_input_mapper_for_qwen2_vl)
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_qwen2_vl_image_tokens)
 @MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
     "video", get_max_qwen2_vl_video_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_vl)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_qwen2_vl)
+@MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor)
 class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                                       SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
@@ -1110,7 +904,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
-        config = vllm_config.model_config.hf_config
+        config: Qwen2VLConfig = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index b00513e5b37cb..6baf19d675d50 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -220,15 +220,18 @@ def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
         multi_data = MultiModalDataItems()
 
         for k, v in data.items():
+            # TODO: Make a separate modality for embedding inputs
+            # to avoid confusion
             # yapf: disable
             if k == "video":
                 # Special case since even a single item can be a list
                 multi_data[k] = (  # type: ignore[index]
-                    v if is_list_of(v, (list, torch.Tensor)) else [v]
+                    v if (isinstance(v, torch.Tensor)
+                          or is_list_of(v, list)) else [v]
                 )
             elif k in ("image", "audio"):
                 multi_data[k] = (  # type: ignore[index]
-                    v if isinstance(v, (list, torch.Tensor)) else [v]
+                    v if isinstance(v, (torch.Tensor, list)) else [v]
                 )
             else:
                 multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
@@ -252,6 +255,9 @@ def videos(self) -> Sequence[VideoItem]:
     def audios(self) -> Sequence[AudioItem]:
         return self.get("audio", [])
 
+    def get_item_counts(self) -> Mapping[str, int]:
+        return {m: len(items) for m, items in self.items()}
+
     def get_image_size(self, item_idx: int) -> ImageSize:
         image = self.images[item_idx]
 
@@ -612,6 +618,12 @@ def _get_hf_processor(self) -> ProcessorMixin:
     def _get_tokenizer(self) -> AnyTokenizer:
         return self.ctx.tokenizer
 
+    def _get_mm_items(
+        self,
+        mm_data: MultiModalDataDict,
+    ) -> MultiModalDataItems:
+        return MultiModalDataItems.from_dict(mm_data)
+
     @abstractmethod
     def _get_prompt_replacements(
         self,
@@ -778,7 +790,7 @@ def apply(
         3. Extract information about the placeholder tokens from the
            processed token IDs.
         """
-        mm_items = MultiModalDataItems.from_dict(mm_data)
+        mm_items = self._get_mm_items(mm_data)
 
         hf_inputs = self._apply_hf_processor(prompt_text, mm_items,
                                              mm_processor_kwargs)
@@ -791,7 +803,7 @@ def apply(
 
         # If HF processor already inserts placeholder tokens,
         # there is no need for us to insert them
-        mm_item_counts = {m: len(items) for m, items in mm_items.items()}
+        mm_item_counts = mm_items.get_item_counts()
         all_placeholders = self._find_placeholders(all_prompt_repls,
                                                    prompt_ids, mm_item_counts)
 

From cdf22afddad7b29e8d584b77863a563a91ac09fb Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 20 Dec 2024 00:59:32 +0800
Subject: [PATCH 125/357] [Misc] Clean up and consolidate LRUCache (#11339)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/adapter_commons/models.py                |  9 ++-
 .../tokenizer_group/tokenizer_group.py        |  2 +-
 vllm/utils.py                                 | 59 ++++++++-----------
 vllm/v1/engine/mm_input_mapper.py             |  6 +-
 vllm/v1/utils.py                              | 25 --------
 5 files changed, 34 insertions(+), 67 deletions(-)

diff --git a/vllm/adapter_commons/models.py b/vllm/adapter_commons/models.py
index a5c04ab78fbe8..468904c90fff4 100644
--- a/vllm/adapter_commons/models.py
+++ b/vllm/adapter_commons/models.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, Hashable, Optional, TypeVar
+from typing import Any, Callable, Dict, Optional, TypeVar
 
 from torch import nn
 
@@ -24,14 +24,13 @@ def from_local_checkpoint(cls, model_dir, model_id=None, **kwargs):
 T = TypeVar('T')
 
 
-class AdapterLRUCache(LRUCache[T]):
+class AdapterLRUCache(LRUCache[int, T]):
 
-    def __init__(self, capacity: int, deactivate_fn: Callable[[Hashable],
-                                                              None]):
+    def __init__(self, capacity: int, deactivate_fn: Callable[[int], object]):
         super().__init__(capacity)
         self.deactivate_fn = deactivate_fn
 
-    def _on_remove(self, key: Hashable, value: Optional[T]):
+    def _on_remove(self, key: int, value: Optional[T]):
         logger.debug("Removing adapter int id: %d", key)
         self.deactivate_fn(key)
         return super()._on_remove(key, value)
diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
index 761b07f34d2f9..95a8f7098bbac 100644
--- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
@@ -22,7 +22,7 @@ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
         self.max_input_length = max_input_length
         self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
         max_loras = tokenizer_config.get("max_loras", 0)
-        self.lora_tokenizers = LRUCache[AnyTokenizer](
+        self.lora_tokenizers = LRUCache[int, AnyTokenizer](
             capacity=max(max_loras, max_num_seqs) if enable_lora else 0)
 
     @classmethod
diff --git a/vllm/utils.py b/vllm/utils.py
index ba567feb19792..3934903385ad4 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -21,14 +21,13 @@
 import warnings
 import weakref
 from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
-from collections import UserDict, defaultdict
+from collections import OrderedDict, UserDict, defaultdict
 from collections.abc import Iterable, Mapping
 from dataclasses import dataclass, field
 from functools import lru_cache, partial, wraps
 from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
                     Dict, Generator, Generic, Hashable, List, Literal,
-                    Optional, OrderedDict, Set, Tuple, Type, TypeVar, Union,
-                    overload)
+                    Optional, Tuple, Type, TypeVar, Union, overload)
 from uuid import uuid4
 
 import numpy as np
@@ -154,10 +153,12 @@
 }
 
 P = ParamSpec('P')
-K = TypeVar("K")
 T = TypeVar("T")
 U = TypeVar("U")
 
+_K = TypeVar("_K", bound=Hashable)
+_V = TypeVar("_V")
+
 
 class _Sentinel:
     ...
@@ -190,50 +191,48 @@ def reset(self) -> None:
         self.counter = 0
 
 
-class LRUCache(Generic[T]):
+class LRUCache(Generic[_K, _V]):
 
-    def __init__(self, capacity: int):
-        self.cache: OrderedDict[Hashable, T] = OrderedDict()
-        self.pinned_items: Set[Hashable] = set()
+    def __init__(self, capacity: int) -> None:
+        self.cache = OrderedDict[_K, _V]()
+        self.pinned_items = set[_K]()
         self.capacity = capacity
 
-    def __contains__(self, key: Hashable) -> bool:
+    def __contains__(self, key: _K) -> bool:
         return key in self.cache
 
     def __len__(self) -> int:
         return len(self.cache)
 
-    def __getitem__(self, key: Hashable) -> T:
+    def __getitem__(self, key: _K) -> _V:
         value = self.cache[key]  # Raise KeyError if not exists
         self.cache.move_to_end(key)
         return value
 
-    def __setitem__(self, key: Hashable, value: T) -> None:
+    def __setitem__(self, key: _K, value: _V) -> None:
         self.put(key, value)
 
-    def __delitem__(self, key: Hashable) -> None:
+    def __delitem__(self, key: _K) -> None:
         self.pop(key)
 
-    def touch(self, key: Hashable) -> None:
+    def touch(self, key: _K) -> None:
         self.cache.move_to_end(key)
 
-    def get(self,
-            key: Hashable,
-            default_value: Optional[T] = None) -> Optional[T]:
-        value: Optional[T]
+    def get(self, key: _K, default: Optional[_V] = None) -> Optional[_V]:
+        value: Optional[_V]
         if key in self.cache:
             value = self.cache[key]
             self.cache.move_to_end(key)
         else:
-            value = default_value
+            value = default
         return value
 
-    def put(self, key: Hashable, value: T) -> None:
+    def put(self, key: _K, value: _V) -> None:
         self.cache[key] = value
         self.cache.move_to_end(key)
         self._remove_old_if_needed()
 
-    def pin(self, key: Hashable) -> None:
+    def pin(self, key: _K) -> None:
         """
         Pins a key in the cache preventing it from being
         evicted in the LRU order.
@@ -242,13 +241,13 @@ def pin(self, key: Hashable) -> None:
             raise ValueError(f"Cannot pin key: {key} not in cache.")
         self.pinned_items.add(key)
 
-    def _unpin(self, key: Hashable) -> None:
+    def _unpin(self, key: _K) -> None:
         self.pinned_items.remove(key)
 
-    def _on_remove(self, key: Hashable, value: Optional[T]):
+    def _on_remove(self, key: _K, value: Optional[_V]) -> None:
         pass
 
-    def remove_oldest(self, remove_pinned=False):
+    def remove_oldest(self, *, remove_pinned: bool = False) -> None:
         if not self.cache:
             return
 
@@ -262,17 +261,15 @@ def remove_oldest(self, remove_pinned=False):
                                    "cannot remove oldest from the cache.")
         else:
             lru_key = next(iter(self.cache))
-        self.pop(lru_key)
+        self.pop(lru_key)  # type: ignore
 
     def _remove_old_if_needed(self) -> None:
         while len(self.cache) > self.capacity:
             self.remove_oldest()
 
-    def pop(self,
-            key: Hashable,
-            default_value: Optional[T] = None) -> Optional[T]:
+    def pop(self, key: _K, default: Optional[_V] = None) -> Optional[_V]:
         run_on_remove = key in self.cache
-        value: Optional[T] = self.cache.pop(key, default_value)
+        value = self.cache.pop(key, default)
         # remove from pinned items
         if key in self.pinned_items:
             self._unpin(key)
@@ -280,7 +277,7 @@ def pop(self,
             self._on_remove(key, value)
         return value
 
-    def clear(self):
+    def clear(self) -> None:
         while len(self.cache) > 0:
             self.remove_oldest(remove_pinned=True)
         self.cache.clear()
@@ -843,10 +840,6 @@ def flatten_2d_lists(lists: List[List[T]]) -> List[T]:
     return [item for sublist in lists for item in sublist]
 
 
-_K = TypeVar("_K", bound=Hashable)
-_V = TypeVar("_V")
-
-
 def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]):
     """
     Unlike :class:`itertools.groupby`, groups are not broken by
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index cb97f743b1d52..218724bff6bba 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -8,7 +8,7 @@
 from vllm.logger import init_logger
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
                              MultiModalKwargs, MultiModalRegistry)
-from vllm.v1.utils import LRUDictCache
+from vllm.utils import LRUCache
 
 logger = init_logger(__name__)
 
@@ -44,7 +44,7 @@ def __init__(
 
         # Init cache
         self.use_cache = not model_config.disable_mm_preprocessor_cache
-        self.mm_cache = LRUDictCache[str, MultiModalKwargs](MM_CACHE_SIZE)
+        self.mm_cache = LRUCache[str, MultiModalKwargs](MM_CACHE_SIZE)
 
         # DEBUG: Set to None to disable
         self.mm_debug_cache_hit_ratio_steps = None
@@ -120,7 +120,7 @@ class MMInputMapperServer:
 
     def __init__(self, model_config):
         self.use_cache = not model_config.disable_mm_preprocessor_cache
-        self.mm_cache = LRUDictCache[str, MultiModalKwargs](MM_CACHE_SIZE)
+        self.mm_cache = LRUCache[str, MultiModalKwargs](MM_CACHE_SIZE)
 
     def process_inputs(
         self,
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 5f327d7066830..e802c6439b740 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,4 +1,3 @@
-from collections import OrderedDict
 from collections.abc import Sequence
 from contextlib import contextmanager
 from typing import (Any, Generic, Iterator, List, Optional, TypeVar, Union,
@@ -102,27 +101,3 @@ def make_zmq_socket(
 
     finally:
         ctx.destroy(linger=0)
-
-
-K = TypeVar('K')
-V = TypeVar('V')
-
-
-class LRUDictCache(Generic[K, V]):
-
-    def __init__(self, size: int):
-        self.cache: OrderedDict[K, V] = OrderedDict()
-        self.size = size
-
-    def get(self, key: K, default=None) -> V:
-        if key not in self.cache:
-            return default
-
-        self.cache.move_to_end(key)
-        return self.cache[key]
-
-    def put(self, key: K, value: V):
-        self.cache[key] = value
-        self.cache.move_to_end(key)
-        if len(self.cache) > self.size:
-            self.cache.popitem(last=False)

From 276738ce0f6aac48ace36bc79aa4a0765fccdfb2 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 20 Dec 2024 01:37:31 +0800
Subject: [PATCH 126/357] [Bugfix] Fix broken CPU compressed-tensors test
 (#11338)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/layers/quantization/utils/w8a8_utils.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index d77722499d0e9..d89071f30a549 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -11,8 +11,7 @@
 
 
 def sparse_cutlass_supported() -> bool:
-    # sparse cutlass is not supported on Rocm
-    if current_platform.is_rocm():
+    if not current_platform.is_cuda():
         return False
 
     capability_tuple = current_platform.get_device_capability()
@@ -22,8 +21,7 @@ def sparse_cutlass_supported() -> bool:
 
 
 def cutlass_fp8_supported() -> bool:
-    # cutlass is not supported on Rocm
-    if current_platform.is_rocm():
+    if not current_platform.is_cuda():
         return False
 
     capability_tuple = current_platform.get_device_capability()

From e461c262f0d4c9911f1bf75bea723f8ae17219be Mon Sep 17 00:00:00 2001
From: yangzhibin <45459326+Ghjk94522@users.noreply.github.com>
Date: Fri, 20 Dec 2024 01:54:24 +0800
Subject: [PATCH 127/357] [Misc] Remove unused vllm/block.py (#11336)

---
 vllm/block.py        | 88 --------------------------------------------
 vllm/core/evictor.py |  4 +-
 2 files changed, 2 insertions(+), 90 deletions(-)
 delete mode 100644 vllm/block.py

diff --git a/vllm/block.py b/vllm/block.py
deleted file mode 100644
index 47c381c19383b..0000000000000
--- a/vllm/block.py
+++ /dev/null
@@ -1,88 +0,0 @@
-"""Token blocks."""
-from typing import TYPE_CHECKING, Iterator, List, Optional
-
-from vllm.utils import Device
-
-DEFAULT_LAST_ACCESSED_TIME: float = -1
-
-
-class PhysicalTokenBlock:
-    """Represents the state of a block in the KV cache."""
-
-    def __init__(
-        self,
-        device: Device,
-        block_number: int,
-        block_size: int,
-        block_hash: int,
-        num_hashed_tokens: int,
-    ) -> None:
-        self.device = device
-        self.block_number = block_number
-        self.block_size = block_size
-        self.block_hash = block_hash
-        self.num_hashed_tokens = num_hashed_tokens
-
-        self.ref_count = 0
-        self.last_accessed = DEFAULT_LAST_ACCESSED_TIME
-
-        self.computed = False
-
-    def __repr__(self) -> str:
-        return (f'PhysicalTokenBlock(device={self.device}, '
-                f'block_number={self.block_number}, '
-                f'num_hashed_tokens={self.num_hashed_tokens}, '
-                f'ref_count={self.ref_count}, '
-                f'last_accessed={self.last_accessed}, '
-                f'computed={self.computed})')
-
-
-class BlockTable:
-    """Holds a list of blocks with caching of their associated block_ids 
-    """
-
-    def __init__(self, blocks: Optional[List[PhysicalTokenBlock]] = None):
-        self._blocks: List[PhysicalTokenBlock] = []
-        self._block_ids: List[int] = []
-
-        if blocks is not None:
-            for block in blocks:
-                self.append(block)
-
-    def append(self, block: PhysicalTokenBlock):
-        self._blocks.append(block)
-        self._block_ids.append(block.block_number)
-
-    def __len__(self) -> int:
-        return len(self._blocks)
-
-    def __getitem__(self, key):
-        return self._blocks[key]
-
-    if TYPE_CHECKING:
-
-        def __iter__(self) -> Iterator[PhysicalTokenBlock]:
-            raise RuntimeError("Method should be automatically generated")
-
-    def __setitem__(self, key, value):
-        if isinstance(key, slice):
-            blocks = value
-            self._blocks[key] = blocks
-            self._block_ids[key] = [b.block_number for b in blocks]
-        else:
-            block = value
-            self._blocks[key] = block
-            self._block_ids[key] = block.block_number
-
-    def reset(self):
-        self._blocks = []
-        self._block_ids = []
-
-    def copy(self) -> "BlockTable":
-        return BlockTable(self._blocks)
-
-    def list(self) -> List[PhysicalTokenBlock]:
-        return self._blocks
-
-    def ids(self) -> List[int]:
-        return self._block_ids
diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index 44adc4158abec..c9306518223a3 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -13,7 +13,7 @@ class EvictionPolicy(enum.Enum):
 
 class Evictor(ABC):
     """The Evictor subclasses should be used by the BlockAllocator class to
-    handle eviction of freed PhysicalTokenBlocks.
+    handle eviction of freed Blocks.
     """
 
     @abstractmethod
@@ -70,7 +70,7 @@ def __init__(self, content_hash: int, num_hashed_tokens: int,
 
 class LRUEvictor(Evictor):
     """Evicts in a least-recently-used order using the last_accessed timestamp
-    that's recorded in the PhysicalTokenBlock. If there are multiple blocks with
+    that's recorded in the Block. If there are multiple blocks with
     the same last_accessed time, then the one with the largest num_hashed_tokens
     will be evicted. If two blocks each have the lowest last_accessed time and
     highest num_hashed_tokens value, then one will be chose arbitrarily

From a985f7af9f7b249974b283a9d999575ac30fac3d Mon Sep 17 00:00:00 2001
From: Yuan <yuan.zhou@intel.com>
Date: Fri, 20 Dec 2024 03:46:55 +0800
Subject: [PATCH 128/357] [CI] Adding CPU docker pipeline (#11261)

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
Co-authored-by: Kevin H. Luu <kevin@anyscale.com>
---
 .buildkite/release-pipeline.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 2de6fceb0c3fe..51618a2955fb1 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -55,3 +55,18 @@ steps:
           password-env: DOCKERHUB_TOKEN
     env:
       DOCKER_BUILDKIT: "1"
+
+  - block: "Build CPU release image"
+    key: block-cpu-release-image-build
+    depends_on: ~
+
+  - label: "Build and publish CPU release image"
+    depends_on: block-cpu-release-image-build
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
+    env:
+      DOCKER_BUILDKIT: "1"

From 48edab8041741a82a1fd2f4d463cc0f393561b05 Mon Sep 17 00:00:00 2001
From: Akash kaothalkar <61960177+Akashcodes732@users.noreply.github.com>
Date: Fri, 20 Dec 2024 07:02:07 +0530
Subject: [PATCH 129/357] [Bugfix][Hardware][POWERPC] Fix auto dtype failure in
 case of POWER10 (#11331)

Signed-off-by: Akash Kaothalkar <0052v2@linux.vnet.ibm.com>
---
 vllm/config.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index 0e886e18fcd6d..6badae24d9d7d 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -22,7 +22,7 @@
 from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
                                                      get_quantization_config)
 from vllm.model_executor.models import ModelRegistry
-from vllm.platforms import current_platform
+from vllm.platforms import current_platform, interface
 from vllm.tracing import is_otel_available, otel_import_error_traceback
 from vllm.transformers_utils.config import (
     ConfigFormat, get_config, get_hf_image_processor_config,
@@ -2199,6 +2199,17 @@ def _get_and_verify_dtype(
             else:
                 torch_dtype = config_dtype
 
+            if (current_platform.is_cpu()
+                    and current_platform.get_cpu_architecture()
+                    == interface.CpuArchEnum.POWERPC
+                    and (config_dtype == torch.float16
+                         or config_dtype == torch.float32)):
+                logger.info(
+                    "For POWERPC, we cast models to bfloat16 instead of "
+                    "using float16 by default. Float16 is not currently "
+                    "supported for POWERPC.")
+                torch_dtype = torch.bfloat16
+
             if current_platform.is_hpu() and config_dtype == torch.float16:
                 logger.info(
                     "For HPU, we cast models to bfloat16 instead of"

From 7801f56ed76d9bec0344728bfa3359b42c926074 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 19 Dec 2024 18:13:06 -0800
Subject: [PATCH 130/357] [ci][gh200] dockerfile clean up (#11351)

Signed-off-by: drikster80 <ed.sealing@gmail.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: drikster80 <ed.sealing@gmail.com>
Co-authored-by: cenzhiyao <2523403608@qq.com>
---
 .buildkite/run-gh200-test.sh                  |  3 ++
 Dockerfile                                    | 39 ++++++++++---------
 docs/source/serving/deploying_with_docker.rst | 30 +++++++++-----
 requirements-build.txt                        |  2 +-
 requirements-common.txt                       |  7 ++--
 requirements-cuda-arm64.txt                   |  3 --
 requirements-cuda.txt                         |  4 +-
 7 files changed, 51 insertions(+), 37 deletions(-)
 delete mode 100644 requirements-cuda-arm64.txt

diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
index d06604f96f2b8..4fc6d089cc666 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -4,6 +4,9 @@
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 
+# Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
+python3 use_existing_torch.py
+
 # Try building the docker image
 DOCKER_BUILDKIT=1 docker build . \
   --target vllm-openai \
diff --git a/Dockerfile b/Dockerfile
index 123703848749c..0944050f7dfca 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,17 +45,21 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 WORKDIR /workspace
 
 # install build and runtime dependencies
-COPY requirements-common.txt requirements-common.txt
-COPY requirements-cuda.txt requirements-cuda.txt
-COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-cuda.txt
 
+# arm64 (GH200) build follows the practice of "use existing pytorch" build,
+# we need to install torch and torchvision from the nightly builds first,
+# pytorch will not appear as a vLLM dependency in all of the following steps
+# after this step
 RUN --mount=type=cache,target=/root/.cache/pip \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        python3 -m pip install -r requirements-cuda-arm64.txt; \
+        python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
     fi
 
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cuda.txt requirements-cuda.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -r requirements-cuda.txt
+
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
 # explicitly set the list to avoid issues with torch 2.2
@@ -77,11 +81,6 @@ COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-build.txt
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        python3 -m pip install -r requirements-cuda-arm64.txt; \
-    fi
-
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
@@ -157,8 +156,6 @@ WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETPLATFORM
 
-COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
-
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
     echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
 
@@ -183,17 +180,20 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
+# arm64 (GH200) build follows the practice of "use existing pytorch" build,
+# we need to install torch and torchvision from the nightly builds first,
+# pytorch will not appear as a vLLM dependency in all of the following steps
+# after this step
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
+    fi
+
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install dist/*.whl --verbose
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        pip uninstall -y torch && \
-        python3 -m pip install -r requirements-cuda-arm64.txt; \
-    fi
-
 RUN --mount=type=cache,target=/root/.cache/pip \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
@@ -244,6 +244,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     else \
         pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \
     fi
+
 ENV VLLM_USAGE_SOURCE production-docker-image
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index 56f0020a1011a..b64eef819cd2e 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -3,6 +3,9 @@
 Deploying with Docker
 ============================
 
+Use vLLM's Official Docker Image
+--------------------------------
+
 vLLM offers an official Docker image for deployment.
 The image can be used to run OpenAI compatible server and is available on Docker Hub as `vllm/vllm-openai <https://hub.docker.com/r/vllm/vllm-openai/tags>`_.
 
@@ -24,12 +27,15 @@ The image can be used to run OpenAI compatible server and is available on Docker
         memory to share data between processes under the hood, particularly for tensor parallel inference.
 
 
+Building vLLM's Docker Image from Source
+----------------------------------------
+
 You can build and run vLLM from source via the provided `Dockerfile <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`_. To build vLLM:
 
 .. code-block:: console
 
-    $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
-
+    $ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
+    $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
 
 .. note::
 
@@ -41,18 +47,19 @@ Building for Arm64/aarch64
 --------------------------
 
 A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
-of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
+of PyTorch Nightly and should be considered **experimental**. Using the flag ``--platform "linux/arm64"`` will attempt to build for arm64.
 
 .. note::
 
-        Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
-        flags to speed up build process. However, ensure your 'max_jobs' is substantially larger than 'nvcc_threads' to get the most benefits.
+        Multiple modules must be compiled, so this process can take a while. Recommend using ``--build-arg max_jobs=`` & ``--build-arg nvcc_threads=``
+        flags to speed up build process. However, ensure your ``max_jobs`` is substantially larger than ``nvcc_threads`` to get the most benefits.
         Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
 
 .. code-block:: console
 
-    # Example of building on Nvidia GH200 server. (Memory usage: ~12GB, Build time: ~1475s / ~25 min, Image size: 7.26GB)
-    $ DOCKER_BUILDKIT=1 sudo docker build . \
+    # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
+    $ python3 use_existing_torch.py
+    $ DOCKER_BUILDKIT=1 docker build . \
       --target vllm-openai \
       --platform "linux/arm64" \
       -t vllm/vllm-gh200-openai:latest \
@@ -61,7 +68,10 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
       --build-arg torch_cuda_arch_list="9.0+PTX" \
       --build-arg vllm_fa_cmake_gpu_arches="90-real"
 
-To run vLLM:
+Use the custom-built vLLM Docker image
+--------------------------------------
+
+To run vLLM with the custom-built Docker image:
 
 .. code-block:: console
 
@@ -71,6 +81,8 @@ To run vLLM:
         --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
         vllm/vllm-openai <args...>
 
+The argument ``vllm/vllm-openai`` specifies the image to run, and should be replaced with the name of the custom-built image (the ``-t`` tag from the build command).
+
 .. note::
 
-        **For `v0.4.1` and `v0.4.2` only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. ``/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable ``VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` .
+        **For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. ``/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable ``VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` .
diff --git a/requirements-build.txt b/requirements-build.txt
index 388b193403e88..fec01caaf25ef 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -4,6 +4,6 @@ ninja
 packaging
 setuptools>=61
 setuptools-scm>=8
-torch==2.5.1; platform_machine != 'aarch64'
+torch==2.5.1
 wheel
 jinja2
diff --git a/requirements-common.txt b/requirements-common.txt
index 250e2b17ffc23..6c390bcfd18e6 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -19,7 +19,7 @@ pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
-outlines == 0.1.11
+outlines == 0.1.11 # Requires pytorch
 lark == 1.2.2 
 xgrammar >= 0.1.6; platform_machine == "x86_64"
 typing_extensions >= 4.10
@@ -34,5 +34,6 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.8.1 # required for compressed-tensors
-depyf==0.18.0 # required for profiling and debugging torch.compile
+compressed-tensors == 0.8.1 # required for compressed-tensors, requires pytorch
+depyf==0.18.0 # required for profiling and debugging with compilation config
+cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
diff --git a/requirements-cuda-arm64.txt b/requirements-cuda-arm64.txt
deleted file mode 100644
index bbcb5cb7012ce..0000000000000
--- a/requirements-cuda-arm64.txt
+++ /dev/null
@@ -1,3 +0,0 @@
---index-url https://download.pytorch.org/whl/nightly/cu124
-torchvision==0.22.0.dev20241215; platform_machine == 'aarch64'
-torch==2.6.0.dev20241210+cu124; platform_machine == 'aarch64'
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 5d4dee8c7129a..058ab7c1ee9df 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -4,7 +4,7 @@
 # Dependencies for NVIDIA GPUs
 ray >= 2.9
 nvidia-ml-py >= 12.560.30 # for pynvml package
-torch == 2.5.1; platform_machine != 'aarch64'
+torch == 2.5.1
 # These must be updated alongside torch
-torchvision == 0.20.1; platform_machine != 'aarch64' # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.1

From b880ffb87e0bcde5e3693203b480df49e46d67bc Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 19 Dec 2024 23:35:18 -0500
Subject: [PATCH 131/357] [Misc] Add tqdm progress bar during graph capture
 (#11349)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/worker/model_runner.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 6ff98a8f1bab2..2b545d1b28bd2 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -13,6 +13,7 @@
 import torch
 import torch.distributed
 import torch.nn as nn
+from tqdm import tqdm
 
 import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
@@ -21,7 +22,8 @@
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.distributed import get_kv_transfer_group, get_pp_group
-from vllm.distributed.parallel_state import graph_capture
+from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
+                                             graph_capture)
 from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
@@ -1413,8 +1415,8 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
         logger.info("Capturing cudagraphs for decoding. This may lead to "
                     "unexpected consequences if the model is not static. To "
                     "run the model in eager mode, set 'enforce_eager=True' or "
-                    "use '--enforce-eager' in the CLI.")
-        logger.info("If out-of-memory error occurs during cudagraph capture,"
+                    "use '--enforce-eager' in the CLI. "
+                    "If out-of-memory error occurs during cudagraph capture,"
                     " consider decreasing `gpu_memory_utilization` or "
                     "switching to eager mode. You can also reduce the "
                     "`max_num_seqs` as needed to decrease memory usage.")
@@ -1451,8 +1453,14 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
             # memory usage of CUDA graph.
             for virtual_engine in range(
                     self.parallel_config.pipeline_parallel_size):
-                for batch_size in \
-                    self.vllm_config.compilation_config.capture_sizes:
+                # Only rank 0 should print progress bar during capture
+                capture_sizes = (
+                    tqdm(
+                        self.vllm_config.compilation_config.capture_sizes,
+                        desc="Capturing CUDA graph shapes",
+                    ) if get_tensor_model_parallel_rank() == 0 else
+                    self.vllm_config.compilation_config.capture_sizes)
+                for batch_size in capture_sizes:
                     attn_metadata = (
                         self.attn_state.graph_capture_get_metadata_for_batch(
                             batch_size,

From 86c2d8fd1cb27e607928ca8c92fa20d9694d2e4b Mon Sep 17 00:00:00 2001
From: Wallas Henrique <wallashss@users.noreply.github.com>
Date: Fri, 20 Dec 2024 02:15:31 -0300
Subject: [PATCH 132/357] [Bugfix] Fix spec decoding when seed is none in a
 batch (#10863)

Signed-off-by: Wallas Santos <wallashss@ibm.com>
---
 tests/samplers/test_rejection_sampler.py      | 63 +++++++++++++++++++
 .../layers/rejection_sampler.py               | 10 +--
 2 files changed, 66 insertions(+), 7 deletions(-)

diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index f5497976faf7a..397fa2cc85821 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -200,6 +200,69 @@ def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
                 assert torch.equal(results[j][i], results[0][i])
 
 
+@pytest.mark.parametrize("k", [1, 3, 6])
+@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
+@pytest.mark.parametrize("batch_size", [3, 8, 32, 128])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("use_flashinfer", [True, False])
+@torch.inference_mode()
+def test_mixed_seeded_batch(k: int, vocab_size: int, batch_size: int,
+                            device: str, use_flashinfer: bool):
+    torch.set_default_device(device)
+    set_random_seed(0)
+    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    target_probs = torch.rand(batch_size,
+                              k + 1,
+                              vocab_size,
+                              dtype=torch.float32)
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+    draft_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, k),
+                                    dtype=torch.int64)
+
+    single_batches = []
+    for i in range(batch_size):
+        single_batches.append((draft_probs[i].clone().unsqueeze(0),
+                               draft_token_ids[i].clone().unsqueeze(0),
+                               target_probs[i].clone().unsqueeze(0),
+                               bonus_token_ids[i].clone().unsqueeze(0),
+                               draft_token_ids[i].clone().unsqueeze(0)))
+
+    set_random_seed(0)
+    rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
+    rejection_sampler.init_gpu_tensors(device=device)
+
+    results = []
+    seeded_seqs = {
+        i: torch.Generator(device=device).manual_seed(i)
+        for i in range(1, batch_size)  # 0 is seed None
+    }
+    batch_result = rejection_sampler(target_probs.clone(),
+                                     bonus_token_ids.clone(),
+                                     draft_probs.clone(),
+                                     draft_token_ids.clone(), seeded_seqs)
+
+    set_random_seed(0)
+
+    rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
+    rejection_sampler.init_gpu_tensors(device=device)
+    for i in range(batch_size):
+        request_seeded_seqs = {
+            0: torch.Generator(device=device).manual_seed(i)
+        } if seeded_seqs.get(i) is not None else None
+        (draft_probs, draft_token_ids, target_probs, bonus_token_ids,
+         draft_token_ids) = single_batches[i]
+        results.append(
+            rejection_sampler(target_probs, bonus_token_ids, draft_probs,
+                              draft_token_ids, request_seeded_seqs))
+    for i in range(batch_size):
+        assert torch.equal(batch_result[i], results[i].squeeze(0))
+
+
 @pytest.mark.parametrize("k", [1, 3, 6])
 @pytest.mark.parametrize("vocab_size", [30_000, 50_000])
 @pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index 3ab0ba9e9f5c2..97a1b0c9603bd 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -1,6 +1,6 @@
 from functools import cached_property
 from importlib.util import find_spec
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, Optional, Tuple
 
 import torch
 import torch.jit
@@ -386,16 +386,12 @@ def _multinomial(
     if not seeded_seqs:
         q.exponential_(1.0)
     else:
-        non_seeded_indices: List[int] = []
         start = 0
         for idx in range(len(q) // k):
             end = start + k
             generator = seeded_seqs.get(idx)
-            if generator is None:
-                non_seeded_indices.extend(list(range(start, end)))
-            else:
-                q[start:end].exponential_(1.0, generator=generator)
+            # Note: generator might be None for non seeded
+            q[start:end].exponential_(1.0, generator=generator)
             start = end
-        q[non_seeded_indices].exponential_(1.0)
 
     return probs.div_(q).argmax(dim=1).view(-1, num_samples)

From c954f21ac05642c416cbd87861ddebe9af2ae1b4 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 19 Dec 2024 21:18:25 -0800
Subject: [PATCH 133/357] [misc] add early error message for custom ops
 (#11355)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/utils.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 3934903385ad4..1b90eca1cd6cc 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1577,8 +1577,18 @@ def direct_register_custom_op(
     library object. If you want to bind the operator to a different library,
     make sure the library object is alive when the operator is used.
     """
-    if is_in_doc_build() or not supports_custom_op():
+    if is_in_doc_build():
         return
+
+    if not supports_custom_op():
+        assert not current_platform.is_cuda_alike(), (
+            "cuda platform needs torch>=2.4 to support custom op, "
+            "chances are you are using an old version of pytorch "
+            "or a custom build of pytorch. It is recommended to "
+            "use vLLM in a fresh new environment and let it install "
+            "the required dependencies.")
+        return
+
     import torch.library
     if hasattr(torch.library, "infer_schema"):
         schema_str = torch.library.infer_schema(op_func,

From 1ecc645b8f5431f1404551ad24721a63f01aea4e Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 19 Dec 2024 21:33:53 -0800
Subject: [PATCH 134/357] [doc] backward compatibility for 0.6.4 (#11359)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/getting_started/debugging.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index d6c83014dc69f..7f36d65a227f0 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -86,6 +86,11 @@ If GPU/CPU communication cannot be established, you can use the following Python
     from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 
     pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank)
+    # pynccl is enabled by default for 0.6.5+,
+    # but for 0.6.4 and below, we need to enable it manually.
+    # keep the code for backward compatibility when because people
+    # prefer to read the latest documentation.
+    pynccl.disabled = False
 
     s = torch.cuda.Stream()
     with torch.cuda.stream(s):

From 04139ade599eedd493ce8effcda7ceabb57f2fb5 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Fri, 20 Dec 2024 04:04:21 -0800
Subject: [PATCH 135/357] [V1] Fix profiling for models with merged input
 processor (#11370)

Signed-off-by: ywang96 <ywang@roblox.com>
---
 vllm/v1/worker/gpu_model_runner.py | 44 ++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index cb89246db0cc9..ace62d8978bea 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -635,17 +635,6 @@ def profile_run(self) -> None:
             )
             dummy_mm_data = dummy_request_data.multi_modal_data
 
-            # Compute MM hashes (if enabled)
-            mm_hashes = None
-            if self.use_hash:
-                mm_hashes = self.mm_hasher.hash_dummy_mm_data(dummy_mm_data)
-
-            dummy_mm_kwargs = self.mm_input_mapper_client.process_inputs(
-                mm_data=dummy_mm_data,
-                mm_hashes=mm_hashes,
-                mm_processor_kwargs=None,
-                precomputed_mm_inputs=None)
-
             # NOTE: Currently model is profiled with a single non-text
             # modality even when it supports multiple.
             max_tokens_per_mm_item = max(
@@ -660,8 +649,39 @@ def profile_run(self) -> None:
             # (e.g, multiple images) for a single request, therefore here we
             # always replicate first item by max_num_mm_items times since in V1
             # they are scheduled to be processed separately.
+
+            # Case when models have a merged processor, their dummy data is
+            # already batched `MultiModalKwargs`, therefore we need to "unbatch"
+            # and take the first item in each batched tensor.
+            # TODO (ywang96): This is somewhat hacky. Refactor this to be
+            # consistent with the other case.
+            if isinstance(dummy_mm_data, MultiModalKwargs):
+                dummy_mm_kwargs = {
+                    k: v[0].unsqueeze(0)
+                    for k, v in dummy_mm_data.items()
+                }
+
+            # Case when models have dummy data explicitly defined as
+            # `MultiModalDataDict`, so they need to be processed through input
+            # mapper.
+            else:
+                # Compute MM hashes (if enabled)
+                mm_hashes = None
+                if self.use_hash:
+                    mm_hashes = self.mm_hasher.hash_dummy_mm_data(
+                        dummy_mm_data)
+
+                mm_kwargs_list = self.mm_input_mapper_client.process_inputs(
+                    mm_data=dummy_mm_data,
+                    mm_hashes=mm_hashes,
+                    mm_processor_kwargs=None,
+                    precomputed_mm_inputs=None)
+
+                # Take the first `MultiModalKwargs`
+                dummy_mm_kwargs = mm_kwargs_list[0]
+
             batched_dummy_mm_inputs = MultiModalKwargs.batch(
-                [dummy_mm_kwargs[0]] * max_num_mm_items)
+                [dummy_mm_kwargs] * max_num_mm_items)
             batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(
                 batched_dummy_mm_inputs, device=self.device)
 

From 7c7aa37c6933c40a94da0789d0f330a8d89f091b Mon Sep 17 00:00:00 2001
From: Daniele <36171005+dtrifiro@users.noreply.github.com>
Date: Fri, 20 Dec 2024 17:14:40 +0100
Subject: [PATCH 136/357] [CI/Build] fix pre-compiled wheel install for exact
 tag (#11373)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index fcfaa207c176a..a860093fe5f35 100644
--- a/setup.py
+++ b/setup.py
@@ -466,7 +466,7 @@ def get_vllm_version() -> str:
             version += f"{sep}empty"
     elif _is_cuda():
         if envs.VLLM_USE_PRECOMPILED:
-            version += ".precompiled"
+            version += f"{sep}precompiled"
         else:
             cuda_version = str(get_nvcc_cuda_version())
             if cuda_version != MAIN_CUDA_VERSION:

From 995f56236bc08300ea11fc8cd3d66029ffec8678 Mon Sep 17 00:00:00 2001
From: omer-dayan <omer@run.ai>
Date: Fri, 20 Dec 2024 18:46:24 +0200
Subject: [PATCH 137/357] [Core] Loading model from S3 using RunAI Model
 Streamer as optional loader (#10192)

Signed-off-by: OmerD <omer@run.ai>
---
 Dockerfile                                    |   4 +-
 docs/source/index.rst                         |   1 +
 docs/source/serving/runai_model_streamer.rst  |  53 +++++++
 setup.py                                      |   1 +
 tests/runai_model_streamer/__init__.py        |   0
 .../test_runai_model_streamer_loader.py       |  31 ++++
 .../runai_model_streamer/test_weight_utils.py |  39 +++++
 vllm/config.py                                |  37 +++++
 vllm/engine/arg_utils.py                      |   2 +
 vllm/model_executor/model_loader/loader.py    | 118 +++++++++++++-
 .../model_loader/weight_utils.py              |  24 +++
 vllm/transformers_utils/s3_utils.py           | 146 ++++++++++++++++++
 vllm/transformers_utils/utils.py              |   4 +
 13 files changed, 457 insertions(+), 3 deletions(-)
 create mode 100644 docs/source/serving/runai_model_streamer.rst
 create mode 100644 tests/runai_model_streamer/__init__.py
 create mode 100644 tests/runai_model_streamer/test_runai_model_streamer_loader.py
 create mode 100644 tests/runai_model_streamer/test_weight_utils.py
 create mode 100644 vllm/transformers_utils/s3_utils.py

diff --git a/Dockerfile b/Dockerfile
index 0944050f7dfca..84350cde59bfb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -240,9 +240,9 @@ FROM vllm-base AS vllm-openai
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10'; \
+        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
     else \
-        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \
+        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
     fi
 
 ENV VLLM_USAGE_SOURCE production-docker-image
diff --git a/docs/source/index.rst b/docs/source/index.rst
index fd741ea5e9766..d812885aafea9 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -88,6 +88,7 @@ Documentation
    serving/metrics
    serving/integrations
    serving/tensorizer
+   serving/runai_model_streamer
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/source/serving/runai_model_streamer.rst b/docs/source/serving/runai_model_streamer.rst
new file mode 100644
index 0000000000000..459eb8677fb95
--- /dev/null
+++ b/docs/source/serving/runai_model_streamer.rst
@@ -0,0 +1,53 @@
+.. _runai_model_streamer:
+
+Loading Models with Run:ai Model Streamer
+=========================================
+Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory.
+Further reading can be found in `Run:ai Model Streamer Documentation <https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md>`_.
+
+vLLM supports loading weights in Safetensors format using the Run:ai Model Streamer.
+You first need to install vLLM RunAI optional dependency:
+
+.. code-block:: console
+
+    $ pip3 install vllm[runai]
+
+To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag:
+
+.. code-block:: console
+
+    $ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer
+
+To run model from AWS S3 object store run:
+
+.. code-block:: console
+
+    $ vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer
+
+
+To run model from a S3 compatible object store run:
+
+.. code-block:: console
+
+    $ RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer
+
+Tunable parameters
+------------------
+You can tune parameters using `--model-loader-extra-config`:
+
+You can tune `concurrency` that controls the level of concurrency and number of OS threads reading tensors from the file to the CPU buffer.
+For reading from S3, it will be the number of client instances the host is opening to the S3 server.
+
+ .. code-block:: console
+
+    $ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}'
+
+You can controls the size of the CPU Memory buffer to which tensors are read from the file, and limit this size.
+You can read further about CPU buffer memory limiting `here <https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit>`_.
+
+ .. code-block:: console
+
+    $ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}'
+
+.. note::
+  For further instructions about tunable parameters and additional parameters configurable through environment variables, read the `Environment Variables Documentation <https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md>`_.
diff --git a/setup.py b/setup.py
index a860093fe5f35..73407b64edf22 100644
--- a/setup.py
+++ b/setup.py
@@ -630,6 +630,7 @@ def _read_requirements(filename: str) -> List[str]:
     ext_modules=ext_modules,
     extras_require={
         "tensorizer": ["tensorizer>=2.9.0"],
+        "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
         "audio": ["librosa", "soundfile"],  # Required for audio processing
         "video": ["decord"]  # Required for video processing
     },
diff --git a/tests/runai_model_streamer/__init__.py b/tests/runai_model_streamer/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/runai_model_streamer/test_runai_model_streamer_loader.py b/tests/runai_model_streamer/test_runai_model_streamer_loader.py
new file mode 100644
index 0000000000000..c5722fbae5c8a
--- /dev/null
+++ b/tests/runai_model_streamer/test_runai_model_streamer_loader.py
@@ -0,0 +1,31 @@
+from vllm import SamplingParams
+from vllm.config import LoadConfig, LoadFormat
+from vllm.model_executor.model_loader.loader import (RunaiModelStreamerLoader,
+                                                     get_model_loader)
+
+test_model = "openai-community/gpt2"
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
+
+
+def get_runai_model_loader():
+    load_config = LoadConfig(load_format=LoadFormat.RUNAI_STREAMER)
+    return get_model_loader(load_config)
+
+
+def test_get_model_loader_with_runai_flag():
+    model_loader = get_runai_model_loader()
+    assert isinstance(model_loader, RunaiModelStreamerLoader)
+
+
+def test_runai_model_loader_download_files(vllm_runner):
+    with vllm_runner(test_model, load_format=LoadFormat.RUNAI_STREAMER) as llm:
+        deserialized_outputs = llm.generate(prompts, sampling_params)
+        assert deserialized_outputs
diff --git a/tests/runai_model_streamer/test_weight_utils.py b/tests/runai_model_streamer/test_weight_utils.py
new file mode 100644
index 0000000000000..5c89bd78ad81d
--- /dev/null
+++ b/tests/runai_model_streamer/test_weight_utils.py
@@ -0,0 +1,39 @@
+import glob
+import tempfile
+
+import huggingface_hub.constants
+import torch
+
+from vllm.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf, runai_safetensors_weights_iterator,
+    safetensors_weights_iterator)
+
+
+def test_runai_model_loader():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        huggingface_hub.constants.HF_HUB_OFFLINE = False
+        download_weights_from_hf("openai-community/gpt2",
+                                 allow_patterns=["*.safetensors"],
+                                 cache_dir=tmpdir)
+        safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
+        assert len(safetensors) > 0
+
+        runai_model_streamer_tensors = {}
+        hf_safetensors_tensors = {}
+
+        for name, tensor in runai_safetensors_weights_iterator(safetensors):
+            runai_model_streamer_tensors[name] = tensor
+
+        for name, tensor in safetensors_weights_iterator(safetensors):
+            hf_safetensors_tensors[name] = tensor
+
+        assert len(runai_model_streamer_tensors) == len(hf_safetensors_tensors)
+
+        for name, runai_tensor in runai_model_streamer_tensors.items():
+            assert runai_tensor.dtype == hf_safetensors_tensors[name].dtype
+            assert runai_tensor.shape == hf_safetensors_tensors[name].shape
+            assert torch.all(runai_tensor.eq(hf_safetensors_tensors[name]))
+
+
+if __name__ == "__main__":
+    test_runai_model_loader()
diff --git a/vllm/config.py b/vllm/config.py
index 6badae24d9d7d..643698f8bbec3 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -29,6 +29,7 @@
     get_hf_text_config, get_pooling_config,
     get_sentence_transformer_tokenizer_config, is_encoder_decoder,
     try_get_generation_config, uses_mrope)
+from vllm.transformers_utils.utils import is_s3
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
                         get_cpu_memory, print_warning_once, random_uuid,
                         resolve_obj_by_qualname)
@@ -256,6 +257,8 @@ def __init__(self,
                    f"'Please instead use `--hf-overrides '{hf_override!r}'`")
             warnings.warn(DeprecationWarning(msg), stacklevel=2)
 
+        self.maybe_pull_model_tokenizer_for_s3(model, tokenizer)
+
         # The tokenizer version is consistent with the model version by default.
         if tokenizer_revision is None:
             self.tokenizer_revision = revision
@@ -357,6 +360,39 @@ def __init__(self,
         self._verify_cuda_graph()
         self._verify_bnb_config()
 
+    def maybe_pull_model_tokenizer_for_s3(self, model: str,
+                                          tokenizer: str) -> None:
+        """
+        Pull the model config or tokenizer to a temporary 
+        directory in case of S3.
+
+        Args:
+            model: The model name or path.
+            tokenizer: The tokenizer name or path.
+
+        """
+        if is_s3(model) or is_s3(tokenizer):
+            try:
+                from vllm.transformers_utils.s3_utils import S3Model
+            except ImportError as err:
+                raise ImportError(
+                    "Please install Run:ai optional dependency "
+                    "to use the S3 capabilities. "
+                    "You can install it with: pip install vllm[runai]"
+                ) from err
+
+            if is_s3(model):
+                self.s3_model = S3Model()
+                self.s3_model.pull_files(model, allow_pattern=["*config.json"])
+                self.model_weights = self.model
+                self.model = self.s3_model.dir
+
+            if is_s3(tokenizer):
+                self.s3_tokenizer = S3Model()
+                self.s3_tokenizer.pull_files(
+                    model, ignore_pattern=["*.pt", "*.safetensors", "*.bin"])
+                self.tokenizer = self.s3_tokenizer.dir
+
     def _init_multimodal_config(
         self, limit_mm_per_prompt: Optional[Mapping[str, int]]
     ) -> Optional["MultiModalConfig"]:
@@ -1099,6 +1135,7 @@ class LoadFormat(str, enum.Enum):
     GGUF = "gguf"
     BITSANDBYTES = "bitsandbytes"
     MISTRAL = "mistral"
+    RUNAI_STREAMER = "runai_streamer"
 
 
 @dataclass
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 912a8b2f54adb..7aa45b7958e26 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -316,6 +316,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             '* "tensorizer" will load the weights using tensorizer from '
             'CoreWeave. See the Tensorize vLLM Model script in the Examples '
             'section for more information.\n'
+            '* "runai_streamer" will load the Safetensors weights using Run:ai'
+            'Model Streamer \n'
             '* "bitsandbytes" will load the weights using bitsandbytes '
             'quantization.\n')
         parser.add_argument(
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index fdc4c6305bd5e..24e554e6060ab 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -45,9 +45,10 @@
     filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
     get_gguf_extra_tensor_names, gguf_quant_weights_iterator,
     initialize_dummy_weights, np_cache_weights_iterator, pt_weights_iterator,
-    safetensors_weights_iterator)
+    runai_safetensors_weights_iterator, safetensors_weights_iterator)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
+from vllm.transformers_utils.utils import is_s3
 from vllm.utils import is_pin_memory_available
 
 
@@ -1234,6 +1235,118 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
         return model
 
 
+class RunaiModelStreamerLoader(BaseModelLoader):
+    """
+        Model loader that can load safetensors 
+        files from local FS or S3 bucket.
+    """
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        if load_config.model_loader_extra_config:
+            extra_config = load_config.model_loader_extra_config
+
+            if ("concurrency" in extra_config
+                    and isinstance(extra_config.get("concurrency"), int)):
+                os.environ["RUNAI_STREAMER_CONCURRENCY"] = str(
+                    extra_config.get("concurrency"))
+
+            if ("memory_limit" in extra_config
+                    and isinstance(extra_config.get("memory_limit"), int)):
+                os.environ["RUNAI_STREAMER_MEMORY_LIMIT"] = str(
+                    extra_config.get("memory_limit"))
+
+            runai_streamer_s3_endpoint = os.getenv(
+                'RUNAI_STREAMER_S3_ENDPOINT')
+            aws_endpoint_url = os.getenv('AWS_ENDPOINT_URL')
+            if (runai_streamer_s3_endpoint is None
+                    and aws_endpoint_url is not None):
+                os.environ["RUNAI_STREAMER_S3_ENDPOINT"] = aws_endpoint_url
+
+    def _prepare_weights(self, model_name_or_path: str,
+                         revision: Optional[str]) -> List[str]:
+        """Prepare weights for the model.
+
+        If the model is not local, it will be downloaded."""
+        is_s3_path = is_s3(model_name_or_path)
+        if is_s3_path:
+            try:
+                from vllm.transformers_utils.s3_utils import glob as s3_glob
+            except ImportError as err:
+                raise ImportError(
+                    "Please install Run:ai optional dependency "
+                    "to use the S3 capabilities. "
+                    "You can install it with: pip install vllm[runai]"
+                ) from err
+
+        is_local = os.path.isdir(model_name_or_path)
+        safetensors_pattern = "*.safetensors"
+        index_file = SAFE_WEIGHTS_INDEX_NAME
+
+        hf_folder = (model_name_or_path if
+                     (is_local or is_s3_path) else download_weights_from_hf(
+                         model_name_or_path,
+                         self.load_config.download_dir,
+                         [safetensors_pattern],
+                         revision,
+                         ignore_patterns=self.load_config.ignore_patterns,
+                     ))
+
+        if is_s3_path:
+            hf_weights_files = s3_glob(path=hf_folder,
+                                       allow_pattern=[safetensors_pattern])
+        else:
+            hf_weights_files = glob.glob(
+                os.path.join(hf_folder, safetensors_pattern))
+
+        if not is_local and not is_s3_path:
+            download_safetensors_index_file_from_hf(
+                model_name_or_path, index_file, self.load_config.download_dir,
+                revision)
+
+        if not hf_weights_files:
+            raise RuntimeError(
+                f"Cannot find any safetensors model weights with "
+                f"`{model_name_or_path}`")
+
+        return hf_weights_files
+
+    def _get_weights_iterator(
+            self, model_or_path: str,
+            revision: str) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        """Get an iterator for the model weights based on the load format."""
+        hf_weights_files = self._prepare_weights(model_or_path, revision)
+        return runai_safetensors_weights_iterator(hf_weights_files)
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        """Download model if necessary"""
+        self._prepare_weights(model_config.model, model_config.revision)
+
+    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
+        """Perform streaming of the model to destination"""
+        device_config = vllm_config.device_config
+        model_config = vllm_config.model_config
+
+        target_device = torch.device(device_config.device)
+        with set_default_torch_dtype(model_config.dtype):
+            with target_device:
+                model = _initialize_model(vllm_config=vllm_config)
+
+            model_weights = model_config.model
+            if hasattr(model_config, "model_weights"):
+                model_weights = model_config.model_weights
+            model.load_weights(
+                self._get_weights_iterator(model_weights,
+                                           model_config.revision))
+
+            for _, module in model.named_modules():
+                quant_method = getattr(module, "quant_method", None)
+                if quant_method is not None:
+                    with device_loading_context(module, target_device):
+                        quant_method.process_weights_after_loading(module)
+        return model.eval()
+
+
 def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
     """Get a model loader based on the load format."""
 
@@ -1255,4 +1368,7 @@ def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
     if load_config.load_format == LoadFormat.GGUF:
         return GGUFModelLoader(load_config)
 
+    if load_config.load_format == LoadFormat.RUNAI_STREAMER:
+        return RunaiModelStreamerLoader(load_config)
+
     return DefaultModelLoader(load_config)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 9488d54edf365..f2a9e7e2687cb 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -410,6 +410,30 @@ def safetensors_weights_iterator(
                 yield name, param
 
 
+def runai_safetensors_weights_iterator(
+    hf_weights_files: List[str]
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """Iterate over the weights in the model safetensor files."""
+    try:
+        from runai_model_streamer import SafetensorsStreamer
+    except ImportError as err:
+        raise ImportError(
+            "Please install Run:ai optional dependency."
+            "You can install it with: pip install vllm[runai]") from err
+
+    enable_tqdm = not torch.distributed.is_initialized(
+    ) or torch.distributed.get_rank() == 0
+    with SafetensorsStreamer() as streamer:
+        for st_file in tqdm(
+                hf_weights_files,
+                desc="Loading safetensors using Runai Model Streamer",
+                disable=not enable_tqdm,
+                bar_format=_BAR_FORMAT,
+        ):
+            streamer.stream_file(st_file)
+            yield from streamer.get_tensors()
+
+
 def pt_weights_iterator(
     hf_weights_files: List[str]
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py
new file mode 100644
index 0000000000000..6f63dab74d696
--- /dev/null
+++ b/vllm/transformers_utils/s3_utils.py
@@ -0,0 +1,146 @@
+import fnmatch
+import os
+import shutil
+import signal
+import tempfile
+from pathlib import Path
+from typing import Optional
+
+import boto3
+
+
+def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]:
+    return [
+        path for path in paths if any(
+            fnmatch.fnmatch(path, pattern) for pattern in patterns)
+    ]
+
+
+def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]:
+    return [
+        path for path in paths
+        if not any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
+    ]
+
+
+def glob(s3=None,
+         path: str = "",
+         allow_pattern: Optional[list[str]] = None) -> list[str]:
+    """
+    List full file names from S3 path and filter by allow pattern.
+
+    Args:
+        s3: S3 client to use.
+        path: The S3 path to list from.
+        allow_pattern: A list of patterns of which files to pull.
+
+    Returns:
+        list[str]: List of full S3 paths allowed by the pattern
+    """
+    if s3 is None:
+        s3 = boto3.client("s3")
+    bucket_name, _, paths = list_files(s3,
+                                       path=path,
+                                       allow_pattern=allow_pattern)
+    return [f"s3://{bucket_name}/{path}" for path in paths]
+
+
+def list_files(
+        s3,
+        path: str,
+        allow_pattern: Optional[list[str]] = None,
+        ignore_pattern: Optional[list[str]] = None
+) -> tuple[str, str, list[str]]:
+    """
+    List files from S3 path and filter by pattern.
+
+    Args:
+        s3: S3 client to use.
+        path: The S3 path to list from.
+        allow_pattern: A list of patterns of which files to pull.
+        ignore_pattern: A list of patterns of which files not to pull.
+
+    Returns:
+        tuple[str, str, list[str]]: A tuple where:
+            - The first element is the bucket name
+            - The second element is string represent the bucket 
+              and the prefix as a dir like string
+            - The third element is a list of files allowed or 
+              disallowed by pattern
+    """
+    parts = path.removeprefix('s3://').split('/')
+    prefix = '/'.join(parts[1:])
+    bucket_name = parts[0]
+
+    objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
+    paths = [obj['Key'] for obj in objects.get('Contents', [])]
+
+    paths = _filter_ignore(paths, ["*/"])
+    if allow_pattern is not None:
+        paths = _filter_allow(paths, allow_pattern)
+
+    if ignore_pattern is not None:
+        paths = _filter_ignore(paths, ignore_pattern)
+
+    return bucket_name, prefix, paths
+
+
+class S3Model:
+    """
+    A class representing a S3 model mirrored into a temporary directory.
+
+    Attributes:
+        s3: S3 client.
+        dir: The temporary created directory.
+
+    Methods:
+        pull_files(): Pull model from S3 to the temporary directory.
+    """
+
+    def __init__(self) -> None:
+        self.s3 = boto3.client('s3')
+        for sig in (signal.SIGINT, signal.SIGTERM):
+            existing_handler = signal.getsignal(sig)
+            signal.signal(sig, self._close_by_signal(existing_handler))
+        self.dir = tempfile.mkdtemp()
+
+    def __del__(self):
+        self._close()
+
+    def _close(self) -> None:
+        if os.path.exists(self.dir):
+            shutil.rmtree(self.dir)
+
+    def _close_by_signal(self, existing_handler=None):
+
+        def new_handler(signum, frame):
+            self._close()
+            if existing_handler:
+                existing_handler(signum, frame)
+
+        return new_handler
+
+    def pull_files(self,
+                   s3_model_path: str = "",
+                   allow_pattern: Optional[list[str]] = None,
+                   ignore_pattern: Optional[list[str]] = None) -> None:
+        """
+        Pull files from S3 storage into the temporary directory.
+
+        Args:
+            s3_model_path: The S3 path of the model.
+            allow_pattern: A list of patterns of which files to pull.
+            ignore_pattern: A list of patterns of which files not to pull.
+
+        """
+        bucket_name, base_dir, files = list_files(self.s3, s3_model_path,
+                                                  allow_pattern,
+                                                  ignore_pattern)
+        if len(files) == 0:
+            return
+
+        for file in files:
+            destination_file = self.dir + file.removeprefix(base_dir)
+            local_dir = Path(destination_file).parent
+            os.makedirs(local_dir, exist_ok=True)
+            self.s3.download_file(bucket_name, file, destination_file)
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 7a9041b04fbb9..10a09fb4f566c 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -3,6 +3,10 @@
 from typing import Union
 
 
+def is_s3(model_or_path: str) -> bool:
+    return model_or_path.lower().startswith('s3://')
+
+
 def check_gguf_file(model: Union[str, PathLike]) -> bool:
     """Check if the file is a GGUF model."""
     model = Path(model)

From d573aeadcc891976f09d6d50f1a4f98c8ff809aa Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 20 Dec 2024 14:03:50 -0500
Subject: [PATCH 138/357] [Bugfix] Don't log OpenAI field aliases as ignored
 (#11378)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/entrypoints/openai/protocol.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 1314de714215e..1d8b0d19f9516 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -46,7 +46,15 @@ class OpenAIBaseModel(BaseModel):
     @classmethod
     def __log_extra_fields__(cls, data):
         if isinstance(data, dict):
-            extra_fields = data.keys() - cls.model_fields.keys()
+            # Get all class field names and their potential aliases
+            field_names = set()
+            for field_name, field in cls.model_fields.items():
+                field_names.add(field_name)
+                if hasattr(field, 'alias') and field.alias:
+                    field_names.add(field.alias)
+
+            # Compare against both field names and aliases
+            extra_fields = data.keys() - field_names
             if extra_fields:
                 logger.warning(
                     "The following fields were present in the request "

From 5d2248d81ab1f83a2874bfa726f0a1933ef2d048 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 20 Dec 2024 13:00:56 -0800
Subject: [PATCH 139/357] [doc] explain nccl requirements for rlhf (#11381)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/getting_started/debugging.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index 7f36d65a227f0..b123960533816 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -200,3 +200,4 @@ try this instead:
 Known Issues
 ----------------------------------------
 - In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq <https://github.com/zeromq/pyzmq/issues/2000>`_ , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of ``vllm`` to include the `fix <https://github.com/vllm-project/vllm/pull/6759>`_.
+- To circumvent a NCCL `bug <https://github.com/NVIDIA/nccl/issues/1234>`__ , all vLLM processes will set an environment variable ``NCCL_CUMEM_ENABLE=0`` to disable NCCL's ``cuMem`` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in `the RLHF integration <https://github.com/OpenRLHF/OpenRLHF/pull/604>`__ and the `discussion <https://github.com/vllm-project/vllm/issues/5723#issuecomment-2554389656>`__ .

From 47a0b615b45efd0a9ed57049d8ca6eff1c249844 Mon Sep 17 00:00:00 2001
From: Jiaxin Shan <seedjeffwan@gmail.com>
Date: Fri, 20 Dec 2024 13:54:55 -0800
Subject: [PATCH 140/357] Add ray[default] to wget to run distributed inference
 out of box (#11265)

Signed-off-by: Jiaxin Shan <seedjeffwan@gmail.com>
---
 Dockerfile            | 2 +-
 requirements-cuda.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 84350cde59bfb..6226569e9d3b4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -163,7 +163,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
+    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
     && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
     && add-apt-repository ppa:deadsnakes/ppa \
     && apt-get update -y \
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 058ab7c1ee9df..8002fbd8ee5b9 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -2,7 +2,7 @@
 -r requirements-common.txt
 
 # Dependencies for NVIDIA GPUs
-ray >= 2.9
+ray[default] >= 2.9
 nvidia-ml-py >= 12.560.30 # for pynvml package
 torch == 2.5.1
 # These must be updated alongside torch

From dd2b5633dd5fb0ecb5fb7247351ebedc1d69d054 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 21 Dec 2024 14:22:21 +0900
Subject: [PATCH 141/357] [V1][Bugfix] Skip hashing empty or None mm_data
 (#11386)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/engine/mm_input_mapper.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index 218724bff6bba..8bfc739b3dbbc 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -180,6 +180,10 @@ def hash_prompt_mm_data(self, prompt: PromptType) -> Optional[List[str]]:
             return None
 
         mm_data = prompt["multi_modal_data"]
+        if not mm_data:
+            # mm_data can be None or an empty dict.
+            return None
+
         image_inputs = mm_data["image"]
 
         return self.hash_images(image_inputs)

From 51ff216d851ba2457a601b47a2a3f19b47f80940 Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Sat, 21 Dec 2024 01:36:23 -0500
Subject: [PATCH 142/357] [Bugfix] update should_ignore_layer (#11354)

Signed-off-by: George Ohashi <george@neuralmagic.com>
---
 .../layers/quantization/compressed_tensors/utils.py             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index a74eaef5efdee..dfae4db71e546 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -30,7 +30,7 @@ def should_ignore_layer(layer_name: Optional[str],
     # in the safetensors checkpoint. So, we convert the name
     # from the fused version to unfused + check to make sure that
     # each shard of the fused layer has the same scheme.
-    if proj_name in FUSED_LAYER_NAME_MAPPING:
+    if proj_name in FUSED_LAYER_NAME_MAPPING and layer_name not in ignore:
         shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name]
 
         # Convert fused_name --> [shard_names]

From 584f0ae40d6f64a7097525f04feb236e94ad37fd Mon Sep 17 00:00:00 2001
From: Ricky Xu <xuchen727@hotmail.com>
Date: Fri, 20 Dec 2024 23:14:08 -0800
Subject: [PATCH 143/357] [V1] Make AsyncLLMEngine v1-v0 opaque (#11383)

Signed-off-by: Ricky Xu <xuchen727@hotmail.com>
---
 vllm/engine/async_llm_engine.py       | 7 +++++++
 vllm/entrypoints/openai/api_server.py | 6 +-----
 vllm/v1/engine/async_llm.py           | 6 +-----
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index f50e20cf70323..66a5089074ff5 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1256,3 +1256,10 @@ async def stop_profile(self) -> None:
             self.engine.model_executor.stop_profile()
         else:
             self.engine.model_executor._run_workers("stop_profile")
+
+
+# TODO(v1): Remove this class proxy when V1 goes default.
+if envs.VLLM_USE_V1:
+    from vllm.v1.engine.async_llm import AsyncLLM
+
+    AsyncLLMEngine = AsyncLLM  # type: ignore
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 00e2d1a56f160..2e5b769a825ce 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -27,6 +27,7 @@
 import vllm.envs as envs
 from vllm.config import ModelConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine  # type: ignore
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.engine.multiprocessing.engine import run_mp_engine
 from vllm.engine.protocol import EngineClient
@@ -66,11 +67,6 @@
                         is_valid_ipv6_address)
 from vllm.version import __version__ as VLLM_VERSION
 
-if envs.VLLM_USE_V1:
-    from vllm.v1.engine.async_llm import AsyncLLMEngine  # type: ignore
-else:
-    from vllm.engine.async_llm_engine import AsyncLLMEngine  # type: ignore
-
 TIMEOUT_KEEP_ALIVE = 5  # seconds
 
 prometheus_multiproc_dir: tempfile.TemporaryDirectory
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 41fb4b25d45bb..cfdbea8004c35 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -98,7 +98,7 @@ def from_engine_args(
         start_engine_loop: bool = True,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
         stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
-    ) -> "AsyncLLMEngine":
+    ) -> "AsyncLLM":
         """Create an AsyncLLM from the EngineArgs."""
 
         # Create the engine configs.
@@ -386,7 +386,3 @@ def errored(self) -> bool:
     @property
     def dead_error(self) -> BaseException:
         return Exception()  # TODO: implement
-
-
-# Retain V0 name for backwards compatibility.
-AsyncLLMEngine = AsyncLLM

From c2d1b075ba88271ccc23b981c223a0617afc6bfc Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 21 Dec 2024 02:15:03 -0800
Subject: [PATCH 144/357] [Bugfix] Fix issues for `Pixtral-Large-Instruct-2411`
 (#11393)

Signed-off-by: ywang96 <ywang@example.com>
Co-authored-by: ywang96 <ywang@example.com>
---
 vllm/model_executor/models/pixtral.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 6676dd16e005f..f3d66c2313198 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -45,8 +45,12 @@
 except ImportError:
     USE_XFORMERS_OPS = False
 
-PIXTRAL_IMAGE_BREAK_ID = 12
-PIXTRAL_IMAGE_END_ID = 13
+# These token ids cannot be retrieved from model config
+# so we hardcode them here.
+PIXTRAL_12B_IMAGE_BREAK_ID = 12
+PIXTRAL_12B_IMAGE_END_ID = 13
+PIXTRAL_LARGE_IMAGE_BREAK_ID = 14
+PIXTRAL_LARGE_IMAGE_END_ID = 15
 
 
 def get_max_pixtral_image_tokens(ctx: InputContext):
@@ -118,8 +122,7 @@ def input_mapper_for_pixtral(ctx: InputContext,
     for image_data in data_list:
         image = ImageChunk(image=image_data)
         encoding = tokenizer.instruct.mm_encoder(image)
-        image = torch.from_numpy(encoding.image).to(device="cuda",
-                                                    dtype=torch.float16)
+        image = torch.from_numpy(encoding.image).to(dtype=torch.float16)
         images.append(image)
         image_tokens_list.append(encoding.tokens)
 
@@ -237,8 +240,9 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
 
         # NOTE: Image embeddings are split into separate tensors for each image
         # by the indices of `[IMG_END]` token.
-        split_indices = torch.where(
-            image_tokens == PIXTRAL_IMAGE_END_ID)[0] + 1
+        image_end_condition = (image_tokens == PIXTRAL_12B_IMAGE_END_ID) | (
+            image_tokens == PIXTRAL_LARGE_IMAGE_END_ID)
+        split_indices = torch.where(image_end_condition)[0] + 1
         if len(split_indices) <= 1:
             # Do not split, return as tensor of shape [1, fs, hs]
             return image_embeds.unsqueeze(0)
@@ -260,8 +264,11 @@ def get_input_embeddings(
         if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids, inputs_embeds, multimodal_embeddings, [
-                    self.vision_args.image_token_id, PIXTRAL_IMAGE_END_ID,
-                    PIXTRAL_IMAGE_BREAK_ID
+                    self.vision_args.image_token_id,
+                    PIXTRAL_12B_IMAGE_END_ID,
+                    PIXTRAL_12B_IMAGE_BREAK_ID,
+                    PIXTRAL_LARGE_IMAGE_BREAK_ID,
+                    PIXTRAL_LARGE_IMAGE_END_ID,
                 ])
         return inputs_embeds
 

From 29c748930e0d35a98351a8cf8a093fba4b758114 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 21 Dec 2024 21:08:44 -0800
Subject: [PATCH 145/357] [CI] Fix flaky entrypoint tests (#11403)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 tests/entrypoints/openai/test_audio.py  | 3 +++
 tests/entrypoints/openai/test_video.py  | 3 +++
 tests/entrypoints/openai/test_vision.py | 3 +++
 3 files changed, 9 insertions(+)

diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index 0a29d77e73abc..1116c0da1a6f0 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -74,6 +74,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
         messages=messages,
         max_completion_tokens=10,
         logprobs=True,
+        temperature=0.0,
         top_logprobs=5)
     assert len(chat_completion.choices) == 1
 
@@ -130,6 +131,7 @@ async def test_single_chat_session_audio_base64encoded(
         messages=messages,
         max_completion_tokens=10,
         logprobs=True,
+        temperature=0.0,
         top_logprobs=5)
     assert len(chat_completion.choices) == 1
 
@@ -150,6 +152,7 @@ async def test_single_chat_session_audio_base64encoded(
         model=model_name,
         messages=messages,
         max_completion_tokens=10,
+        temperature=0.0,
     )
     message = chat_completion.choices[0].message
     assert message.content is not None and len(message.content) >= 0
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index 294b250362699..e73449e406739 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -82,6 +82,7 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
         messages=messages,
         max_completion_tokens=10,
         logprobs=True,
+        temperature=0.0,
         top_logprobs=5)
     assert len(chat_completion.choices) == 1
 
@@ -174,6 +175,7 @@ async def test_single_chat_session_video_base64encoded(
         messages=messages,
         max_completion_tokens=10,
         logprobs=True,
+        temperature=0.0,
         top_logprobs=5)
     assert len(chat_completion.choices) == 1
 
@@ -194,6 +196,7 @@ async def test_single_chat_session_video_base64encoded(
         model=model_name,
         messages=messages,
         max_completion_tokens=10,
+        temperature=0.0,
     )
     message = chat_completion.choices[0].message
     assert message.content is not None and len(message.content) >= 0
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index a0b6edd566561..5f070ba3b12e9 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -83,6 +83,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
         messages=messages,
         max_completion_tokens=10,
         logprobs=True,
+        temperature=0.0,
         top_logprobs=5)
     assert len(chat_completion.choices) == 1
 
@@ -175,6 +176,7 @@ async def test_single_chat_session_image_base64encoded(
         messages=messages,
         max_completion_tokens=10,
         logprobs=True,
+        temperature=0.0,
         top_logprobs=5)
     assert len(chat_completion.choices) == 1
 
@@ -195,6 +197,7 @@ async def test_single_chat_session_image_base64encoded(
         model=model_name,
         messages=messages,
         max_completion_tokens=10,
+        temperature=0.0,
     )
     message = chat_completion.choices[0].message
     assert message.content is not None and len(message.content) >= 0

From 4a9139780ad78a648415f07dd7a5a216fb3f96ab Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 21 Dec 2024 23:53:44 -0800
Subject: [PATCH 146/357] [cd][release] add pypi index for every commit and
 nightly build (#11404)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
---
 .buildkite/generate_index.py | 24 ++++++++++++++++++++++++
 .buildkite/upload-wheels.sh  | 16 +++++++++++++++-
 2 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 .buildkite/generate_index.py

diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
new file mode 100644
index 0000000000000..8350e2705141e
--- /dev/null
+++ b/.buildkite/generate_index.py
@@ -0,0 +1,24 @@
+import argparse
+import os
+
+template = """<!DOCTYPE html>
+<html>
+    <body>
+    <h1>Links for vLLM</h1/>
+        <a href="../{wheel_html_escaped}">{wheel}</a><br/>
+    </body>
+</html>
+"""
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--wheel", help="The wheel path.", required=True)
+args = parser.parse_args()
+
+filename = os.path.basename(args.wheel)
+
+with open("index.html", "w") as f:
+    print(f"Generated index.html for {args.wheel}")
+    # cloudfront requires escaping the '+' character
+    f.write(
+        template.format(wheel=filename,
+                        wheel_html_escaped=filename.replace("+", "%2B")))
diff --git a/.buildkite/upload-wheels.sh b/.buildkite/upload-wheels.sh
index 7345dd4e66b29..0b6d2a1c64c91 100644
--- a/.buildkite/upload-wheels.sh
+++ b/.buildkite/upload-wheels.sh
@@ -23,6 +23,8 @@ wheel="$new_wheel"
 version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
 echo "Version: $version"
 
+normal_wheel="$wheel" # Save the original wheel filename
+
 # If the version contains "dev", rename it to v1.0.0.dev for consistency
 if [[ $version == *dev* ]]; then
     suffix="${version##*.}"
@@ -32,12 +34,24 @@ if [[ $version == *dev* ]]; then
         new_version="1.0.0.dev"
     fi
     new_wheel="${wheel/$version/$new_version}"
-    mv -- "$wheel" "$new_wheel"
+    # use cp to keep both files in the artifacts directory
+    cp -- "$wheel" "$new_wheel"
     wheel="$new_wheel"
     version="$new_version"
 fi
 
 # Upload the wheel to S3
+python3 .buildkite/generate_index.py --wheel "$normal_wheel"
+
+# generate index for this commit
 aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
+aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+
+# generate index for nightly
 aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
+aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
+
 aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
\ No newline at end of file

From 72d9c316d3f6ede485146fe5aabd4e61dbc59069 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 22 Dec 2024 00:39:11 -0800
Subject: [PATCH 147/357] [cd][release] fix race conditions (#11407)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/upload-wheels.sh | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/.buildkite/upload-wheels.sh b/.buildkite/upload-wheels.sh
index 0b6d2a1c64c91..3c756659a715a 100644
--- a/.buildkite/upload-wheels.sh
+++ b/.buildkite/upload-wheels.sh
@@ -46,12 +46,26 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 # generate index for this commit
 aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
-aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+
+if [[ $normal_wheel == *"cu118"* ]]; then
+    # if $normal_wheel matches cu118, do not upload the index.html
+    echo "Skipping index files for cu118 wheels"
+else
+    # only upload index.html for cu12 wheels (default wheels)
+    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
+    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+fi
 
 # generate index for nightly
 aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
-aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
+
+if [[ $normal_wheel == *"cu118"* ]]; then
+    # if $normal_wheel matches cu118, do not upload the index.html
+    echo "Skipping index files for cu118 wheels"
+else
+    # only upload index.html for cu12 wheels (default wheels)
+    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
+fi
 
 aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
\ No newline at end of file

From f1d1bf6288abfe051ec4ad891c5a96575e347bfc Mon Sep 17 00:00:00 2001
From: "Jason T. Greene" <jason@stacksmash.com>
Date: Sun, 22 Dec 2024 09:25:10 -0600
Subject: [PATCH 148/357] [Bugfix] Fix fully sharded LoRAs with Mixtral
 (#11390)

Signed-off-by: Jason Greene <jason.greene@redhat.com>
---
 tests/lora/test_mixtral.py | 4 +++-
 vllm/lora/layers.py        | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index 150221dfce6ab..797a495201d33 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -62,8 +62,9 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
 
 
 @pytest.mark.parametrize("tp_size", [4])
+@pytest.mark.parametrize("fully_shard", [True, False])
 def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
-                                         tp_size):
+                                         tp_size, fully_shard):
     """This LoRA model has all supported Mixtral target modules"""
 
     if torch.cuda.device_count() < tp_size:
@@ -82,6 +83,7 @@ def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
         max_loras=4,
         distributed_executor_backend="ray",
         tensor_parallel_size=tp_size,
+        fully_sharded_loras=fully_shard,
         max_lora_rank=32,
     )
 
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index a6c93a3d8bfe9..85164c2165a3c 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -425,8 +425,9 @@ def forward(self, input_):
                        if self.base_layer.skip_bias_add else None)
         return output, output_bias
 
+    # ReplicatedLinear should always be replaced, regardless of the fully
+    # sharded LoRAs setting, because it is, by definition, copied per GPU.
     @classmethod
-    @_not_fully_sharded_can_replace
     def can_replace_layer(
         cls,
         source_layer: nn.Module,

From 048fc57a0fb599a3e39bbc9228432b0d1bb9e88d Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Sun, 22 Dec 2024 14:17:43 -0800
Subject: [PATCH 149/357] [CI] Unboock H100 Benchmark (#11419)

Signed-off-by: simon-mo <simon.mo@hey.com>
---
 .buildkite/nightly-benchmarks/benchmark-pipeline.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index 64ba1b32fb074..708e548727cf5 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -65,9 +65,9 @@ steps:
         - VLLM_USAGE_SOURCE
         - HF_TOKEN
 
-  - block: "Run H100 Benchmark"
-    key: block-h100
-    depends_on: ~
+  #- block: "Run H100 Benchmark"
+    #key: block-h100
+    #depends_on: ~
 
   - label: "H100"
     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"

From f30581c51831b74795cb55419b2fffc928cfd7d2 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 23 Dec 2024 00:01:08 -0800
Subject: [PATCH 150/357] [misc][perf] remove old code (#11425)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/_custom_ops.py | 51 ---------------------------------------------
 1 file changed, 51 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 19f31b8ec419d..aeacf5dda5761 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1,5 +1,4 @@
 import contextlib
-import functools
 import importlib
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
@@ -36,34 +35,6 @@ def register_fake(fn):
         from torch.library import impl_abstract as register_fake
 
 
-def hint_on_error(fn):
-
-    @functools.wraps(fn)
-    def wrapper(*args, **kwargs):
-        try:
-            return fn(*args, **kwargs)
-
-        except NotImplementedError as e:
-            msg = (
-                "Error in calling custom op %s: %s\n"
-                "Not implemented or built, mostly likely because the current current device "
-                "does not support this kernel (less likely TORCH_CUDA_ARCH_LIST was set "
-                "incorrectly while building)")
-            logger.error(msg, fn.__name__, e)
-            raise NotImplementedError(msg % (fn.__name__, e)) from e
-        except AttributeError as e:
-            msg = (
-                "Error in calling custom op %s: %s\n"
-                "Possibly you have built or installed an obsolete version of vllm.\n"
-                "Please try a clean build and install of vllm,"
-                "or remove old built files such as vllm/*cpython*.so and build/ ."
-            )
-            logger.error(msg, fn.__name__, e)
-            raise e
-
-    return wrapper
-
-
 # activation ops
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     torch.ops._C.silu_and_mul(out, x)
@@ -1101,25 +1072,3 @@ def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
 def register_graph_buffers(fa: int, handles: List[List[int]],
                            offsets: List[List[int]]) -> None:
     torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
-
-
-# temporary fix for https://github.com/vllm-project/vllm/issues/5456
-# TODO: remove this in v0.6.0
-names_and_values = globals()
-names_and_values_to_update = {}
-# prepare variables to avoid dict size change during iteration
-k, v, arg = None, None, None
-fn_type = type(lambda x: x)
-for k, v in names_and_values.items():
-    # find functions that are defined in this file and have torch.Tensor
-    # in their annotations. `arg == "torch.Tensor"` is used to handle
-    # the case when users use `import __annotations__` to turn type
-    # hints into strings.
-    if isinstance(v, fn_type) \
-        and v.__code__.co_filename == __file__ \
-        and any(arg is torch.Tensor or arg == "torch.Tensor"
-                for arg in v.__annotations__.values()):
-        names_and_values_to_update[k] = hint_on_error(v)
-
-names_and_values.update(names_and_values_to_update)
-del names_and_values_to_update, names_and_values, v, k, fn_type

From e51719ae72dd1dcdf55436a99ac8bed245b51422 Mon Sep 17 00:00:00 2001
From: Lucas Tucker <47258766+lucas-tucker@users.noreply.github.com>
Date: Mon, 23 Dec 2024 07:55:49 -0600
Subject: [PATCH 151/357] mypy type checking for vllm/worker (#11418)

Signed-off-by: lucast2021 <lucast2021@headroyce.org>
Co-authored-by: lucast2021 <lucast2021@headroyce.org>
---
 vllm/worker/cpu_worker.py              |  3 +--
 vllm/worker/multi_step_model_runner.py | 13 +++++++------
 vllm/worker/worker_base.py             |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 09758a5d9accf..b5dfebfce6f75 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -333,9 +333,8 @@ def execute_worker(
     def prepare_worker_input(
             self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
         assert execute_model_req is not None
-        virtual_engine = execute_model_req.virtual_engine
+        virtual_engine: int = execute_model_req.virtual_engine
         num_seq_groups: int = len(execute_model_req.seq_group_metadata_list)
-        blocks_to_copy = execute_model_req.blocks_to_copy
         blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
                                       device="cpu",
                                       dtype=torch.int64).view(-1, 2)
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index 18b03bf1bfb56..f3d7c726a29f1 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -406,8 +406,9 @@ def _async_process_outputs(self, model_input: StatefulModelInput,
             if not cont:
                 break
 
-    def _final_process_outputs(self, model_input: StatefulModelInput,
-                               output_proc_callback: Optional[Callable]):
+    def _final_process_outputs(
+            self, model_input: StatefulModelInput,
+            output_proc_callback: Optional[Callable]) -> List[SamplerOutput]:
         assert model_input.frozen_model_input is not None
 
         has_async_callback = output_proc_callback is not None
@@ -594,8 +595,8 @@ def execute_model(
         # should be [SamplerOutput]
         return output
 
-    def _update_sampling_metadata(self, sampling_metadata, num_seqs,
-                                  num_queries):
+    def _update_sampling_metadata(self, sampling_metadata: SamplingMetadata,
+                                  num_seqs: Optional[int], num_queries: int):
 
         assert sampling_metadata.num_prompts == 0
         assert len(sampling_metadata.seq_groups) == num_queries
@@ -850,13 +851,13 @@ def _pythonize_sampler_output(
         seq_ids = seq_group.seq_ids
         next_token_ids = sample_result
         parent_ids = [0]
+        seq_outputs: List[SequenceOutput]
 
         if cache is not None:
             completion_seq_group_output: CompletionSequenceGroupOutput = \
                 cache.cached_completion_seq_group_output.get_object()
             completion_seq_group_output.samples.clear()
-            seq_outputs: List[
-                SequenceOutput] = completion_seq_group_output.samples
+            seq_outputs = completion_seq_group_output.samples
         else:
             seq_outputs = []
 
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 6d00102e0a324..3ac7fb8dfb766 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -452,7 +452,7 @@ def init_worker(self, *args, **kwargs):
         self.worker = worker_class(*args, **kwargs)
         assert self.worker is not None
 
-    def execute_method(self, method, *args, **kwargs):
+    def execute_method(self, method: str, *args, **kwargs):
         try:
             target = self if self.worker is None else self.worker
             executor = getattr(target, method)

From 5bfb30a529283f7d9a8baa6715ab3bfef204cddd Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 23 Dec 2024 10:06:20 -0500
Subject: [PATCH 152/357] [Bugfix] Fix CFGGuide and use outlines for grammars
 that can't convert to GBNF (#11389)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 tests/entrypoints/llm/test_guided_generate.py |  5 --
 .../guided_decoding/__init__.py               | 87 ++++---------------
 .../outlines_logits_processors.py             | 23 +++--
 .../{xgrammar_utils.py => utils.py}           | 70 +++++++++++++++
 .../guided_decoding/xgrammar_decoding.py      |  4 +-
 5 files changed, 103 insertions(+), 86 deletions(-)
 rename vllm/model_executor/guided_decoding/{xgrammar_utils.py => utils.py} (72%)

diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index e9c48f2b6b551..ccb9906fc5c0f 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -174,11 +174,6 @@ def test_guided_choice_completion(sample_guided_choice, llm,
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 def test_guided_grammar(sample_sql_statements, llm,
                         guided_decoding_backend: str):
-    if guided_decoding_backend == "outlines":
-        pytest.skip("Outlines backend fails in this test case with:\n"
-                    "AttributeError: Error in model execution: 'ParserConf' "
-                    "object has no attribute 'deterministic'")
-
     sampling_params = SamplingParams(temperature=0.8,
                                      top_p=0.95,
                                      max_tokens=1000,
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index 550b892303feb..694c5b68b1cbd 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -3,6 +3,9 @@
 from typing import TYPE_CHECKING
 
 from vllm.logger import init_logger
+from vllm.model_executor.guided_decoding.utils import (
+    convert_lark_to_gbnf, grammar_is_likely_lark,
+    has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
 from vllm.platforms import CpuArchEnum, current_platform
 
 if TYPE_CHECKING:
@@ -15,76 +18,6 @@
 logger = init_logger(__name__)
 
 
-def has_xgrammar_unsupported_json_features(schema: dict) -> bool:
-    """Check if JSON schema contains features unsupported by xgrammar."""
-
-    def check_object(obj: dict) -> bool:
-        if not isinstance(obj, dict):
-            return False
-
-        # Check for pattern restrictions
-        if "pattern" in obj:
-            return True
-
-        # Check for numeric ranges
-        if obj.get("type") in ("integer", "number") and any(
-                key in obj for key in [
-                    "minimum", "maximum", "exclusiveMinimum",
-                    "exclusiveMaximum", "multipleOf"
-                ]):
-            return True
-
-        # Recursively check all nested objects and arrays
-        for value in obj.values():
-            if isinstance(value, dict):
-                if check_object(value):
-                    return True
-            elif isinstance(value, list):
-                for item in value:
-                    if isinstance(item, dict) and check_object(item):
-                        return True
-
-        return False
-
-    return check_object(schema)
-
-
-def has_lmf_unsupported_json_features(schema: dict) -> bool:
-    """
-    Check if JSON schema contains features unsupported 
-    by lm_format_enforcer.
-
-    Known issues:
-    - Regex patterns:
-        "grade": {
-            "type": "string",
-            "pattern": "^[A-D]$"  # Regex pattern
-        },
-    """
-
-    def check_object(obj: dict) -> bool:
-        if not isinstance(obj, dict):
-            return False
-
-        # Check for pattern restrictions
-        if "pattern" in obj:
-            return True
-
-        # Recursively check all nested objects and arrays
-        for value in obj.values():
-            if isinstance(value, dict):
-                if check_object(value):
-                    return True
-            elif isinstance(value, list):
-                for item in value:
-                    if isinstance(item, dict) and check_object(item):
-                        return True
-
-        return False
-
-    return check_object(schema)
-
-
 def maybe_backend_fallback(
         guided_params: GuidedDecodingParams) -> GuidedDecodingParams:
     # lm-format-enforce doesn't support grammar, fallback to xgrammar
@@ -127,6 +60,20 @@ def maybe_backend_fallback(
                 "Falling back to use outlines instead.")
             guided_params.backend = "outlines"
 
+        # xgrammar only supports GBNF grammars, so we must convert Lark.
+        # We must check if the grammar is likely Lark and if that
+        # grammar is convertible to GBNF
+        elif (guided_params.grammar is not None
+              and grammar_is_likely_lark(guided_params.grammar)):
+            try:
+                convert_lark_to_gbnf(guided_params.grammar)
+            except Exception:
+                logger.warning(
+                    "xgrammar does not support Lark grammars and the "
+                    "grammar failed to convert to GBNF. "
+                    "Falling back to use outlines instead.")
+                guided_params.backend = "outlines"
+
     if (guided_params.backend == "outlines"
             and guided_params.json_object is not None):
         # outlines doesn't support json_object, fallback to xgrammar
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index b63fed1c8a8c3..e4eb3f16e56cf 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -21,10 +21,11 @@
 
 import numpy as np
 import torch
-from lark import Lark
 from outlines import grammars
 from outlines.caching import cache
-from outlines.fsm.guide import CFGGuide, Generate, Guide, RegexGuide, Write
+from outlines.fsm.guide import (CFGGuide, CFGState, Generate, Guide,
+                                RegexGuide, Write)
+from outlines.fsm.parsing import PartialLark
 from outlines_core.fsm.json_schema import build_regex_from_schema
 from pydantic import BaseModel
 from transformers import PreTrainedTokenizerBase
@@ -34,7 +35,9 @@ class BaseLogitsProcessor:
 
     def __init__(self, guide: Guide):
         self._guide: Guide = guide
-        self._fsm_state: DefaultDict[int, int] = defaultdict(int)
+        # CFGState is used for the FSM state for CFGGuide
+        self._fsm_state: DefaultDict[int, Union[int,
+                                                CFGState]] = defaultdict(int)
 
     def __call__(self, input_ids: List[int],
                  scores: torch.Tensor) -> torch.Tensor:
@@ -54,15 +57,13 @@ def __call__(self, input_ids: List[int],
             # On the first time this is called, we simply re-create
             # the Lark object.
             if isinstance(self._guide, CFGGuide):
-                self._guide.parser = Lark(
+                self._guide.parser = PartialLark(
                     self._guide.cfg_string,
                     parser="lalr",
-                    lexer="contextual",
-                    propagate_positions=False,
-                    maybe_placeholders=False,
-                    regex=True,
                     import_paths=[grammars.GRAMMAR_PATH],
                 )
+                self._fsm_state[seq_id] = CFGState(
+                    parser_state=self._guide.parser.parse(""), prev_token=None)
 
         instruction = self._guide.get_next_instruction(
             state=self._fsm_state[seq_id])
@@ -200,7 +201,8 @@ def convert_token_to_string(token: str) -> str:
         string = tokenizer.convert_tokens_to_string([token])
 
         # A hack to handle missing spaces to HF's Llama tokenizers
-        if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
+        if (type(token) is str and token.startswith(SPIECE_UNDERLINE)
+                or token == "<0x20>"):
             return " " + string
 
         return string
@@ -211,6 +213,9 @@ def change_decoder(
         """Sync vLLM's decoder with the outlines by returning list."""
 
         def new_decoder(inp_tokens: List[int]) -> List[str]:
+            if (isinstance(inp_tokens, list) and len(inp_tokens) == 1
+                    and isinstance(inp_tokens[0], list)):
+                inp_tokens = inp_tokens[0]
             return [decoder(inp_tokens)]
 
         return new_decoder
diff --git a/vllm/model_executor/guided_decoding/xgrammar_utils.py b/vllm/model_executor/guided_decoding/utils.py
similarity index 72%
rename from vllm/model_executor/guided_decoding/xgrammar_utils.py
rename to vllm/model_executor/guided_decoding/utils.py
index 9a0463964de49..20abaefbacc51 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_utils.py
+++ b/vllm/model_executor/guided_decoding/utils.py
@@ -1,6 +1,76 @@
 import re
 
 
+def has_xgrammar_unsupported_json_features(schema: dict) -> bool:
+    """Check if JSON schema contains features unsupported by xgrammar."""
+
+    def check_object(obj: dict) -> bool:
+        if not isinstance(obj, dict):
+            return False
+
+        # Check for pattern restrictions
+        if "pattern" in obj:
+            return True
+
+        # Check for numeric ranges
+        if obj.get("type") in ("integer", "number") and any(
+                key in obj for key in [
+                    "minimum", "maximum", "exclusiveMinimum",
+                    "exclusiveMaximum", "multipleOf"
+                ]):
+            return True
+
+        # Recursively check all nested objects and arrays
+        for value in obj.values():
+            if isinstance(value, dict):
+                if check_object(value):
+                    return True
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict) and check_object(item):
+                        return True
+
+        return False
+
+    return check_object(schema)
+
+
+def has_lmf_unsupported_json_features(schema: dict) -> bool:
+    """
+    Check if JSON schema contains features unsupported 
+    by lm_format_enforcer.
+
+    Known issues:
+    - Regex patterns:
+        "grade": {
+            "type": "string",
+            "pattern": "^[A-D]$"  # Regex pattern
+        },
+    """
+
+    def check_object(obj: dict) -> bool:
+        if not isinstance(obj, dict):
+            return False
+
+        # Check for pattern restrictions
+        if "pattern" in obj:
+            return True
+
+        # Recursively check all nested objects and arrays
+        for value in obj.values():
+            if isinstance(value, dict):
+                if check_object(value):
+                    return True
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict) and check_object(item):
+                        return True
+
+        return False
+
+    return check_object(schema)
+
+
 def grammar_is_likely_lark(grammar_str: str) -> bool:
     """
     Check if grammar appears to use Lark syntax.
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index 5b97f03257502..5e1948977bff4 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -14,8 +14,8 @@
 except ImportError:
     pass
 
-from vllm.model_executor.guided_decoding.xgrammar_utils import (
-    convert_lark_to_gbnf, grammar_is_likely_lark)
+from vllm.model_executor.guided_decoding.utils import (convert_lark_to_gbnf,
+                                                       grammar_is_likely_lark)
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 
 if TYPE_CHECKING:

From 2e726680b386a06dfac1144853fd58964da3914f Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Mon, 23 Dec 2024 12:20:22 -0500
Subject: [PATCH 153/357] [Bugfix] torch nightly version in ROCm installation
 guide (#11423)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 docs/source/getting_started/amd-installation.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
index ece5d785e0c65..27636d936270c 100644
--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
@@ -145,7 +145,7 @@ Note to get your gfx architecture, run `rocminfo |grep gfx`.
 
         $ # Install PyTorch
         $ pip uninstall torch -y
-        $ pip install --no-cache-dir --pre torch==2.6.0.dev20240918 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
+        $ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
 
         $ # Build & install AMD SMI
         $ pip install /opt/rocm/share/amd_smi

From b866cdbd05b13e0c0ab349efc6fca834fbe21760 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Mon, 23 Dec 2024 13:23:38 -0500
Subject: [PATCH 154/357] [Misc] Add assertion and helpful message for marlin24
 compressed models (#11388)

---
 .../compressed_tensors/schemes/compressed_tensors_w4a16_24.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
index 9ad61a64e406c..61d1c911cd1ad 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -61,6 +61,10 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
 
+        assert params_dtype == torch.float16, (
+            "float16 is required for marlin24 compressd models. Set dtype=torch.float16"  # noqa: E501
+        )
+
         pack_factor = 32 // self.quant_type.size_bits
         output_size_per_partition = sum(output_partition_sizes)
 

From 8cef6e02dcba8a1fa680cd130222bd3d47d54796 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Mon, 23 Dec 2024 13:33:20 -0500
Subject: [PATCH 155/357] [Misc] add w8a8 asym models (#11075)

---
 tests/quantization/test_compressed_tensors.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 38e02f6018dee..92436889ecffe 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -79,12 +79,12 @@ def zp_valid(zp: Optional[torch.Tensor]):
         assert output
 
 
-@pytest.mark.parametrize(
-    "model_path",
-    [
-        "neuralmagic/Llama-3.2-1B-quantized.w8a8"
-        # TODO static & asymmetric
-    ])
+@pytest.mark.parametrize("model_path", [
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
+])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner,
@@ -92,6 +92,10 @@ def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner,
                                           max_tokens, num_logprobs):
     dtype = "bfloat16"
 
+    # skip language translation prompt for the static per tensor asym model
+    if model_path == "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym":  # noqa: E501
+        example_prompts = example_prompts[0:-1]
+
     with hf_runner(model_path, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
             example_prompts, max_tokens, num_logprobs)

From 63afbe9215813780e0327e31072c4292bd99e46b Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 23 Dec 2024 13:35:38 -0500
Subject: [PATCH 156/357] [CI] Expand OpenAI test_chat.py guided decoding tests
 (#11048)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 tests/entrypoints/openai/test_chat.py | 29 +++++++++++----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 47c521a9b5eb5..5e6499d8f563c 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -17,6 +17,8 @@
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 
+GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
+
 
 @pytest.fixture(scope="module")
 def server(zephyr_lora_files, zephyr_lora_added_tokens_files):  # noqa: F811
@@ -464,8 +466,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
 # will fail on the second `guided_decoding_backend` even when I swap their order
 # (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_chat(client: openai.AsyncOpenAI,
                                   guided_decoding_backend: str,
                                   sample_guided_choice):
@@ -506,8 +507,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_json_chat(client: openai.AsyncOpenAI,
                                 guided_decoding_backend: str,
                                 sample_json_schema):
@@ -554,8 +554,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_regex_chat(client: openai.AsyncOpenAI,
                                  guided_decoding_backend: str, sample_regex):
     messages = [{
@@ -613,8 +612,7 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
                                            guided_decoding_backend: str,
                                            sample_guided_choice):
@@ -646,8 +644,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_named_tool_use(client: openai.AsyncOpenAI,
                               guided_decoding_backend: str,
                               sample_json_schema):
@@ -681,7 +678,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
             "function": {
                 "name": "dummy_function_name"
             }
-        })
+        },
+        extra_body=dict(guided_decoding_backend=guided_decoding_backend))
     message = chat_completion.choices[0].message
     assert len(message.content) == 0
     json_string = message.tool_calls[0].function.arguments
@@ -716,6 +714,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
                 "name": "dummy_function_name"
             }
         },
+        extra_body=dict(guided_decoding_backend=guided_decoding_backend),
         stream=True)
 
     output = []
@@ -738,10 +737,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
-async def test_required_tool_use_not_yet_supported(
-        client: openai.AsyncOpenAI, guided_decoding_backend: str,
-        sample_json_schema):
+async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI,
+                                                   sample_json_schema):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -785,9 +782,7 @@ async def test_required_tool_use_not_yet_supported(
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
 async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
-                                                  guided_decoding_backend: str,
                                                   sample_json_schema):
     messages = [{
         "role": "system",

From 60fb4f3bcfce9c84e09ba61e4b59bb1abe19953d Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 23 Dec 2024 14:30:45 -0500
Subject: [PATCH 157/357] [Bugfix] Add kv cache scales to gemma2.py (#11269)

---
 vllm/model_executor/models/gemma2.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 4664aa53ea092..f4530e4771960 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -31,11 +31,14 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    get_compressed_tensors_cache_scale)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
@@ -326,6 +329,15 @@ def load_weights(self, weights: Iterable[Tuple[str,
         params_dict = dict(self.named_parameters())
         loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
+            if scale_name := get_compressed_tensors_cache_scale(name):
+                # Loading kv cache scales for compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = loaded_weight[0]
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
             for (param_name, shard_name, shard_id) in stacked_params_mapping:
                 if shard_name not in name:
                     continue
@@ -343,6 +355,10 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
                 if is_pp_missing_parameter(name, self):
                     continue
                 param = params_dict[name]

From 94d545a1a18e20ea8763a6760194589b8a3c9065 Mon Sep 17 00:00:00 2001
From: yansh97 <yansh97@foxmail.com>
Date: Tue, 24 Dec 2024 04:20:44 +0800
Subject: [PATCH 158/357] [Doc] Fix typo in the help message of
 '--guided-decoding-backend' (#11440)

---
 vllm/engine/arg_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 7aa45b7958e26..997a952240ecb 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -373,7 +373,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             choices=['outlines', 'lm-format-enforcer', 'xgrammar'],
             help='Which engine will be used for guided decoding'
             ' (JSON schema / regex etc) by default. Currently support '
-            'https://github.com/outlines-dev/outlines,'
+            'https://github.com/outlines-dev/outlines, '
             'https://github.com/mlc-ai/xgrammar, and '
             'https://github.com/noamgat/lm-format-enforcer.'
             ' Can be overridden per request via guided_decoding_backend'

From 32aa2059addd97be1afce7a199d228191710c294 Mon Sep 17 00:00:00 2001
From: Rafael Vasquez <rafvasq21@gmail.com>
Date: Mon, 23 Dec 2024 17:35:38 -0500
Subject: [PATCH 159/357] [Docs] Convert rST to MyST (Markdown) (#11145)

Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
---
 .gitignore                                    |   2 +
 Dockerfile                                    |   2 +-
 docs/requirements-docs.txt                    |   2 +-
 docs/source/automatic_prefix_caching/apc.md   | 102 ++++
 docs/source/automatic_prefix_caching/apc.rst  | 110 ----
 docs/source/community/meetups.md              |  15 +
 docs/source/community/meetups.rst             |  16 -
 docs/source/conf.py                           |   2 +-
 .../contributing/dockerfile/dockerfile.md     |  50 ++
 .../contributing/dockerfile/dockerfile.rst    |  50 --
 .../{overview.rst => overview.md}             | 137 +++--
 .../contributing/profiling/profiling_index.md |  41 ++
 .../profiling/profiling_index.rst             |  48 --
 .../{arch_overview.rst => arch_overview.md}   | 250 ++++-----
 docs/source/design/huggingface_integration.md |  36 ++
 .../source/design/huggingface_integration.rst |  40 --
 .../input_processing_pipeline.md              |  19 +
 .../input_processing_pipeline.rst             |  20 -
 .../input_processing/model_inputs_index.md    |  43 ++
 .../input_processing/model_inputs_index.rst   |  39 --
 docs/source/design/kernel/paged_attention.md  | 527 ++++++++++++++++++
 docs/source/design/kernel/paged_attention.rst | 525 -----------------
 .../multimodal/adding_multimodal_plugin.md    |  16 +
 .../multimodal/adding_multimodal_plugin.rst   |  17 -
 ...ltimodal_index.rst => multimodal_index.md} |  61 +-
 docs/source/design/plugin_system.md           |  54 ++
 docs/source/design/plugin_system.rst          |  62 ---
 ...ync_llm_engine.rst => async_llm_engine.md} |   5 +-
 docs/source/dev/engine/engine_index.md        |  17 +
 docs/source/dev/engine/engine_index.rst       |  13 -
 .../engine/{llm_engine.rst => llm_engine.md}  |   5 +-
 .../dev/offline_inference/{llm.rst => llm.md} |   5 +-
 .../{llm_inputs.rst => llm_inputs.md}         |   9 +-
 .../dev/offline_inference/offline_index.md    |   8 +
 .../dev/offline_inference/offline_index.rst   |   8 -
 .../{pooling_params.rst => pooling_params.md} |   5 +-
 ...sampling_params.rst => sampling_params.md} |   5 +-
 docs/source/generate_examples.py              |  24 +-
 .../getting_started/amd-installation.md       | 163 ++++++
 .../getting_started/amd-installation.rst      | 178 ------
 .../getting_started/arm-installation.md       |  46 ++
 .../getting_started/arm-installation.rst      |  50 --
 .../getting_started/cpu-installation.md       | 154 +++++
 .../getting_started/cpu-installation.rst      | 164 ------
 docs/source/getting_started/debugging.md      | 199 +++++++
 docs/source/getting_started/debugging.rst     | 203 -------
 .../examples/examples_index.template.md       |   8 +
 .../examples/examples_index.template.rst      |   8 -
 .../getting_started/gaudi-installation.md     | 388 +++++++++++++
 .../getting_started/gaudi-installation.rst    | 402 -------------
 docs/source/getting_started/installation.md   | 199 +++++++
 docs/source/getting_started/installation.rst  | 214 -------
 .../getting_started/neuron-installation.md    | 132 +++++
 .../getting_started/neuron-installation.rst   | 140 -----
 .../getting_started/openvino-installation.md  | 104 ++++
 .../getting_started/openvino-installation.rst | 116 ----
 docs/source/getting_started/quickstart.md     | 174 ++++++
 docs/source/getting_started/quickstart.rst    | 181 ------
 .../getting_started/tpu-installation.md       | 193 +++++++
 .../getting_started/tpu-installation.rst      | 200 -------
 .../getting_started/xpu-installation.md       |  74 +++
 .../getting_started/xpu-installation.rst      |  80 ---
 docs/source/index.md                          | 200 +++++++
 docs/source/index.rst                         | 194 -------
 docs/source/models/adding_model.md            | 155 ++++++
 docs/source/models/adding_model.rst           | 159 ------
 .../models/enabling_multimodal_inputs.md      | 143 +++++
 .../models/enabling_multimodal_inputs.rst     | 147 -----
 docs/source/models/generative_models.md       | 138 +++++
 docs/source/models/generative_models.rst      | 146 -----
 docs/source/models/pooling_models.md          | 127 +++++
 docs/source/models/pooling_models.rst         | 136 -----
 ...pported_models.rst => supported_models.md} | 401 ++++++-------
 docs/source/performance/benchmarks.md         |  28 +
 docs/source/performance/benchmarks.rst        |  33 --
 docs/source/quantization/auto_awq.md          |  78 +++
 docs/source/quantization/auto_awq.rst         |  79 ---
 docs/source/quantization/bnb.md               |  39 ++
 docs/source/quantization/bnb.rst              |  43 --
 docs/source/quantization/fp8.md               | 192 +++++++
 docs/source/quantization/fp8.rst              | 204 -------
 docs/source/quantization/fp8_e4m3_kvcache.md  |  44 ++
 docs/source/quantization/fp8_e4m3_kvcache.rst |  47 --
 docs/source/quantization/fp8_e5m2_kvcache.md  |  31 ++
 docs/source/quantization/fp8_e5m2_kvcache.rst |  34 --
 docs/source/quantization/gguf.md              |  72 +++
 docs/source/quantization/gguf.rst             |  73 ---
 docs/source/quantization/int8.md              | 136 +++++
 docs/source/quantization/int8.rst             | 145 -----
 ...ted_hardware.rst => supported_hardware.md} | 264 ++++-----
 docs/source/serving/deploying_with_bentoml.md |   7 +
 .../source/serving/deploying_with_bentoml.rst |   8 -
 .../serving/deploying_with_cerebrium.md       | 109 ++++
 .../serving/deploying_with_cerebrium.rst      | 112 ----
 docs/source/serving/deploying_with_docker.md  |  81 +++
 docs/source/serving/deploying_with_docker.rst |  88 ---
 docs/source/serving/deploying_with_dstack.md  | 102 ++++
 docs/source/serving/deploying_with_dstack.rst | 103 ----
 ...g_with_helm.rst => deploying_with_helm.md} |  44 +-
 docs/source/serving/deploying_with_k8s.md     | 171 ++++++
 docs/source/serving/deploying_with_k8s.rst    | 175 ------
 docs/source/serving/deploying_with_kserve.md  |   7 +
 docs/source/serving/deploying_with_kserve.rst |   8 -
 docs/source/serving/deploying_with_kubeai.md  |  15 +
 docs/source/serving/deploying_with_kubeai.rst |  17 -
 docs/source/serving/deploying_with_lws.md     |  11 +
 docs/source/serving/deploying_with_lws.rst    |  12 -
 docs/source/serving/deploying_with_nginx.md   | 133 +++++
 docs/source/serving/deploying_with_nginx.rst  | 142 -----
 docs/source/serving/deploying_with_triton.md  |   5 +
 docs/source/serving/deploying_with_triton.rst |   6 -
 docs/source/serving/distributed_serving.md    | 105 ++++
 docs/source/serving/distributed_serving.rst   | 107 ----
 docs/source/serving/integrations.md           |  17 +
 docs/source/serving/integrations.rst          |  17 -
 docs/source/serving/metrics.md                |  38 ++
 docs/source/serving/metrics.rst               |  38 --
 .../serving/openai_compatible_server.md       |  22 +-
 docs/source/serving/run_on_sky.md             | 345 ++++++++++++
 docs/source/serving/run_on_sky.rst            | 366 ------------
 docs/source/serving/runai_model_streamer.md   |  53 ++
 docs/source/serving/runai_model_streamer.rst  |  53 --
 docs/source/serving/serving_with_langchain.md |  30 +
 .../source/serving/serving_with_langchain.rst |  31 --
 .../source/serving/serving_with_llamaindex.md |  26 +
 .../serving/serving_with_llamaindex.rst       |  27 -
 .../source/serving/serving_with_llamastack.md |  38 ++
 .../serving/serving_with_llamastack.rst       |  42 --
 docs/source/serving/tensorizer.md             |  16 +
 docs/source/serving/tensorizer.rst            |  15 -
 docs/source/usage/compatibility_matrix.md     | 468 ++++++++++++++++
 docs/source/usage/compatibility_matrix.rst    | 468 ----------------
 docs/source/usage/disagg_prefill.md           |  64 +++
 docs/source/usage/disagg_prefill.rst          |  69 ---
 .../usage/{engine_args.rst => engine_args.md} |  14 +-
 docs/source/usage/env_vars.md                 |  15 +
 docs/source/usage/env_vars.rst                |  14 -
 docs/source/usage/{faq.rst => faq.md}         |  31 +-
 docs/source/usage/lora.md                     | 215 +++++++
 docs/source/usage/lora.rst                    | 225 --------
 docs/source/usage/multimodal_inputs.md        | 486 ++++++++++++++++
 docs/source/usage/multimodal_inputs.rst       | 492 ----------------
 .../usage/{performance.rst => performance.md} |  51 +-
 docs/source/usage/spec_decode.md              | 205 +++++++
 docs/source/usage/spec_decode.rst             | 210 -------
 docs/source/usage/structured_outputs.md       | 260 +++++++++
 docs/source/usage/structured_outputs.rst      | 267 ---------
 docs/source/usage/usage_stats.md              |   2 +-
 vllm/attention/backends/rocm_flash_attn.py    |   2 +-
 vllm/config.py                                |   6 +-
 vllm/engine/arg_utils.py                      |   2 +-
 vllm/engine/llm_engine.py                     |   2 +-
 vllm/engine/output_processor/multi_step.py    |   2 +-
 vllm/entrypoints/llm.py                       |   2 +-
 vllm/executor/cpu_executor.py                 |   2 +-
 vllm/inputs/__init__.py                       |   2 +-
 vllm/inputs/registry.py                       |   6 +-
 vllm/multimodal/__init__.py                   |   2 +-
 vllm/multimodal/base.py                       |  14 +-
 vllm/multimodal/inputs.py                     |   2 +-
 vllm/multimodal/registry.py                   |   6 +-
 vllm/platforms/cpu.py                         |   2 +-
 vllm/scripts.py                               |   2 +-
 vllm/spec_decode/spec_decode_worker.py        |   2 +-
 vllm/utils.py                                 |   2 +-
 vllm/worker/multi_step_model_runner.py        |   2 +-
 vllm/worker/utils.py                          |   2 +-
 167 files changed, 7870 insertions(+), 8138 deletions(-)
 create mode 100644 docs/source/automatic_prefix_caching/apc.md
 delete mode 100644 docs/source/automatic_prefix_caching/apc.rst
 create mode 100644 docs/source/community/meetups.md
 delete mode 100644 docs/source/community/meetups.rst
 create mode 100644 docs/source/contributing/dockerfile/dockerfile.md
 delete mode 100644 docs/source/contributing/dockerfile/dockerfile.rst
 rename docs/source/contributing/{overview.rst => overview.md} (51%)
 create mode 100644 docs/source/contributing/profiling/profiling_index.md
 delete mode 100644 docs/source/contributing/profiling/profiling_index.rst
 rename docs/source/design/{arch_overview.rst => arch_overview.md} (54%)
 create mode 100644 docs/source/design/huggingface_integration.md
 delete mode 100644 docs/source/design/huggingface_integration.rst
 create mode 100644 docs/source/design/input_processing/input_processing_pipeline.md
 delete mode 100644 docs/source/design/input_processing/input_processing_pipeline.rst
 create mode 100644 docs/source/design/input_processing/model_inputs_index.md
 delete mode 100644 docs/source/design/input_processing/model_inputs_index.rst
 create mode 100644 docs/source/design/kernel/paged_attention.md
 delete mode 100644 docs/source/design/kernel/paged_attention.rst
 create mode 100644 docs/source/design/multimodal/adding_multimodal_plugin.md
 delete mode 100644 docs/source/design/multimodal/adding_multimodal_plugin.rst
 rename docs/source/design/multimodal/{multimodal_index.rst => multimodal_index.md} (61%)
 create mode 100644 docs/source/design/plugin_system.md
 delete mode 100644 docs/source/design/plugin_system.rst
 rename docs/source/dev/engine/{async_llm_engine.rst => async_llm_engine.md} (59%)
 create mode 100644 docs/source/dev/engine/engine_index.md
 delete mode 100644 docs/source/dev/engine/engine_index.rst
 rename docs/source/dev/engine/{llm_engine.rst => llm_engine.md} (60%)
 rename docs/source/dev/offline_inference/{llm.rst => llm.md} (67%)
 rename docs/source/dev/offline_inference/{llm_inputs.rst => llm_inputs.md} (78%)
 create mode 100644 docs/source/dev/offline_inference/offline_index.md
 delete mode 100644 docs/source/dev/offline_inference/offline_index.rst
 rename docs/source/dev/{pooling_params.rst => pooling_params.md} (55%)
 rename docs/source/dev/{sampling_params.rst => sampling_params.md} (55%)
 create mode 100644 docs/source/getting_started/amd-installation.md
 delete mode 100644 docs/source/getting_started/amd-installation.rst
 create mode 100644 docs/source/getting_started/arm-installation.md
 delete mode 100644 docs/source/getting_started/arm-installation.rst
 create mode 100644 docs/source/getting_started/cpu-installation.md
 delete mode 100644 docs/source/getting_started/cpu-installation.rst
 create mode 100644 docs/source/getting_started/debugging.md
 delete mode 100644 docs/source/getting_started/debugging.rst
 create mode 100644 docs/source/getting_started/examples/examples_index.template.md
 delete mode 100644 docs/source/getting_started/examples/examples_index.template.rst
 create mode 100644 docs/source/getting_started/gaudi-installation.md
 delete mode 100644 docs/source/getting_started/gaudi-installation.rst
 create mode 100644 docs/source/getting_started/installation.md
 delete mode 100644 docs/source/getting_started/installation.rst
 create mode 100644 docs/source/getting_started/neuron-installation.md
 delete mode 100644 docs/source/getting_started/neuron-installation.rst
 create mode 100644 docs/source/getting_started/openvino-installation.md
 delete mode 100644 docs/source/getting_started/openvino-installation.rst
 create mode 100644 docs/source/getting_started/quickstart.md
 delete mode 100644 docs/source/getting_started/quickstart.rst
 create mode 100644 docs/source/getting_started/tpu-installation.md
 delete mode 100644 docs/source/getting_started/tpu-installation.rst
 create mode 100644 docs/source/getting_started/xpu-installation.md
 delete mode 100644 docs/source/getting_started/xpu-installation.rst
 create mode 100644 docs/source/index.md
 delete mode 100644 docs/source/index.rst
 create mode 100644 docs/source/models/adding_model.md
 delete mode 100644 docs/source/models/adding_model.rst
 create mode 100644 docs/source/models/enabling_multimodal_inputs.md
 delete mode 100644 docs/source/models/enabling_multimodal_inputs.rst
 create mode 100644 docs/source/models/generative_models.md
 delete mode 100644 docs/source/models/generative_models.rst
 create mode 100644 docs/source/models/pooling_models.md
 delete mode 100644 docs/source/models/pooling_models.rst
 rename docs/source/models/{supported_models.rst => supported_models.md} (71%)
 create mode 100644 docs/source/performance/benchmarks.md
 delete mode 100644 docs/source/performance/benchmarks.rst
 create mode 100644 docs/source/quantization/auto_awq.md
 delete mode 100644 docs/source/quantization/auto_awq.rst
 create mode 100644 docs/source/quantization/bnb.md
 delete mode 100644 docs/source/quantization/bnb.rst
 create mode 100644 docs/source/quantization/fp8.md
 delete mode 100644 docs/source/quantization/fp8.rst
 create mode 100644 docs/source/quantization/fp8_e4m3_kvcache.md
 delete mode 100644 docs/source/quantization/fp8_e4m3_kvcache.rst
 create mode 100644 docs/source/quantization/fp8_e5m2_kvcache.md
 delete mode 100644 docs/source/quantization/fp8_e5m2_kvcache.rst
 create mode 100644 docs/source/quantization/gguf.md
 delete mode 100644 docs/source/quantization/gguf.rst
 create mode 100644 docs/source/quantization/int8.md
 delete mode 100644 docs/source/quantization/int8.rst
 rename docs/source/quantization/{supported_hardware.rst => supported_hardware.md} (84%)
 create mode 100644 docs/source/serving/deploying_with_bentoml.md
 delete mode 100644 docs/source/serving/deploying_with_bentoml.rst
 create mode 100644 docs/source/serving/deploying_with_cerebrium.md
 delete mode 100644 docs/source/serving/deploying_with_cerebrium.rst
 create mode 100644 docs/source/serving/deploying_with_docker.md
 delete mode 100644 docs/source/serving/deploying_with_docker.rst
 create mode 100644 docs/source/serving/deploying_with_dstack.md
 delete mode 100644 docs/source/serving/deploying_with_dstack.rst
 rename docs/source/serving/{deploying_with_helm.rst => deploying_with_helm.md} (88%)
 create mode 100644 docs/source/serving/deploying_with_k8s.md
 delete mode 100644 docs/source/serving/deploying_with_k8s.rst
 create mode 100644 docs/source/serving/deploying_with_kserve.md
 delete mode 100644 docs/source/serving/deploying_with_kserve.rst
 create mode 100644 docs/source/serving/deploying_with_kubeai.md
 delete mode 100644 docs/source/serving/deploying_with_kubeai.rst
 create mode 100644 docs/source/serving/deploying_with_lws.md
 delete mode 100644 docs/source/serving/deploying_with_lws.rst
 create mode 100644 docs/source/serving/deploying_with_nginx.md
 delete mode 100644 docs/source/serving/deploying_with_nginx.rst
 create mode 100644 docs/source/serving/deploying_with_triton.md
 delete mode 100644 docs/source/serving/deploying_with_triton.rst
 create mode 100644 docs/source/serving/distributed_serving.md
 delete mode 100644 docs/source/serving/distributed_serving.rst
 create mode 100644 docs/source/serving/integrations.md
 delete mode 100644 docs/source/serving/integrations.rst
 create mode 100644 docs/source/serving/metrics.md
 delete mode 100644 docs/source/serving/metrics.rst
 create mode 100644 docs/source/serving/run_on_sky.md
 delete mode 100644 docs/source/serving/run_on_sky.rst
 create mode 100644 docs/source/serving/runai_model_streamer.md
 delete mode 100644 docs/source/serving/runai_model_streamer.rst
 create mode 100644 docs/source/serving/serving_with_langchain.md
 delete mode 100644 docs/source/serving/serving_with_langchain.rst
 create mode 100644 docs/source/serving/serving_with_llamaindex.md
 delete mode 100644 docs/source/serving/serving_with_llamaindex.rst
 create mode 100644 docs/source/serving/serving_with_llamastack.md
 delete mode 100644 docs/source/serving/serving_with_llamastack.rst
 create mode 100644 docs/source/serving/tensorizer.md
 delete mode 100644 docs/source/serving/tensorizer.rst
 create mode 100644 docs/source/usage/compatibility_matrix.md
 delete mode 100644 docs/source/usage/compatibility_matrix.rst
 create mode 100644 docs/source/usage/disagg_prefill.md
 delete mode 100644 docs/source/usage/disagg_prefill.rst
 rename docs/source/usage/{engine_args.rst => engine_args.md} (76%)
 create mode 100644 docs/source/usage/env_vars.md
 delete mode 100644 docs/source/usage/env_vars.rst
 rename docs/source/usage/{faq.rst => faq.md} (61%)
 create mode 100644 docs/source/usage/lora.md
 delete mode 100644 docs/source/usage/lora.rst
 create mode 100644 docs/source/usage/multimodal_inputs.md
 delete mode 100644 docs/source/usage/multimodal_inputs.rst
 rename docs/source/usage/{performance.rst => performance.md} (54%)
 create mode 100644 docs/source/usage/spec_decode.md
 delete mode 100644 docs/source/usage/spec_decode.rst
 create mode 100644 docs/source/usage/structured_outputs.md
 delete mode 100644 docs/source/usage/structured_outputs.rst

diff --git a/.gitignore b/.gitignore
index ceef6a5fba456..bb7e4d5b244a8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -81,6 +81,8 @@ instance/
 docs/_build/
 docs/source/getting_started/examples/*.rst
 !**/*.template.rst
+docs/source/getting_started/examples/*.md
+!**/*.template.md
 
 # PyBuilder
 .pybuilder/
diff --git a/Dockerfile b/Dockerfile
index 6226569e9d3b4..153bff9cf565f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@
 # to run the OpenAI compatible server.
 
 # Please update any changes made here to
-# docs/source/dev/dockerfile/dockerfile.rst and
+# docs/source/dev/dockerfile/dockerfile.md and
 # docs/source/assets/dev/dockerfile-stages-dependency.png
 
 ARG CUDA_VERSION=12.4.1
diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index ca2da4cd66d2d..4859c8ac08bea 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -1,7 +1,7 @@
 sphinx==6.2.1
 sphinx-book-theme==1.0.1
 sphinx-copybutton==0.5.2
-myst-parser==2.0.0
+myst-parser==3.0.1
 sphinx-argparse==0.4.0
 msgspec
 cloudpickle
diff --git a/docs/source/automatic_prefix_caching/apc.md b/docs/source/automatic_prefix_caching/apc.md
new file mode 100644
index 0000000000000..c0c141c5fb7ef
--- /dev/null
+++ b/docs/source/automatic_prefix_caching/apc.md
@@ -0,0 +1,102 @@
+(apc)=
+
+# Introduction
+
+## What is Automatic Prefix Caching
+
+Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
+
+```{note}
+Technical details on how vLLM implements APC are in the next page.
+```
+
+## Enabling APC in vLLM
+
+Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example:
+
+```python
+import time
+from vllm import LLM, SamplingParams
+
+
+# A prompt containing a large markdown table. The table is randomly generated by GPT-4.
+LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
+| ID  | Name          | Age | Occupation    | Country       | Email                  | Phone Number   | Address                       |
+|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
+| 1   | John Doe      | 29  | Engineer      | USA           | john.doe@example.com   | 555-1234       | 123 Elm St, Springfield, IL  |
+| 2   | Jane Smith    | 34  | Doctor        | Canada        | jane.smith@example.com | 555-5678       | 456 Oak St, Toronto, ON      |
+| 3   | Alice Johnson | 27  | Teacher       | UK            | alice.j@example.com    | 555-8765       | 789 Pine St, London, UK      |
+| 4   | Bob Brown     | 45  | Artist        | Australia     | bob.b@example.com      | 555-4321       | 321 Maple St, Sydney, NSW    |
+| 5   | Carol White   | 31  | Scientist     | New Zealand   | carol.w@example.com    | 555-6789       | 654 Birch St, Wellington, NZ |
+| 6   | Dave Green    | 28  | Lawyer        | Ireland       | dave.g@example.com     | 555-3456       | 987 Cedar St, Dublin, IE     |
+| 7   | Emma Black    | 40  | Musician      | USA           | emma.b@example.com     | 555-1111       | 246 Ash St, New York, NY     |
+| 8   | Frank Blue    | 37  | Chef          | Canada        | frank.b@example.com    | 555-2222       | 135 Spruce St, Vancouver, BC |
+| 9   | Grace Yellow  | 50  | Engineer      | UK            | grace.y@example.com    | 555-3333       | 864 Fir St, Manchester, UK   |
+| 10  | Henry Violet  | 32  | Artist        | Australia     | henry.v@example.com    | 555-4444       | 753 Willow St, Melbourne, VIC|
+| 11  | Irene Orange  | 26  | Scientist     | New Zealand   | irene.o@example.com    | 555-5555       | 912 Poplar St, Auckland, NZ  |
+| 12  | Jack Indigo   | 38  | Teacher       | Ireland       | jack.i@example.com     | 555-6666       | 159 Elm St, Cork, IE         |
+| 13  | Karen Red     | 41  | Lawyer        | USA           | karen.r@example.com    | 555-7777       | 357 Cedar St, Boston, MA     |
+| 14  | Leo Brown     | 30  | Chef          | Canada        | leo.b@example.com      | 555-8888       | 246 Oak St, Calgary, AB      |
+| 15  | Mia Green     | 33  | Musician      | UK            | mia.g@example.com      | 555-9999       | 975 Pine St, Edinburgh, UK   |
+| 16  | Noah Yellow   | 29  | Doctor        | Australia     | noah.y@example.com     | 555-0000       | 864 Birch St, Brisbane, QLD  |
+| 17  | Olivia Blue   | 35  | Engineer      | New Zealand   | olivia.b@example.com   | 555-1212       | 753 Maple St, Hamilton, NZ   |
+| 18  | Peter Black   | 42  | Artist        | Ireland       | peter.b@example.com    | 555-3434       | 912 Fir St, Limerick, IE     |
+| 19  | Quinn White   | 28  | Scientist     | USA           | quinn.w@example.com    | 555-5656       | 159 Willow St, Seattle, WA   |
+| 20  | Rachel Red    | 31  | Teacher       | Canada        | rachel.r@example.com   | 555-7878       | 357 Poplar St, Ottawa, ON    |
+| 21  | Steve Green   | 44  | Lawyer        | UK            | steve.g@example.com    | 555-9090       | 753 Elm St, Birmingham, UK   |
+| 22  | Tina Blue     | 36  | Musician      | Australia     | tina.b@example.com     | 555-1213       | 864 Cedar St, Perth, WA      |
+| 23  | Umar Black    | 39  | Chef          | New Zealand   | umar.b@example.com     | 555-3435       | 975 Spruce St, Christchurch, NZ|
+| 24  | Victor Yellow | 43  | Engineer      | Ireland       | victor.y@example.com   | 555-5657       | 246 Willow St, Galway, IE    |
+| 25  | Wendy Orange  | 27  | Artist        | USA           | wendy.o@example.com    | 555-7879       | 135 Elm St, Denver, CO       |
+| 26  | Xavier Green  | 34  | Scientist     | Canada        | xavier.g@example.com   | 555-9091       | 357 Oak St, Montreal, QC     |
+| 27  | Yara Red      | 41  | Teacher       | UK            | yara.r@example.com     | 555-1214       | 975 Pine St, Leeds, UK       |
+| 28  | Zack Blue     | 30  | Lawyer        | Australia     | zack.b@example.com     | 555-3436       | 135 Birch St, Adelaide, SA   |
+| 29  | Amy White     | 33  | Musician      | New Zealand   | amy.w@example.com      | 555-5658       | 159 Maple St, Wellington, NZ |
+| 30  | Ben Black     | 38  | Chef          | Ireland       | ben.b@example.com      | 555-7870       | 246 Fir St, Waterford, IE    |
+"""
+
+
+def get_generation_time(llm, sampling_params, prompts):
+    # time the generation
+    start_time = time.time()
+    output = llm.generate(prompts, sampling_params=sampling_params)
+    end_time = time.time()
+    # print the output and generation time
+    print(f"Output: {output[0].outputs[0].text}")
+    print(f"Generation time: {end_time - start_time} seconds.")
+
+
+# set enable_prefix_caching=True to enable APC
+llm = LLM(
+    model='lmsys/longchat-13b-16k',
+    enable_prefix_caching=True
+)
+
+sampling_params = SamplingParams(temperature=0, max_tokens=100)
+
+# Querying the age of John Doe
+get_generation_time(
+    llm,
+    sampling_params,
+    LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
+)
+
+# Querying the age of Zack Blue
+# This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again.
+get_generation_time(
+    llm,
+    sampling_params,
+    LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
+)
+```
+
+## Example workloads
+
+We describe two example workloads, where APC can provide huge performance benefit:
+
+- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency.
+- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency.
+
+## Limits
+
+APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused).
diff --git a/docs/source/automatic_prefix_caching/apc.rst b/docs/source/automatic_prefix_caching/apc.rst
deleted file mode 100644
index 0d70c74689bf9..0000000000000
--- a/docs/source/automatic_prefix_caching/apc.rst
+++ /dev/null
@@ -1,110 +0,0 @@
-.. _apc:
-
-Introduction
-============
-
-What is Automatic Prefix Caching
---------------------------------
-
-Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
-
-
-.. note::
-
-   Technical details on how vLLM implements APC are in the next page.
-
-
-
-Enabling APC in vLLM
---------------------
-
-Set ``enable_prefix_caching=True`` in vLLM engine to enable APC. Here is an example:
-
-.. code-block:: python
-
-    import time
-    from vllm import LLM, SamplingParams
-
-
-    # A prompt containing a large markdown table. The table is randomly generated by GPT-4.
-    LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
-    | ID  | Name          | Age | Occupation    | Country       | Email                  | Phone Number   | Address                       |
-    |-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
-    | 1   | John Doe      | 29  | Engineer      | USA           | john.doe@example.com   | 555-1234       | 123 Elm St, Springfield, IL  |
-    | 2   | Jane Smith    | 34  | Doctor        | Canada        | jane.smith@example.com | 555-5678       | 456 Oak St, Toronto, ON      |
-    | 3   | Alice Johnson | 27  | Teacher       | UK            | alice.j@example.com    | 555-8765       | 789 Pine St, London, UK      |
-    | 4   | Bob Brown     | 45  | Artist        | Australia     | bob.b@example.com      | 555-4321       | 321 Maple St, Sydney, NSW    |
-    | 5   | Carol White   | 31  | Scientist     | New Zealand   | carol.w@example.com    | 555-6789       | 654 Birch St, Wellington, NZ |
-    | 6   | Dave Green    | 28  | Lawyer        | Ireland       | dave.g@example.com     | 555-3456       | 987 Cedar St, Dublin, IE     |
-    | 7   | Emma Black    | 40  | Musician      | USA           | emma.b@example.com     | 555-1111       | 246 Ash St, New York, NY     |
-    | 8   | Frank Blue    | 37  | Chef          | Canada        | frank.b@example.com    | 555-2222       | 135 Spruce St, Vancouver, BC |
-    | 9   | Grace Yellow  | 50  | Engineer      | UK            | grace.y@example.com    | 555-3333       | 864 Fir St, Manchester, UK   |
-    | 10  | Henry Violet  | 32  | Artist        | Australia     | henry.v@example.com    | 555-4444       | 753 Willow St, Melbourne, VIC|
-    | 11  | Irene Orange  | 26  | Scientist     | New Zealand   | irene.o@example.com    | 555-5555       | 912 Poplar St, Auckland, NZ  |
-    | 12  | Jack Indigo   | 38  | Teacher       | Ireland       | jack.i@example.com     | 555-6666       | 159 Elm St, Cork, IE         |
-    | 13  | Karen Red     | 41  | Lawyer        | USA           | karen.r@example.com    | 555-7777       | 357 Cedar St, Boston, MA     |
-    | 14  | Leo Brown     | 30  | Chef          | Canada        | leo.b@example.com      | 555-8888       | 246 Oak St, Calgary, AB      |
-    | 15  | Mia Green     | 33  | Musician      | UK            | mia.g@example.com      | 555-9999       | 975 Pine St, Edinburgh, UK   |
-    | 16  | Noah Yellow   | 29  | Doctor        | Australia     | noah.y@example.com     | 555-0000       | 864 Birch St, Brisbane, QLD  |
-    | 17  | Olivia Blue   | 35  | Engineer      | New Zealand   | olivia.b@example.com   | 555-1212       | 753 Maple St, Hamilton, NZ   |
-    | 18  | Peter Black   | 42  | Artist        | Ireland       | peter.b@example.com    | 555-3434       | 912 Fir St, Limerick, IE     |
-    | 19  | Quinn White   | 28  | Scientist     | USA           | quinn.w@example.com    | 555-5656       | 159 Willow St, Seattle, WA   |
-    | 20  | Rachel Red    | 31  | Teacher       | Canada        | rachel.r@example.com   | 555-7878       | 357 Poplar St, Ottawa, ON    |
-    | 21  | Steve Green   | 44  | Lawyer        | UK            | steve.g@example.com    | 555-9090       | 753 Elm St, Birmingham, UK   |
-    | 22  | Tina Blue     | 36  | Musician      | Australia     | tina.b@example.com     | 555-1213       | 864 Cedar St, Perth, WA      |
-    | 23  | Umar Black    | 39  | Chef          | New Zealand   | umar.b@example.com     | 555-3435       | 975 Spruce St, Christchurch, NZ|
-    | 24  | Victor Yellow | 43  | Engineer      | Ireland       | victor.y@example.com   | 555-5657       | 246 Willow St, Galway, IE    |
-    | 25  | Wendy Orange  | 27  | Artist        | USA           | wendy.o@example.com    | 555-7879       | 135 Elm St, Denver, CO       |
-    | 26  | Xavier Green  | 34  | Scientist     | Canada        | xavier.g@example.com   | 555-9091       | 357 Oak St, Montreal, QC     |
-    | 27  | Yara Red      | 41  | Teacher       | UK            | yara.r@example.com     | 555-1214       | 975 Pine St, Leeds, UK       |
-    | 28  | Zack Blue     | 30  | Lawyer        | Australia     | zack.b@example.com     | 555-3436       | 135 Birch St, Adelaide, SA   |
-    | 29  | Amy White     | 33  | Musician      | New Zealand   | amy.w@example.com      | 555-5658       | 159 Maple St, Wellington, NZ |
-    | 30  | Ben Black     | 38  | Chef          | Ireland       | ben.b@example.com      | 555-7870       | 246 Fir St, Waterford, IE    |
-    """
-
-
-    def get_generation_time(llm, sampling_params, prompts):
-        # time the generation
-        start_time = time.time()
-        output = llm.generate(prompts, sampling_params=sampling_params)
-        end_time = time.time()
-        # print the output and generation time
-        print(f"Output: {output[0].outputs[0].text}")
-        print(f"Generation time: {end_time - start_time} seconds.")
-
-
-    # set enable_prefix_caching=True to enable APC
-    llm = LLM(
-        model='lmsys/longchat-13b-16k',
-        enable_prefix_caching=True
-    )
-
-    sampling_params = SamplingParams(temperature=0, max_tokens=100)
-
-    # Querying the age of John Doe
-    get_generation_time(
-        llm,
-        sampling_params,
-        LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
-    )
-
-    # Querying the age of Zack Blue
-    # This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again.
-    get_generation_time(
-        llm,
-        sampling_params,
-        LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
-    )
-
-Example workloads
------------------
-
-We describe two example workloads, where APC can provide huge performance benefit:
-
-- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency.
-- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency.
-
-
-Limits
-------
-APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused).
diff --git a/docs/source/community/meetups.md b/docs/source/community/meetups.md
new file mode 100644
index 0000000000000..43fa9ee616096
--- /dev/null
+++ b/docs/source/community/meetups.md
@@ -0,0 +1,15 @@
+(meetups)=
+
+# vLLM Meetups
+
+We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
+
+- [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing)
+- [The sixth vLLM meetup](https://lu.ma/87q3nvnh), with NVIDIA, September 9th 2024. [[Slides]](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing)
+- [The fifth vLLM meetup](https://lu.ma/lp0gyjqr), with AWS, July 24th 2024. [[Slides]](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing)
+- [The fourth vLLM meetup](https://lu.ma/agivllm), with Cloudflare and BentoML, June 11th 2024. [[Slides]](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing)
+- [The third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/), with Roblox, April 2nd 2024. [[Slides]](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing)
+- [The second vLLM meetup](https://lu.ma/ygxbpzhl), with IBM Research, January 31st 2024. [[Slides]](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing) [[Video (vLLM Update)]](https://youtu.be/Y0C-DUvEnZQ) [[Video (IBM Research & torch.compile)]](https://youtu.be/m0dMtFLI-dg)
+- [The first vLLM meetup](https://lu.ma/first-vllm-meetup), with a16z, October 5th 2023. [[Slides]](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing)
+
+We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu).
diff --git a/docs/source/community/meetups.rst b/docs/source/community/meetups.rst
deleted file mode 100644
index c87f01aa263b3..0000000000000
--- a/docs/source/community/meetups.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-.. _meetups:
-
-vLLM Meetups
-============
-
-We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
-
-- `The seventh vLLM meetup <https://lu.ma/h0qvrajz>`__, with Snowflake, November 14th 2024. `[Slides] <https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing>`__
-- `The sixth vLLM meetup <https://lu.ma/87q3nvnh>`__, with NVIDIA, September 9th 2024. `[Slides] <https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing>`__
-- `The fifth vLLM meetup <https://lu.ma/lp0gyjqr>`__, with AWS, July 24th 2024. `[Slides] <https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing>`__
-- `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__
-- `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__
-- `The second vLLM meetup <https://lu.ma/ygxbpzhl>`__, with IBM Research, January 31st 2024. `[Slides] <https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing>`__ `[Video (vLLM Update)] <https://youtu.be/Y0C-DUvEnZQ>`__ `[Video (IBM Research & torch.compile)] <https://youtu.be/m0dMtFLI-dg>`__
-- `The first vLLM meetup <https://lu.ma/first-vllm-meetup>`__, with a16z, October 5th 2023. `[Slides] <https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing>`__
-
-We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at `vllm-questions@lists.berkeley.edu <mailto:vllm-questions@lists.berkeley.edu>`__.
diff --git a/docs/source/conf.py b/docs/source/conf.py
index e9d9ac68c9560..6f1d1842fe686 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -51,7 +51,7 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns: List[str] = ["**/*.template.rst"]
+exclude_patterns: List[str] = ["**/*.template.md"]
 
 # Exclude the prompt "$" when copying code
 copybutton_prompt_text = r"\$ "
diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md
new file mode 100644
index 0000000000000..d72b99fe017b6
--- /dev/null
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@@ -0,0 +1,50 @@
+# Dockerfile
+
+See [here](https://github.com/vllm-project/vllm/blob/main/Dockerfile) for the main Dockerfile to construct
+the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found [here](https://docs.vllm.ai/en/stable/serving/deploying_with_docker.html).
+
+Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
+
+- All build stages
+- The default build target (highlighted in grey)
+- External images (with dashed borders)
+
+The edges of the build graph represent:
+
+- FROM ... dependencies (with a solid line and a full arrow head)
+
+- COPY --from=... dependencies (with a dashed line and an empty arrow head)
+
+- RUN --mount=(.\*)from=... dependencies (with a dotted line and an empty diamond arrow head)
+
+  > ```{figure} ../../assets/dev/dockerfile-stages-dependency.png
+  > :align: center
+  > :alt: query
+  > :width: 100%
+  > ```
+  >
+  > Made using: <https://github.com/patrickhoefler/dockerfilegraph>
+  >
+  > Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present):
+  >
+  > ```bash
+  > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile
+  > ```
+  >
+  > or in case you want to run it directly with the docker image:
+  >
+  > ```bash
+  > docker run \
+  >    --rm \
+  >    --user "$(id -u):$(id -g)" \
+  >    --workdir /workspace \
+  >    --volume "$(pwd)":/workspace \
+  >    ghcr.io/patrickhoefler/dockerfilegraph:alpine \
+  >    --output png \
+  >    --dpi 200 \
+  >    --max-label-length 50 \
+  >    --filename Dockerfile \
+  >    --legend
+  > ```
+  >
+  > (To run it for a different file, you can pass in a different argument to the flag `--filename`.)
diff --git a/docs/source/contributing/dockerfile/dockerfile.rst b/docs/source/contributing/dockerfile/dockerfile.rst
deleted file mode 100644
index 9c17c27aa61bf..0000000000000
--- a/docs/source/contributing/dockerfile/dockerfile.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-Dockerfile
-====================
-
-See `here <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`__ for the main Dockerfile to construct 
-the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found `here <https://docs.vllm.ai/en/stable/serving/deploying_with_docker.html>`__.
-
-Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
-
-- All build stages
-- The default build target (highlighted in grey)
-- External images (with dashed borders)
-   
-The edges of the build graph represent:
-
-- FROM ... dependencies (with a solid line and a full arrow head)
-- COPY --from=... dependencies (with a dashed line and an empty arrow head)
-- RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head)
-
-   .. figure:: ../../assets/dev/dockerfile-stages-dependency.png
-      :alt: query
-      :width: 100%
-      :align: center
-
-   Made using: https://github.com/patrickhoefler/dockerfilegraph
-
-   Commands to regenerate the build graph (make sure to run it **from the `root` directory of the vLLM repository** where the dockerfile is present):
-
-   .. code:: bash
-
-      dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile
-
-   or in case you want to run it directly with the docker image:
-   
-   .. code:: bash
-
-      docker run \
-         --rm \
-         --user "$(id -u):$(id -g)" \
-         --workdir /workspace \
-         --volume "$(pwd)":/workspace \
-         ghcr.io/patrickhoefler/dockerfilegraph:alpine \
-         --output png \
-         --dpi 200 \
-         --max-label-length 50 \
-         --filename Dockerfile \
-         --legend
-
-   (To run it for a different file, you can pass in a different argument to the flag `--filename`.)
-
-   
\ No newline at end of file
diff --git a/docs/source/contributing/overview.rst b/docs/source/contributing/overview.md
similarity index 51%
rename from docs/source/contributing/overview.rst
rename to docs/source/contributing/overview.md
index 4cea0afdaea74..53e8e78f08e72 100644
--- a/docs/source/contributing/overview.rst
+++ b/docs/source/contributing/overview.md
@@ -1,5 +1,4 @@
-Contributing to vLLM
-=====================
+# Contributing to vLLM
 
 Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project:
 
@@ -12,132 +11,121 @@ We also believe in the power of community support; thus, answering queries, offe
 
 Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
 
-License
--------
+## License
 
-See `LICENSE <https://github.com/vllm-project/vllm/tree/main/LICENSE>`_.
+See [LICENSE](https://github.com/vllm-project/vllm/tree/main/LICENSE).
 
-Developing
-----------
+## Developing
 
-Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the `building from source <https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source>`_ documentation for details.
+Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details.
 
-Testing
--------
+## Testing
 
-.. code-block:: bash
+```bash
+pip install -r requirements-dev.txt
 
-    pip install -r requirements-dev.txt
+# linting and formatting
+bash format.sh
+# Static type checking
+mypy
+# Unit tests
+pytest tests/
+```
 
-    # linting and formatting
-    bash format.sh
-    # Static type checking
-    mypy
-    # Unit tests
-    pytest tests/
+```{note}
+Currently, the repository does not pass the `mypy` tests.
+```
 
-.. note:: Currently, the repository does not pass the ``mypy`` tests.
+# Contribution Guidelines
 
-Contribution Guidelines
-=======================
+## Issues
 
-Issues
-------
+If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
 
-If you encounter a bug or have a feature request, please `search existing issues <https://github.com/vllm-project/vllm/issues?q=is%3Aissue>`_ first to see if it has already been reported. If not, please `file a new issue <https://github.com/vllm-project/vllm/issues/new/choose>`_, providing as much relevant information as possible.
+```{important}
+If you discover a security vulnerability, please follow the instructions [here](https://github.com/vllm-project/vllm/tree/main/SECURITY.md#reporting-a-vulnerability).
+```
 
-.. important::
-   If you discover a security vulnerability, please follow the instructions `here <https://github.com/vllm-project/vllm/tree/main/SECURITY.md#reporting-a-vulnerability>`_.
-
-Pull Requests & Code Reviews
-----------------------------
+## Pull Requests & Code Reviews
 
 Thank you for your contribution to vLLM! Before submitting the pull request,
 please ensure the PR meets the following criteria. This helps vLLM maintain the
 code quality and improve the efficiency of the review process.
 
-DCO and Signed-off-by
-^^^^^^^^^^^^^^^^^^^^^
+### DCO and Signed-off-by
 
-When contributing changes to this project, you must agree to the `DCO <https://github.com/vllm-project/vllm/tree/main/DCO>`_.
-Commits must include a ``Signed-off-by:`` header which certifies agreement with
-the terms of the `DCO <https://github.com/vllm-project/vllm/tree/main/DCO>`_.
+When contributing changes to this project, you must agree to the [DCO](https://github.com/vllm-project/vllm/tree/main/DCO).
+Commits must include a `Signed-off-by:` header which certifies agreement with
+the terms of the [DCO](https://github.com/vllm-project/vllm/tree/main/DCO).
 
-Using ``-s`` with ``git commit`` will automatically add this header.
+Using `-s` with `git commit` will automatically add this header.
 
-PR Title and Classification
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
+### PR Title and Classification
 
 Only specific types of PRs will be reviewed. The PR title is prefixed
 appropriately to indicate the type of change. Please use one of the following:
 
-- ``[Bugfix]`` for bug fixes.
-- ``[CI/Build]`` for build or continuous integration improvements.
-- ``[Doc]`` for documentation fixes and improvements.
-- ``[Model]`` for adding a new model or improving an existing model. Model name
+- `[Bugfix]` for bug fixes.
+- `[CI/Build]` for build or continuous integration improvements.
+- `[Doc]` for documentation fixes and improvements.
+- `[Model]` for adding a new model or improving an existing model. Model name
   should appear in the title.
-- ``[Frontend]`` For changes on the vLLM frontend (e.g., OpenAI API server,
-  ``LLM`` class, etc.)
-- ``[Kernel]`` for changes affecting CUDA kernels or other compute kernels.
-- ``[Core]`` for changes in the core vLLM logic (e.g., ``LLMEngine``,
-  ``AsyncLLMEngine``, ``Scheduler``, etc.)
-- ``[Hardware][Vendor]`` for hardware-specific changes. Vendor name should
-  appear in the prefix (e.g., ``[Hardware][AMD]``).
-- ``[Misc]`` for PRs that do not fit the above categories. Please use this
+- `[Frontend]` For changes on the vLLM frontend (e.g., OpenAI API server,
+  `LLM` class, etc.)
+- `[Kernel]` for changes affecting CUDA kernels or other compute kernels.
+- `[Core]` for changes in the core vLLM logic (e.g., `LLMEngine`,
+  `AsyncLLMEngine`, `Scheduler`, etc.)
+- `[Hardware][Vendor]` for hardware-specific changes. Vendor name should
+  appear in the prefix (e.g., `[Hardware][AMD]`).
+- `[Misc]` for PRs that do not fit the above categories. Please use this
   sparingly.
 
-.. note::
-   If the PR spans more than one category, please include all relevant prefixes.
+```{note}
+If the PR spans more than one category, please include all relevant prefixes.
+```
 
-Code Quality
-^^^^^^^^^^^^
+### Code Quality
 
 The PR needs to meet the following code quality standards:
 
-- We adhere to `Google Python style guide
-  <https://google.github.io/styleguide/pyguide.html>`_ and `Google C++ style guide
-  <https://google.github.io/styleguide/cppguide.html>`_.
-- Pass all linter checks. Please use `format.sh
-  <https://github.com/vllm-project/vllm/blob/main/format.sh>`_ to format your
+- We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
+- Pass all linter checks. Please use [format.sh](https://github.com/vllm-project/vllm/blob/main/format.sh) to format your
   code.
 - The code needs to be well-documented to ensure future contributors can easily
   understand the code.
 - Include sufficient tests to ensure the project stays correct and robust. This
   includes both unit tests and integration tests.
-- Please add documentation to ``docs/source/`` if the PR modifies the
+- Please add documentation to `docs/source/` if the PR modifies the
   user-facing behaviors of vLLM. It helps vLLM users understand and utilize the
   new features or changes.
 
-Adding or Changing Kernels
-^^^^^^^^^^^^^^^^^^^^^^^^^^
+### Adding or Changing Kernels
 
 Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.
 
 - Make sure custom ops are registered following PyTorch guidelines:
-  `Custom C++ and CUDA Operators <https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial>`_
-  and `The Custom Operators Manual <https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU>`_.
-- Custom operations that return ``Tensors`` require meta-functions.
+  [Custom C++ and CUDA Operators](https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial)
+  and [The Custom Operators Manual](https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU).
+- Custom operations that return `Tensors` require meta-functions.
   Meta-functions should be implemented and registered in Python so that dynamic
   dims can be handled automatically. See above documents for a description of
   meta-functions.
-- Use `torch.library.opcheck() <https://pytorch.org/docs/stable/library.html#torch.library.opcheck>`_
+- Use [torch.library.opcheck()](https://pytorch.org/docs/stable/library.html#torch.library.opcheck)
   to test the function registration and meta-function for any registered ops.
-  See ``tests/kernels`` for examples.
+  See `tests/kernels` for examples.
 - When changing the C++ signature of an existing op, the schema must be updated
   to reflect the changes.
 - If a new custom type is needed, see the following document:
-  `Custom Class Support in PT2 <https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA>`_.
+  [Custom Class Support in PT2](https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA).
 
-Notes for Large Changes
-^^^^^^^^^^^^^^^^^^^^^^^
+### Notes for Large Changes
 
 Please keep the changes as concise as possible. For major architectural changes
 (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue
 (RFC) discussing the technical design and justification. Otherwise, we will tag
-it with ``rfc-required`` and might not go through the PR.
+it with `rfc-required` and might not go through the PR.
 
-What to Expect for the Reviews
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+### What to Expect for the Reviews
 
 The goal of the vLLM team is to be a *transparent reviewing machine*. We would
 like to make the review process transparent and efficient and make sure no
@@ -150,15 +138,14 @@ review process:
 - After the PR is assigned, the reviewer will provide status updates every 2-3
   days. If the PR is not reviewed within 7 days, please feel free to ping the
   reviewer or the vLLM team.
-- After the review, the reviewer will put an ``action-required`` label on the PR
+- After the review, the reviewer will put an `action-required` label on the PR
   if there are changes required. The contributor should address the comments and
   ping the reviewer to re-review the PR.
 - Please respond to all comments within a reasonable time frame. If a comment
   isn't clear or you disagree with a suggestion, feel free to ask for
   clarification or discuss the suggestion.
 
-Thank You
----------
+## Thank You
 
 Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
 All of your contributions help make vLLM a great tool and community for everyone!
diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md
new file mode 100644
index 0000000000000..04e01da556231
--- /dev/null
+++ b/docs/source/contributing/profiling/profiling_index.md
@@ -0,0 +1,41 @@
+# Profiling vLLM
+
+We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`
+
+The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
+
+When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag.
+
+```{warning}
+Only enable profiling in a development environment.
+```
+
+Traces can be visualized using <https://ui.perfetto.dev/>.
+
+```{tip}
+Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
+```
+
+```{tip}
+To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
+Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
+`export VLLM_RPC_TIMEOUT=1800000`
+```
+
+## Example commands and usage:
+
+### Offline Inference:
+
+Refer to [examples/offline_inference_with_profiler.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_with_profiler.py) for an example.
+
+### OpenAI Server:
+
+```bash
+VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
+```
+
+benchmark_serving.py:
+
+```bash
+python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2
+```
diff --git a/docs/source/contributing/profiling/profiling_index.rst b/docs/source/contributing/profiling/profiling_index.rst
deleted file mode 100644
index a422b1fcda521..0000000000000
--- a/docs/source/contributing/profiling/profiling_index.rst
+++ /dev/null
@@ -1,48 +0,0 @@
-==============
-Profiling vLLM
-==============
-
-We support tracing vLLM workers using the ``torch.profiler`` module. You can enable tracing by setting the ``VLLM_TORCH_PROFILER_DIR`` environment variable to the directory where you want to save the traces: ``VLLM_TORCH_PROFILER_DIR=/mnt/traces/``
-
-The OpenAI server also needs to be started with the ``VLLM_TORCH_PROFILER_DIR`` environment variable set.
-
-When using ``benchmarks/benchmark_serving.py``, you can enable profiling by passing the ``--profile`` flag.
-
-.. warning::
-
-   Only enable profiling in a development environment. 
-
-
-Traces can be visualized using https://ui.perfetto.dev/.
-
-.. tip::
-
-   Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
-
-.. tip::
-
-   To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
-   Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
-   ``export VLLM_RPC_TIMEOUT=1800000``
-  
-Example commands and usage:
-===========================
-
-Offline Inference:
-------------------
-
-Refer to `examples/offline_inference_with_profiler.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_with_profiler.py>`_ for an example.
-
-
-OpenAI Server:
---------------
-
-.. code-block:: bash
-
-    VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B 
-
-benchmark_serving.py:
-
-.. code-block:: bash
-
-    python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2 
\ No newline at end of file
diff --git a/docs/source/design/arch_overview.rst b/docs/source/design/arch_overview.md
similarity index 54%
rename from docs/source/design/arch_overview.rst
rename to docs/source/design/arch_overview.md
index bc3f509f0a66e..511bee20a91f4 100644
--- a/docs/source/design/arch_overview.rst
+++ b/docs/source/design/arch_overview.md
@@ -1,25 +1,24 @@
-.. _arch_overview:
+(arch-overview)=
 
-Architecture Overview
-======================
+# Architecture Overview
 
 This document provides an overview of the vLLM architecture.
 
-.. contents:: Table of Contents
-    :local:
-    :depth: 2
+```{contents} Table of Contents
+:depth: 2
+:local: true
+```
 
-Entrypoints
------------
+## Entrypoints
 
 vLLM provides a number of entrypoints for interacting with the system. The
 following diagram shows the relationship between them.
 
-.. image:: /assets/design/arch_overview/entrypoints.excalidraw.png
-    :alt: Entrypoints Diagram
+```{image} /assets/design/arch_overview/entrypoints.excalidraw.png
+:alt: Entrypoints Diagram
+```
 
-LLM Class
-^^^^^^^^^
+### LLM Class
 
 The LLM class provides the primary Python interface for doing offline inference,
 which is interacting with a model without using a separate model inference
@@ -27,75 +26,70 @@ server.
 
 Here is a sample of `LLM` class usage:
 
-.. code-block:: python
+```python
+from vllm import LLM, SamplingParams
 
-    from vllm import LLM, SamplingParams
+# Define a list of input prompts
+prompts = [
+    "Hello, my name is",
+    "The capital of France is",
+    "The largest ocean is",
+]
 
-    # Define a list of input prompts
-    prompts = [
-        "Hello, my name is",
-        "The capital of France is",
-        "The largest ocean is",
-    ]
+# Define sampling parameters
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-    # Define sampling parameters
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+# Initialize the LLM engine with the OPT-125M model
+llm = LLM(model="facebook/opt-125m")
 
-    # Initialize the LLM engine with the OPT-125M model
-    llm = LLM(model="facebook/opt-125m")
+# Generate outputs for the input prompts
+outputs = llm.generate(prompts, sampling_params)
 
-    # Generate outputs for the input prompts
-    outputs = llm.generate(prompts, sampling_params)
+# Print the generated outputs
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
 
-    # Print the generated outputs
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-More API details can be found in the :doc:`Offline Inference
+More API details can be found in the {doc}`Offline Inference
 </dev/offline_inference/offline_index>` section of the API docs.
 
-The code for the `LLM` class can be found in `vllm/entrypoints/llm.py
-<https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py>`_.
+The code for the `LLM` class can be found in [vllm/entrypoints/llm.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py).
 
-OpenAI-compatible API server
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+### OpenAI-compatible API server
 
 The second primary interface to vLLM is via its OpenAI-compatible API server.
 This server can be started using the `vllm serve` command.
 
-.. code-block:: bash
-
-    vllm serve <model>
+```bash
+vllm serve <model>
+```
 
-The code for the `vllm` CLI can be found in `vllm/scripts.py
-<https://github.com/vllm-project/vllm/blob/main/vllm/scripts.py>`_.
+The code for the `vllm` CLI can be found in [vllm/scripts.py](https://github.com/vllm-project/vllm/blob/main/vllm/scripts.py).
 
 Sometimes you may see the API server entrypoint used directly instead of via the
 `vllm` CLI command. For example:
 
-.. code-block:: bash
-
-    python -m vllm.entrypoints.openai.api_server --model <model>
+```bash
+python -m vllm.entrypoints.openai.api_server --model <model>
+```
 
-That code can be found in `vllm/entrypoints/openai/api_server.py
-<https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/api_server.py>`_.
+That code can be found in [vllm/entrypoints/openai/api_server.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/api_server.py).
 
-More details on the API server can be found in the :doc:`OpenAI Compatible
+More details on the API server can be found in the {doc}`OpenAI Compatible
 Server </serving/openai_compatible_server>` document.
 
-LLM Engine
-----------
+## LLM Engine
 
 The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of
 the vLLM system, handling model inference and asynchronous request processing.
 
-.. image:: /assets/design/arch_overview/llm_engine.excalidraw.png
-    :alt: LLMEngine Diagram
+```{image} /assets/design/arch_overview/llm_engine.excalidraw.png
+:alt: LLMEngine Diagram
+```
 
-LLMEngine
-^^^^^^^^^
+### LLMEngine
 
 The `LLMEngine` class is the core component of the vLLM engine. It is
 responsible for receiving requests from clients and generating outputs from the
@@ -105,21 +99,15 @@ processing.
 
 - **Input Processing**: Handles tokenization of input text using the specified
   tokenizer.
-
 - **Scheduling**: Chooses which requests are processed in each step.
-
 - **Model Execution**: Manages the execution of the language model, including
   distributed execution across multiple GPUs.
-
 - **Output Processing**: Processes the outputs generated by the model, decoding the
   token IDs from a language model into human-readable text.
 
-The code for `LLMEngine` can be found in `vllm/engine/llm_engine.py`_.
-
-.. _vllm/engine/llm_engine.py: https://github.com/vllm-project/vllm/tree/main/vllm/engine/llm_engine.py
+The code for `LLMEngine` can be found in [vllm/engine/llm_engine.py].
 
-AsyncLLMEngine
-^^^^^^^^^^^^^^
+### AsyncLLMEngine
 
 The `AsyncLLMEngine` class is an asynchronous wrapper for the `LLMEngine` class.
 It uses `asyncio` to create a background loop that continuously processes
@@ -128,54 +116,46 @@ can handle multiple concurrent requests and stream outputs to clients.
 
 The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo
 API server that serves as a simpler example in
-`vllm/entrypoints/api_server.py`_.
-
-.. _vllm/entrypoints/api_server.py: https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/api_server.py
+[vllm/entrypoints/api_server.py].
 
-The code for `AsyncLLMEngine` can be found in `vllm/engine/async_llm_engine.py`_.
+The code for `AsyncLLMEngine` can be found in [vllm/engine/async_llm_engine.py].
 
-.. _vllm/engine/async_llm_engine.py: https://github.com/vllm-project/vllm/tree/main/vllm/engine/async_llm_engine.py
-
-Worker
-------
+## Worker
 
 A worker is a process that runs the model inference. vLLM follows the common
 practice of using one process to control one accelerator device, such as GPUs.
 For example, if we use tensor parallelism of size 2 and pipeline parallelism of
 size 2, we will have 4 workers in total. Workers are identified by their
-``rank`` and ``local_rank``. ``rank`` is used for global orchestration, while
-``local_rank`` is mainly used for assigning the accelerator device and accessing
+`rank` and `local_rank`. `rank` is used for global orchestration, while
+`local_rank` is mainly used for assigning the accelerator device and accessing
 local resources such as the file system and shared memory.
 
-Model Runner
-------------
+## Model Runner
 
 Every worker has one model runner object, responsible for loading and running
 the model. Much of the model execution logic resides here, such as preparing
 input tensors and capturing cudagraphs.
 
-Model
------
+## Model
 
 Every model runner object has one model object, which is the actual
-``torch.nn.Module`` instance. See :ref:`huggingface_integration` for how various
+`torch.nn.Module` instance. See [huggingface_integration](#huggingface-integration) for how various
 configurations affect the class we ultimately get.
 
-Class Hierarchy
----------------
+## Class Hierarchy
 
 The following figure shows the class hierarchy of vLLM:
 
-    .. figure:: /assets/design/hierarchy.png
-        :alt: query
-        :width: 100%
-        :align: center
+> ```{figure} /assets/design/hierarchy.png
+> :align: center
+> :alt: query
+> :width: 100%
+> ```
 
 There are several important design choices behind this class hierarchy:
 
-1. **Extensibility**: All classes in the hierarchy accept a configuration object
-containing all the necessary information. The `VllmConfig
-<https://github.com/vllm-project/vllm/blob/d1c6799b8870e513bf4f2305cbf6cda9fc3d773b/vllm/config.py#L2036>`__
+1\. **Extensibility**: All classes in the hierarchy accept a configuration object
+containing all the necessary information. The [VllmConfig](https://github.com/vllm-project/vllm/blob/d1c6799b8870e513bf4f2305cbf6cda9fc3d773b/vllm/config.py#L2036)
 class is the main configuration object that is passed around. The class
 hierarchy is quite deep, and every class needs to read the configuration it is
 interested in. By encapsulating all configurations in one object, we can easily
@@ -188,7 +168,7 @@ the `VllmConfig` class, and the model runner can access it directly. We don't
 need to change the constructor of the engine, worker, or model class to pass the
 new configuration option.
 
-2. **Uniformity**: The model runner needs a unified interface to create and
+2\. **Uniformity**: The model runner needs a unified interface to create and
 initialize the model. vLLM supports more than 50 types of popular open-source
 models. Each model has its own initialization logic. If the constructor
 signature varies with models, the model runner does not know how to call the
@@ -200,46 +180,46 @@ of a vision model and a language model. By making the constructor uniform, we
 can easily create a vision model and a language model and compose them into a
 vision-language model.
 
-.. note::
-
-    To support this change, all vLLM models' signatures have been updated to:
-
-    .. code-block:: python
-
-        def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-
-    To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
-
-    .. code-block:: python
-
-        class MyOldModel(nn.Module):
-            def __init__(
-                self,
-                config,
-                cache_config: Optional[CacheConfig] = None,
-                quant_config: Optional[QuantizationConfig] = None,
-                lora_config: Optional[LoRAConfig] = None,
-                prefix: str = "",
-            ) -> None:
-                ...
-
-        from vllm.config import VllmConfig
-        class MyNewModel(MyOldModel):
-            def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-                config = vllm_config.model_config.hf_config
-                cache_config = vllm_config.cache_config
-                quant_config = vllm_config.quant_config
-                lora_config = vllm_config.lora_config
-                super().__init__(config, cache_config, quant_config, lora_config, prefix)
-
-        if __version__ >= "0.6.4":
-            MyModel = MyNewModel
-        else:
-            MyModel = MyOldModel
-
-    This way, the model can work with both old and new versions of vLLM.
-
-3. **Sharding and Quantization at Initialization**: Certain features require
+````{note}
+To support this change, all vLLM models' signatures have been updated to:
+
+```python
+def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+```
+
+To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
+
+```python
+class MyOldModel(nn.Module):
+    def __init__(
+        self,
+        config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        ...
+
+from vllm.config import VllmConfig
+class MyNewModel(MyOldModel):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        super().__init__(config, cache_config, quant_config, lora_config, prefix)
+
+if __version__ >= "0.6.4":
+    MyModel = MyNewModel
+else:
+    MyModel = MyOldModel
+```
+
+This way, the model can work with both old and new versions of vLLM.
+````
+
+3\. **Sharding and Quantization at Initialization**: Certain features require
 changing the model weights. For example, tensor parallelism needs to shard the
 model weights, and quantization needs to quantize the model weights. There are
 two possible ways to implement this feature. One way is to change the model
@@ -252,23 +232,27 @@ initialized, we need to load the full 810GB weights to every GPU and then shard
 the weights, leading to a huge memory overhead. Instead, if we shard the weights
 during the model initialization, every layer will only create a shard of the
 weights it needs, leading to a much smaller memory overhead. The same idea
-applies to quantization. Note that we also add an additional argument ``prefix``
+applies to quantization. Note that we also add an additional argument `prefix`
 to the model's constructor so that the model can initialize itself differently
 based on the prefix. This is useful for non-uniform quantization, where
-different parts of the model are quantized differently. The ``prefix`` is
-usually an empty string for the top-level model and a string like ``"vision"``
-or ``"language"`` for the sub-models. In general, it matches the name of the
+different parts of the model are quantized differently. The `prefix` is
+usually an empty string for the top-level model and a string like `"vision"`
+or `"language"` for the sub-models. In general, it matches the name of the
 module's state dict in the checkpoint file.
 
 One disadvantage of this design is that it is hard to write unit tests for
 individual components in vLLM because every component needs to be initialized by
 a complete config object. We solve this problem by providing a default
 initialization function that creates a default config object with all fields set
-to ``None``. If the component we want to test only cares about a few fields in
+to `None`. If the component we want to test only cares about a few fields in
 the config object, we can create a default config object and set the fields we
 care about. This way, we can test the component in isolation. Note that many
 tests in vLLM are end-to-end tests that test the whole system, so this is not a
 big problem.
 
-In summary, the complete config object ``VllmConfig`` can be treated as an
+In summary, the complete config object `VllmConfig` can be treated as an
 engine-level global state that is shared among all vLLM classes.
+
+[vllm/engine/async_llm_engine.py]: https://github.com/vllm-project/vllm/tree/main/vllm/engine/async_llm_engine.py
+[vllm/engine/llm_engine.py]: https://github.com/vllm-project/vllm/tree/main/vllm/engine/llm_engine.py
+[vllm/entrypoints/api_server.py]: https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/api_server.py
diff --git a/docs/source/design/huggingface_integration.md b/docs/source/design/huggingface_integration.md
new file mode 100644
index 0000000000000..99b4cb56424c6
--- /dev/null
+++ b/docs/source/design/huggingface_integration.md
@@ -0,0 +1,36 @@
+(huggingface-integration)=
+
+# Integration with HuggingFace
+
+This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`.
+
+Let's say we want to serve the popular QWen model by running `vllm serve Qwen/Qwen2-7B`.
+
+1. The `model` argument is `Qwen/Qwen2-7B`. vLLM determines whether this model exists by checking for the corresponding config file `config.json`. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L162-L182) for the implementation. Within this process:
+
+   - If the `model` argument corresponds to an existing local path, vLLM will load the config file directly from this path.
+   - If the `model` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the HuggingFace cache works.
+   - If the `model` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file.
+
+2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186) for the implementation.
+
+3. Next, vLLM [inspects](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189) the `model_type` field in the config dictionary to [generate](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#190-L216) the config object to use. There are some `model_type` values that vLLM directly supports; see [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48) for the list. If the `model_type` is not in the list, vLLM will use [AutoConfig.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained) to load the config class, with `model`, `--revision`, and `--trust_remote_code` as the arguments. Please note that:
+
+   - HuggingFace also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, HuggingFace will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example.
+   - The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled.
+
+4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see [here](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244) for the implementation.
+
+5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the `architectures` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in [its registry](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/registry.py#L80). If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For `Qwen/Qwen2-7B`, the `architectures` field is `["Qwen2ForCausalLM"]`, which corresponds to the `Qwen2ForCausalLM` class in [vLLM's code](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/qwen2.py#L364). This class will initialize itself depending on various configs.
+
+Beyond that, there are two more things vLLM depends on HuggingFace for.
+
+1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24).
+
+2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights.
+
+   - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that:
+
+This completes the integration between vLLM and HuggingFace.
+
+In summary, vLLM reads the config file `config.json`, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository.
diff --git a/docs/source/design/huggingface_integration.rst b/docs/source/design/huggingface_integration.rst
deleted file mode 100644
index e6c1cea6001ea..0000000000000
--- a/docs/source/design/huggingface_integration.rst
+++ /dev/null
@@ -1,40 +0,0 @@
-.. _huggingface_integration:
-
-Integration with HuggingFace
-===================================
-
-This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run ``vllm serve``.
-
-Let's say we want to serve the popular QWen model by running ``vllm serve Qwen/Qwen2-7B``.
-
-1. The ``model`` argument is ``Qwen/Qwen2-7B``. vLLM determines whether this model exists by checking for the corresponding config file ``config.json``. See this `code snippet <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L162-L182>`__ for the implementation. Within this process:
-
-   - If the ``model`` argument corresponds to an existing local path, vLLM will load the config file directly from this path.
-   
-   - If the ``model`` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the ``model`` argument as the model name and the ``--revision`` argument as the revision. See `their website <https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome>`__ for more information on how the HuggingFace cache works.
-
-   - If the ``model`` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to `this function <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91>`__ for the implementation. The input arguments include the ``model`` argument as the model name, the ``--revision`` argument as the revision, and the environment variable ``HF_TOKEN`` as the token to access the model hub. In our case, vLLM will download the `config.json <https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json>`__ file.
-
-2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this `code snippet <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186>`__ for the implementation.
-
-3. Next, vLLM `inspects <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189>`__ the ``model_type`` field in the config dictionary to `generate <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#190-L216>`__ the config object to use. There are some ``model_type`` values that vLLM directly supports; see `here <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48>`__ for the list. If the ``model_type`` is not in the list, vLLM will use `AutoConfig.from_pretrained <https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained>`__ to load the config class, with ``model``, ``--revision``, and ``--trust_remote_code`` as the arguments. Please note that:
-
-   - HuggingFace also has its own logic to determine the config class to use. It will again use the ``model_type`` field to search for the class name in the transformers library; see `here <https://github.com/huggingface/transformers/tree/main/src/transformers/models>`__ for the list of supported models. If the ``model_type`` is not found, HuggingFace will use the ``auto_map`` field from the config JSON file to determine the class name. Specifically, it is the ``AutoConfig`` field under ``auto_map``. See `DeepSeek <https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json>`__ for an example.
-
-   - The ``AutoConfig`` field under ``auto_map`` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the ``from_pretrained`` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when ``--trust_remote_code`` is enabled.
-
-4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see `here <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244>`__ for the implementation.
-
-5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the ``architectures`` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in `its registry <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/registry.py#L80>`__. If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For ``Qwen/Qwen2-7B``, the ``architectures`` field is ``["Qwen2ForCausalLM"]``, which corresponds to the ``Qwen2ForCausalLM`` class in `vLLM's code <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/qwen2.py#L364>`__. This class will initialize itself depending on various configs.
-
-Beyond that, there are two more things vLLM depends on HuggingFace for.
-
-1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using `AutoTokenizer.from_pretrained <https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained>`__ with the ``model`` argument as the model name and the ``--revision`` argument as the revision. It is also possible to use a tokenizer from another model by specifying the ``--tokenizer`` argument in the ``vllm serve`` command. Other relevant arguments are ``--tokenizer-revision`` and ``--tokenizer-mode``. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the `get_tokenizer <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87>`__ function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in `get_cached_tokenizer <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24>`__.
-
-2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the ``model`` argument as the model name and the ``--revision`` argument as the revision. vLLM provides the argument ``--load-format`` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass ``--load-format dummy`` to skip downloading the weights.
-
-   - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the `documentation <https://huggingface.co/docs/safetensors/en/index>`__ for more information on the safetensors format. This part of the logic can be found `here <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385>`__. Please note that:
-
-This completes the integration between vLLM and HuggingFace.
-
-In summary, vLLM reads the config file ``config.json``, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository.
diff --git a/docs/source/design/input_processing/input_processing_pipeline.md b/docs/source/design/input_processing/input_processing_pipeline.md
new file mode 100644
index 0000000000000..bb16920e3d0c0
--- /dev/null
+++ b/docs/source/design/input_processing/input_processing_pipeline.md
@@ -0,0 +1,19 @@
+(input-processing-pipeline)=
+
+# Input Processing Pipeline
+
+1. Input data is passed to {class}`~vllm.LLMEngine` (or {class}`~vllm.AsyncLLMEngine`).
+
+2. Tokenize the data if necessary.
+
+3. Process the inputs using {meth}`INPUT_REGISTRY.process_input <vllm.inputs.registry.InputRegistry.process_input>`.
+
+   - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings.
+
+4. Send the processed inputs to {class}`~vllm.executor.executor_base.ExecutorBase`.
+
+5. Distribute the inputs via {class}`~vllm.worker.worker_base.WorkerBase` to {class}`~vllm.worker.model_runner_base.ModelRunnerBase`.
+
+6. If the data contains multi-modal data, convert it into keyword arguments using {meth}`MULTIMODAL_REGISTRY.map_input <vllm.multimodal.MultiModalRegistry.map_input>`.
+
+   - For example, convert a {class}`PIL.Image.Image` input to its pixel values for a vision model.
diff --git a/docs/source/design/input_processing/input_processing_pipeline.rst b/docs/source/design/input_processing/input_processing_pipeline.rst
deleted file mode 100644
index 48abec8f75286..0000000000000
--- a/docs/source/design/input_processing/input_processing_pipeline.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-.. _input_processing_pipeline:
-
-Input Processing Pipeline
-=========================
-
-1. Input data is passed to :class:`~vllm.LLMEngine` (or :class:`~vllm.AsyncLLMEngine`).
-
-2. Tokenize the data if necessary.
-
-3. Process the inputs using :meth:`INPUT_REGISTRY.process_input <vllm.inputs.registry.InputRegistry.process_input>`.
-
-   - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings.
-
-4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`.
-
-5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunnerBase`.
-
-6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input <vllm.multimodal.MultiModalRegistry.map_input>`.
-
-   - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision model.
diff --git a/docs/source/design/input_processing/model_inputs_index.md b/docs/source/design/input_processing/model_inputs_index.md
new file mode 100644
index 0000000000000..cb415366e5a66
--- /dev/null
+++ b/docs/source/design/input_processing/model_inputs_index.md
@@ -0,0 +1,43 @@
+(input-processing)=
+
+# Input Processing
+
+```{eval-rst}
+.. currentmodule:: vllm.inputs
+```
+
+Each model can override parts of vLLM's [input processing pipeline](#input-processing-pipeline) via
+{data}`~vllm.inputs.INPUT_REGISTRY` and {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`.
+
+Currently, this mechanism is only utilized in [multi-modal](#multi-modality) models for preprocessing multi-modal input
+data in addition to input prompt, but it can be extended to text-only language models when needed.
+
+## Guides
+
+```{toctree}
+:maxdepth: 1
+
+input_processing_pipeline
+```
+
+## Module Contents
+
+### LLM Engine Inputs
+
+```{eval-rst}
+.. autoclass:: vllm.inputs.DecoderOnlyInputs
+    :members:
+    :show-inheritance:
+```
+
+### Registry
+
+```{eval-rst}
+.. autodata:: vllm.inputs.INPUT_REGISTRY
+```
+
+```{eval-rst}
+.. automodule:: vllm.inputs.registry
+    :members:
+    :show-inheritance:
+```
diff --git a/docs/source/design/input_processing/model_inputs_index.rst b/docs/source/design/input_processing/model_inputs_index.rst
deleted file mode 100644
index f0ec1fea15ddb..0000000000000
--- a/docs/source/design/input_processing/model_inputs_index.rst
+++ /dev/null
@@ -1,39 +0,0 @@
-.. _input_processing:
-
-Input Processing
-================
-
-.. currentmodule:: vllm.inputs
-
-Each model can override parts of vLLM's :ref:`input processing pipeline <input_processing_pipeline>` via
-:data:`~vllm.inputs.INPUT_REGISTRY` and :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-
-Currently, this mechanism is only utilized in :ref:`multi-modal <multi_modality>` models for preprocessing multi-modal input 
-data in addition to input prompt, but it can be extended to text-only language models when needed.
-
-Guides
-++++++
-
-.. toctree::
-   :maxdepth: 1
-
-   input_processing_pipeline
-
-Module Contents
-+++++++++++++++
-
-LLM Engine Inputs
------------------
-
-.. autoclass:: vllm.inputs.DecoderOnlyInputs
-    :members:
-    :show-inheritance:
-
-Registry
---------
-
-.. autodata:: vllm.inputs.INPUT_REGISTRY
-
-.. automodule:: vllm.inputs.registry
-    :members:
-    :show-inheritance:
diff --git a/docs/source/design/kernel/paged_attention.md b/docs/source/design/kernel/paged_attention.md
new file mode 100644
index 0000000000000..c21985b36eb3a
--- /dev/null
+++ b/docs/source/design/kernel/paged_attention.md
@@ -0,0 +1,527 @@
+# vLLM Paged Attention
+
+- Currently, vLLM utilizes its own implementation of a multi-head query
+  attention kernel (`csrc/attention/attention_kernels.cu`).
+  This kernel is designed to be compatible with
+  vLLM's paged KV caches, where the key and value cache are stored in
+  separate blocks (note that this block concept differs from the GPU
+  thread block. So in a later document, I will refer to vLLM paged
+  attention block as "block", while refer to GPU thread block as
+  "thread block").
+- To achieve high performance, this kernel relies on a specially
+  designed memory layout and access method, specifically when threads
+  read data from global memory to shared memory. The purpose of this
+  document is to provide a high-level explanation of the kernel
+  implementation step by step, aiding those who wish to learn about the
+  vLLM multi-head query attention kernel. After going through this
+  document, users will likely have a better understanding and feel easier
+  to follow the actual implementation.
+- Please note that this document may not cover all details, such as how
+  to calculate the correct index for the corresponding data or the dot
+  multiplication implementation. However, after reading this document
+  and becoming familiar with the high-level logic flow, it should be
+  easier for you to read the actual code and understand the details.
+
+## Inputs
+
+- The kernel function takes a list of arguments for the current thread
+  to perform its assigned work. The three most important arguments are
+  the input pointers `q`, `k_cache`, and `v_cache`, which point
+  to query, key, and value data on global memory that need to be read
+  and processed. The output pointer `out` points to global memory
+  where the result should be written. These four pointers actually
+  refer to multi-dimensional arrays, but each thread only accesses the
+  portion of data assigned to it. I have omitted all other runtime
+  parameters here for simplicity.
+
+  ```cpp
+  template<
+  typename scalar_t,
+  int HEAD_SIZE,
+  int BLOCK_SIZE,
+  int NUM_THREADS,
+  int PARTITION_SIZE = 0>
+  __device__ void paged_attention_kernel(
+  ... // Other side args.
+  const scalar_t* __restrict__ out,       // [num_seqs, num_heads, max_num_partitions, head_size]
+  const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
+  const scalar_t* __restrict__ k_cache,   // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+  const scalar_t* __restrict__ v_cache,   // [num_blocks, num_kv_heads, head_size, block_size]
+  ... // Other side args.
+  )
+  ```
+
+- There are also a list of template arguments above the function
+  signature that are determined during compilation time. `scalar_t`
+  represents the data type of the query, key, and value data elements,
+  such as FP16. `HEAD_SIZE` indicates the number of elements in each
+  head. `BLOCK_SIZE` refers to the number of tokens in each block.
+  `NUM_THREADS` denotes the number of threads in each thread block.
+  `PARTITION_SIZE` represents the number of tensor parallel GPUs (For
+  simplicity, we assume this is 0 and tensor parallel is disabled).
+
+- With these arguments, we need to perform a sequence of preparations.
+  This includes calculating the current head index, block index, and
+  other necessary variables. However, for now, we can ignore these
+  preparations and proceed directly to the actual calculations. It will
+  be easier to understand them once we grasp the entire flow.
+
+## Concepts
+
+- Just before we dive into the calculation flow, I want to describe a
+  few concepts that are needed for later sections. However, you may
+  skip this section and return later if you encounter any confusing
+  terminologies.
+- **Sequence**: A sequence represents a client request. For example,
+  the data pointed to by `q` has a shape of
+  `[num_seqs, num_heads, head_size]`. That represents there are total
+  `num_seqs` of query sequence data are pointed by `q`. Since this
+  kernel is a single query attention kernel, each sequence only has one
+  query token. Hence, the `num_seqs` equals the total number of tokens
+  that are processed in the batch.
+- **Context**: The context consists of the generated tokens from the
+  sequence. For instance, `["What", "is", "your"]` are the context
+  tokens, and the input query token is `"name"`. The model might
+  generate the token `"?"`.
+- **Vec**: The vec is a list of elements that are fetched and
+  calculated together. For query and key data, the vec size
+  (`VEC_SIZE`) is determined so that each thread group can fetch and
+  calculate 16 bytes of data at a time. For value data, the vec size
+  (`V_VEC_SIZE`) is determined so that each thread can fetch and
+  calculate 16 bytes of data at a time. For example, if the
+  `scalar_t` is FP16 (2 bytes) and `THREAD_GROUP_SIZE` is 2, the
+  `VEC_SIZE` will be 4, while the `V_VEC_SIZE` will be 8.
+- **Thread group**: The thread group is a small group of
+  threads(`THREAD_GROUP_SIZE`) that fetches and calculates one
+  query token and one key token at a time. Each thread handles only a
+  portion of the token data. The total number of elements processed by
+  one thread group is referred as `x`. For example, if the thread
+  group contains 2 threads and the head size is 8, then thread 0
+  handles the query and key elements at index 0, 2, 4, 6, while thread
+  1 handles the elements at index 1, 3, 5, 7.
+- **Block**: The key and value cache data in vLLM are split into
+  blocks. Each block stores data for a fixed number(`BLOCK_SIZE`)
+  of tokens at one head. Each block may contain only a portion of the
+  whole context tokens. For example, if the block size is 16 and the
+  head size is 128, then for one head, one block can store 16 * 128 =
+  2048 elements.
+- **Warp**: A warp is a group of 32 threads(`WARP_SIZE`) that
+  execute simultaneously on a stream multiprocessor (SM). In this
+  kernel, each warp processes the calculation between one query token
+  and key tokens of one entire block at a time (it may process multiple
+  blocks in multiple iterations). For example, if there are 4 warps and
+  6 blocks for one context, the assignment would be like warp 0 handles
+  the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2
+  handles the 2nd block and warp 3 handles the 3rd block.
+- **Thread block**: A thread block is a group of
+  threads(`NUM_THREADS`) that can access the same shared memory.
+  Each thread block contains multiple warps(`NUM_WARPS`), and in
+  this kernel, each thread block processes the calculation between one
+  query token and key tokens of a whole context.
+- **Grid**: A grid is a collection of thread blocks and defines the
+  shape of the collection. In this kernel, the shape is
+  `(num_heads, num_seqs, max_num_partitions)`. Therefore, each thread
+  block only handles the calculation for one head, one sequence, and
+  one partition.
+
+## Query
+
+- This section will introduce how query data is stored in memory and
+  fetched by each thread. As mentioned above, each thread group fetches
+  one query token data, while each thread itself only handles a part of
+  one query token data. Within each warp, every thread group will fetch
+  the same query token data, but will multiply it with different key
+  token data.
+
+  ```cpp
+  const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
+  ```
+
+  ```{figure} ../../assets/kernel/query.png
+  :align: center
+  :alt: query
+  :width: 70%
+
+  Query data of one token at one head
+  ```
+
+- Each thread defines its own `q_ptr` which points to the assigned
+  query token data on global memory. For example, if `VEC_SIZE` is 4
+  and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
+  total of 128 elements divided into 128 / 4 = 32 vecs.
+
+  ```{figure} ../../assets/kernel/q_vecs.png
+  :align: center
+  :alt: q_vecs
+  :width: 70%
+
+  `q_vecs` for one thread group
+  ```
+
+  ```cpp
+  __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
+  ```
+
+- Next, we need to read the global memory data pointed to by `q_ptr`
+  into shared memory as `q_vecs`. It is important to note that each
+  vecs is assigned to a different row. For example, if the
+  `THREAD_GROUP_SIZE` is 2, thread 0 will handle the 0th row vecs,
+  while thread 1 handles the 1st row vecs. By reading the query data in
+  this way, neighboring threads like thread 0 and thread 1 can read
+  neighbor memory, achieving the memory coalescing to improve
+  performance.
+
+## Key
+
+- Similar to the "Query" section, this section introduces memory layout
+  and assignment for keys. While each thread group only handle one
+  query token one kernel run, it may handle multiple key tokens across
+  multiple iterations. Meanwhile, each warp will process multiple blocks
+  of key tokens in multiple iterations, ensuring that all context
+  tokens are processed by the entire thread group after the kernel run.
+  In this context, "handle" refers to performing the dot multiplication
+  between query data and key data.
+
+  ```cpp
+  const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride
+                      + kv_head_idx * kv_head_stride
+                      + physical_block_offset * x;
+  ```
+
+- Unlike to `q_ptr`, `k_ptr` in each thread will point to different
+  key token at different iterations. As shown above, that `k_ptr`
+  points to key token data based on `k_cache` at assigned block,
+  assigned head and assigned token.
+
+  ```{figure} ../../assets/kernel/key.png
+  :align: center
+  :alt: key
+  :width: 70%
+
+  Key data of all context tokens at one head
+  ```
+
+- The diagram above illustrates the memory layout for key data. It
+  assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is
+  8, `THREAD_GROUP_SIZE` is 2, and there are a total of 4 warps. Each
+  rectangle represents all the elements for one key token at one head,
+  which will be processed by one thread group. The left half shows the
+  total 16 blocks of key token data for warp 0, while the right half
+  represents the remaining key token data for other warps or
+  iterations. Inside each rectangle, there are a total 32 vecs (128
+  elements for one token) that will be processed by 2 threads (one
+  thread group) separately.
+
+  ```{figure} ../../assets/kernel/k_vecs.png
+  :align: center
+  :alt: k_vecs
+  :width: 70%
+
+  `k_vecs` for one thread
+  ```
+
+  ```cpp
+  K_vec k_vecs[NUM_VECS_PER_THREAD]
+  ```
+
+- Next, we need to read the key token data from `k_ptr` and store
+  them on register memory as `k_vecs`. We use register memory for
+  `k_vecs` because it will only be accessed by one thread once,
+  whereas `q_vecs` will be accessed by multiple threads multiple
+  times. Each `k_vecs` will contain multiple vectors for later
+  calculation. Each vec will be set at each inner iteration. The
+  assignment of vecs allows neighboring threads in a warp to read
+  neighboring memory together, which again promotes the memory
+  coalescing. For instance, thread 0 will read vec 0, while thread 1
+  will read vec 1. In the next inner loop, thread 0 will read vec 2,
+  while thread 1 will read vec 3, and so on.
+
+- You may still be a little confused about the overall flow. Don't
+  worry, please keep reading the next "QK" section. It will illustrate
+  the query and key calculation flow in a clearer and higher-level
+  manner.
+
+## QK
+
+- As shown the pseudo code below, before the entire for loop block, we
+  fetch the query data for one token and store it in `q_vecs`. Then,
+  in the outer for loop, we iterate through different `k_ptrs` that
+  point to different tokens and prepare the `k_vecs` in the inner for
+  loop. Finally, we perform the dot multiplication between the
+  `q_vecs` and each `k_vecs`.
+
+  ```cpp
+  q_vecs = ...
+  for ... {
+     k_ptr = ...
+     for ... {
+        k_vecs[i] = ...
+     }
+     ...
+     float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
+  }
+  ```
+
+- As mentioned before, for each thread, it only fetches part of the
+  query and key token data at a time. However, there will be a cross
+  thread group reduction happen in the `Qk_dot<>::dot` . So `qk`
+  returned here is not just between part of the query and key token dot
+  multiplication, but actually a full result between entire query and
+  key token data.
+
+- For example, if the value of `HEAD_SIZE` is 128 and
+  `THREAD_GROUP_SIZE` is 2, each thread's `k_vecs` will contain
+  total 64 elements. However, the returned `qk` is actually the
+  result of dot multiplication between 128 query elements and 128 key
+  elements. If you want to learn more about the details of the dot
+  multiplication and reduction, you may refer to the implementation of
+  `Qk_dot<>::dot`. However, for the sake of simplicity, I will not
+  cover it in this document.
+
+## Softmax
+
+- Next, we need to calculate the normalized softmax for all `qk`s,
+  as shown above, where each $x$ represents a `qk`. To do this,
+  we must obtain the reduced value of `qk_max`($m(x)$) and
+  the `exp_sum`($\ell(x)$) of all `qk`s. The reduction
+  should be performed across the entire thread block, encompassing
+  results between the query token and all context key tokens.
+
+  ```{math}
+  :nowrap: true
+
+  \begin{gather*}
+  m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\
+  \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)}
+  \end{gather*}
+  ```
+
+### `qk_max` and `logits`
+
+- Just right after we get the `qk` result, we can set the temporary
+  `logits` result with `qk` (In the end, the `logits` should
+  store the normalized softmax result). Also we can compare and collect
+  the `qk_max` for all `qk`s that are calculated by current
+  thread group.
+
+  ```cpp
+  if (thread_group_offset == 0) {
+     const bool mask = token_idx >= context_len;
+     logits[token_idx - start_token_idx] = mask ? 0.f : qk;
+     qk_max = mask ? qk_max : fmaxf(qk_max, qk);
+  }
+  ```
+
+- Please note that the `logits` here is on shared memory, so each
+  thread group will set the fields for its own assigned context tokens.
+  Overall, the size of logits should be number of context tokens.
+
+  ```cpp
+  for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
+      qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
+  }
+
+  if (lane == 0) {
+     red_smem[warp_idx] = qk_max;
+  }
+  ```
+
+- Then we need to get the reduced `qk_max` across each warp. The main
+  idea is to make threads in warp to communicate with each other and
+  get the final max `qk` .
+
+  ```cpp
+  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
+      qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
+  }
+  qk_max = VLLM_SHFL_SYNC(qk_max, 0);
+  ```
+
+- Finally, we can get the reduced `qk_max` from whole thread block by
+  compare the `qk_max` from all warps in this thread block. Then we
+  need to broadcast the final result to each thread.
+
+### `exp_sum`
+
+- Similar to `qk_max`, we need to get the reduced sum value from the
+  entire thread block too.
+
+  ```cpp
+  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
+      float val = __expf(logits[i] - qk_max);
+      logits[i] = val;
+      exp_sum += val;
+  }
+  ...
+  exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum);
+  ```
+
+- Firstly, sum all exp values from each thread group, and meanwhile,
+  convert each entry of `logits` from `qk` to `exp(qk - qk_max)`.
+  Please note, the `qk_max` here is already the max `qk` across the
+  whole thread block. And then we can do reduction for `exp_sum`
+  across whole thread block just like the `qk_max`.
+
+  ```cpp
+  const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
+  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
+     logits[i] *= inv_sum;
+  }
+  ```
+
+- Finally, with the reduced `qk_max` and `exp_sum`, we can obtain
+  the final normalized softmax result as `logits`. This `logits`
+  variable will be used for dot multiplication with the value data in
+  later steps. Now, it should store the normalized softmax result of
+  `qk` for all assigned context tokens.
+
+## Value
+
+```{figure} ../../assets/kernel/value.png
+:align: center
+:alt: value
+:width: 70%
+
+Value data of all context tokens at one head
+```
+
+```{figure} ../../assets/kernel/logits_vec.png
+:align: center
+:alt: logits_vec
+:width: 50%
+
+`logits_vec` for one thread
+```
+
+```{figure} ../../assets/kernel/v_vec.png
+:align: center
+:alt: v_vec
+:width: 70%
+
+List of `v_vec` for one thread
+```
+
+- Now we need to retrieve the value data and perform dot multiplication
+  with `logits`. Unlike query and key, there is no thread group
+  concept for value data. As shown in diagram, different from key token
+  memory layout, elements from the same column correspond to the same
+  value token. For one block of value data, there are `HEAD_SIZE` of
+  rows and `BLOCK_SIZE` of columns that are split into multiple
+  `v_vecs`.
+
+- Each thread always fetches `V_VEC_SIZE` elements from the same
+  `V_VEC_SIZE` of tokens at a time. As a result, a single thread
+  retrieves multiple `v_vec`s from different rows and the same
+  columns through multiple inner iterations. For each `v_vec`, it
+  needs to be dot multiplied with the corresponding `logits_vec`,
+  which is also `V_VEC_SIZE` elements from `logits`. Overall, with
+  multiple inner iterations, each warp will process one block of value
+  tokens. And with multiple outer iterations, the whole context value
+  tokens are processd
+
+  ```cpp
+  float accs[NUM_ROWS_PER_THREAD];
+  for ... { // Iteration over different blocks.
+      logits_vec = ...
+      for ... { // Iteration over different rows.
+          v_vec = ...
+          ...
+          accs[i] += dot(logits_vec, v_vec);
+      }
+  }
+  ```
+
+- As shown in the above pseudo code, in the outer loop, similar to
+  `k_ptr`, `logits_vec` iterates over different blocks and reads
+  `V_VEC_SIZE` elements from `logits`. In the inner loop, each
+  thread reads `V_VEC_SIZE` elements from the same tokens as a
+  `v_vec` and performs dot multiplication. It is important to note
+  that in each inner iteration, the thread fetches different head
+  position elements for the same tokens. The dot result is then
+  accumulated in `accs`. Therefore, each entry of `accs` is mapped
+  to a head position assigned to the current thread.
+
+- For example, if `BLOCK_SIZE` is 16 and `V_VEC_SIZE` is 8, each
+  thread fetches 8 value elements for 8 tokens at a time. Each element
+  is from different tokens at the same head position. If `HEAD_SIZE`
+  is 128 and `WARP_SIZE` is 32, for each inner loop, a warp needs to
+  fetch `WARP_SIZE * V_VEC_SIZE = 256` elements. This means there are
+  a total of 128 * 16 / 256 = 8 inner iterations for a warp to handle
+  a whole block of value tokens. And each `accs` in each thread
+  contains 8 elements that accumulated at 8 different head positions.
+  For the thread 0, the `accs` variable will have 8 elements, which
+  are 0th, 32th … 224th elements of a value head that are accumulated
+  from all assigned 8 tokens.
+
+## LV
+
+- Now, we need to perform reduction for `accs` within each warp. This
+  process allows each thread to accumulate the `accs` for the
+  assigned head positions of all tokens in one block.
+
+  ```cpp
+  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+     float acc = accs[i];
+     for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
+        acc += VLLM_SHFL_XOR_SYNC(acc, mask);
+     }
+     accs[i] = acc;
+  }
+  ```
+
+- Next, we perform reduction for `accs` across all warps, allowing
+  each thread to have the accumulation of `accs` for the assigned
+  head positions of all context tokens. Please note that each `accs`
+  in every thread only stores the accumulation for a portion of
+  elements of the entire head for all context tokens. However, overall,
+  all results for output have been calculated but are just stored in
+  different thread register memory.
+
+  ```cpp
+  float* out_smem = reinterpret_cast<float*>(shared_mem);
+  for (int i = NUM_WARPS; i > 1; i /= 2) {
+      // Upper warps write to shared memory.
+      ...
+          float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
+          for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+                  ...
+          dst[row_idx] = accs[i];
+      }
+
+      // Lower warps update the output.
+          const float* src = &out_smem[warp_idx * HEAD_SIZE];
+      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+                  ...
+          accs[i] += src[row_idx];
+      }
+
+          // Write out the accs.
+  }
+  ```
+
+## Output
+
+- Now we can write all of calculated result from local register memory
+  to final output global memory.
+
+  ```cpp
+  scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
+                  + head_idx * max_num_partitions * HEAD_SIZE
+                  + partition_idx * HEAD_SIZE;
+  ```
+
+- First, we need to define the `out_ptr` variable, which points to
+  the start address of the assigned sequence and assigned head.
+
+  ```cpp
+  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+  const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
+  if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
+      from_float(*(out_ptr + row_idx), accs[i]);
+  }
+  }
+  ```
+
+- Finally, we need to iterate over different assigned head positions
+  and write out the corresponding accumulated result based on the
+  `out_ptr`.
diff --git a/docs/source/design/kernel/paged_attention.rst b/docs/source/design/kernel/paged_attention.rst
deleted file mode 100644
index ba4f7a2718158..0000000000000
--- a/docs/source/design/kernel/paged_attention.rst
+++ /dev/null
@@ -1,525 +0,0 @@
-vLLM Paged Attention
-====================
-
--  Currently, vLLM utilizes its own implementation of a multi-head query
-   attention kernel (``csrc/attention/attention_kernels.cu``). 
-   This kernel is designed to be compatible with
-   vLLM's paged KV caches, where the key and value cache are stored in
-   separate blocks (note that this block concept differs from the GPU
-   thread block. So in a later document, I will refer to vLLM paged
-   attention block as "block", while refer to GPU thread block as
-   "thread block").
--  To achieve high performance, this kernel relies on a specially
-   designed memory layout and access method, specifically when threads
-   read data from global memory to shared memory. The purpose of this
-   document is to provide a high-level explanation of the kernel
-   implementation step by step, aiding those who wish to learn about the
-   vLLM multi-head query attention kernel. After going through this 
-   document, users will likely have a better understanding and feel easier
-   to follow the actual implementation.
--  Please note that this document may not cover all details, such as how
-   to calculate the correct index for the corresponding data or the dot
-   multiplication implementation. However, after reading this document
-   and becoming familiar with the high-level logic flow, it should be
-   easier for you to read the actual code and understand the details.
-
-Inputs
-------
-
--  The kernel function takes a list of arguments for the current thread
-   to perform its assigned work. The three most important arguments are
-   the input pointers ``q``, ``k_cache``, and ``v_cache``, which point
-   to query, key, and value data on global memory that need to be read
-   and processed. The output pointer ``out`` points to global memory
-   where the result should be written. These four pointers actually
-   refer to multi-dimensional arrays, but each thread only accesses the
-   portion of data assigned to it. I have omitted all other runtime
-   parameters here for simplicity.
-
-   .. code:: cpp
-
-      template<
-      typename scalar_t,
-      int HEAD_SIZE,
-      int BLOCK_SIZE,
-      int NUM_THREADS,
-      int PARTITION_SIZE = 0>
-      __device__ void paged_attention_kernel(
-      ... // Other side args.
-      const scalar_t* __restrict__ out,       // [num_seqs, num_heads, max_num_partitions, head_size]
-      const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
-      const scalar_t* __restrict__ k_cache,   // [num_blocks, num_kv_heads, head_size/x, block_size, x]
-      const scalar_t* __restrict__ v_cache,   // [num_blocks, num_kv_heads, head_size, block_size]
-      ... // Other side args.
-      )
-
--  There are also a list of template arguments above the function
-   signature that are determined during compilation time. ``scalar_t``
-   represents the data type of the query, key, and value data elements,
-   such as FP16. ``HEAD_SIZE`` indicates the number of elements in each
-   head. ``BLOCK_SIZE`` refers to the number of tokens in each block.
-   ``NUM_THREADS`` denotes the number of threads in each thread block.
-   ``PARTITION_SIZE`` represents the number of tensor parallel GPUs (For
-   simplicity, we assume this is 0 and tensor parallel is disabled).
--  With these arguments, we need to perform a sequence of preparations.
-   This includes calculating the current head index, block index, and
-   other necessary variables. However, for now, we can ignore these
-   preparations and proceed directly to the actual calculations. It will
-   be easier to understand them once we grasp the entire flow.
-
-Concepts
---------
-
--  Just before we dive into the calculation flow, I want to describe a
-   few concepts that are needed for later sections. However, you may
-   skip this section and return later if you encounter any confusing
-   terminologies.
--  **Sequence**: A sequence represents a client request. For example,
-   the data pointed to by ``q`` has a shape of
-   ``[num_seqs, num_heads, head_size]``. That represents there are total
-   ``num_seqs`` of query sequence data are pointed by ``q``. Since this 
-   kernel is a single query attention kernel, each sequence only has one
-   query token. Hence, the ``num_seqs`` equals the total number of tokens 
-   that are processed in the batch.
--  **Context**: The context consists of the generated tokens from the
-   sequence. For instance, ``["What", "is", "your"]`` are the context
-   tokens, and the input query token is ``"name"``. The model might
-   generate the token ``"?"``.
--  **Vec**: The vec is a list of elements that are fetched and
-   calculated together. For query and key data, the vec size
-   (``VEC_SIZE``) is determined so that each thread group can fetch and
-   calculate 16 bytes of data at a time. For value data, the vec size
-   (``V_VEC_SIZE``) is determined so that each thread can fetch and
-   calculate 16 bytes of data at a time. For example, if the
-   ``scalar_t`` is FP16 (2 bytes) and ``THREAD_GROUP_SIZE`` is 2, the 
-   ``VEC_SIZE`` will be 4, while the ``V_VEC_SIZE`` will be 8.
--  **Thread group**: The thread group is a small group of
-   threads(\ ``THREAD_GROUP_SIZE``) that fetches and calculates one
-   query token and one key token at a time. Each thread handles only a
-   portion of the token data. The total number of elements processed by
-   one thread group is referred as ``x``. For example, if the thread
-   group contains 2 threads and the head size is 8, then thread 0
-   handles the query and key elements at index 0, 2, 4, 6, while thread
-   1 handles the elements at index 1, 3, 5, 7.
--  **Block**: The key and value cache data in vLLM are split into
-   blocks. Each block stores data for a fixed number(\ ``BLOCK_SIZE``)
-   of tokens at one head. Each block may contain only a portion of the
-   whole context tokens. For example, if the block size is 16 and the
-   head size is 128, then for one head, one block can store 16 \* 128 =
-   2048 elements.
--  **Warp**: A warp is a group of 32 threads(\ ``WARP_SIZE``) that
-   execute simultaneously on a stream multiprocessor (SM). In this
-   kernel, each warp processes the calculation between one query token
-   and key tokens of one entire block at a time (it may process multiple
-   blocks in multiple iterations). For example, if there are 4 warps and
-   6 blocks for one context, the assignment would be like warp 0 handles
-   the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2
-   handles the 2nd block and warp 3 handles the 3rd block.
--  **Thread block**: A thread block is a group of
-   threads(\ ``NUM_THREADS``) that can access the same shared memory.
-   Each thread block contains multiple warps(\ ``NUM_WARPS``), and in
-   this kernel, each thread block processes the calculation between one
-   query token and key tokens of a whole context.
--  **Grid**: A grid is a collection of thread blocks and defines the
-   shape of the collection. In this kernel, the shape is
-   ``(num_heads, num_seqs, max_num_partitions)``. Therefore, each thread
-   block only handles the calculation for one head, one sequence, and
-   one partition.
-
-Query
------
-
--  This section will introduce how query data is stored in memory and
-   fetched by each thread. As mentioned above, each thread group fetches
-   one query token data, while each thread itself only handles a part of
-   one query token data. Within each warp, every thread group will fetch
-   the same query token data, but will multiply it with different key
-   token data.
-
-   .. code:: cpp
-
-      const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
-
-   .. figure:: ../../assets/kernel/query.png
-      :alt: query
-      :width: 70%
-      :align: center
-
-      Query data of one token at one head
-
--  Each thread defines its own ``q_ptr`` which points to the assigned
-   query token data on global memory. For example, if ``VEC_SIZE`` is 4
-   and ``HEAD_SIZE`` is 128, the ``q_ptr`` points to data that contains
-   total of 128 elements divided into 128 / 4 = 32 vecs.
-
-   .. figure:: ../../assets/kernel/q_vecs.png
-      :alt: q_vecs
-      :width: 70%
-      :align: center
-
-      ``q_vecs`` for one thread group
-
-   .. code:: cpp
-
-      __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
-
--  Next, we need to read the global memory data pointed to by ``q_ptr``
-   into shared memory as ``q_vecs``. It is important to note that each
-   vecs is assigned to a different row. For example, if the
-   ``THREAD_GROUP_SIZE`` is 2, thread 0 will handle the 0th row vecs,
-   while thread 1 handles the 1st row vecs. By reading the query data in
-   this way, neighboring threads like thread 0 and thread 1 can read
-   neighbor memory, achieving the memory coalescing to improve
-   performance.
-
-Key
----
-
--  Similar to the "Query" section, this section introduces memory layout
-   and assignment for keys. While each thread group only handle one
-   query token one kernel run, it may handle multiple key tokens across
-   multiple iterations. Meanwhile, each warp will process multiple blocks
-   of key tokens in multiple iterations, ensuring that all context
-   tokens are processed by the entire thread group after the kernel run.
-   In this context, "handle" refers to performing the dot multiplication
-   between query data and key data.
-
-   .. code:: cpp
-
-      const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride
-                          + kv_head_idx * kv_head_stride
-                          + physical_block_offset * x;
-
--  Unlike to ``q_ptr``, ``k_ptr`` in each thread will point to different
-   key token at different iterations. As shown above, that ``k_ptr``
-   points to key token data based on ``k_cache`` at assigned block,
-   assigned head and assigned token.
-
-   .. figure:: ../../assets/kernel/key.png
-      :alt: key
-      :width: 70%
-      :align: center
-
-      Key data of all context tokens at one head
-
--  The diagram above illustrates the memory layout for key data. It
-   assumes that the ``BLOCK_SIZE`` is 16, ``HEAD_SIZE`` is 128, ``x`` is
-   8, ``THREAD_GROUP_SIZE`` is 2, and there are a total of 4 warps. Each
-   rectangle represents all the elements for one key token at one head,
-   which will be processed by one thread group. The left half shows the
-   total 16 blocks of key token data for warp 0, while the right half
-   represents the remaining key token data for other warps or
-   iterations. Inside each rectangle, there are a total 32 vecs (128
-   elements for one token) that will be processed by 2 threads (one
-   thread group) separately.
-
-   .. figure:: ../../assets/kernel/k_vecs.png
-      :alt: k_vecs
-      :width: 70%
-      :align: center
-
-      ``k_vecs`` for one thread
-
-   .. code:: cpp
-
-      K_vec k_vecs[NUM_VECS_PER_THREAD]
-
--  Next, we need to read the key token data from ``k_ptr`` and store
-   them on register memory as ``k_vecs``. We use register memory for
-   ``k_vecs`` because it will only be accessed by one thread once,
-   whereas ``q_vecs`` will be accessed by multiple threads multiple
-   times. Each ``k_vecs`` will contain multiple vectors for later
-   calculation. Each vec will be set at each inner iteration. The
-   assignment of vecs allows neighboring threads in a warp to read
-   neighboring memory together, which again promotes the memory
-   coalescing. For instance, thread 0 will read vec 0, while thread 1
-   will read vec 1. In the next inner loop, thread 0 will read vec 2,
-   while thread 1 will read vec 3, and so on.
--  You may still be a little confused about the overall flow. Don't
-   worry, please keep reading the next "QK" section. It will illustrate
-   the query and key calculation flow in a clearer and higher-level
-   manner.
-
-QK
----
-
--  As shown the pseudo code below, before the entire for loop block, we
-   fetch the query data for one token and store it in ``q_vecs``. Then,
-   in the outer for loop, we iterate through different ``k_ptrs`` that
-   point to different tokens and prepare the ``k_vecs`` in the inner for
-   loop. Finally, we perform the dot multiplication between the
-   ``q_vecs`` and each ``k_vecs``.
-
-   .. code:: cpp
-
-      q_vecs = ...
-      for ... {
-         k_ptr = ...
-         for ... {
-            k_vecs[i] = ...
-         }
-         ...
-         float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
-      }
-
--  As mentioned before, for each thread, it only fetches part of the
-   query and key token data at a time. However, there will be a cross
-   thread group reduction happen in the ``Qk_dot<>::dot`` . So ``qk``
-   returned here is not just between part of the query and key token dot
-   multiplication, but actually a full result between entire query and
-   key token data.
--  For example, if the value of ``HEAD_SIZE`` is 128 and
-   ``THREAD_GROUP_SIZE`` is 2, each thread's ``k_vecs`` will contain
-   total 64 elements. However, the returned ``qk`` is actually the
-   result of dot multiplication between 128 query elements and 128 key
-   elements. If you want to learn more about the details of the dot
-   multiplication and reduction, you may refer to the implementation of
-   ``Qk_dot<>::dot``. However, for the sake of simplicity, I will not
-   cover it in this document.
-
-Softmax
--------
-
--  Next, we need to calculate the normalized softmax for all ``qk``\ s,
-   as shown above, where each :math:`x` represents a ``qk``. To do this,
-   we must obtain the reduced value of ``qk_max``\ (:math:`m(x)`) and
-   the ``exp_sum``\ (:math:`\ell(x)`) of all ``qk``\ s. The reduction
-   should be performed across the entire thread block, encompassing
-   results between the query token and all context key tokens.
-
-   .. math::
-      :nowrap:
-
-      \begin{gather*}
-      m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\
-      \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)}
-      \end{gather*}
-
-``qk_max`` and ``logits``
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
--  Just right after we get the ``qk`` result, we can set the temporary
-   ``logits`` result with ``qk`` (In the end, the ``logits`` should
-   store the normalized softmax result). Also we can compare and collect
-   the ``qk_max`` for all ``qk``\ s that are calculated by current
-   thread group.
-
-   .. code:: cpp
-
-      if (thread_group_offset == 0) {
-         const bool mask = token_idx >= context_len;
-         logits[token_idx - start_token_idx] = mask ? 0.f : qk;
-         qk_max = mask ? qk_max : fmaxf(qk_max, qk);
-      }
-
--  Please note that the ``logits`` here is on shared memory, so each
-   thread group will set the fields for its own assigned context tokens.
-   Overall, the size of logits should be number of context tokens.
-
-   .. code:: cpp
-
-      for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
-          qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
-      }
-
-      if (lane == 0) {
-         red_smem[warp_idx] = qk_max;
-      }
-
--  Then we need to get the reduced ``qk_max`` across each warp. The main
-   idea is to make threads in warp to communicate with each other and
-   get the final max ``qk`` .
-
-   .. code:: cpp
-
-      for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
-          qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
-      }
-      qk_max = VLLM_SHFL_SYNC(qk_max, 0);
-
--  Finally, we can get the reduced ``qk_max`` from whole thread block by
-   compare the ``qk_max`` from all warps in this thread block. Then we
-   need to broadcast the final result to each thread.
-
-``exp_sum``
-~~~~~~~~~~~
-
--  Similar to ``qk_max``, we need to get the reduced sum value from the
-   entire thread block too.
-
-   .. code:: cpp
-
-      for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
-          float val = __expf(logits[i] - qk_max);
-          logits[i] = val;
-          exp_sum += val;
-      }
-      ...
-      exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum);
-
--  Firstly, sum all exp values from each thread group, and meanwhile,
-   convert each entry of ``logits`` from ``qk`` to ``exp(qk - qk_max)``.
-   Please note, the ``qk_max`` here is already the max ``qk`` across the
-   whole thread block. And then we can do reduction for ``exp_sum``
-   across whole thread block just like the ``qk_max``.
-
-   .. code:: cpp
-
-      const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
-      for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
-         logits[i] *= inv_sum;
-      }
-
--  Finally, with the reduced ``qk_max`` and ``exp_sum``, we can obtain
-   the final normalized softmax result as ``logits``. This ``logits``
-   variable will be used for dot multiplication with the value data in
-   later steps. Now, it should store the normalized softmax result of
-   ``qk`` for all assigned context tokens.
-
-Value
------
-
-.. figure:: ../../assets/kernel/value.png
-   :alt: value
-   :width: 70%
-   :align: center
-
-   Value data of all context tokens at one head
-
-.. figure:: ../../assets/kernel/logits_vec.png
-   :alt: logits_vec
-   :width: 50%
-   :align: center
-
-   ``logits_vec`` for one thread
-
-.. figure:: ../../assets/kernel/v_vec.png
-   :alt: v_vec
-   :width: 70%
-   :align: center
-
-   List of ``v_vec`` for one thread
-
--  Now we need to retrieve the value data and perform dot multiplication
-   with ``logits``. Unlike query and key, there is no thread group
-   concept for value data. As shown in diagram, different from key token
-   memory layout, elements from the same column correspond to the same
-   value token. For one block of value data, there are ``HEAD_SIZE`` of
-   rows and ``BLOCK_SIZE`` of columns that are split into multiple
-   ``v_vecs``.
--  Each thread always fetches ``V_VEC_SIZE`` elements from the same
-   ``V_VEC_SIZE`` of tokens at a time. As a result, a single thread
-   retrieves multiple ``v_vec``\ s from different rows and the same
-   columns through multiple inner iterations. For each ``v_vec``, it
-   needs to be dot multiplied with the corresponding ``logits_vec``,
-   which is also ``V_VEC_SIZE`` elements from ``logits``. Overall, with
-   multiple inner iterations, each warp will process one block of value
-   tokens. And with multiple outer iterations, the whole context value
-   tokens are processd
-
-   .. code:: cpp
-
-      float accs[NUM_ROWS_PER_THREAD];
-      for ... { // Iteration over different blocks.
-          logits_vec = ...
-          for ... { // Iteration over different rows.
-              v_vec = ...
-              ...
-              accs[i] += dot(logits_vec, v_vec);
-          }
-      }
-
--  As shown in the above pseudo code, in the outer loop, similar to
-   ``k_ptr``, ``logits_vec`` iterates over different blocks and reads
-   ``V_VEC_SIZE`` elements from ``logits``. In the inner loop, each
-   thread reads ``V_VEC_SIZE`` elements from the same tokens as a
-   ``v_vec`` and performs dot multiplication. It is important to note
-   that in each inner iteration, the thread fetches different head
-   position elements for the same tokens. The dot result is then
-   accumulated in ``accs``. Therefore, each entry of ``accs`` is mapped
-   to a head position assigned to the current thread.
--  For example, if ``BLOCK_SIZE`` is 16 and ``V_VEC_SIZE`` is 8, each
-   thread fetches 8 value elements for 8 tokens at a time. Each element
-   is from different tokens at the same head position. If ``HEAD_SIZE``
-   is 128 and ``WARP_SIZE`` is 32, for each inner loop, a warp needs to
-   fetch ``WARP_SIZE * V_VEC_SIZE = 256`` elements. This means there are
-   a total of 128 \* 16 / 256 = 8 inner iterations for a warp to handle
-   a whole block of value tokens. And each ``accs`` in each thread
-   contains 8 elements that accumulated at 8 different head positions.
-   For the thread 0, the ``accs`` variable will have 8 elements, which
-   are 0th, 32th … 224th elements of a value head that are accumulated
-   from all assigned 8 tokens.
-
-LV
----
--  Now, we need to perform reduction for ``accs`` within each warp. This
-   process allows each thread to accumulate the ``accs`` for the
-   assigned head positions of all tokens in one block.
-
-   .. code:: cpp
-
-      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-         float acc = accs[i];
-         for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
-            acc += VLLM_SHFL_XOR_SYNC(acc, mask);
-         }
-         accs[i] = acc;
-      }
-
--  Next, we perform reduction for ``accs`` across all warps, allowing
-   each thread to have the accumulation of ``accs`` for the assigned
-   head positions of all context tokens. Please note that each ``accs``
-   in every thread only stores the accumulation for a portion of
-   elements of the entire head for all context tokens. However, overall,
-   all results for output have been calculated but are just stored in
-   different thread register memory.
-
-   .. code:: cpp
-
-      float* out_smem = reinterpret_cast<float*>(shared_mem);
-      for (int i = NUM_WARPS; i > 1; i /= 2) {
-          // Upper warps write to shared memory.
-          ...
-              float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
-              for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-                      ...
-              dst[row_idx] = accs[i];
-          }
-
-          // Lower warps update the output.
-              const float* src = &out_smem[warp_idx * HEAD_SIZE];
-          for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-                      ...
-              accs[i] += src[row_idx];
-          }
-
-              // Write out the accs.
-      }
-
-Output
-------
-
--  Now we can write all of calculated result from local register memory
-   to final output global memory.
-
-   .. code:: cpp
-
-      scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
-                      + head_idx * max_num_partitions * HEAD_SIZE
-                      + partition_idx * HEAD_SIZE;
-
--  First, we need to define the ``out_ptr`` variable, which points to
-   the start address of the assigned sequence and assigned head.
-
-   .. code:: cpp
-
-      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-      const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
-      if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
-          from_float(*(out_ptr + row_idx), accs[i]);
-      }
-      }
-
--  Finally, we need to iterate over different assigned head positions
-   and write out the corresponding accumulated result based on the
-   ``out_ptr``.
diff --git a/docs/source/design/multimodal/adding_multimodal_plugin.md b/docs/source/design/multimodal/adding_multimodal_plugin.md
new file mode 100644
index 0000000000000..bcccd284879bb
--- /dev/null
+++ b/docs/source/design/multimodal/adding_multimodal_plugin.md
@@ -0,0 +1,16 @@
+(adding-multimodal-plugin)=
+
+# Adding a Multimodal Plugin
+
+This document teaches you how to add a new modality to vLLM.
+
+Each modality in vLLM is represented by a {class}`~vllm.multimodal.MultiModalPlugin` and registered to {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`.
+For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to {meth}`~vllm.multimodal.MultiModalRegistry.register_plugin`.
+
+The remainder of this document details how to define custom {class}`~vllm.multimodal.MultiModalPlugin` s.
+
+```{note}
+This article is a work in progress.
+```
+
+% TODO: Add more instructions on how to add new plugins once embeddings is in.
diff --git a/docs/source/design/multimodal/adding_multimodal_plugin.rst b/docs/source/design/multimodal/adding_multimodal_plugin.rst
deleted file mode 100644
index b726138f840a3..0000000000000
--- a/docs/source/design/multimodal/adding_multimodal_plugin.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-.. _adding_multimodal_plugin:
-
-Adding a Multimodal Plugin
-==========================
-
-This document teaches you how to add a new modality to vLLM.
-
-Each modality in vLLM is represented by a :class:`~vllm.multimodal.MultiModalPlugin` and registered to :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to :meth:`~vllm.multimodal.MultiModalRegistry.register_plugin`.
-
-The remainder of this document details how to define custom :class:`~vllm.multimodal.MultiModalPlugin` s.
-
-.. note::
-  This article is a work in progress.
-
-..
-  TODO: Add more instructions on how to add new plugins once embeddings is in.
diff --git a/docs/source/design/multimodal/multimodal_index.rst b/docs/source/design/multimodal/multimodal_index.md
similarity index 61%
rename from docs/source/design/multimodal/multimodal_index.rst
rename to docs/source/design/multimodal/multimodal_index.md
index c6d47f90b62d5..88af07afc7018 100644
--- a/docs/source/design/multimodal/multimodal_index.rst
+++ b/docs/source/design/multimodal/multimodal_index.md
@@ -1,66 +1,83 @@
-.. _multi_modality:
+(multi-modality)=
 
-Multi-Modality
-==============
+# Multi-Modality
 
+```{eval-rst}
 .. currentmodule:: vllm.multimodal
-    
-vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
+```
 
-Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models <supported_mm_models>`
-via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptType`.
+vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
+
+Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
+via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
 
 Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
-by following :ref:`this guide <adding_multimodal_plugin>`.
+by following [this guide](#adding-multimodal-plugin).
 
-Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here <enabling_multimodal_inputs>`.
+Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs).
 
-Guides
-++++++
+## Guides
 
-.. toctree::
-   :maxdepth: 1
+```{toctree}
+:maxdepth: 1
 
-   adding_multimodal_plugin
+adding_multimodal_plugin
+```
 
-Module Contents
-+++++++++++++++
+## Module Contents
 
+```{eval-rst}
 .. automodule:: vllm.multimodal
+```
 
-Registry
---------
+### Registry
 
+```{eval-rst}
 .. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
+```
 
+```{eval-rst}
 .. autoclass:: vllm.multimodal.MultiModalRegistry
     :members:
     :show-inheritance:
+```
 
-Base Classes
-------------
+### Base Classes
 
+```{eval-rst}
 .. autodata:: vllm.multimodal.NestedTensors
+```
 
+```{eval-rst}
 .. autodata:: vllm.multimodal.BatchedTensorInputs
+```
 
+```{eval-rst}
 .. autoclass:: vllm.multimodal.MultiModalDataBuiltins
     :members:
     :show-inheritance:
+```
 
+```{eval-rst}
 .. autodata:: vllm.multimodal.MultiModalDataDict
+```
 
+```{eval-rst}
 .. autoclass:: vllm.multimodal.MultiModalKwargs
     :members:
     :show-inheritance:
+```
 
+```{eval-rst}
 .. autoclass:: vllm.multimodal.MultiModalPlugin
     :members:
     :show-inheritance:
+```
 
-Image Classes
--------------
+### Image Classes
 
+```{eval-rst}
 .. automodule:: vllm.multimodal.image
     :members:
     :show-inheritance:
+```
diff --git a/docs/source/design/plugin_system.md b/docs/source/design/plugin_system.md
new file mode 100644
index 0000000000000..79aff757518f2
--- /dev/null
+++ b/docs/source/design/plugin_system.md
@@ -0,0 +1,54 @@
+(plugin-system)=
+
+# vLLM's Plugin System
+
+The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM.
+
+## How Plugins Work in vLLM
+
+Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [](#arch-overview)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work.
+
+## How vLLM Discovers Plugins
+
+vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
+
+```python
+# inside `setup.py` file
+from setuptools import setup
+
+setup(name='vllm_add_dummy_model',
+      version='0.1',
+      packages=['vllm_add_dummy_model'],
+      entry_points={
+          'vllm.general_plugins':
+          ["register_dummy_model = vllm_add_dummy_model:register"]
+      })
+
+# inside `vllm_add_dummy_model.py` file
+def register():
+    from vllm import ModelRegistry
+
+    if "MyLlava" not in ModelRegistry.get_supported_archs():
+        ModelRegistry.register_model("MyLlava",
+                                        "vllm_add_dummy_model.my_llava:MyLlava")
+```
+
+For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
+
+Every plugin has three parts:
+
+1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group `vllm.general_plugins` to register general plugins. This is the key of `entry_points` in the `setup.py` file. Always use `vllm.general_plugins` for vLLM's general plugins.
+2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name.
+3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module.
+
+## What Can Plugins Do?
+
+Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM.
+
+## Guidelines for Writing Plugins
+
+- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
+
+## Compatibility Guarantee
+
+vLLM guarantees the interface of documented plugins, such as `ModelRegistry.register_model`, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, `"vllm_add_dummy_model.my_llava:MyLlava"` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development.
diff --git a/docs/source/design/plugin_system.rst b/docs/source/design/plugin_system.rst
deleted file mode 100644
index 5a96cc8b3a464..0000000000000
--- a/docs/source/design/plugin_system.rst
+++ /dev/null
@@ -1,62 +0,0 @@
-.. _plugin_system:
-
-vLLM's Plugin System
-====================
-
-The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM.
-
-How Plugins Work in vLLM
-------------------------
-
-Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see :ref:`arch_overview`), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the `load_general_plugins <https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16>`__ function in the ``vllm.plugins`` module. This function is called for every process created by vLLM before it starts any work.
-
-How vLLM Discovers Plugins
---------------------------
-
-vLLM's plugin system uses the standard Python ``entry_points`` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
-
-.. code-block:: python
-
-    # inside `setup.py` file
-    from setuptools import setup
-
-    setup(name='vllm_add_dummy_model',
-          version='0.1',
-          packages=['vllm_add_dummy_model'],
-          entry_points={
-              'vllm.general_plugins':
-              ["register_dummy_model = vllm_add_dummy_model:register"]
-          })
-    
-    # inside `vllm_add_dummy_model.py` file
-    def register():
-        from vllm import ModelRegistry
-
-        if "MyLlava" not in ModelRegistry.get_supported_archs():
-            ModelRegistry.register_model("MyLlava",
-                                            "vllm_add_dummy_model.my_llava:MyLlava")
-
-For more information on adding entry points to your package, please check the `official documentation <https://setuptools.pypa.io/en/latest/userguide/entry_point.html>`__.
-
-Every plugin has three parts:
-
-1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group ``vllm.general_plugins`` to register general plugins. This is the key of ``entry_points`` in the ``setup.py`` file. Always use ``vllm.general_plugins`` for vLLM's general plugins.
-
-2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the ``entry_points`` dictionary. In the example above, the plugin name is ``register_dummy_model``. Plugins can be filtered by their names using the ``VLLM_PLUGINS`` environment variable. To load only a specific plugin, set ``VLLM_PLUGINS`` to the plugin name.
-
-3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is ``vllm_add_dummy_model:register``, which refers to a function named ``register`` in the ``vllm_add_dummy_model`` module.
-
-What Can Plugins Do?
---------------------
-
-Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling ``ModelRegistry.register_model`` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM.
-
-Guidelines for Writing Plugins
-------------------------------
-
-- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
-
-Compatibility Guarantee
------------------------
-
-vLLM guarantees the interface of documented plugins, such as ``ModelRegistry.register_model``, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, ``"vllm_add_dummy_model.my_llava:MyLlava"`` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development.
diff --git a/docs/source/dev/engine/async_llm_engine.rst b/docs/source/dev/engine/async_llm_engine.md
similarity index 59%
rename from docs/source/dev/engine/async_llm_engine.rst
rename to docs/source/dev/engine/async_llm_engine.md
index 93fc310cb543b..904feaa505164 100644
--- a/docs/source/dev/engine/async_llm_engine.rst
+++ b/docs/source/dev/engine/async_llm_engine.md
@@ -1,6 +1,7 @@
-AsyncLLMEngine
-=================================
+# AsyncLLMEngine
 
+```{eval-rst}
 .. autoclass:: vllm.AsyncLLMEngine
     :members:
     :show-inheritance:
+```
diff --git a/docs/source/dev/engine/engine_index.md b/docs/source/dev/engine/engine_index.md
new file mode 100644
index 0000000000000..701cb95d3be33
--- /dev/null
+++ b/docs/source/dev/engine/engine_index.md
@@ -0,0 +1,17 @@
+# vLLM Engine
+
+```{eval-rst}
+.. automodule:: vllm.engine
+```
+
+```{eval-rst}
+.. currentmodule:: vllm.engine
+```
+
+```{toctree}
+:caption: Engines
+:maxdepth: 2
+
+llm_engine
+async_llm_engine
+```
diff --git a/docs/source/dev/engine/engine_index.rst b/docs/source/dev/engine/engine_index.rst
deleted file mode 100644
index ba9ae55ddea46..0000000000000
--- a/docs/source/dev/engine/engine_index.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-vLLM Engine
-=================================
-
-.. automodule:: vllm.engine
-.. currentmodule:: vllm.engine
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Engines
-
-   llm_engine
-   async_llm_engine
-
diff --git a/docs/source/dev/engine/llm_engine.rst b/docs/source/dev/engine/llm_engine.md
similarity index 60%
rename from docs/source/dev/engine/llm_engine.rst
rename to docs/source/dev/engine/llm_engine.md
index 0b8c1e219d7c9..d6613ef5562dc 100644
--- a/docs/source/dev/engine/llm_engine.rst
+++ b/docs/source/dev/engine/llm_engine.md
@@ -1,6 +1,7 @@
-LLMEngine
-=================================
+# LLMEngine
 
+```{eval-rst}
 .. autoclass:: vllm.LLMEngine
     :members:
     :show-inheritance:
+```
diff --git a/docs/source/dev/offline_inference/llm.rst b/docs/source/dev/offline_inference/llm.md
similarity index 67%
rename from docs/source/dev/offline_inference/llm.rst
rename to docs/source/dev/offline_inference/llm.md
index 83ba1b6987c6d..9f129d5e41686 100644
--- a/docs/source/dev/offline_inference/llm.rst
+++ b/docs/source/dev/offline_inference/llm.md
@@ -1,6 +1,7 @@
-LLM Class
-=========
+# LLM Class
 
+```{eval-rst}
 .. autoclass:: vllm.LLM
     :members:
     :show-inheritance:
+```
diff --git a/docs/source/dev/offline_inference/llm_inputs.rst b/docs/source/dev/offline_inference/llm_inputs.md
similarity index 78%
rename from docs/source/dev/offline_inference/llm_inputs.rst
rename to docs/source/dev/offline_inference/llm_inputs.md
index 0d47281db485e..21f688a12c536 100644
--- a/docs/source/dev/offline_inference/llm_inputs.rst
+++ b/docs/source/dev/offline_inference/llm_inputs.md
@@ -1,14 +1,19 @@
-LLM Inputs
-==========
+# LLM Inputs
 
+```{eval-rst}
 .. autodata:: vllm.inputs.PromptType
+```
 
+```{eval-rst}
 .. autoclass:: vllm.inputs.TextPrompt
     :show-inheritance:
     :members:
     :member-order: bysource
+```
 
+```{eval-rst}
 .. autoclass:: vllm.inputs.TokensPrompt
     :show-inheritance:
     :members:
     :member-order: bysource
+```
diff --git a/docs/source/dev/offline_inference/offline_index.md b/docs/source/dev/offline_inference/offline_index.md
new file mode 100644
index 0000000000000..318a02d8c78df
--- /dev/null
+++ b/docs/source/dev/offline_inference/offline_index.md
@@ -0,0 +1,8 @@
+# Offline Inference
+
+```{toctree}
+:maxdepth: 1
+
+llm
+llm_inputs
+```
diff --git a/docs/source/dev/offline_inference/offline_index.rst b/docs/source/dev/offline_inference/offline_index.rst
deleted file mode 100644
index 27dfb0e9df90e..0000000000000
--- a/docs/source/dev/offline_inference/offline_index.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-Offline Inference
-=================================
-
-.. toctree::
-   :maxdepth: 1
-
-   llm
-   llm_inputs
diff --git a/docs/source/dev/pooling_params.rst b/docs/source/dev/pooling_params.md
similarity index 55%
rename from docs/source/dev/pooling_params.rst
rename to docs/source/dev/pooling_params.md
index 334e0287aff09..74b2c57443e4b 100644
--- a/docs/source/dev/pooling_params.rst
+++ b/docs/source/dev/pooling_params.md
@@ -1,5 +1,6 @@
-Pooling Parameters
-==================
+# Pooling Parameters
 
+```{eval-rst}
 .. autoclass:: vllm.PoolingParams
     :members:
+```
diff --git a/docs/source/dev/sampling_params.rst b/docs/source/dev/sampling_params.md
similarity index 55%
rename from docs/source/dev/sampling_params.rst
rename to docs/source/dev/sampling_params.md
index f645941a6c022..bdc36af5153db 100644
--- a/docs/source/dev/sampling_params.rst
+++ b/docs/source/dev/sampling_params.md
@@ -1,5 +1,6 @@
-Sampling Parameters
-===================
+# Sampling Parameters
 
+```{eval-rst}
 .. autoclass:: vllm.SamplingParams
     :members:
+```
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index 79b49a186236a..4c5a9d9c1da38 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -15,18 +15,12 @@ def fix_case(text: str) -> str:
     return text
 
 
-def underline(title: str, character: str = "=") -> str:
-    return f"{title}\n{character * len(title)}"
-
-
 def generate_title(filename: str) -> str:
     # Turn filename into a title
     title = filename.replace("_", " ").title()
     # Handle acronyms and names
     title = fix_case(title)
-    # Underline title
-    title = underline(title)
-    return title
+    return f"# {title}"
 
 
 def generate_examples():
@@ -38,7 +32,7 @@ def generate_examples():
 
     # Destination paths
     doc_dir = root_dir / "docs/source/getting_started/examples"
-    doc_paths = [doc_dir / f"{path.stem}.rst" for path in script_paths]
+    doc_paths = [doc_dir / f"{path.stem}.md" for path in script_paths]
 
     # Generate the example docs for each example script
     for script_path, doc_path in zip(script_paths, doc_paths):
@@ -46,16 +40,16 @@ def generate_examples():
         # Make script_path relative to doc_path and call it include_path
         include_path = '../../../..' / script_path.relative_to(root_dir)
         content = (f"{generate_title(doc_path.stem)}\n\n"
-                   f"Source {script_url}.\n\n"
-                   f".. literalinclude:: {include_path}\n"
-                   "    :language: python\n"
-                   "    :linenos:\n")
+                   f"Source: <{script_url}>.\n\n"
+                   f"```{{literalinclude}} {include_path}\n"
+                   ":language: python\n"
+                   ":linenos:\n```")
         with open(doc_path, "w+") as f:
             f.write(content)
 
     # Generate the toctree for the example scripts
-    with open(doc_dir / "examples_index.template.rst") as f:
+    with open(doc_dir / "examples_index.template.md") as f:
         examples_index = f.read()
-    with open(doc_dir / "examples_index.rst", "w+") as f:
-        example_docs = "\n   ".join(path.stem for path in script_paths)
+    with open(doc_dir / "examples_index.md", "w+") as f:
+        example_docs = "\n".join(path.stem + ".md" for path in script_paths)
         f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs))
diff --git a/docs/source/getting_started/amd-installation.md b/docs/source/getting_started/amd-installation.md
new file mode 100644
index 0000000000000..b9ccbd7d6c7fc
--- /dev/null
+++ b/docs/source/getting_started/amd-installation.md
@@ -0,0 +1,163 @@
+(installation-rocm)=
+
+# Installation with ROCm
+
+vLLM supports AMD GPUs with ROCm 6.2.
+
+## Requirements
+
+- OS: Linux
+- Python: 3.9 -- 3.12
+- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
+- ROCm 6.2
+
+Installation options:
+
+1. [Build from source with docker](#build-from-source-docker-rocm)
+2. [Build from source](#build-from-source-rocm)
+
+(build-from-source-docker-rocm)=
+
+## Option 1: Build from source with docker (recommended)
+
+You can build and install vLLM from source.
+
+First, build a docker image from [Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm) and launch a docker container from the image.
+It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
+
+```console
+{
+    "features": {
+        "buildkit": true
+    }
+}
+```
+
+[Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm) uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches.
+It provides flexibility to customize the build of docker image using the following arguments:
+
+- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image.
+- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target.
+- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
+- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c`
+- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1.
+
+Their values can be passed in when running `docker build` with `--build-arg` options.
+
+To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default:
+
+```console
+$ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
+```
+
+To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below:
+
+```console
+$ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
+```
+
+To run the above docker image `vllm-rocm`, use the below command:
+
+```console
+$ docker run -it \
+   --network=host \
+   --group-add=video \
+   --ipc=host \
+   --cap-add=SYS_PTRACE \
+   --security-opt seccomp=unconfined \
+   --device /dev/kfd \
+   --device /dev/dri \
+   -v <path/to/model>:/app/model \
+   vllm-rocm \
+   bash
+```
+
+Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models.
+
+(build-from-source-rocm)=
+
+## Option 2: Build from source
+
+0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
+
+- [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html)
+- [PyTorch](https://pytorch.org/)
+
+For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`.
+
+Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/)
+
+1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton)
+
+Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md)
+
+```console
+$ python3 -m pip install ninja cmake wheel pybind11
+$ pip uninstall -y triton
+$ git clone https://github.com/OpenAI/triton.git
+$ cd triton
+$ git checkout e192dba
+$ cd python
+$ pip3 install .
+$ cd ../..
+```
+
+```{note}
+- If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
+```
+
+2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile)
+
+Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support)
+Alternatively, wheels intended for vLLM use can be accessed under the releases.
+
+For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`.
+
+```console
+$ git clone https://github.com/ROCm/flash-attention.git
+$ cd flash-attention
+$ git checkout 3cea2fb
+$ git submodule update --init
+$ GPU_ARCHS="gfx90a" python3 setup.py install
+$ cd ..
+```
+
+```{note}
+- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
+```
+
+3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps:
+
+```bash
+$ pip install --upgrade pip
+
+# Install PyTorch
+$ pip uninstall torch -y
+$ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
+
+# Build & install AMD SMI
+$ pip install /opt/rocm/share/amd_smi
+
+# Install dependencies
+$ pip install --upgrade numba scipy huggingface-hub[cli]
+$ pip install "numpy<2"
+$ pip install -r requirements-rocm.txt
+
+# Build vLLM for MI210/MI250/MI300.
+$ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+$ python3 setup.py develop
+```
+
+This may take 5-10 minutes. Currently, {code}`pip install .` does not work for ROCm installation.
+
+```{tip}
+- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
+- Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
+- To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
+- The ROCm version of PyTorch, ideally, should match the ROCm driver version.
+```
+
+```{tip}
+- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
+  For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
+```
diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
deleted file mode 100644
index 27636d936270c..0000000000000
--- a/docs/source/getting_started/amd-installation.rst
+++ /dev/null
@@ -1,178 +0,0 @@
-.. _installation_rocm:
-
-Installation with ROCm
-======================
-
-vLLM supports AMD GPUs with ROCm 6.2.
-
-Requirements
-------------
-
-* OS: Linux
-* Python: 3.9 -- 3.12
-* GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
-* ROCm 6.2
-
-Installation options:
-
-#. :ref:`Build from source with docker <build_from_source_docker_rocm>`
-#. :ref:`Build from source <build_from_source_rocm>`
-
-.. _build_from_source_docker_rocm:
-
-Option 1: Build from source with docker (recommended)
------------------------------------------------------
-
-You can build and install vLLM from source.
-
-First, build a docker image from `Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ and launch a docker container from the image.
-It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
-
-.. code-block:: console
-    
-    {
-        "features": {
-            "buildkit": true
-        }
-    }
-
-
-`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches.
-It provides flexibility to customize the build of docker image using the following arguments:
-
-* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image.
-* `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For `Radeon RX 7900 series (gfx1100) <https://rocm.docs.amd.com/projects/radeon/en/latest/index.html>`_, this should be set to 0 before flash-attention supports this target.
-* `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
-* `FA_BRANCH`: specifies the branch used to build the CK flash-attention in `ROCm's flash-attention repo <https://github.com/ROCmSoftwarePlatform/flash-attention>`_. The default is `ae7928c`
-* `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1. 
-
-Their values can be passed in when running ``docker build`` with ``--build-arg`` options.
-
-
-To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default:
-
-.. code-block:: console
-
-    $ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
-
-To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below:
-
-.. code-block:: console
-
-    $ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
-
-To run the above docker image ``vllm-rocm``, use the below command:
-
-.. code-block:: console
-
-    $ docker run -it \
-       --network=host \
-       --group-add=video \
-       --ipc=host \
-       --cap-add=SYS_PTRACE \
-       --security-opt seccomp=unconfined \
-       --device /dev/kfd \
-       --device /dev/dri \
-       -v <path/to/model>:/app/model \
-       vllm-rocm \
-       bash
-
-Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models.
-
-
-.. _build_from_source_rocm:
-
-Option 2: Build from source
----------------------------
-
-0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
-
-- `ROCm <https://rocm.docs.amd.com/en/latest/deploy/linux/index.html>`_
-- `PyTorch <https://pytorch.org/>`_
-
-For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`.
-
-Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch `Getting Started <https://pytorch.org/get-started/locally/>`_
-
-
-1. Install `Triton flash attention for ROCm <https://github.com/ROCm/triton>`_
-
-Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from `ROCm/triton <https://github.com/ROCm/triton/blob/triton-mlir/README.md>`_
-
-    .. code-block:: console
-
-        $ python3 -m pip install ninja cmake wheel pybind11
-        $ pip uninstall -y triton 
-        $ git clone https://github.com/OpenAI/triton.git
-        $ cd triton
-        $ git checkout e192dba
-        $ cd python
-        $ pip3 install .
-        $ cd ../..
-
-.. note::
-    - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
-
-
-2. Optionally, if you choose to use CK flash attention, you can install `flash attention for ROCm <https://github.com/ROCm/flash-attention/tree/ck_tile>`_
-
-
-Install ROCm's flash attention (v2.5.9.post1) following the instructions from `ROCm/flash-attention <https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support>`_
-Alternatively, wheels intended for vLLM use can be accessed under the releases.
-
-For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`.
-Note to get your gfx architecture, run `rocminfo |grep gfx`.
-
-    .. code-block:: console
-
-        $ git clone https://github.com/ROCm/flash-attention.git
-        $ cd flash-attention
-        $ git checkout 3cea2fb
-        $ git submodule update --init
-        $ GPU_ARCHS="gfx90a" python3 setup.py install
-        $ cd ..
-
-.. note::
-    - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
-
-3. Build vLLM.
-
-    For example, vLLM on ROCM 6.2 can be built with the following steps:
-
-    .. code-block:: console
-
-        $ pip install --upgrade pip
-
-        $ # Install PyTorch
-        $ pip uninstall torch -y
-        $ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
-
-        $ # Build & install AMD SMI
-        $ pip install /opt/rocm/share/amd_smi
-
-        $ # Install dependencies
-        $ pip install --upgrade numba scipy huggingface-hub[cli]
-        $ pip install "numpy<2"
-        $ pip install -r requirements-rocm.txt
-
-        $ # Build vLLM for MI210/MI250/MI300.
-        $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
-        $ python3 setup.py develop
-
-
-    This may take 5-10 minutes. Currently, :code:`pip install .` does not work for ROCm installation.
-
-
-.. tip::
-
-    - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
-    - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
-    - To use CK flash-attention or PyTorch naive attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention. 
-    - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
-
-
-.. tip::
-    - For MI300x (gfx942) users, to achieve optimal performance, please refer to `MI300x tuning guide <https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html>`_ for performance optimization and tuning tips on system and workflow level.
-      For vLLM, please refer to `vLLM performance optimization <https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization>`_.
-
-
diff --git a/docs/source/getting_started/arm-installation.md b/docs/source/getting_started/arm-installation.md
new file mode 100644
index 0000000000000..de807e198b4f6
--- /dev/null
+++ b/docs/source/getting_started/arm-installation.md
@@ -0,0 +1,46 @@
+(installation-arm)=
+
+# Installation for ARM CPUs
+
+vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering:
+
+- CPU backend inference capabilities
+- Relevant runtime environment variables
+- Performance optimization tips
+
+ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
+Contents:
+
+1. [Requirements](#arm-backend-requirements)
+2. [Quick Start with Dockerfile](#arm-backend-quick-start-dockerfile)
+3. [Building from Source](#build-arm-backend-from-source)
+
+(arm-backend-requirements)=
+
+## Requirements
+
+- **Operating System**: Linux or macOS
+- **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended)
+- **Instruction Set Architecture (ISA)**: NEON support is required
+
+(arm-backend-quick-start-dockerfile)=
+
+## Quick Start with Dockerfile
+
+You can quickly set up vLLM on ARM using Docker:
+
+```console
+$ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g .
+$ docker run -it \
+             --rm \
+             --network=host \
+             --cpuset-cpus=<cpu-id-list, optional> \
+             --cpuset-mems=<memory-node, optional> \
+             vllm-cpu-env
+```
+
+(build-arm-backend-from-source)=
+
+## Building from Source
+
+To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility.
diff --git a/docs/source/getting_started/arm-installation.rst b/docs/source/getting_started/arm-installation.rst
deleted file mode 100644
index 7b457df92c11d..0000000000000
--- a/docs/source/getting_started/arm-installation.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-.. _installation_arm:
-
-Installation for ARM CPUs
-=========================
-
-vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering:
-
-* CPU backend inference capabilities
-* Relevant runtime environment variables
-* Performance optimization tips
-
-ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
-Contents:
-
-1. :ref:`Requirements <arm_backend_requirements>`
-2. :ref:`Quick Start with Dockerfile <arm_backend_quick_start_dockerfile>`
-3. :ref:`Building from Source <build_arm_backend_from_source>`
-
-.. _arm_backend_requirements:
-
-Requirements
-------------
-
-* **Operating System**: Linux or macOS
-* **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended)
-* **Instruction Set Architecture (ISA)**: NEON support is required
-
-.. _arm_backend_quick_start_dockerfile:
-
-Quick Start with Dockerfile
----------------------------
-
-You can quickly set up vLLM on ARM using Docker:
-
-.. code-block:: console
-
-    $ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g .
-    $ docker run -it \
-                 --rm \
-                 --network=host \
-                 --cpuset-cpus=<cpu-id-list, optional> \
-                 --cpuset-mems=<memory-node, optional> \
-                 vllm-cpu-env
-
-.. _build_arm_backend_from_source:
-
-Building from Source
---------------------
-
-To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility.
diff --git a/docs/source/getting_started/cpu-installation.md b/docs/source/getting_started/cpu-installation.md
new file mode 100644
index 0000000000000..4ab5437f091d5
--- /dev/null
+++ b/docs/source/getting_started/cpu-installation.md
@@ -0,0 +1,154 @@
+(installation-cpu)=
+
+# Installation with CPU
+
+vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
+
+- Tensor Parallel
+- Model Quantization (`INT8 W8A8, AWQ`)
+- Chunked-prefill
+- Prefix-caching
+- FP8-E5M2 KV-Caching (TODO)
+
+Table of contents:
+
+1. [Requirements](#cpu-backend-requirements)
+2. [Quick start using Dockerfile](#cpu-backend-quick-start-dockerfile)
+3. [Build from source](#build-cpu-backend-from-source)
+4. [Related runtime environment variables](#env-intro)
+5. [Intel Extension for PyTorch](#ipex-guidance)
+6. [Performance tips](#cpu-backend-performance-tips)
+
+(cpu-backend-requirements)=
+
+## Requirements
+
+- OS: Linux
+- Compiler: gcc/g++>=12.3.0 (optional, recommended)
+- Instruction set architecture (ISA) requirement: AVX512 (optional, recommended)
+
+(cpu-backend-quick-start-dockerfile)=
+
+## Quick start using Dockerfile
+
+```console
+$ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
+$ docker run -it \
+             --rm \
+             --network=host \
+             --cpuset-cpus=<cpu-id-list, optional> \
+             --cpuset-mems=<memory-node, optional> \
+             vllm-cpu-env
+```
+
+(build-cpu-backend-from-source)=
+
+## Build from source
+
+- First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
+
+```console
+$ sudo apt-get update  -y
+$ sudo apt-get install -y gcc-12 g++-12 libnuma-dev
+$ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+```
+
+- Second, install Python packages for vLLM CPU backend building:
+
+```console
+$ pip install --upgrade pip
+$ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
+$ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+```
+
+- Finally, build and install vLLM CPU backend:
+
+```console
+$ VLLM_TARGET_DEVICE=cpu python setup.py install
+```
+
+```{note}
+- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
+- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building.
+```
+
+(env-intro)=
+
+## Related runtime environment variables
+
+- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
+- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
+
+(ipex-guidance)=
+
+## Intel Extension for PyTorch
+
+- [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
+
+(cpu-backend-performance-tips)=
+
+## Performance tips
+
+- We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run:
+
+```console
+$ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
+$ find / -name *libtcmalloc* # find the dynamic link library path
+$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
+$ python examples/offline_inference.py # run vLLM
+```
+
+- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
+
+```console
+$ export VLLM_CPU_KVCACHE_SPACE=40
+$ export VLLM_CPU_OMP_THREADS_BIND=0-29
+$ vllm serve facebook/opt-125m
+```
+
+- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND`. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
+
+```console
+$ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
+
+# The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core.
+CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
+0    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
+1    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
+2    0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
+3    0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
+4    0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
+5    0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
+6    0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
+7    0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
+8    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
+9    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
+10   0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
+11   0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
+12   0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
+13   0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
+14   0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
+15   0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
+
+# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
+$ export VLLM_CPU_OMP_THREADS_BIND=0-7
+$ python examples/offline_inference.py
+```
+
+- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
+
+## CPU Backend Considerations
+
+- The CPU backend significantly differs from the GPU backend since the vLLM architecture was originally optimized for GPU use. A number of optimizations are needed to enhance its performance.
+
+- Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance.
+
+- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel.
+
+  - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](https://github.com/vllm-project/vllm/pull/6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
+
+    ```console
+    $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
+    ```
+
+  - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](../serving/deploying_with_nginx) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md).
diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst
deleted file mode 100644
index 649de1cd9b53c..0000000000000
--- a/docs/source/getting_started/cpu-installation.rst
+++ /dev/null
@@ -1,164 +0,0 @@
-.. _installation_cpu:
-
-Installation with CPU
-========================
-
-vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
-
-- Tensor Parallel 
-- Model Quantization (``INT8 W8A8, AWQ``)
-- Chunked-prefill
-- Prefix-caching
-- FP8-E5M2 KV-Caching (TODO)
-
-Table of contents:
-
-#. :ref:`Requirements <cpu_backend_requirements>`
-#. :ref:`Quick start using Dockerfile <cpu_backend_quick_start_dockerfile>`
-#. :ref:`Build from source <build_cpu_backend_from_source>`
-#. :ref:`Related runtime environment variables <env_intro>`
-#. :ref:`Intel Extension for PyTorch <ipex_guidance>`
-#. :ref:`Performance tips <cpu_backend_performance_tips>`
-
-.. _cpu_backend_requirements:
-
-Requirements
-------------
-
-* OS: Linux
-* Compiler: gcc/g++>=12.3.0 (optional, recommended)
-* Instruction set architecture (ISA) requirement: AVX512 (optional, recommended)
-
-.. _cpu_backend_quick_start_dockerfile:
-
-Quick start using Dockerfile
-----------------------------
-
-.. code-block:: console
-
-    $ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
-    $ docker run -it \
-                 --rm \
-                 --network=host \
-                 --cpuset-cpus=<cpu-id-list, optional> \
-                 --cpuset-mems=<memory-node, optional> \
-                 vllm-cpu-env
-
-.. _build_cpu_backend_from_source:
-
-Build from source
------------------
-
-- First, install recommended compiler. We recommend to use ``gcc/g++ >= 12.3.0`` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
-
-.. code-block:: console
-
-    $ sudo apt-get update  -y
-    $ sudo apt-get install -y gcc-12 g++-12 libnuma-dev
-    $ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
-
-- Second, install Python packages for vLLM CPU backend building:
-
-.. code-block:: console
-
-    $ pip install --upgrade pip
-    $ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
-    $ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
-
-- Finally, build and install vLLM CPU backend: 
-
-.. code-block:: console
-
-    $ VLLM_TARGET_DEVICE=cpu python setup.py install
-
-.. note::
-    - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. 
-    
-    - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building.    
-
-.. _env_intro:
-
-Related runtime environment variables
--------------------------------------
-
-- ``VLLM_CPU_KVCACHE_SPACE``: specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
-
-- ``VLLM_CPU_OMP_THREADS_BIND``: specify the CPU cores dedicated to the OpenMP threads. For example, ``VLLM_CPU_OMP_THREADS_BIND=0-31`` means there will be 32 OpenMP threads bound on 0-31 CPU cores. ``VLLM_CPU_OMP_THREADS_BIND=0-31|32-63`` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
-
-.. _ipex_guidance:
-
-Intel Extension for PyTorch
----------------------------
-
-- `Intel Extension for PyTorch (IPEX) <https://github.com/intel/intel-extension-for-pytorch>`_ extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
-
-.. _cpu_backend_performance_tips:
-
-Performance tips
------------------
-
-- We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run:
-
-.. code-block:: console
-
-    $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
-    $ find / -name *libtcmalloc* # find the dynamic link library path
-    $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
-    $ python examples/offline_inference.py # run vLLM
-
-- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
-
-.. code-block:: console
-
-    $ export VLLM_CPU_KVCACHE_SPACE=40
-    $ export VLLM_CPU_OMP_THREADS_BIND=0-29 
-    $ vllm serve facebook/opt-125m
-
-- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using ``VLLM_CPU_OMP_THREADS_BIND``. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
-
-.. code-block:: console
-
-    $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
-
-    # The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core. 
-    CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
-    0    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
-    1    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
-    2    0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
-    3    0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
-    4    0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
-    5    0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
-    6    0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
-    7    0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
-    8    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
-    9    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
-    10   0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
-    11   0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
-    12   0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
-    13   0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
-    14   0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
-    15   0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
-
-    # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
-    $ export VLLM_CPU_OMP_THREADS_BIND=0-7 
-    $ python examples/offline_inference.py
-
-- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using ``VLLM_CPU_OMP_THREADS_BIND`` to avoid cross NUMA node memory access.
-
-CPU Backend Considerations
---------------------------
-
-- The CPU backend significantly differs from the GPU backend since the vLLM architecture was originally optimized for GPU use. A number of optimizations are needed to enhance its performance.
-
-- Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance.
-
-- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the `topology <https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.md#non-uniform-memory-access-numa>`_. For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel.  
-
-  * Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With `TP feature on CPU <https://github.com/vllm-project/vllm/pull/6125>`_ merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
-
-    .. code-block:: console
-
-         $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
-
-
-  * Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like `Nginx <../serving/deploying_with_nginx.html>`_ or HAProxy are recommended. Anyscale Ray project provides the feature on LLM `serving <https://docs.ray.io/en/latest/serve/index.html>`_. Here is the example to setup a scalable LLM serving with `Ray Serve <https://github.com/intel/llm-on-ray/blob/main/docs/setup.md>`_.
\ No newline at end of file
diff --git a/docs/source/getting_started/debugging.md b/docs/source/getting_started/debugging.md
new file mode 100644
index 0000000000000..2f11c95ce0e77
--- /dev/null
+++ b/docs/source/getting_started/debugging.md
@@ -0,0 +1,199 @@
+(debugging)=
+
+# Debugging Tips
+
+This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
+
+```{note}
+Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated.
+```
+
+## Hangs downloading a model
+
+If the model isn't already downloaded to disk, vLLM will download it from the internet which can take time and depend on your internet connection.
+It's recommended to download the model first using the [huggingface-cli](https://huggingface.co/docs/huggingface_hub/en/guides/cli) and passing the local path to the model to vLLM. This way, you can isolate the issue.
+
+## Hangs loading a model from disk
+
+If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow.
+It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory.
+
+```{note}
+To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck.
+```
+
+## Model is too large
+
+If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](https://docs.vllm.ai/en/latest/serving/distributed_serving.html#distributed-inference-and-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using [this example](https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html) . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+
+## Enable more logging
+
+If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue:
+
+- `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging.
+- `export CUDA_LAUNCH_BLOCKING=1` to identify which CUDA kernel is causing the problem.
+- `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL.
+- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs.
+
+## Incorrect network setup
+
+The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as `DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl` and the IP address should be the correct one.
+If it's not, override the IP address using the environment variable `export VLLM_HOST_IP=<your_ip_address>`.
+
+You might also need to set `export NCCL_SOCKET_IFNAME=<your_network_interface>` and `export GLOO_SOCKET_IFNAME=<your_network_interface>` to specify the network interface for the IP address.
+
+## Error near `self.graph.replay()`
+
+If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph.
+To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.
+
+## Incorrect hardware/driver
+
+If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
+
+```python
+# Test PyTorch NCCL
+import torch
+import torch.distributed as dist
+dist.init_process_group(backend="nccl")
+local_rank = dist.get_rank() % torch.cuda.device_count()
+torch.cuda.set_device(local_rank)
+data = torch.FloatTensor([1,] * 128).to("cuda")
+dist.all_reduce(data, op=dist.ReduceOp.SUM)
+torch.cuda.synchronize()
+value = data.mean().item()
+world_size = dist.get_world_size()
+assert value == world_size, f"Expected {world_size}, got {value}"
+
+print("PyTorch NCCL is successful!")
+
+# Test PyTorch GLOO
+gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
+cpu_data = torch.FloatTensor([1,] * 128)
+dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
+value = cpu_data.mean().item()
+assert value == world_size, f"Expected {world_size}, got {value}"
+
+print("PyTorch GLOO is successful!")
+
+if world_size <= 1:
+    exit()
+
+# Test vLLM NCCL, with cuda graph
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+
+pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank)
+# pynccl is enabled by default for 0.6.5+,
+# but for 0.6.4 and below, we need to enable it manually.
+# keep the code for backward compatibility when because people
+# prefer to read the latest documentation.
+pynccl.disabled = False
+
+s = torch.cuda.Stream()
+with torch.cuda.stream(s):
+    data.fill_(1)
+    pynccl.all_reduce(data, stream=s)
+    value = data.mean().item()
+    assert value == world_size, f"Expected {world_size}, got {value}"
+
+print("vLLM NCCL is successful!")
+
+g = torch.cuda.CUDAGraph()
+with torch.cuda.graph(cuda_graph=g, stream=s):
+    pynccl.all_reduce(data, stream=torch.cuda.current_stream())
+
+data.fill_(1)
+g.replay()
+torch.cuda.current_stream().synchronize()
+value = data.mean().item()
+assert value == world_size, f"Expected {world_size}, got {value}"
+
+print("vLLM NCCL with cuda graph is successful!")
+
+dist.destroy_process_group(gloo_group)
+dist.destroy_process_group()
+```
+
+If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use:
+
+```console
+$ NCCL_DEBUG=TRACE torchrun --nproc-per-node=<number-of-GPUs> test.py
+```
+
+If you are testing with multi-nodes, adjust `--nproc-per-node` and `--nnodes` according to your setup and set `MASTER_ADDR` to the correct IP address of the master node, reachable from all nodes. Then, run:
+
+```console
+$ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py
+```
+
+If the script runs successfully, you should see the message `sanity check is successful!`.
+
+If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as `export NCCL_P2P_DISABLE=1` to see if it helps. Please check [their documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html) for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully.
+
+```{note}
+A multi-node environment is more complicated than a single-node one. If you see errors such as `torch.distributed.DistNetworkError`, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments:
+
+- In the first node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py`.
+- In the second node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py`.
+
+Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes.
+```
+
+## Python multiprocessing
+
+### `RuntimeError` Exception
+
+If you have seen a warning in your logs like this:
+
+```console
+WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
+    initialized. We must use the `spawn` multiprocessing start method. Setting
+    VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
+    https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
+    for more information.
+```
+
+or an error from Python that looks like this:
+
+```console
+RuntimeError:
+        An attempt has been made to start a new process before the
+        current process has finished its bootstrapping phase.
+
+        This probably means that you are not using fork to start your
+        child processes and you have forgotten to use the proper idiom
+        in the main module:
+
+            if __name__ == '__main__':
+                freeze_support()
+                ...
+
+        The "freeze_support()" line can be omitted if the program
+        is not going to be frozen to produce an executable.
+
+        To fix this issue, refer to the "Safe importing of main module"
+        section in https://docs.python.org/3/library/multiprocessing.html
+```
+
+then you must update your Python code to guard usage of `vllm` behind a `if
+__name__ == '__main__':` block. For example, instead of this:
+
+```python
+import vllm
+
+llm = vllm.LLM(...)
+```
+
+try this instead:
+
+```python
+if __name__ == '__main__':
+    import vllm
+
+    llm = vllm.LLM(...)
+```
+
+## Known Issues
+
+- In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](https://github.com/vllm-project/vllm/pull/6759).
+- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable ``NCCL_CUMEM_ENABLE=0`` to disable NCCL's ``cuMem`` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](https://github.com/vllm-project/vllm/issues/5723#issuecomment-2554389656) .
diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
deleted file mode 100644
index b123960533816..0000000000000
--- a/docs/source/getting_started/debugging.rst
+++ /dev/null
@@ -1,203 +0,0 @@
-.. _debugging:
-
-===============
-Debugging Tips
-===============
-
-This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please `search existing issues <https://github.com/vllm-project/vllm/issues?q=is%3Aissue>`_ first to see if it has already been reported. If not, please `file a new issue <https://github.com/vllm-project/vllm/issues/new/choose>`_, providing as much relevant information as possible.
-
-.. note::
-
-    Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated.
-
-Hangs downloading a model 
-----------------------------------------
-If the model isn't already downloaded to disk, vLLM will download it from the internet which can take time and depend on your internet connection. 
-It's recommended to download the model first using the `huggingface-cli <https://huggingface.co/docs/huggingface_hub/en/guides/cli>`_ and passing the local path to the model to vLLM. This way, you can isolate the issue.
-
-Hangs loading a model from disk
-----------------------------------------
-If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow. 
-It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory.
-
-.. note::
-
-    To isolate the model downloading and loading issue, you can use the ``--load-format dummy`` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck.
-
-Model is too large
-----------------------------------------
-If the model is too large to fit in a single GPU, you might want to `consider tensor parallelism <https://docs.vllm.ai/en/latest/serving/distributed_serving.html#distributed-inference-and-serving>`_ to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `this example <https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html>`_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
-
-Enable more logging 
-----------------------------------------
-If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue:
-
-- ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging.
-- ``export CUDA_LAUNCH_BLOCKING=1`` to identify which CUDA kernel is causing the problem.
-- ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL.
-- ``export VLLM_TRACE_FUNCTION=1`` to record all function calls for inspection in the log files to tell which function crashes or hangs.
-
-Incorrect network setup
-----------------------------------------
-The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl`` and the IP address should be the correct one. 
-If it's not, override the IP address using the environment variable ``export VLLM_HOST_IP=<your_ip_address>``. 
-
-You might also need to set ``export NCCL_SOCKET_IFNAME=<your_network_interface>`` and ``export GLOO_SOCKET_IFNAME=<your_network_interface>`` to specify the network interface for the IP address.
-
-Error near ``self.graph.replay()`` 
-----------------------------------------
-If vLLM crashes and the error trace captures it somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a CUDA error inside CUDAGraph. 
-To identify the particular CUDA operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the :class:`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.
-
-Incorrect hardware/driver
-----------------------------------------
-If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
-
-.. code-block:: python
-
-    # Test PyTorch NCCL
-    import torch
-    import torch.distributed as dist
-    dist.init_process_group(backend="nccl")
-    local_rank = dist.get_rank() % torch.cuda.device_count()
-    torch.cuda.set_device(local_rank)
-    data = torch.FloatTensor([1,] * 128).to("cuda")
-    dist.all_reduce(data, op=dist.ReduceOp.SUM)
-    torch.cuda.synchronize()
-    value = data.mean().item()
-    world_size = dist.get_world_size()
-    assert value == world_size, f"Expected {world_size}, got {value}"
-
-    print("PyTorch NCCL is successful!")
-
-    # Test PyTorch GLOO
-    gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
-    cpu_data = torch.FloatTensor([1,] * 128)
-    dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
-    value = cpu_data.mean().item()
-    assert value == world_size, f"Expected {world_size}, got {value}"
-
-    print("PyTorch GLOO is successful!")
-
-    if world_size <= 1:
-        exit()
-
-    # Test vLLM NCCL, with cuda graph
-    from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
-
-    pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank)
-    # pynccl is enabled by default for 0.6.5+,
-    # but for 0.6.4 and below, we need to enable it manually.
-    # keep the code for backward compatibility when because people
-    # prefer to read the latest documentation.
-    pynccl.disabled = False
-
-    s = torch.cuda.Stream()
-    with torch.cuda.stream(s):
-        data.fill_(1)
-        pynccl.all_reduce(data, stream=s)
-        value = data.mean().item()
-        assert value == world_size, f"Expected {world_size}, got {value}"
-
-    print("vLLM NCCL is successful!")
-
-    g = torch.cuda.CUDAGraph()
-    with torch.cuda.graph(cuda_graph=g, stream=s):
-        pynccl.all_reduce(data, stream=torch.cuda.current_stream())
-
-    data.fill_(1)
-    g.replay()
-    torch.cuda.current_stream().synchronize()
-    value = data.mean().item()
-    assert value == world_size, f"Expected {world_size}, got {value}"
-
-    print("vLLM NCCL with cuda graph is successful!")
-
-    dist.destroy_process_group(gloo_group)
-    dist.destroy_process_group()
-
-If you are testing with a single node, adjust ``--nproc-per-node`` to the number of GPUs you want to use:
-
-.. code-block:: console
-
-    $ NCCL_DEBUG=TRACE torchrun --nproc-per-node=<number-of-GPUs> test.py
-
-If you are testing with multi-nodes, adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup and set ``MASTER_ADDR`` to the correct IP address of the master node, reachable from all nodes. Then, run:
-
-.. code-block:: console
-
-    $ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py
-
-If the script runs successfully, you should see the message ``sanity check is successful!``.
-
-If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as ``export NCCL_P2P_DISABLE=1`` to see if it helps. Please check `their documentation <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html>`__ for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully.
-
-.. note::
-
-    A multi-node environment is more complicated than a single-node one. If you see errors such as ``torch.distributed.DistNetworkError``, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments:
-
-    - In the first node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py``.
-    - In the second node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py``.
-
-    Adjust ``--nproc-per-node``, ``--nnodes``, and ``--node-rank`` according to your setup, being sure to execute different commands (with different ``--node-rank``) on different nodes.
-
-Python multiprocessing
-----------------------
-
-`RuntimeError` Exception
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-If you have seen a warning in your logs like this:
-
-.. code-block:: console
-
-    WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
-        initialized. We must use the `spawn` multiprocessing start method. Setting
-        VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
-        https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
-        for more information.
-
-or an error from Python that looks like this:
-
-.. code-block:: console
-
-    RuntimeError:
-            An attempt has been made to start a new process before the
-            current process has finished its bootstrapping phase.
-
-            This probably means that you are not using fork to start your
-            child processes and you have forgotten to use the proper idiom
-            in the main module:
-
-                if __name__ == '__main__':
-                    freeze_support()
-                    ...
-
-            The "freeze_support()" line can be omitted if the program
-            is not going to be frozen to produce an executable.
-
-            To fix this issue, refer to the "Safe importing of main module"
-            section in https://docs.python.org/3/library/multiprocessing.html
-
-then you must update your Python code to guard usage of ``vllm`` behind a ``if
-__name__ == '__main__':`` block. For example, instead of this:
-
-.. code-block:: python
-
-    import vllm
-
-    llm = vllm.LLM(...)
-
-try this instead:
-
-.. code-block:: python
-
-    if __name__ == '__main__':
-        import vllm
-
-        llm = vllm.LLM(...)
-
-Known Issues
-----------------------------------------
-- In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq <https://github.com/zeromq/pyzmq/issues/2000>`_ , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of ``vllm`` to include the `fix <https://github.com/vllm-project/vllm/pull/6759>`_.
-- To circumvent a NCCL `bug <https://github.com/NVIDIA/nccl/issues/1234>`__ , all vLLM processes will set an environment variable ``NCCL_CUMEM_ENABLE=0`` to disable NCCL's ``cuMem`` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in `the RLHF integration <https://github.com/OpenRLHF/OpenRLHF/pull/604>`__ and the `discussion <https://github.com/vllm-project/vllm/issues/5723#issuecomment-2554389656>`__ .
diff --git a/docs/source/getting_started/examples/examples_index.template.md b/docs/source/getting_started/examples/examples_index.template.md
new file mode 100644
index 0000000000000..de7a91c0ffa48
--- /dev/null
+++ b/docs/source/getting_started/examples/examples_index.template.md
@@ -0,0 +1,8 @@
+# Examples
+
+```{toctree}
+:maxdepth: 1
+:caption: Scripts
+
+%EXAMPLE_DOCS%
+```
\ No newline at end of file
diff --git a/docs/source/getting_started/examples/examples_index.template.rst b/docs/source/getting_started/examples/examples_index.template.rst
deleted file mode 100644
index 1b34cccbae15a..0000000000000
--- a/docs/source/getting_started/examples/examples_index.template.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-Examples
-=================================
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Scripts
-
-   %EXAMPLE_DOCS%
diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/gaudi-installation.md
new file mode 100644
index 0000000000000..170d7e49ba806
--- /dev/null
+++ b/docs/source/getting_started/gaudi-installation.md
@@ -0,0 +1,388 @@
+# Installation with Intel® Gaudi® AI Accelerators
+
+This README provides instructions on running vLLM with Intel Gaudi devices.
+
+## Requirements and Installation
+
+Please follow the instructions provided in the [Gaudi Installation
+Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
+to set up the execution environment. To achieve the best performance,
+please follow the methods outlined in the [Optimizing Training Platform
+Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
+
+### Requirements
+
+- OS: Ubuntu 22.04 LTS
+- Python: 3.10
+- Intel Gaudi accelerator
+- Intel Gaudi software version 1.18.0
+
+### Quick start using Dockerfile
+
+```console
+$ docker build -f Dockerfile.hpu -t vllm-hpu-env  .
+$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
+```
+
+```{tip}
+If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered.
+```
+
+### Build from source
+
+#### Environment verification
+
+To verify that the Intel Gaudi software was correctly installed, run:
+
+```console
+$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
+$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
+$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
+$ pip list | grep neural # verify that neural_compressor is installed
+```
+
+Refer to [Intel Gaudi Software Stack
+Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
+for more details.
+
+#### Run Docker Image
+
+It is highly recommended to use the latest Docker image from Intel Gaudi
+vault. Refer to the [Intel Gaudi
+documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers)
+for more details.
+
+Use the following commands to run a Docker image:
+
+```console
+$ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+```
+
+#### Build and Install vLLM
+
+To build and install vLLM from source, run:
+
+```console
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ python setup.py develop
+```
+
+Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following:
+
+```console
+$ git clone https://github.com/HabanaAI/vllm-fork.git
+$ cd vllm-fork
+$ git checkout habana_main
+$ python setup.py develop
+```
+
+## Supported Features
+
+- [Offline batched
+  inference](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#offline-batched-inference)
+- Online inference via [OpenAI-Compatible
+  Server](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server)
+- HPU autodetection - no need to manually select device within vLLM
+- Paged KV cache with algorithms enabled for Intel Gaudi accelerators
+- Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
+  prefill attention, Root Mean Square Layer Normalization, Rotary
+  Positional Encoding
+- Tensor parallelism support for multi-card inference
+- Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
+  for accelerating low-batch latency and throughput
+- Attention with Linear Biases (ALiBi)
+
+## Unsupported Features
+
+- Beam search
+- LoRA adapters
+- Quantization
+- Prefill chunking (mixed-batch inferencing)
+
+## Supported Configurations
+
+The following configurations have been validated to be function with
+Gaudi2 devices. Configurations that are not listed may or may not work.
+
+- [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b)
+  on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+  datatype with random or greedy sampling
+- [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
+  on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+  datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)
+  on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+  datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
+  on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+  datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B)
+  on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+  datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)
+  on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+  datatype with random or greedy sampling
+- [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b)
+  with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf)
+  with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)
+  with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)
+  with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B)
+  with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct)
+  with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+
+## Performance Tuning
+
+### Execution modes
+
+Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag.
+
+```{eval-rst}
+.. list-table:: vLLM execution modes
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - ``PT_HPU_LAZY_MODE``
+     - ``enforce_eager``
+     - execution mode
+   * - 0
+     - 0
+     - torch.compile
+   * - 0
+     - 1
+     - PyTorch eager mode
+   * - 1
+     - 0
+     - HPU Graphs
+   * - 1
+     - 1
+     - PyTorch lazy mode
+```
+
+```{warning}
+In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
+```
+
+### Bucketing mechanism
+
+Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
+In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`.
+
+```{note}
+Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
+```
+
+Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
+
+```
+INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
+INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
+INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
+INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+```
+
+`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling - `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
+
+Example (with ramp-up)
+
+```
+min = 2, step = 32, max = 64
+=> ramp_up = (2, 4, 8, 16)
+=> stable = (32, 64)
+=> buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
+```
+
+Example (without ramp-up)
+
+```
+min = 128, step = 128, max = 512
+=> ramp_up = ()
+=> stable = (128, 256, 384, 512)
+=> buckets = ramp_up + stable => (128, 256, 384, 512)
+```
+
+In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket.
+
+```{warning}
+If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
+```
+
+As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket.
+
+```{note}
+Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
+```
+
+### Warmup
+
+Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
+
+```
+INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
+INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
+INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
+...
+INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
+INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
+INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
+INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
+...
+INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
+INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
+```
+
+This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
+
+```{tip}
+Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
+```
+
+### HPU Graph capture
+
+[HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management.
+
+When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by `gpu_memory_utilization` flag (`0.9` by default).
+Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage.
+Only after that, `gpu_memory_utilization` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable.
+Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured.
+Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of memory reserved for HPU Graphs capture.
+With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache.
+Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints.
+Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs.
+
+```{note}
+`gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.
+```
+
+User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
+\- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
+\- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt
+
+When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy.
+
+```{note}
+`VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
+```
+
+Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
+
+```
+INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
+INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
+INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
+INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
+INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
+INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
+INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
+INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
+INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
+INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
+INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
+...
+INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
+INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
+INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
+...
+INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
+INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
+...
+INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
+INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
+INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
+INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
+INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
+INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
+INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
+INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
+```
+
+### Recommended vLLM Parameters
+
+- We recommend running inference on Gaudi 2 with `block_size` of 128
+  for BF16 data type. Using default values (16, 32) might lead to
+  sub-optimal performance due to Matrix Multiplication Engine
+  under-utilization (see [Gaudi
+  Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
+- For max throughput on Llama 7B, we recommend running with batch size
+  of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
+  If you encounter out-of-memory issues, see troubleshooting section.
+
+### Environment variables
+
+**Diagnostic and profiling knobs:**
+
+- `VLLM_PROFILER_ENABLED`: if `true`, high level profiler will be enabled. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled by default.
+- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true`, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside `PT_HPU_METRICS_GC_DETAILS=1`. Disabled by default.
+- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default.
+- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default.
+- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default.
+
+**Performance tuning knobs:**
+
+- `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by default
+
+- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for HPUGraph capture, `0.1` by default
+
+- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory dedicated for prompt graphs, `0.3` by default
+
+- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt graph capture, `min_tokens` or `max_bs`, `min_tokens` by default
+
+- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode graph capture, `min_tokens` or `max_bs`, `max_bs` by default
+
+- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism
+
+  - `{phase}` is either `PROMPT` or `DECODE`
+
+  - `{dim}` is either `BS`, `SEQ` or `BLOCK`
+
+  - `{param}` is either `MIN`, `STEP` or `MAX`
+
+  - Default values:
+
+    - Prompt:
+      : - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1`
+        - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
+        - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)`
+        - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size`
+        - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size`
+        - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len`
+    - Decode:
+      : - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1`
+        - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
+        - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs`
+        - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size`
+        - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size`
+        - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)`
+
+Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
+
+- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is default
+- `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs
+
+## Troubleshooting: Tweaking HPU Graphs
+
+If you experience device out-of-memory issues or want to attempt
+inference at higher batch sizes, try tweaking HPU Graphs by following
+the below:
+
+- Tweak `gpu_memory_utilization` knob. It will decrease the
+  allocation of KV cache, leaving some headroom for capturing graphs
+  with larger batch size. By default `gpu_memory_utilization` is set
+  to 0.9. It attempts to allocate ~90% of HBM left for KV cache after
+  short profiling run. Note that decreasing reduces the number of KV
+  cache blocks you have available, and therefore reduces the effective
+  maximum number of tokens you can handle at a given time.
+- If this method is not efficient, you can disable `HPUGraph`
+  completely. With HPU Graphs disabled, you are trading latency and
+  throughput at lower batches for potentially higher throughput on
+  higher batches. You can do that by adding `--enforce-eager` flag to
+  server (for online inference), or by passing `enforce_eager=True`
+  argument to LLM constructor (for offline inference).
diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
deleted file mode 100644
index 249e08278ff8f..0000000000000
--- a/docs/source/getting_started/gaudi-installation.rst
+++ /dev/null
@@ -1,402 +0,0 @@
-Installation with Intel® Gaudi® AI Accelerators
-===============================================
-
-This README provides instructions on running vLLM with Intel Gaudi devices.
-
-Requirements and Installation
------------------------------
-
-Please follow the instructions provided in the `Gaudi Installation
-Guide <https://docs.habana.ai/en/latest/Installation_Guide/index.html>`__
-to set up the execution environment. To achieve the best performance,
-please follow the methods outlined in the `Optimizing Training Platform
-Guide <https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html>`__.
-
-Requirements
-~~~~~~~~~~~~
-
--  OS: Ubuntu 22.04 LTS
--  Python: 3.10
--  Intel Gaudi accelerator
--  Intel Gaudi software version 1.18.0
-
-
-Quick start using Dockerfile
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. code:: console
-
-   $ docker build -f Dockerfile.hpu -t vllm-hpu-env  .
-   $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
-
-
-.. tip::
-   If you're observing the following error: ``docker: Error response from daemon: Unknown runtime specified habana.``, please refer to "Install Using Containers" section of `Intel Gaudi Software Stack and Driver Installation <https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html>`__. Make sure you have ``habana-container-runtime`` package installed and that ``habana`` container runtime is registered.
-
-
-Build from source
-~~~~~~~~~~~~~~~~~
-
-Environment verification
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-To verify that the Intel Gaudi software was correctly installed, run:
-
-.. code:: console
-
-   $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
-   $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
-   $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
-   $ pip list | grep neural # verify that neural_compressor is installed
-
-Refer to `Intel Gaudi Software Stack
-Verification <https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade>`__
-for more details.
-
-Run Docker Image
-^^^^^^^^^^^^^^^^
-
-It is highly recommended to use the latest Docker image from Intel Gaudi
-vault. Refer to the `Intel Gaudi
-documentation <https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers>`__
-for more details.
-
-Use the following commands to run a Docker image:
-
-.. code:: console
-
-   $ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
-   $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
-
-Build and Install vLLM
-^^^^^^^^^^^^^^^^^^^^^^
-
-To build and install vLLM from source, run:
-
-.. code:: console
-
-   $ git clone https://github.com/vllm-project/vllm.git
-   $ cd vllm
-   $ python setup.py develop
-
-
-Currently, the latest features and performance optimizations are developed in Gaudi's `vLLM-fork <https://github.com/HabanaAI/vllm-fork>`__ and we periodically upstream them to vLLM main repo. To install latest `HabanaAI/vLLM-fork <https://github.com/HabanaAI/vllm-fork>`__, run the following:
-
-.. code:: console
-
-   $ git clone https://github.com/HabanaAI/vllm-fork.git
-   $ cd vllm-fork
-   $ git checkout habana_main
-   $ python setup.py develop
-
-
-Supported Features
-------------------
-
--  `Offline batched
-   inference <https://docs.vllm.ai/en/latest/getting_started/quickstart.html#offline-batched-inference>`__
--  Online inference via `OpenAI-Compatible
-   Server <https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server>`__
--  HPU autodetection - no need to manually select device within vLLM
--  Paged KV cache with algorithms enabled for Intel Gaudi accelerators
--  Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
-   prefill attention, Root Mean Square Layer Normalization, Rotary
-   Positional Encoding
--  Tensor parallelism support for multi-card inference
--  Inference with `HPU Graphs <https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html>`__
-   for accelerating low-batch latency and throughput
--  Attention with Linear Biases (ALiBi)
-
-Unsupported Features
---------------------
-
--  Beam search
--  LoRA adapters
--  Quantization
--  Prefill chunking (mixed-batch inferencing)
-
-Supported Configurations
-------------------------
-
-The following configurations have been validated to be function with
-Gaudi2 devices. Configurations that are not listed may or may not work.
-
--  `meta-llama/Llama-2-7b <https://huggingface.co/meta-llama/Llama-2-7b>`__
-   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-   datatype with random or greedy sampling
--  `meta-llama/Llama-2-7b-chat-hf <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`__
-   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-   datatype with random or greedy sampling
--  `meta-llama/Meta-Llama-3-8B <https://huggingface.co/meta-llama/Meta-Llama-3-8B>`__
-   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-   datatype with random or greedy sampling
--  `meta-llama/Meta-Llama-3-8B-Instruct <https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct>`__
-   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-   datatype with random or greedy sampling
--  `meta-llama/Meta-Llama-3.1-8B <https://huggingface.co/meta-llama/Meta-Llama-3.1-8B>`__
-   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-   datatype with random or greedy sampling
--  `meta-llama/Meta-Llama-3.1-8B-Instruct <https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct>`__
-   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-   datatype with random or greedy sampling
--  `meta-llama/Llama-2-70b <https://huggingface.co/meta-llama/Llama-2-70b>`__
-   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
--  `meta-llama/Llama-2-70b-chat-hf <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`__
-   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
--  `meta-llama/Meta-Llama-3-70B <https://huggingface.co/meta-llama/Meta-Llama-3-70B>`__
-   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
--  `meta-llama/Meta-Llama-3-70B-Instruct <https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct>`__
-   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
--  `meta-llama/Meta-Llama-3.1-70B <https://huggingface.co/meta-llama/Meta-Llama-3.1-70B>`__
-   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
--  `meta-llama/Meta-Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct>`__
-   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
-
-Performance Tuning
-------------------
-
-Execution modes
-~~~~~~~~~~~~~~~
-
-Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via ``PT_HPU_LAZY_MODE`` environment variable), and ``--enforce-eager`` flag.  
-
-.. list-table:: vLLM execution modes
-   :widths: 25 25 50
-   :header-rows: 1
-
-   * - ``PT_HPU_LAZY_MODE``
-     - ``enforce_eager`` 
-     - execution mode
-   * - 0
-     - 0
-     - torch.compile
-   * - 0
-     - 1
-     - PyTorch eager mode
-   * - 1
-     - 0
-     - HPU Graphs
-   * - 1
-     - 1
-     - PyTorch lazy mode
-
-.. warning::
-   In 1.18.0, all modes utilizing ``PT_HPU_LAZY_MODE=0`` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
-
-
-Bucketing mechanism
-~~~~~~~~~~~~~~~~~~~
-
-Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. `Intel Gaudi Graph Compiler <https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime>`__ is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
-In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - ``batch_size`` and ``sequence_length``. 
-
-.. note::
-   Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
-
-Bucketing ranges are determined with 3 parameters - ``min``, ``step`` and ``max``. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
-
-.. code-block::
-
-      INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
-      INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
-      INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
-      INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-
-``min`` determines the lowest value of the bucket. ``step`` determines the interval between buckets, and ``max`` determines the upper bound of the bucket. Furthermore, interval between ``min`` and ``step`` has special handling - ``min`` gets multiplied by consecutive powers of two, until ``step`` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
-
-Example (with ramp-up)
-
-.. code-block:: 
-   
-    min = 2, step = 32, max = 64
-    => ramp_up = (2, 4, 8, 16)
-    => stable = (32, 64)
-    => buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
-
-Example (without ramp-up)
-
-.. code-block:: 
-   
-    min = 128, step = 128, max = 512
-    => ramp_up = ()
-    => stable = (128, 256, 384, 512)
-    => buckets = ramp_up + stable => (128, 256, 384, 512)
-
-
-In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. 
-
-.. warning::
-   If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
-
-As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as ``(4, 512)`` prefill bucket, as ``batch_size`` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as ``(4, 512)`` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a ``(2, 512)`` bucket, or context length increases above 512 tokens, in which case it will become ``(4, 640)`` bucket. 
-
-.. note::
-   Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
-
-Warmup
-~~~~~~
-
-Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
-
-.. code-block::
-
-   INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
-   INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
-   INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
-   ...
-   INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
-   INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
-   INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
-   INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
-   ...
-   INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
-   INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-
-This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. 
-
-.. tip::
-   Compiling all the buckets might take some time and can be turned off with ``VLLM_SKIP_WARMUP=true`` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
-
-HPU Graph capture
-~~~~~~~~~~~~~~~~~
-
-`HPU Graphs <https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html>`__ are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management.
-
-
-When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by ``gpu_memory_utilization`` flag (``0.9`` by default). 
-Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage. 
-Only after that, ``gpu_memory_utilization`` flag is utilized - at its default value,  will mark 90% of free device memory at that point as usable.
-Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. 
-Environment variable ``VLLM_GRAPH_RESERVED_MEM`` defines the ratio of memory reserved for HPU Graphs capture. 
-With its default value (``VLLM_GRAPH_RESERVED_MEM=0.1``), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. 
-Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.3``), both stages have equal memory constraints.
-Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. ``VLLM_GRAPH_PROMPT_RATIO=0.2`` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. 
-
-.. note:: 
-   ``gpu_memory_utilization`` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, ``gpu_memory_utilization`` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.   
-
-User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
--    ``max_bs`` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. ``(64, 128)``, ``(64, 256)``, ``(32, 128)``, ``(32, 256)``, ``(1, 128)``, ``(1,256)``), default strategy for decode
--    ``min_tokens`` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (``batch_size*sequence_length``), default strategy for prompt
-
-When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by ``max_bs`` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in ``min_tokens`` strategy.
-
-
-.. note::
-   ``VLLM_GRAPH_PROMPT_RATIO`` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * ``VLLM_GRAPH_PROMPT_RATIO``) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
-
-
-Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
-
-.. code-block::
-
-   INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
-   INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
-   INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
-   INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-   INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
-   INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
-   INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
-   INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
-   INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
-   INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
-   INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
-   INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
-   ...
-   INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-   INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
-   INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
-   ...
-   INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
-   INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
-   ...
-   INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
-   INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
-   INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
-   INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
-   INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
-   INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
-   INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-   INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
-   INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
-
-
-Recommended vLLM Parameters
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
--  We recommend running inference on Gaudi 2 with ``block_size`` of 128
-   for BF16 data type. Using default values (16, 32) might lead to
-   sub-optimal performance due to Matrix Multiplication Engine
-   under-utilization (see `Gaudi
-   Architecture <https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html>`__).
--  For max throughput on Llama 7B, we recommend running with batch size
-   of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
-   If you encounter out-of-memory issues, see troubleshooting section.
-
-Environment variables
-~~~~~~~~~~~~~~~~~~~~~
-
-**Diagnostic and profiling knobs:**
-
--   ``VLLM_PROFILER_ENABLED``: if ``true``, high level profiler will be enabled. Resulting JSON traces can be viewed in `perfetto.habana.ai <https://perfetto.habana.ai/#!/viewer>`__. Disabled by default.
--   ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION``: if ``true``, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside ``PT_HPU_METRICS_GC_DETAILS=1``. Disabled by default.
--   ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL``: if ``true``, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default.
--   ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS``: if ``true``, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default.
--   ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL``: if ``true``, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default.
-
-**Performance tuning knobs:**
-
--   ``VLLM_SKIP_WARMUP``: if ``true``, warmup will be skipped, ``false`` by default
--   ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.1`` by default
--   ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.3`` by default
--   ``VLLM_GRAPH_PROMPT_STRATEGY``: strategy determining order of prompt graph capture, ``min_tokens`` or ``max_bs``, ``min_tokens`` by default
--   ``VLLM_GRAPH_DECODE_STRATEGY``: strategy determining order of decode graph capture, ``min_tokens`` or ``max_bs``, ``max_bs`` by default
--   ``VLLM_{phase}_{dim}_BUCKET_{param}`` - collection of 12 environment variables configuring ranges of bucketing mechanism
-
-    - ``{phase}`` is either ``PROMPT`` or ``DECODE``
-    - ``{dim}`` is either ``BS``, ``SEQ`` or ``BLOCK``
-    - ``{param}`` is either ``MIN``, ``STEP`` or ``MAX``
-    - Default values:
-
-      - Prompt:
-         - batch size min (``VLLM_PROMPT_BS_BUCKET_MIN``): ``1``
-         - batch size step (``VLLM_PROMPT_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)``
-         - batch size max (``VLLM_PROMPT_BS_BUCKET_MAX``): ``min(max_num_seqs, 64)``
-         - sequence length min (``VLLM_PROMPT_SEQ_BUCKET_MIN``): ``block_size``
-         - sequence length step (``VLLM_PROMPT_SEQ_BUCKET_STEP``): ``block_size``
-         - sequence length max (``VLLM_PROMPT_SEQ_BUCKET_MAX``): ``max_model_len``
-
-      - Decode:
-         - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``1``
-         - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)``
-         - batch size max (``VLLM_DECODE_BS_BUCKET_MAX``): ``max_num_seqs``
-         - sequence length min (``VLLM_DECODE_BLOCK_BUCKET_MIN``): ``block_size``
-         - sequence length step (``VLLM_DECODE_BLOCK_BUCKET_STEP``): ``block_size``
-         - sequence length max (``VLLM_DECODE_BLOCK_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)``
-
-
-Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:  
-
--   ``PT_HPU_LAZY_MODE``: if ``0``, PyTorch Eager backend for Gaudi will be used, if ``1`` PyTorch Lazy backend for Gaudi will be used, ``1`` is default 
--   ``PT_HPU_ENABLE_LAZY_COLLECTIVES``: required to be ``true`` for tensor parallel inference with HPU Graphs
-
-Troubleshooting: Tweaking HPU Graphs
-------------------------------------
-
-If you experience device out-of-memory issues or want to attempt
-inference at higher batch sizes, try tweaking HPU Graphs by following
-the below:
-
--  Tweak ``gpu_memory_utilization`` knob. It will decrease the
-   allocation of KV cache, leaving some headroom for capturing graphs
-   with larger batch size. By default ``gpu_memory_utilization`` is set
-   to 0.9. It attempts to allocate ~90% of HBM left for KV cache after
-   short profiling run. Note that decreasing reduces the number of KV
-   cache blocks you have available, and therefore reduces the effective
-   maximum number of tokens you can handle at a given time.
-
--  If this method is not efficient, you can disable ``HPUGraph``
-   completely. With HPU Graphs disabled, you are trading latency and
-   throughput at lower batches for potentially higher throughput on
-   higher batches. You can do that by adding ``--enforce-eager`` flag to
-   server (for online inference), or by passing ``enforce_eager=True``
-   argument to LLM constructor (for offline inference).
diff --git a/docs/source/getting_started/installation.md b/docs/source/getting_started/installation.md
new file mode 100644
index 0000000000000..8ca634f966a06
--- /dev/null
+++ b/docs/source/getting_started/installation.md
@@ -0,0 +1,199 @@
+(installation)=
+
+# Installation
+
+vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.
+
+## Requirements
+
+- OS: Linux
+- Python: 3.9 -- 3.12
+- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
+
+## Install released versions
+
+You can install vLLM using pip:
+
+```console
+$ # (Recommended) Create a new conda environment.
+$ conda create -n myenv python=3.12 -y
+$ conda activate myenv
+
+$ # Install vLLM with CUDA 12.1.
+$ pip install vllm
+```
+
+```{note}
+Although we recommend using `conda` to create and manage Python environments, it is highly recommended to use `pip` to install vLLM. This is because `pip` can install `torch` with separate library packages like `NCCL`, while `conda` installs `torch` with statically linked `NCCL`. This can cause issues when vLLM tries to use `NCCL`. See [this issue](https://github.com/vllm-project/vllm/issues/8420) for more details.
+```
+
+````{note}
+As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default.
+We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
+
+```console
+$ # Install vLLM with CUDA 11.8.
+$ export VLLM_VERSION=0.6.1.post1
+$ export PYTHON_VERSION=310
+$ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
+```
+
+In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
+
+Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions.
+````
+
+(install-the-latest-code)=
+
+## Install the latest code
+
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`. You can download and install it with the following command:
+
+```console
+$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+```
+
+If you want to access the wheels for previous commits, you can specify the commit hash in the URL:
+
+```console
+$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+```
+
+Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
+
+Another way to access the latest code is to use the docker images:
+
+```console
+$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+$ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}
+```
+
+These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days.
+
+The latest code can contain bugs and may not be stable. Please use it with caution.
+
+(build-from-source)=
+
+## Build from source
+
+(python-only-build)=
+
+### Python-only build (without compilation)
+
+If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM:
+
+```console
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ VLLM_USE_PRECOMPILED=1 pip install --editable .
+```
+
+This will download the latest nightly wheel and use the compiled libraries from there in the install.
+
+The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files):
+
+```console
+$ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl
+$ pip install --editable .
+```
+
+You can find more information about vLLM's wheels [above](#install-the-latest-code).
+
+```{note}
+There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
+It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [the section above](#install-the-latest-code) for instructions on how to install a specified wheel.
+```
+
+### Full build (with compilation)
+
+If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
+
+```console
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ pip install -e .
+```
+
+```{tip}
+Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
+
+For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` .
+As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
+
+[sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments.
+The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
+```
+
+#### Use an existing PyTorch installation
+
+There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.:
+
+- Building vLLM with PyTorch nightly or a custom PyTorch build.
+- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124` to [install PyTorch nightly](https://pytorch.org/get-started/locally/), and then build vLLM on top of it.
+
+To build vLLM using an existing PyTorch installation:
+
+```console
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ python use_existing_torch.py
+$ pip install -r requirements-build.txt
+$ pip install -e . --no-build-isolation
+```
+
+#### Use the local cutlass for compilation
+
+Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead.
+To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory.
+
+```console
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e .
+```
+
+#### Troubleshooting
+
+To avoid your system being overloaded, you can limit the number of compilation jobs
+to be run simultaneously, via the environment variable `MAX_JOBS`. For example:
+
+```console
+$ export MAX_JOBS=6
+$ pip install -e .
+```
+
+This is especially useful when you are building on less powerful machines. For example, when you use WSL it only [assigns 50% of the total memory by default](https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings), so using `export MAX_JOBS=1` can avoid compiling multiple files simultaneously and running out of memory.
+A side effect is a much slower build process.
+
+Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
+
+```console
+$ # Use `--ipc=host` to make sure the shared memory is large enough.
+$ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
+```
+
+If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.:
+
+```console
+$ export CUDA_HOME=/usr/local/cuda
+$ export PATH="${CUDA_HOME}/bin:$PATH"
+```
+
+Here is a sanity check to verify that the CUDA Toolkit is correctly installed:
+
+```console
+$ nvcc --version # verify that nvcc is in your PATH
+$ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME
+```
+
+### Unsupported OS build
+
+vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
+
+Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing:
+
+```console
+$ export VLLM_TARGET_DEVICE=empty
+$ pip install -e .
+```
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
deleted file mode 100644
index 9b6cb0e80d60e..0000000000000
--- a/docs/source/getting_started/installation.rst
+++ /dev/null
@@ -1,214 +0,0 @@
-.. _installation:
-
-============
-Installation
-============
-
-vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.
-
-Requirements
-============
-
-* OS: Linux
-* Python: 3.9 -- 3.12
-* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
-
-Install released versions
-=========================
-
-You can install vLLM using pip:
-
-.. code-block:: console
-
-    $ # (Recommended) Create a new conda environment.
-    $ conda create -n myenv python=3.12 -y
-    $ conda activate myenv
-
-    $ # Install vLLM with CUDA 12.1.
-    $ pip install vllm
-
-.. note::
-
-    Although we recommend using ``conda`` to create and manage Python environments, it is highly recommended to use ``pip`` to install vLLM. This is because ``pip`` can install ``torch`` with separate library packages like ``NCCL``, while ``conda`` installs ``torch`` with statically linked ``NCCL``. This can cause issues when vLLM tries to use ``NCCL``. See `this issue <https://github.com/vllm-project/vllm/issues/8420>`_ for more details.
-
-.. note::
-
-    As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default.
-    We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
-
-    .. code-block:: console
-
-        $ # Install vLLM with CUDA 11.8.
-        $ export VLLM_VERSION=0.6.1.post1
-        $ export PYTHON_VERSION=310
-        $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
-
-    In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
-
-    Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions.
-
-
-.. _install-the-latest-code:
-
-Install the latest code
-=======================
-
-LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since ``v0.5.3``. You can download and install it with the following command:
-
-.. code-block:: console
-
-    $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-
-If you want to access the wheels for previous commits, you can specify the commit hash in the URL:
-
-.. code-block:: console
-
-    $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-    $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-
-Note that the wheels are built with Python 3.8 ABI (see `PEP 425 <https://peps.python.org/pep-0425/>`_ for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
-
-Another way to access the latest code is to use the docker images:
-
-.. code-block:: console
-
-    $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-    $ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}
-
-These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days.
-
-The latest code can contain bugs and may not be stable. Please use it with caution.
-
-.. _build_from_source:
-
-Build from source
-=================
-
-.. _python-only-build:
-
-Python-only build (without compilation)
----------------------------------------
-
-If you only need to change Python code, you can build and install vLLM without compilation. Using `pip's ``--editable`` flag <https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs>`_, changes you make to the code will be reflected when you run vLLM:
-
-.. code-block:: console
-
-    $ git clone https://github.com/vllm-project/vllm.git
-    $ cd vllm
-    $ VLLM_USE_PRECOMPILED=1 pip install --editable .
-
-This will download the latest nightly wheel and use the compiled libraries from there in the install.
-
-The ``VLLM_PRECOMPILED_WHEEL_LOCATION`` environment variable can be used instead of ``VLLM_USE_PRECOMPILED`` to specify a custom path or URL to the wheel file. For example, to use the `0.6.1.post1 PyPi wheel <https://pypi.org/project/vllm/#files>`_:
-
-.. code-block:: console
-
-   $ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl
-   $ pip install --editable .
-
-You can find more information about vLLM's wheels `above <#install-the-latest-code>`_.
-
-.. note::
-
-    There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
-    It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to `the section above <#install-the-latest-code>`_ for instructions on how to install a specified wheel.
-
-Full build (with compilation)
------------------------------
-
-If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
-
-.. code-block:: console
-
-    $ git clone https://github.com/vllm-project/vllm.git
-    $ cd vllm
-    $ pip install -e .
-
-.. tip::
-
-    Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
-
-    For example, you can install `ccache <https://github.com/ccache/ccache>`_ using ``conda install ccache`` or ``apt install ccache`` .
-    As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
-
-    `sccache <https://github.com/mozilla/sccache>`_ works similarly to ``ccache``, but has the capability to utilize caching in remote storage environments.
-    The following environment variables can be set to configure the vLLM ``sccache`` remote: ``SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1``. We also recommend setting ``SCCACHE_IDLE_TIMEOUT=0``.
-
-
-Use an existing PyTorch installation
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.:
-
-* Building vLLM with PyTorch nightly or a custom PyTorch build.
-* Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run ``pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124`` to `install PyTorch nightly <https://pytorch.org/get-started/locally/>`_, and then build vLLM on top of it.
-
-To build vLLM using an existing PyTorch installation:
-
-.. code-block:: console
-
-    $ git clone https://github.com/vllm-project/vllm.git
-    $ cd vllm
-    $ python use_existing_torch.py
-    $ pip install -r requirements-build.txt
-    $ pip install -e . --no-build-isolation
-
-
-Use the local cutlass for compilation
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead.
-To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory.
-
-.. code-block:: console
-
-    $ git clone https://github.com/vllm-project/vllm.git
-    $ cd vllm
-    $ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e .
-
-
-Troubleshooting
-~~~~~~~~~~~~~~~
-
-To avoid your system being overloaded, you can limit the number of compilation jobs
-to be run simultaneously, via the environment variable ``MAX_JOBS``. For example:
-
-.. code-block:: console
-
-    $ export MAX_JOBS=6
-    $ pip install -e .
-
-This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default <https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings>`_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory.
-A side effect is a much slower build process.
-
-Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
-
-.. code-block:: console
-
-    $ # Use `--ipc=host` to make sure the shared memory is large enough.
-    $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
-
-If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from `the official website <https://developer.nvidia.com/cuda-toolkit-archive>`_. After installation, set the environment variable ``CUDA_HOME`` to the installation path of CUDA Toolkit, and make sure that the ``nvcc`` compiler is in your ``PATH``, e.g.:
-
-.. code-block:: console
-
-    $ export CUDA_HOME=/usr/local/cuda
-    $ export PATH="${CUDA_HOME}/bin:$PATH"
-
-Here is a sanity check to verify that the CUDA Toolkit is correctly installed:
-
-.. code-block:: console
-
-    $ nvcc --version # verify that nvcc is in your PATH
-    $ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME
-
-
-Unsupported OS build
---------------------
-
-vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
-
-Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing:
-
-.. code-block:: console
-
-    $ export VLLM_TARGET_DEVICE=empty
-    $ pip install -e .
diff --git a/docs/source/getting_started/neuron-installation.md b/docs/source/getting_started/neuron-installation.md
new file mode 100644
index 0000000000000..d6de5760cc82c
--- /dev/null
+++ b/docs/source/getting_started/neuron-installation.md
@@ -0,0 +1,132 @@
+(installation-neuron)=
+
+# Installation with Neuron
+
+vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching.
+Paged Attention and Chunked Prefill are currently in development and will be available soon.
+Data types currently supported in Neuron SDK are FP16 and BF16.
+
+## Requirements
+
+- OS: Linux
+- Python: 3.9 -- 3.11
+- Accelerator: NeuronCore_v2 (in trn1/inf2 instances)
+- Pytorch 2.0.1/2.1.1
+- AWS Neuron SDK 2.16/2.17 (Verified on python 3.8)
+
+Installation steps:
+
+- [Build from source](#build-from-source-neuron)
+
+  - [Step 0. Launch Trn1/Inf2 instances](#launch-instances)
+  - [Step 1. Install drivers and tools](#install-drivers)
+  - [Step 2. Install transformers-neuronx and its dependencies](#install-tnx)
+  - [Step 3. Install vLLM from source](#install-vllm)
+
+(build-from-source-neuron)=
+
+```{note}
+The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
+```
+
+## Build from source
+
+Following instructions are applicable to Neuron SDK 2.16 and beyond.
+
+(launch-instances)=
+
+### Step 0. Launch Trn1/Inf2 instances
+
+Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html).
+
+- Please follow the instructions at [launch an Amazon EC2 Instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance) to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type.
+- To get more information about instances sizes and pricing see: [Trn1 web page](https://aws.amazon.com/ec2/instance-types/trn1/), [Inf2 web page](https://aws.amazon.com/ec2/instance-types/inf2/)
+- Select Ubuntu Server 22.04 TLS AMI
+- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB.
+- After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance
+
+(install-drivers)=
+
+### Step 1. Install drivers and tools
+
+The installation of drivers and tools wouldn't be necessary, if [Deep Learning AMI Neuron](https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html) is installed. In case the drivers and tools are not installed on the operating system, follow the steps below:
+
+```console
+# Configure Linux for Neuron repository updates
+. /etc/os-release
+sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
+deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
+EOF
+wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
+
+# Update OS packages
+sudo apt-get update -y
+
+# Install OS headers
+sudo apt-get install linux-headers-$(uname -r) -y
+
+# Install git
+sudo apt-get install git -y
+
+# install Neuron Driver
+sudo apt-get install aws-neuronx-dkms=2.* -y
+
+# Install Neuron Runtime
+sudo apt-get install aws-neuronx-collectives=2.* -y
+sudo apt-get install aws-neuronx-runtime-lib=2.* -y
+
+# Install Neuron Tools
+sudo apt-get install aws-neuronx-tools=2.* -y
+
+# Add PATH
+export PATH=/opt/aws/neuron/bin:$PATH
+```
+
+(install-tnx)=
+
+### Step 2. Install transformers-neuronx and its dependencies
+
+[transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx) will be the backend to support inference on trn1/inf2 instances.
+Follow the steps below to install transformer-neuronx package and its dependencies.
+
+```console
+# Install Python venv
+sudo apt-get install -y python3.10-venv g++
+
+# Create Python venv
+python3.10 -m venv aws_neuron_venv_pytorch
+
+# Activate Python venv
+source aws_neuron_venv_pytorch/bin/activate
+
+# Install Jupyter notebook kernel
+pip install ipykernel
+python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)"
+pip install jupyter notebook
+pip install environment_kernels
+
+# Set pip repository pointing to the Neuron repository
+python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+
+# Install wget, awscli
+python -m pip install wget
+python -m pip install awscli
+
+# Update Neuron Compiler and Framework
+python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx
+```
+
+(install-vllm)=
+
+### Step 3. Install vLLM from source
+
+Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows:
+
+```console
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ pip install -U -r requirements-neuron.txt
+$ VLLM_TARGET_DEVICE="neuron" pip install .
+```
+
+If neuron packages are detected correctly in the installation process, `vllm-0.3.0+neuron212` will be installed.
diff --git a/docs/source/getting_started/neuron-installation.rst b/docs/source/getting_started/neuron-installation.rst
deleted file mode 100644
index 025ba6ef7ebd8..0000000000000
--- a/docs/source/getting_started/neuron-installation.rst
+++ /dev/null
@@ -1,140 +0,0 @@
-.. _installation_neuron:
-
-Installation with Neuron
-========================
-
-vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching.
-Paged Attention and Chunked Prefill are currently in development and will be available soon.
-Data types currently supported in Neuron SDK are FP16 and BF16.
-
-Requirements
-------------
-
-* OS: Linux
-* Python: 3.9 -- 3.11
-* Accelerator: NeuronCore_v2 (in trn1/inf2 instances)
-* Pytorch 2.0.1/2.1.1
-* AWS Neuron SDK 2.16/2.17 (Verified on python 3.8)
-
-Installation steps:
-
-- :ref:`Build from source <build_from_source_neuron>`
-
-  - :ref:`Step 0. Launch Trn1/Inf2 instances <launch_instances>`
-  - :ref:`Step 1. Install drivers and tools <install_drivers>`
-  - :ref:`Step 2. Install transformers-neuronx and its dependencies <install_tnx>`
-  - :ref:`Step 3. Install vLLM from source <install_vllm>`
-
-.. _build_from_source_neuron:
-
-.. note::
-
-    The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
-
-Build from source
------------------
-
-Following instructions are applicable to Neuron SDK 2.16 and beyond.
-
-.. _launch_instances:
-
-Step 0. Launch Trn1/Inf2 instances
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Here are the steps to launch trn1/inf2 instances, in order to install `PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS <https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html>`_.
-
-- Please follow the instructions at `launch an Amazon EC2 Instance <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance>`_ to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type.
-- To get more information about instances sizes and pricing see: `Trn1 web page <https://aws.amazon.com/ec2/instance-types/trn1/>`_, `Inf2 web page <https://aws.amazon.com/ec2/instance-types/inf2/>`_
-- Select Ubuntu Server 22.04 TLS AMI
-- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB.
-- After launching the instance, follow the instructions in `Connect to your instance <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html>`_ to connect to the instance
-
-.. _install_drivers:
-
-Step 1. Install drivers and tools
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The installation of drivers and tools wouldn't be necessary, if `Deep Learning AMI Neuron <https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html>`_ is installed. In case the drivers and tools are not installed on the operating system, follow the steps below:
-
-.. code-block:: console
-
-    # Configure Linux for Neuron repository updates
-    . /etc/os-release
-    sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
-    deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
-    EOF
-    wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
-
-    # Update OS packages
-    sudo apt-get update -y
-
-    # Install OS headers
-    sudo apt-get install linux-headers-$(uname -r) -y
-
-    # Install git
-    sudo apt-get install git -y
-
-    # install Neuron Driver
-    sudo apt-get install aws-neuronx-dkms=2.* -y
-
-    # Install Neuron Runtime
-    sudo apt-get install aws-neuronx-collectives=2.* -y
-    sudo apt-get install aws-neuronx-runtime-lib=2.* -y
-
-    # Install Neuron Tools
-    sudo apt-get install aws-neuronx-tools=2.* -y
-
-    # Add PATH
-    export PATH=/opt/aws/neuron/bin:$PATH
-
-
-.. _install_tnx:
-
-Step 2. Install transformers-neuronx and its dependencies
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-`transformers-neuronx <https://github.com/aws-neuron/transformers-neuronx>`_ will be the backend to support inference on trn1/inf2 instances.
-Follow the steps below to install transformer-neuronx package and its dependencies.
-
-.. code-block:: console
-
-    # Install Python venv
-    sudo apt-get install -y python3.10-venv g++
-
-    # Create Python venv
-    python3.10 -m venv aws_neuron_venv_pytorch
-
-    # Activate Python venv
-    source aws_neuron_venv_pytorch/bin/activate
-
-    # Install Jupyter notebook kernel
-    pip install ipykernel
-    python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)"
-    pip install jupyter notebook
-    pip install environment_kernels
-
-    # Set pip repository pointing to the Neuron repository
-    python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
-
-    # Install wget, awscli
-    python -m pip install wget
-    python -m pip install awscli
-
-    # Update Neuron Compiler and Framework
-    python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx
-
-.. _install_vllm:
-
-Step 3. Install vLLM from source
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows:
-
-.. code-block:: console
-
-    $ git clone https://github.com/vllm-project/vllm.git
-    $ cd vllm
-    $ pip install -U -r requirements-neuron.txt
-    $ VLLM_TARGET_DEVICE="neuron" pip install .
-
-If neuron packages are detected correctly in the installation process, ``vllm-0.3.0+neuron212`` will be installed.
diff --git a/docs/source/getting_started/openvino-installation.md b/docs/source/getting_started/openvino-installation.md
new file mode 100644
index 0000000000000..8b43c0a90447f
--- /dev/null
+++ b/docs/source/getting_started/openvino-installation.md
@@ -0,0 +1,104 @@
+(installation-openvino)=
+
+# Installation with OpenVINO
+
+vLLM powered by OpenVINO supports all LLM models from {doc}`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features:
+
+- Prefix caching (`--enable-prefix-caching`)
+- Chunked prefill (`--enable-chunked-prefill`)
+
+**Table of contents**:
+
+- [Requirements](#openvino-backend-requirements)
+- [Quick start using Dockerfile](#openvino-backend-quick-start-dockerfile)
+- [Build from source](#install-openvino-backend-from-source)
+- [Performance tips](#openvino-backend-performance-tips)
+- [Limitations](#openvino-backend-limitations)
+
+(openvino-backend-requirements)=
+
+## Requirements
+
+- OS: Linux
+- Instruction set architecture (ISA) requirement: at least AVX2.
+
+(openvino-backend-quick-start-dockerfile)=
+
+## Quick start using Dockerfile
+
+```console
+$ docker build -f Dockerfile.openvino -t vllm-openvino-env .
+$ docker run -it --rm vllm-openvino-env
+```
+
+(install-openvino-backend-from-source)=
+
+## Install from source
+
+- First, install Python. For example, on Ubuntu 22.04, you can run:
+
+  ```console
+  $ sudo apt-get update  -y
+  $ sudo apt-get install python3
+  ```
+
+- Second, install prerequisites vLLM OpenVINO backend installation:
+
+  ```console
+  $ pip install --upgrade pip
+  $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+  ```
+
+- Finally, install vLLM with OpenVINO backend:
+
+  ```console
+  $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
+  ```
+
+- [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html).
+
+(openvino-backend-performance-tips)=
+
+## Performance tips
+
+### vLLM OpenVINO backend environment variables
+
+- `VLLM_OPENVINO_DEVICE` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, `VLLM_OPENVINO_DEVICE=GPU.1`). If the value is not specified, CPU device is used by default.
+- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `<model_id>`
+
+### CPU performance tips
+
+CPU uses the following environment variables to control behavior:
+
+- `VLLM_OPENVINO_KVCACHE_SPACE` to specify the KV Cache size (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
+- `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
+
+To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (`--enable-chunked-prefill`). Based on the experiments, the recommended batch size is `256` (`--max-num-batched-tokens`)
+
+OpenVINO best known configuration for CPU is:
+
+```console
+$ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
+    python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
+```
+
+### GPU performance tips
+
+GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account `gpu_memory_utilization` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using `VLLM_OPENVINO_KVCACHE_SPACE` environment variable (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=8` means 8 GB space for KV cache).
+
+Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`.
+
+OpenVINO best known configuration for GPU is:
+
+```console
+$ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
+    python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json
+```
+
+(openvino-backend-limitations)=
+
+## Limitations
+
+- LoRA serving is not supported.
+- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration.
+- Tensor and pipeline parallelism are not currently enabled in vLLM integration.
diff --git a/docs/source/getting_started/openvino-installation.rst b/docs/source/getting_started/openvino-installation.rst
deleted file mode 100644
index 5eeb7c78f7e51..0000000000000
--- a/docs/source/getting_started/openvino-installation.rst
+++ /dev/null
@@ -1,116 +0,0 @@
-.. _installation_openvino:
-
-Installation with OpenVINO
-==========================
-
-vLLM powered by OpenVINO supports all LLM models from :doc:`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs (`the list of supported GPUs <https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu>`_). OpenVINO vLLM backend supports the following advanced vLLM features:
-
-- Prefix caching (``--enable-prefix-caching``)
-- Chunked prefill (``--enable-chunked-prefill``)
-
-**Table of contents**:
-
-- :ref:`Requirements <openvino_backend_requirements>`
-- :ref:`Quick start using Dockerfile <openvino_backend_quick_start_dockerfile>`
-- :ref:`Build from source <install_openvino_backend_from_source>`
-- :ref:`Performance tips <openvino_backend_performance_tips>`
-- :ref:`Limitations <openvino_backend_limitations>`
-
-.. _openvino_backend_requirements:
-
-Requirements
-------------
-
-* OS: Linux
-* Instruction set architecture (ISA) requirement: at least AVX2.
-
-.. _openvino_backend_quick_start_dockerfile:
-
-Quick start using Dockerfile
-----------------------------
-
-.. code-block:: console
-
-    $ docker build -f Dockerfile.openvino -t vllm-openvino-env .
-    $ docker run -it --rm vllm-openvino-env
-
-.. _install_openvino_backend_from_source:
-
-Install from source
--------------------
-
-- First, install Python. For example, on Ubuntu 22.04, you can run:
-
-  .. code-block:: console
-
-      $ sudo apt-get update  -y
-      $ sudo apt-get install python3
-
-- Second, install prerequisites vLLM OpenVINO backend installation:
-
-  .. code-block:: console
-
-      $ pip install --upgrade pip
-      $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
-
-- Finally, install vLLM with OpenVINO backend:
-
-  .. code-block:: console
-
-      $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
-
-- [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: `https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html <https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html>`_.
-
-.. _openvino_backend_performance_tips:
-
-Performance tips
-----------------
-
-vLLM OpenVINO backend environment variables
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-- ``VLLM_OPENVINO_DEVICE`` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, ``VLLM_OPENVINO_DEVICE=GPU.1``). If the value is not specified, CPU device is used by default.
-
-- ``VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON`` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `<model_id>`
-
-CPU performance tips
-~~~~~~~~~~~~~~~~~~~~
-
-CPU uses the following environment variables to control behavior:
-
-- ``VLLM_OPENVINO_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
-
-- ``VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8`` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
-
-To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (``--enable-chunked-prefill``). Based on the experiments, the recommended batch size is ``256`` (``--max-num-batched-tokens``)
-
-OpenVINO best known configuration for CPU is:
-
-.. code-block:: console
-
-    $ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
-        python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
-
-GPU performance tips
-~~~~~~~~~~~~~~~~~~~~
-GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account ``gpu_memory_utilization`` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using ``VLLM_OPENVINO_KVCACHE_SPACE`` environment variable (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=8`` means 8 GB space for KV cache).
-
-Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`.
-
-OpenVINO best known configuration for GPU is:
-
-.. code-block:: console
-
-    $ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
-        python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json
-
-.. _openvino_backend_limitations:
-
-Limitations
------------
-
-- LoRA serving is not supported.
-
-- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration.
-
-- Tensor and pipeline parallelism are not currently enabled in vLLM integration.
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
new file mode 100644
index 0000000000000..e3508bce68c2d
--- /dev/null
+++ b/docs/source/getting_started/quickstart.md
@@ -0,0 +1,174 @@
+(quickstart)=
+
+# Quickstart
+
+This guide will help you quickly get started with vLLM to:
+
+- [Run offline batched inference](#offline-batched-inference)
+- [Run OpenAI-compatible inference](#openai-compatible-server)
+
+## Prerequisites
+
+- OS: Linux
+- Python: 3.9 -- 3.12
+- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
+
+## Installation
+
+You can install vLLM using pip. It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
+
+```console
+$ conda create -n myenv python=3.10 -y
+$ conda activate myenv
+$ pip install vllm
+```
+
+Please refer to the {ref}`installation documentation <installation>` for more details on installing vLLM.
+
+(offline-batched-inference)=
+
+## Offline Batched Inference
+
+With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). The example script for this section can be found [here](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py).
+
+The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:
+
+- {class}`~vllm.LLM` is the main class for running offline inference with vLLM engine.
+- {class}`~vllm.SamplingParams` specifies the parameters for the sampling process.
+
+```python
+from vllm import LLM, SamplingParams
+```
+
+The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](https://docs.vllm.ai/en/stable/dev/sampling_params.html).
+
+```python
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+```
+
+The {class}`~vllm.LLM` class initializes vLLM's engine and the [OPT-125M model](https://arxiv.org/abs/2205.01068) for offline inference. The list of supported models can be found [here](#supported-models).
+
+```python
+llm = LLM(model="facebook/opt-125m")
+```
+
+```{note}
+By default, vLLM downloads models from [HuggingFace](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine.
+```
+
+Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens.
+
+```python
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+(openai-compatible-server)=
+
+## OpenAI-Compatible Server
+
+vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API.
+By default, it starts the server at `http://localhost:8000`. You can specify the address with `--host` and `--port` arguments. The server currently hosts one model at a time and implements endpoints such as [list models](https://platform.openai.com/docs/api-reference/models/list), [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create), and [create completion](https://platform.openai.com/docs/api-reference/completions/create) endpoints.
+
+Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) model:
+
+```console
+$ vllm serve Qwen/Qwen2.5-1.5B-Instruct
+```
+
+```{note}
+By default, the server uses a predefined chat template stored in the tokenizer. You can learn about overriding it [here](https://github.com/vllm-project/vllm/blob/main/docs/source/serving/openai_compatible_server.md#chat-template).
+```
+
+This server can be queried in the same format as OpenAI API. For example, to list the models:
+
+```console
+$ curl http://localhost:8000/v1/models
+```
+
+You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` to enable the server to check for API key in the header.
+
+### OpenAI Completions API with vLLM
+
+Once your server is started, you can query the model with input prompts:
+
+```console
+$ curl http://localhost:8000/v1/completions \
+$     -H "Content-Type: application/json" \
+$     -d '{
+$         "model": "Qwen/Qwen2.5-1.5B-Instruct",
+$         "prompt": "San Francisco is a",
+$         "max_tokens": 7,
+$         "temperature": 0
+$     }'
+```
+
+Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` python package:
+
+```python
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
+                                      prompt="San Francisco is a")
+print("Completion result:", completion)
+```
+
+A more detailed client example can be found [here](https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py).
+
+### OpenAI Chat Completions API with vLLM
+
+vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations.
+
+You can use the [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create) endpoint to interact with the model:
+
+```console
+$ curl http://localhost:8000/v1/chat/completions \
+$     -H "Content-Type: application/json" \
+$     -d '{
+$         "model": "Qwen/Qwen2.5-1.5B-Instruct",
+$         "messages": [
+$             {"role": "system", "content": "You are a helpful assistant."},
+$             {"role": "user", "content": "Who won the world series in 2020?"}
+$         ]
+$     }'
+```
+
+Alternatively, you can use the `openai` python package:
+
+```python
+from openai import OpenAI
+# Set OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+chat_response = client.chat.completions.create(
+    model="Qwen/Qwen2.5-1.5B-Instruct",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Tell me a joke."},
+    ]
+)
+print("Chat response:", chat_response)
+```
diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst
deleted file mode 100644
index 0c0491c860563..0000000000000
--- a/docs/source/getting_started/quickstart.rst
+++ /dev/null
@@ -1,181 +0,0 @@
-.. _quickstart:
-
-==========
-Quickstart
-==========
-
-This guide will help you quickly get started with vLLM to:
-
-* :ref:`Run offline batched inference <offline_batched_inference>` 
-* :ref:`Run OpenAI-compatible inference <openai_compatible_server>`
-
-Prerequisites
---------------
-- OS: Linux
-- Python: 3.9 -- 3.12
-- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
-
-Installation
---------------
-
-You can install vLLM using pip. It's recommended to use `conda <https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html>`_ to create and manage Python environments.
-
-.. code-block:: console
-
-    $ conda create -n myenv python=3.10 -y
-    $ conda activate myenv
-    $ pip install vllm
-
-Please refer to the :ref:`installation documentation <installation>` for more details on installing vLLM.
-
-.. _offline_batched_inference:
-
-Offline Batched Inference
--------------------------
-
-With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). The example script for this section can be found `here <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py>`__.
-
-The first line of this example imports the classes :class:`~vllm.LLM` and :class:`~vllm.SamplingParams`:
-
-- :class:`~vllm.LLM` is the main class for running offline inference with vLLM engine.
-- :class:`~vllm.SamplingParams` specifies the parameters for the sampling process.
-
-.. code-block:: python
-
-    from vllm import LLM, SamplingParams
-
-The next section defines a list of input prompts and sampling parameters for text generation. The `sampling temperature <https://arxiv.org/html/2402.05201v1>`_ is set to ``0.8`` and the `nucleus sampling probability <https://en.wikipedia.org/wiki/Top-p_sampling>`_ is set to ``0.95``. You can find more information about the sampling parameters `here <https://docs.vllm.ai/en/stable/dev/sampling_params.html>`__.
-
-.. code-block:: python
-
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-The :class:`~vllm.LLM` class initializes vLLM's engine and the `OPT-125M model <https://arxiv.org/abs/2205.01068>`_ for offline inference. The list of supported models can be found :ref:`here <supported_models>`.
-
-.. code-block:: python
-
-    llm = LLM(model="facebook/opt-125m")
-
-.. note::
-
-    By default, vLLM downloads models from `HuggingFace <https://huggingface.co/>`_. If you would like to use models from `ModelScope <https://www.modelscope.cn>`_, set the environment variable ``VLLM_USE_MODELSCOPE`` before initializing the engine.
-
-Now, the fun part! The outputs are generated using ``llm.generate``. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all of the output tokens.
-
-.. code-block:: python
-
-    outputs = llm.generate(prompts, sampling_params)
-
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-.. _openai_compatible_server:
-
-OpenAI-Compatible Server
-------------------------
-
-vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API.
-By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time and implements endpoints such as `list models <https://platform.openai.com/docs/api-reference/models/list>`_, `create chat completion <https://platform.openai.com/docs/api-reference/chat/completions/create>`_, and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. 
-
-Run the following command to start the vLLM server with the `Qwen2.5-1.5B-Instruct <https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct>`_ model:
-
-.. code-block:: console
-
-    $ vllm serve Qwen/Qwen2.5-1.5B-Instruct
-
-.. note::
-
-    By default, the server uses a predefined chat template stored in the tokenizer. You can learn about overriding it `here <https://github.com/vllm-project/vllm/blob/main/docs/source/serving/openai_compatible_server.md#chat-template>`__.
-
-This server can be queried in the same format as OpenAI API. For example, to list the models:
-
-.. code-block:: console
-
-    $ curl http://localhost:8000/v1/models
-
-You can pass in the argument ``--api-key`` or environment variable ``VLLM_API_KEY`` to enable the server to check for API key in the header.
-
-OpenAI Completions API with vLLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Once your server is started, you can query the model with input prompts:
-
-.. code-block:: console
-
-    $ curl http://localhost:8000/v1/completions \
-    $     -H "Content-Type: application/json" \
-    $     -d '{
-    $         "model": "Qwen/Qwen2.5-1.5B-Instruct",
-    $         "prompt": "San Francisco is a",
-    $         "max_tokens": 7,
-    $         "temperature": 0
-    $     }'
-
-Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the ``openai`` python package:
-
-.. code-block:: python
-
-    from openai import OpenAI
-
-    # Modify OpenAI's API key and API base to use vLLM's API server.
-    openai_api_key = "EMPTY"
-    openai_api_base = "http://localhost:8000/v1"
-    client = OpenAI(
-        api_key=openai_api_key,
-        base_url=openai_api_base,
-    )
-    completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
-                                          prompt="San Francisco is a")
-    print("Completion result:", completion)
-
-A more detailed client example can be found `here <https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py>`__.
-
-OpenAI Chat Completions API with vLLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations.
-
-You can use the `create chat completion <https://platform.openai.com/docs/api-reference/chat/completions/create>`_ endpoint to interact with the model:
-
-.. code-block:: console
-
-    $ curl http://localhost:8000/v1/chat/completions \
-    $     -H "Content-Type: application/json" \
-    $     -d '{
-    $         "model": "Qwen/Qwen2.5-1.5B-Instruct",
-    $         "messages": [
-    $             {"role": "system", "content": "You are a helpful assistant."},
-    $             {"role": "user", "content": "Who won the world series in 2020?"}
-    $         ]
-    $     }'
-
-Alternatively, you can use the ``openai`` python package:
-
-.. code-block:: python
-
-    from openai import OpenAI
-    # Set OpenAI's API key and API base to use vLLM's API server.
-    openai_api_key = "EMPTY"
-    openai_api_base = "http://localhost:8000/v1"
-
-    client = OpenAI(
-        api_key=openai_api_key,
-        base_url=openai_api_base,
-    )
-
-    chat_response = client.chat.completions.create(
-        model="Qwen/Qwen2.5-1.5B-Instruct",
-        messages=[
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": "Tell me a joke."},
-        ]
-    )
-    print("Chat response:", chat_response)
diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/tpu-installation.md
new file mode 100644
index 0000000000000..f4916460026d1
--- /dev/null
+++ b/docs/source/getting_started/tpu-installation.md
@@ -0,0 +1,193 @@
+(installation-tpu)=
+
+# Installation with TPU
+
+Tensor Processing Units (TPUs) are Google's custom-developed application-specific
+integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs
+are available in different versions each with different hardware specifications.
+For more information about TPUs, see [TPU System Architecture](https://cloud.google.com/tpu/docs/system-architecture-tpu-vm).
+For more information on the TPU versions supported with vLLM, see:
+
+- [TPU v6e](https://cloud.google.com/tpu/docs/v6e)
+- [TPU v5e](https://cloud.google.com/tpu/docs/v5e)
+- [TPU v5p](https://cloud.google.com/tpu/docs/v5p)
+- [TPU v4](https://cloud.google.com/tpu/docs/v4)
+
+These TPU versions allow you to configure the physical arrangements of the TPU
+chips. This can improve throughput and networking performance. For more
+information see:
+
+- [TPU v6e topologies](https://cloud.google.com/tpu/docs/v6e#configurations)
+- [TPU v5e topologies](https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config)
+- [TPU v5p topologies](https://cloud.google.com/tpu/docs/v5p#tpu-v5p-config)
+- [TPU v4 topologies](https://cloud.google.com/tpu/docs/v4#tpu-v4-config)
+
+In order for you to use Cloud TPUs you need to have TPU quota granted to your
+Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a
+GPC project and are specified in terms of TPU version, the number of TPU you
+want to use, and quota type. For more information, see [TPU quota](https://cloud.google.com/tpu/docs/quota#tpu_quota).
+
+For TPU pricing information, see [Cloud TPU pricing](https://cloud.google.com/tpu/pricing).
+
+You may need additional persistent storage for your TPU VMs. For more
+information, see [Storage options for Cloud TPU data](https://cloud.devsite.corp.google.com/tpu/docs/storage-options).
+
+## Requirements
+
+- Google Cloud TPU VM
+- TPU versions: v6e, v5e, v5p, v4
+- Python: 3.10 or newer
+
+### Provision Cloud TPUs
+
+You can provision Cloud TPUs using the [Cloud TPU API](https://cloud.google.com/tpu/docs/reference/rest)
+or the [queued resources](https://cloud.google.com/tpu/docs/queued-resources)
+API. This section shows how to create TPUs using the queued resource API. For
+more information about using the Cloud TPU API, see [Create a Cloud TPU using the Create Node API](https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#create-node-api).
+Queued resources enable you to request Cloud TPU resources in a queued manner.
+When you request queued resources, the request is added to a queue maintained by
+the Cloud TPU service. When the requested resource becomes available, it's
+assigned to your Google Cloud project for your immediate exclusive use.
+
+```{note}
+In all of the following commands, replace the ALL CAPS parameter names with
+appropriate values. See the parameter descriptions table for more information.
+```
+
+## Provision a Cloud TPU with the queued resource API
+
+Create a TPU v5e with 4 TPU chips:
+
+```console
+gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
+--node-id TPU_NAME \
+--project PROJECT_ID \
+--zone ZONE \
+--accelerator-type ACCELERATOR_TYPE \
+--runtime-version RUNTIME_VERSION \
+--service-account SERVICE_ACCOUNT
+```
+
+```{eval-rst}
+.. list-table:: Parameter descriptions
+    :header-rows: 1
+
+    * - Parameter name
+      - Description
+    * - QUEUED_RESOURCE_ID
+      - The user-assigned ID of the queued resource request.
+    * - TPU_NAME
+      - The user-assigned name of the TPU which is created when the queued
+        resource request is allocated.
+    * - PROJECT_ID
+      - Your Google Cloud project
+    * - ZONE
+      - The GCP zone where you want to create your Cloud TPU. The value you use
+        depends on the version of TPUs you are using. For more information, see
+        `TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_
+    * - ACCELERATOR_TYPE
+      - The TPU version you want to use. Specify the TPU version, for example
+        `v5litepod-4` specifies a v5e TPU with 4 cores. For more information,
+        see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
+    * - RUNTIME_VERSION
+      - The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
+    * - SERVICE_ACCOUNT
+      - The email address for your service account. You can find it in the IAM
+        Cloud Console under *Service Accounts*. For example:
+        `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`
+```
+
+Connect to your TPU using SSH:
+
+```bash
+gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE
+```
+
+Install Miniconda
+
+```bash
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+bash Miniconda3-latest-Linux-x86_64.sh
+source ~/.bashrc
+```
+
+Create and activate a Conda environment for vLLM:
+
+```bash
+conda create -n vllm python=3.10 -y
+conda activate vllm
+```
+
+Clone the vLLM repository and go to the vLLM directory:
+
+```bash
+git clone https://github.com/vllm-project/vllm.git && cd vllm
+```
+
+Uninstall the existing `torch` and `torch_xla` packages:
+
+```bash
+pip uninstall torch torch-xla -y
+```
+
+Install build dependencies:
+
+```bash
+pip install -r requirements-tpu.txt
+sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
+```
+
+Run the setup script:
+
+```bash
+VLLM_TARGET_DEVICE="tpu" python setup.py develop
+```
+
+## Provision Cloud TPUs with GKE
+
+For more information about using TPUs with GKE, see
+<https://cloud.google.com/kubernetes-engine/docs/how-to/tpus>
+<https://cloud.google.com/kubernetes-engine/docs/concepts/tpus>
+<https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus>
+
+(build-docker-tpu)=
+
+## Build a docker image with {code}`Dockerfile.tpu`
+
+You can use [Dockerfile.tpu](https://github.com/vllm-project/vllm/blob/main/Dockerfile.tpu)
+to build a Docker image with TPU support.
+
+```console
+$ docker build -f Dockerfile.tpu -t vllm-tpu .
+```
+
+Run the Docker image with the following command:
+
+```console
+$ # Make sure to add `--privileged --net host --shm-size=16G`.
+$ docker run --privileged --net host --shm-size=16G -it vllm-tpu
+```
+
+```{note}
+Since TPU relies on XLA which requires static shapes, vLLM bucketizes the
+possible input shapes and compiles an XLA graph for each shape. The
+compilation time may take 20~30 minutes in the first run. However, the
+compilation time reduces to ~5 minutes afterwards because the XLA graphs are
+cached in the disk (in {code}`VLLM_XLA_CACHE_PATH` or {code}`~/.cache/vllm/xla_cache` by default).
+```
+
+````{tip}
+If you encounter the following error:
+
+```console
+from torch._C import *  # noqa: F403
+ImportError: libopenblas.so.0: cannot open shared object file: No such
+file or directory
+```
+
+Install OpenBLAS with the following command:
+
+```console
+$ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
+```
+````
diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst
deleted file mode 100644
index 22cc684a1c778..0000000000000
--- a/docs/source/getting_started/tpu-installation.rst
+++ /dev/null
@@ -1,200 +0,0 @@
-.. _installation_tpu:
-
-#####################
-Installation with TPU
-#####################
-
-Tensor Processing Units (TPUs) are Google's custom-developed application-specific 
-integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs 
-are available in different versions each with different hardware specifications.
-For more information about TPUs, see `TPU System Architecture <https://cloud.google.com/tpu/docs/system-architecture-tpu-vm>`_. 
-For more information on the TPU versions supported with vLLM, see:
-
-* `TPU v6e <https://cloud.google.com/tpu/docs/v6e>`_
-* `TPU v5e <https://cloud.google.com/tpu/docs/v5e>`_
-* `TPU v5p <https://cloud.google.com/tpu/docs/v5p>`_
-* `TPU v4 <https://cloud.google.com/tpu/docs/v4>`_
-
-These TPU versions allow you to configure the physical arrangements of the TPU 
-chips. This can improve throughput and networking performance. For more 
-information see: 
-
-* `TPU v6e topologies <https://cloud.google.com/tpu/docs/v6e#configurations>`_
-* `TPU v5e topologies <https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config>`_
-* `TPU v5p topologies <https://cloud.google.com/tpu/docs/v5p#tpu-v5p-config>`_
-* `TPU v4 topologies <https://cloud.google.com/tpu/docs/v4#tpu-v4-config>`_
-
-In order for you to use Cloud TPUs you need to have TPU quota granted to your 
-Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a
-GPC project and are specified in terms of TPU version, the number of TPU you 
-want to use, and quota type. For more information, see `TPU quota <https://cloud.google.com/tpu/docs/quota#tpu_quota>`_. 
-
-For TPU pricing information, see `Cloud TPU pricing <https://cloud.google.com/tpu/pricing>`_.
-
-You may need additional persistent storage for your TPU VMs. For more 
-information, see `Storage options for Cloud TPU data <https://cloud.devsite.corp.google.com/tpu/docs/storage-options>`_.
-
-Requirements
-------------
-
-* Google Cloud TPU VM 
-* TPU versions: v6e, v5e, v5p, v4
-* Python: 3.10 or newer
-
-Provision Cloud TPUs
-====================
-
-You can provision Cloud TPUs using the `Cloud TPU API <https://cloud.google.com/tpu/docs/reference/rest>`_ 
-or the `queued resources <https://cloud.google.com/tpu/docs/queued-resources>`_ 
-API. This section shows how to create TPUs using the queued resource API. For 
-more information about using the Cloud TPU API, see `Create a Cloud TPU using the Create Node API <https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#create-node-api>`_. 
-Queued resources enable you to request Cloud TPU resources in a queued manner. 
-When you request queued resources, the request is added to a queue maintained by 
-the Cloud TPU service. When the requested resource becomes available, it's 
-assigned to your Google Cloud project for your immediate exclusive use. 
-
-.. note::
-   In all of the following commands, replace the ALL CAPS parameter names with 
-   appropriate values. See the parameter descriptions table for more information.
-
-Provision a Cloud TPU with the queued resource API
---------------------------------------------------
-Create a TPU v5e with 4 TPU chips:
-
-.. code-block:: console
-
-    gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
-    --node-id TPU_NAME \
-    --project PROJECT_ID \
-    --zone ZONE \
-    --accelerator-type ACCELERATOR_TYPE \
-    --runtime-version RUNTIME_VERSION \
-    --service-account SERVICE_ACCOUNT
-
-   
-.. list-table:: Parameter descriptions
-    :header-rows: 1
-
-    * - Parameter name
-      - Description
-    * - QUEUED_RESOURCE_ID
-      - The user-assigned ID of the queued resource request.
-    * - TPU_NAME
-      - The user-assigned name of the TPU which is created when the queued 
-        resource request is allocated.
-    * - PROJECT_ID
-      - Your Google Cloud project
-    * - ZONE
-      - The GCP zone where you want to create your Cloud TPU. The value you use 
-        depends on the version of TPUs you are using. For more information, see 
-        `TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_ 
-    * - ACCELERATOR_TYPE
-      - The TPU version you want to use. Specify the TPU version, for example 
-        `v5litepod-4` specifies a v5e TPU with 4 cores. For more information, 
-        see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
-    * - RUNTIME_VERSION
-      - The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
-    * - SERVICE_ACCOUNT
-      - The email address for your service account. You can find it in the IAM 
-        Cloud Console under *Service Accounts*. For example: 
-        `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`
-
-Connect to your TPU using SSH:
-
-.. code-block:: bash
-
-    gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE
-
-Install Miniconda
-
-.. code-block:: bash
-
-    wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
-    bash Miniconda3-latest-Linux-x86_64.sh
-    source ~/.bashrc
-
-Create and activate a Conda environment for vLLM:
-
-.. code-block:: bash
-
-    conda create -n vllm python=3.10 -y
-    conda activate vllm
-
-Clone the vLLM repository and go to the vLLM directory:
-
-.. code-block:: bash
-
-    git clone https://github.com/vllm-project/vllm.git && cd vllm
-
-Uninstall the existing `torch` and `torch_xla` packages:
-
-.. code-block:: bash
-
-    pip uninstall torch torch-xla -y
-
-Install build dependencies:
-
-.. code-block:: bash
-
-    pip install -r requirements-tpu.txt
-    sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev 
-
-Run the setup script:
-
-.. code-block:: bash
-
-   VLLM_TARGET_DEVICE="tpu" python setup.py develop
-
-
-Provision Cloud TPUs with GKE 
------------------------------
-
-For more information about using TPUs with GKE, see 
-https://cloud.google.com/kubernetes-engine/docs/how-to/tpus
-https://cloud.google.com/kubernetes-engine/docs/concepts/tpus
-https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus
-
-.. _build_docker_tpu:
-
-Build a docker image with :code:`Dockerfile.tpu`
-------------------------------------------------
-
-You can use `Dockerfile.tpu <https://github.com/vllm-project/vllm/blob/main/Dockerfile.tpu>`_ 
-to build a Docker image with TPU support.
-
-.. code-block:: console
-
-    $ docker build -f Dockerfile.tpu -t vllm-tpu .
-
-Run the Docker image with the following command:
-
-.. code-block:: console
-
-    $ # Make sure to add `--privileged --net host --shm-size=16G`.
-    $ docker run --privileged --net host --shm-size=16G -it vllm-tpu
-
-.. note::
-
-    Since TPU relies on XLA which requires static shapes, vLLM bucketizes the 
-    possible input shapes and compiles an XLA graph for each shape. The 
-    compilation time may take 20~30 minutes in the first run. However, the 
-    compilation time reduces to ~5 minutes afterwards because the XLA graphs are 
-    cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default).
-
-.. tip::
-
-    If you encounter the following error:
-
-    .. code-block:: console
-
-        from torch._C import *  # noqa: F403
-        ImportError: libopenblas.so.0: cannot open shared object file: No such 
-        file or directory
-
-
-    Install OpenBLAS with the following command:
-
-    .. code-block:: console
-
-        $ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
-
diff --git a/docs/source/getting_started/xpu-installation.md b/docs/source/getting_started/xpu-installation.md
new file mode 100644
index 0000000000000..5c57509aef2db
--- /dev/null
+++ b/docs/source/getting_started/xpu-installation.md
@@ -0,0 +1,74 @@
+(installation-xpu)=
+
+# Installation with XPU
+
+vLLM initially supports basic model inferencing and serving on Intel GPU platform.
+
+Table of contents:
+
+1. [Requirements](#xpu-backend-requirements)
+2. [Quick start using Dockerfile](#xpu-backend-quick-start-dockerfile)
+3. [Build from source](#build-xpu-backend-from-source)
+
+(xpu-backend-requirements)=
+
+## Requirements
+
+- OS: Linux
+- Supported Hardware: Intel Data Center GPU, Intel ARC GPU
+- OneAPI requirements: oneAPI 2024.2
+
+(xpu-backend-quick-start-dockerfile)=
+
+## Quick start using Dockerfile
+
+```console
+$ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
+$ docker run -it \
+             --rm \
+             --network=host \
+             --device /dev/dri \
+             -v /dev/dri/by-path:/dev/dri/by-path \
+             vllm-xpu-env
+```
+
+(build-xpu-backend-from-source)=
+
+## Build from source
+
+- First, install required driver and intel OneAPI 2024.2 or later.
+- Second, install Python packages for vLLM XPU backend building:
+
+```console
+$ source /opt/intel/oneapi/setvars.sh
+$ pip install --upgrade pip
+$ pip install -v -r requirements-xpu.txt
+```
+
+- Finally, build and install vLLM XPU backend:
+
+```console
+$ VLLM_TARGET_DEVICE=xpu python setup.py install
+```
+
+```{note}
+- FP16 is the default data type in the current XPU backend. The BF16 data
+  type will be supported in the future.
+```
+
+## Distributed inference and serving
+
+XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following:
+
+```console
+$ python -m vllm.entrypoints.openai.api_server \
+$      --model=facebook/opt-13b \
+$      --dtype=bfloat16 \
+$      --device=xpu \
+$      --max_model_len=1024 \
+$      --distributed-executor-backend=ray \
+$      --pipeline-parallel-size=2 \
+$      -tp=8
+```
+
+By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring helper [script](https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh).
diff --git a/docs/source/getting_started/xpu-installation.rst b/docs/source/getting_started/xpu-installation.rst
deleted file mode 100644
index b1868acbc84b0..0000000000000
--- a/docs/source/getting_started/xpu-installation.rst
+++ /dev/null
@@ -1,80 +0,0 @@
-.. _installation_xpu:
-
-Installation with XPU
-========================
-
-vLLM initially supports basic model inferencing and serving on Intel GPU platform.
-
-Table of contents:
-
-#. :ref:`Requirements <xpu_backend_requirements>`
-#. :ref:`Quick start using Dockerfile <xpu_backend_quick_start_dockerfile>`
-#. :ref:`Build from source <build_xpu_backend_from_source>`
-
-.. _xpu_backend_requirements:
-
-Requirements
-------------
-
-* OS: Linux
-* Supported Hardware: Intel Data Center GPU, Intel ARC GPU
-* OneAPI requirements: oneAPI 2024.2 
-
-.. _xpu_backend_quick_start_dockerfile:
-
-Quick start using Dockerfile
-----------------------------
-
-.. code-block:: console
-
-    $ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
-    $ docker run -it \
-                 --rm \
-                 --network=host \
-                 --device /dev/dri \
-                 -v /dev/dri/by-path:/dev/dri/by-path \
-                 vllm-xpu-env
-
-.. _build_xpu_backend_from_source:
-
-Build from source
------------------
-
-- First, install required driver and intel OneAPI 2024.2 or later.
-
-- Second, install Python packages for vLLM XPU backend building:
-
-.. code-block:: console
-
-    $ source /opt/intel/oneapi/setvars.sh
-    $ pip install --upgrade pip
-    $ pip install -v -r requirements-xpu.txt 
-
-- Finally, build and install vLLM XPU backend: 
-
-.. code-block:: console
-
-    $ VLLM_TARGET_DEVICE=xpu python setup.py install
-
-.. note::
-    - FP16 is the default data type in the current XPU backend. The BF16 data
-      type will be supported in the future.
-
-
-Distributed inference and serving
----------------------------------
-
-XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following:
-
-.. code-block:: console
-
-    $ python -m vllm.entrypoints.openai.api_server \
-    $      --model=facebook/opt-13b \
-    $      --dtype=bfloat16 \
-    $      --device=xpu \
-    $      --max_model_len=1024 \
-    $      --distributed-executor-backend=ray \
-    $      --pipeline-parallel-size=2 \
-    $      -tp=8
-
-By default, a ray instance will be launched automatically if no existing one is detected in system, with ``num-gpus`` equals to ``parallel_config.world_size``. We recommend properly starting a ray cluster before execution, referring helper `script <https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh>`_.
diff --git a/docs/source/index.md b/docs/source/index.md
new file mode 100644
index 0000000000000..34f9c4caebe6f
--- /dev/null
+++ b/docs/source/index.md
@@ -0,0 +1,200 @@
+# Welcome to vLLM!
+
+```{figure} ./assets/logos/vllm-logo-text-light.png
+:align: center
+:alt: vLLM
+:class: no-scaled-link
+:width: 60%
+```
+
+```{raw} html
+<p style="text-align:center">
+<strong>Easy, fast, and cheap LLM serving for everyone
+</strong>
+</p>
+
+<p style="text-align:center">
+<script async defer src="https://buttons.github.io/buttons.js"></script>
+<a class="github-button" href="https://github.com/vllm-project/vllm" data-show-count="true" data-size="large" aria-label="Star">Star</a>
+<a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
+<a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
+</p>
+```
+
+vLLM is a fast and easy-to-use library for LLM inference and serving.
+
+vLLM is fast with:
+
+- State-of-the-art serving throughput
+- Efficient management of attention key and value memory with **PagedAttention**
+- Continuous batching of incoming requests
+- Fast model execution with CUDA/HIP graph
+- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8
+- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
+- Speculative decoding
+- Chunked prefill
+
+vLLM is flexible and easy to use with:
+
+- Seamless integration with popular HuggingFace models
+- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
+- Tensor parallelism and pipeline parallelism support for distributed inference
+- Streaming outputs
+- OpenAI-compatible API server
+- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
+- Prefix caching support
+- Multi-lora support
+
+For more information, check out the following:
+
+- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention)
+- [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023)
+- [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al.
+- {ref}`vLLM Meetups <meetups>`.
+
+## Documentation
+
+```{toctree}
+:caption: Getting Started
+:maxdepth: 1
+
+getting_started/installation
+getting_started/amd-installation
+getting_started/openvino-installation
+getting_started/cpu-installation
+getting_started/gaudi-installation
+getting_started/arm-installation
+getting_started/neuron-installation
+getting_started/tpu-installation
+getting_started/xpu-installation
+getting_started/quickstart
+getting_started/debugging
+getting_started/examples/examples_index
+```
+
+```{toctree}
+:caption: Serving
+:maxdepth: 1
+
+serving/openai_compatible_server
+serving/deploying_with_docker
+serving/deploying_with_k8s
+serving/deploying_with_helm
+serving/deploying_with_nginx
+serving/distributed_serving
+serving/metrics
+serving/integrations
+serving/tensorizer
+serving/runai_model_streamer
+```
+
+```{toctree}
+:caption: Models
+:maxdepth: 1
+
+models/supported_models
+models/generative_models
+models/pooling_models
+models/adding_model
+models/enabling_multimodal_inputs
+```
+
+```{toctree}
+:caption: Usage
+:maxdepth: 1
+
+usage/lora
+usage/multimodal_inputs
+usage/tool_calling
+usage/structured_outputs
+usage/spec_decode
+usage/compatibility_matrix
+usage/performance
+usage/faq
+usage/engine_args
+usage/env_vars
+usage/usage_stats
+usage/disagg_prefill
+```
+
+```{toctree}
+:caption: Quantization
+:maxdepth: 1
+
+quantization/supported_hardware
+quantization/auto_awq
+quantization/bnb
+quantization/gguf
+quantization/int8
+quantization/fp8
+quantization/fp8_e5m2_kvcache
+quantization/fp8_e4m3_kvcache
+```
+
+```{toctree}
+:caption: Automatic Prefix Caching
+:maxdepth: 1
+
+automatic_prefix_caching/apc
+automatic_prefix_caching/details
+```
+
+```{toctree}
+:caption: Performance
+:maxdepth: 1
+
+performance/benchmarks
+```
+
+% Community: User community resources
+
+```{toctree}
+:caption: Community
+:maxdepth: 1
+
+community/meetups
+community/sponsors
+```
+
+% API Documentation: API reference aimed at vllm library usage
+
+```{toctree}
+:caption: API Documentation
+:maxdepth: 2
+
+dev/sampling_params
+dev/pooling_params
+dev/offline_inference/offline_index
+dev/engine/engine_index
+```
+
+% Design: docs about vLLM internals
+
+```{toctree}
+:caption: Design
+:maxdepth: 2
+
+design/arch_overview
+design/huggingface_integration
+design/plugin_system
+design/input_processing/model_inputs_index
+design/kernel/paged_attention
+design/multimodal/multimodal_index
+design/multiprocessing
+```
+
+% For Developers: contributing to the vLLM project
+
+```{toctree}
+:caption: For Developers
+:maxdepth: 2
+
+contributing/overview
+contributing/profiling/profiling_index
+contributing/dockerfile/dockerfile
+```
+
+# Indices and tables
+
+- {ref}`genindex`
+- {ref}`modindex`
diff --git a/docs/source/index.rst b/docs/source/index.rst
deleted file mode 100644
index d812885aafea9..0000000000000
--- a/docs/source/index.rst
+++ /dev/null
@@ -1,194 +0,0 @@
-Welcome to vLLM!
-================
-
-.. figure:: ./assets/logos/vllm-logo-text-light.png
-  :width: 60%
-  :align: center
-  :alt: vLLM
-  :class: no-scaled-link
-
-.. raw:: html
-
-   <p style="text-align:center">
-   <strong>Easy, fast, and cheap LLM serving for everyone
-   </strong>
-   </p>
-
-   <p style="text-align:center">
-   <script async defer src="https://buttons.github.io/buttons.js"></script>
-   <a class="github-button" href="https://github.com/vllm-project/vllm" data-show-count="true" data-size="large" aria-label="Star">Star</a>
-   <a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
-   <a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
-   </p>
-
-
-
-vLLM is a fast and easy-to-use library for LLM inference and serving.
-
-vLLM is fast with:
-
-* State-of-the-art serving throughput
-* Efficient management of attention key and value memory with **PagedAttention**
-* Continuous batching of incoming requests
-* Fast model execution with CUDA/HIP graph
-* Quantization: `GPTQ <https://arxiv.org/abs/2210.17323>`_, `AWQ <https://arxiv.org/abs/2306.00978>`_, INT4, INT8, and FP8
-* Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
-* Speculative decoding
-* Chunked prefill
-
-vLLM is flexible and easy to use with:
-
-* Seamless integration with popular HuggingFace models
-* High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
-* Tensor parallelism and pipeline parallelism support for distributed inference
-* Streaming outputs
-* OpenAI-compatible API server
-* Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
-* Prefix caching support
-* Multi-lora support
-
-For more information, check out the following:
-
-* `vLLM announcing blog post <https://vllm.ai>`_ (intro to PagedAttention)
-* `vLLM paper <https://arxiv.org/abs/2309.06180>`_ (SOSP 2023)
-* `How continuous batching enables 23x throughput in LLM inference while reducing p50 latency <https://www.anyscale.com/blog/continuous-batching-llm-inference>`_ by Cade Daniel et al.
-* :ref:`vLLM Meetups <meetups>`.
-
-
-Documentation
--------------
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Getting Started
-
-   getting_started/installation
-   getting_started/amd-installation
-   getting_started/openvino-installation
-   getting_started/cpu-installation
-   getting_started/gaudi-installation
-   getting_started/arm-installation
-   getting_started/neuron-installation
-   getting_started/tpu-installation
-   getting_started/xpu-installation
-   getting_started/quickstart
-   getting_started/debugging
-   getting_started/examples/examples_index
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Serving
-
-   serving/openai_compatible_server
-   serving/deploying_with_docker
-   serving/deploying_with_k8s
-   serving/deploying_with_helm
-   serving/deploying_with_nginx
-   serving/distributed_serving
-   serving/metrics
-   serving/integrations
-   serving/tensorizer
-   serving/runai_model_streamer
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Models
-
-   models/supported_models
-   models/generative_models
-   models/pooling_models
-   models/adding_model
-   models/enabling_multimodal_inputs
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Usage
-
-   usage/lora
-   usage/multimodal_inputs
-   usage/tool_calling
-   usage/structured_outputs
-   usage/spec_decode
-   usage/compatibility_matrix
-   usage/performance
-   usage/faq
-   usage/engine_args
-   usage/env_vars
-   usage/usage_stats
-   usage/disagg_prefill
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Quantization
-
-   quantization/supported_hardware
-   quantization/auto_awq
-   quantization/bnb
-   quantization/gguf
-   quantization/int8
-   quantization/fp8
-   quantization/fp8_e5m2_kvcache
-   quantization/fp8_e4m3_kvcache
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Automatic Prefix Caching
-
-   automatic_prefix_caching/apc
-   automatic_prefix_caching/details
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Performance
-
-   performance/benchmarks
-
-.. Community: User community resources
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Community
-
-   community/meetups
-   community/sponsors
-
-.. API Documentation: API reference aimed at vllm library usage
-
-.. toctree::
-   :maxdepth: 2
-   :caption: API Documentation
-
-   dev/sampling_params
-   dev/pooling_params
-   dev/offline_inference/offline_index
-   dev/engine/engine_index
-
-.. Design: docs about vLLM internals
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Design
-
-   design/arch_overview
-   design/huggingface_integration
-   design/plugin_system
-   design/input_processing/model_inputs_index
-   design/kernel/paged_attention
-   design/multimodal/multimodal_index
-   design/multiprocessing
-
-.. For Developers: contributing to the vLLM project
-
-.. toctree::
-   :maxdepth: 2
-   :caption: For Developers
-
-   contributing/overview
-   contributing/profiling/profiling_index
-   contributing/dockerfile/dockerfile
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
diff --git a/docs/source/models/adding_model.md b/docs/source/models/adding_model.md
new file mode 100644
index 0000000000000..3739873bb547b
--- /dev/null
+++ b/docs/source/models/adding_model.md
@@ -0,0 +1,155 @@
+(adding-a-new-model)=
+
+# Adding a New Model
+
+This document provides a high-level guide on integrating a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM.
+
+```{note}
+The complexity of adding a new model depends heavily on the model's architecture.
+The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
+However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
+```
+
+```{note}
+By default, vLLM models do not support multi-modal inputs. To enable multi-modal support,
+please follow [this guide](#enabling-multimodal-inputs) after implementing the model here.
+```
+
+```{tip}
+If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our [GitHub](https://github.com/vllm-project/vllm/issues) repository.
+We will be happy to help you out!
+```
+
+## 0. Fork the vLLM repository
+
+Start by forking our [GitHub] repository and then [build it from source](#build-from-source).
+This gives you the ability to modify the codebase and test your model.
+
+```{tip}
+If you don't want to fork the repository and modify vLLM's codebase, please refer to the "Out-of-Tree Model Integration" section below.
+```
+
+## 1. Bring your model code
+
+Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the [vllm/model_executor/models](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models) directory.
+For instance, vLLM's [OPT model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/opt.py) was adapted from the HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
+
+```{warning}
+When copying the model code, make sure to review and adhere to the code's copyright and licensing terms.
+```
+
+## 2. Make your code compatible with vLLM
+
+To ensure compatibility with vLLM, your model must meet the following requirements:
+
+### Initialization Code
+
+All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for:
+
+- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts.
+- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode.
+
+The initialization code should look like this:
+
+```python
+from torch import nn
+from vllm.config import VllmConfig
+from vllm.attention import Attention
+
+class MyAttention(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str):
+        super().__init__()
+        self.attn = Attention(prefix=f"{prefix}.attn")
+
+class MyDecoderLayer(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str):
+        super().__init__()
+        self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
+
+class MyModel(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
+        )
+
+class MyModelForCausalLM(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
+```
+
+### Computation Code
+
+Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
+
+```python
+def forward(
+    self,
+    input_ids: torch.Tensor,
+    positions: torch.Tensor,
+    kv_caches: List[torch.Tensor],
+    attn_metadata: AttentionMetadata,
+) -> torch.Tensor:
+    ...
+```
+
+```{note}
+Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
+If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
+```
+
+For reference, check out the [LLAMA model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out the [vLLM models](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models) directory for more examples.
+
+## 3. (Optional) Implement tensor parallelism and quantization support
+
+If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
+To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
+For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with {code}`VocabParallelEmbedding`. For the output LM head, you can use {code}`ParallelLMHead`.
+When it comes to the linear layers, we provide the following options to parallelize them:
+
+- {code}`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
+- {code}`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
+- {code}`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
+- {code}`MergedColumnParallelLinear`: Column-parallel linear that merges multiple {code}`ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
+- {code}`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
+
+Note that all the linear layers above take {code}`linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
+
+## 4. Implement the weight loading logic
+
+You now need to implement the {code}`load_weights` method in your {code}`*ForCausalLM` class.
+This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for {code}`MergedColumnParallelLinear` and {code}`QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
+
+## 5. Register your model
+
+Finally, register your {code}`*ForCausalLM` class to the {code}`_VLLM_MODELS` in [vllm/model_executor/models/registry.py](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/registry.py).
+
+## 6. Out-of-Tree Model Integration
+
+You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see [plugin-system](#plugin-system).
+
+To register the model, use the following code:
+
+```python
+from vllm import ModelRegistry
+from your_code import YourModelForCausalLM
+ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
+```
+
+If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like {code}`RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
+
+```python
+from vllm import ModelRegistry
+
+ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
+```
+
+```{important}
+If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
+Read more about that [here](#enabling-multimodal-inputs).
+```
+
+```{note}
+Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
+```
diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
deleted file mode 100644
index df06d736ca86b..0000000000000
--- a/docs/source/models/adding_model.rst
+++ /dev/null
@@ -1,159 +0,0 @@
-.. _adding_a_new_model:
-
-Adding a New Model
-==================
-
-This document provides a high-level guide on integrating a `HuggingFace Transformers <https://github.com/huggingface/transformers>`_ model into vLLM.
-
-.. note::
-    The complexity of adding a new model depends heavily on the model's architecture.
-    The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
-    However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
-
-.. note::
-    By default, vLLM models do not support multi-modal inputs. To enable multi-modal support,
-    please follow :ref:`this guide <enabling_multimodal_inputs>` after implementing the model here.
-
-.. tip::
-    If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ repository.
-    We will be happy to help you out!
-
-
-0. Fork the vLLM repository
---------------------------------
-
-Start by forking our `GitHub`_ repository and then :ref:`build it from source <build_from_source>`.
-This gives you the ability to modify the codebase and test your model.
-
-.. tip::
-    If you don't want to fork the repository and modify vLLM's codebase, please refer to the "Out-of-Tree Model Integration" section below.
-
-1. Bring your model code
-------------------------
-
-Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the `vllm/model_executor/models <https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models>`_ directory.
-For instance, vLLM's `OPT model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/opt.py>`_ was adapted from the HuggingFace's `modeling_opt.py <https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py>`_ file.
-
-.. warning::
-    When copying the model code, make sure to review and adhere to the code's copyright and licensing terms.
-
-
-2. Make your code compatible with vLLM
---------------------------------------
-
-To ensure compatibility with vLLM, your model must meet the following requirements:
-
-Initialization Code
-^^^^^^^^^^^^^^^^^^^
-
-All vLLM modules within the model must include a ``prefix`` argument in their constructor. This ``prefix`` is typically the full name of the module in the model's state dictionary and is crucial for:
-
-* Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts.
-* Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the ``prefix`` during initialization, vLLM can match the current layer's ``prefix`` with the quantization configuration to determine if the layer should be initialized in quantized mode.
-
-The initialization code should look like this:
-
-.. code-block:: python
-
-    from torch import nn
-    from vllm.config import VllmConfig
-    from vllm.attention import Attention
-
-    class MyAttention(nn.Module):
-        def __init__(self, vllm_config: VllmConfig, prefix: str):
-            super().__init__()
-            self.attn = Attention(prefix=f"{prefix}.attn")
-
-    class MyDecoderLayer(nn.Module):
-        def __init__(self, vllm_config: VllmConfig, prefix: str):
-            super().__init__()
-            self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
-
-    class MyModel(nn.Module):
-        def __init__(self, vllm_config: VllmConfig, prefix: str):
-            super().__init__()
-            self.layers = nn.ModuleList(
-                [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
-            )
-
-    class MyModelForCausalLM(nn.Module):
-        def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
-            super().__init__()
-            self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
-
-Computation Code
-^^^^^^^^^^^^^^^^
-
-Rewrite the :meth:`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat ``input_ids`` and ``positions`` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
-
-.. code-block:: python
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-    ) -> torch.Tensor:
-        ...
-
-.. note::
-    Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
-    If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
-
-For reference, check out the `LLAMA model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llama.py>`__. vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out the `vLLM models <https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models>`__ directory for more examples.
-
-3. (Optional) Implement tensor parallelism and quantization support
--------------------------------------------------------------------
-
-If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
-To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
-For the embedding layer, you can simply replace :class:`torch.nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`.
-When it comes to the linear layers, we provide the following options to parallelize them:
-
-* :code:`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
-* :code:`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
-* :code:`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
-* :code:`MergedColumnParallelLinear`: Column-parallel linear that merges multiple :code:`ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
-* :code:`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
-
-Note that all the linear layers above take :code:`linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
-
-4. Implement the weight loading logic
--------------------------------------
-
-You now need to implement the :code:`load_weights` method in your :code:`*ForCausalLM` class.
-This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for :code:`MergedColumnParallelLinear` and :code:`QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
-
-5. Register your model
-----------------------
-
-Finally, register your :code:`*ForCausalLM` class to the :code:`_VLLM_MODELS` in `vllm/model_executor/models/registry.py <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/registry.py>`_.
-
-6. Out-of-Tree Model Integration
---------------------------------
-
-You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see :ref:`plugin_system`.
-
-To register the model, use the following code:
-
-.. code-block:: python
-
-    from vllm import ModelRegistry
-    from your_code import YourModelForCausalLM
-    ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
-
-If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
-
-.. code-block:: python
-
-    from vllm import ModelRegistry
-
-    ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
-
-.. important::
-    If your model is a multimodal model, ensure the model class implements the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
-    Read more about that :ref:`here <enabling_multimodal_inputs>`.
-
-.. note::
-    Although you can directly put these code snippets in your script using ``vllm.LLM``, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
diff --git a/docs/source/models/enabling_multimodal_inputs.md b/docs/source/models/enabling_multimodal_inputs.md
new file mode 100644
index 0000000000000..2f93eb826fb1e
--- /dev/null
+++ b/docs/source/models/enabling_multimodal_inputs.md
@@ -0,0 +1,143 @@
+(enabling-multimodal-inputs)=
+
+# Enabling Multimodal Inputs
+
+This document walks you through the steps to extend a vLLM model so that it accepts [multi-modal inputs](#multimodal-inputs).
+
+```{seealso}
+[Adding a New Model](adding-a-new-model)
+```
+
+## 1. Update the base vLLM model
+
+It is assumed that you have already implemented the model in vLLM according to [these steps](#adding-a-new-model).
+Further update the model as follows:
+
+- Implement the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
+
+  ```diff
+  + from vllm.model_executor.models.interfaces import SupportsMultiModal
+
+  - class YourModelForImage2Seq(nn.Module):
+  + class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+  ```
+
+  ```{note}
+  The model class does not have to be named {code}`*ForCausalLM`.
+  Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples.
+  ```
+
+- If you haven't already done so, reserve a keyword parameter in {meth}`~torch.nn.Module.forward`
+  for each input tensor that corresponds to a multi-modal input, as shown in the following example:
+
+  ```diff
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+  +     pixel_values: torch.Tensor,
+    ) -> SamplerOutput:
+  ```
+
+## 2. Register input mappers
+
+For each modality type that the model accepts as input, decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_input_mapper <vllm.multimodal.MultiModalRegistry.register_input_mapper>`.
+This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in {meth}`~torch.nn.Module.forward`.
+
+```diff
+  from vllm.model_executor.models.interfaces import SupportsMultiModal
++ from vllm.multimodal import MULTIMODAL_REGISTRY
+
++ @MULTIMODAL_REGISTRY.register_image_input_mapper()
+  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+```
+
+A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function.
+
+```{seealso}
+[Input Processing Pipeline](#input-processing-pipeline)
+```
+
+## 3. Register maximum number of multi-modal tokens
+
+For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data item
+and register it via {meth}`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_max_multimodal_tokens>`.
+
+```diff
+  from vllm.inputs import INPUT_REGISTRY
+  from vllm.model_executor.models.interfaces import SupportsMultiModal
+  from vllm.multimodal import MULTIMODAL_REGISTRY
+
+  @MULTIMODAL_REGISTRY.register_image_input_mapper()
++ @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
+  @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
+  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+```
+
+Here are some examples:
+
+- Image inputs (static feature size): [LLaVA-1.5 Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py)
+- Image inputs (dynamic feature size): [LLaVA-NeXT Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py)
+
+```{seealso}
+[Input Processing Pipeline](#input-processing-pipeline)
+```
+
+## 4. (Optional) Register dummy data
+
+During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models.
+In such cases, you can define your own dummy data by registering a factory method via {meth}`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_dummy_data>`.
+
+```diff
+  from vllm.inputs import INPUT_REGISTRY
+  from vllm.model_executor.models.interfaces import SupportsMultiModal
+  from vllm.multimodal import MULTIMODAL_REGISTRY
+
+  @MULTIMODAL_REGISTRY.register_image_input_mapper()
+  @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
++ @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
+  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+```
+
+```{note}
+The dummy data should have the maximum possible number of multi-modal tokens, as described in the previous step.
+```
+
+Here are some examples:
+
+- Image inputs (static feature size): [LLaVA-1.5 Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py)
+- Image inputs (dynamic feature size): [LLaVA-NeXT Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py)
+
+```{seealso}
+[Input Processing Pipeline](#input-processing-pipeline)
+```
+
+## 5. (Optional) Register input processor
+
+Sometimes, there is a need to process inputs at the {class}`~vllm.LLMEngine` level before they are passed to the model executor.
+This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's {meth}`~torch.nn.Module.forward` call.
+You can register input processors via {meth}`INPUT_REGISTRY.register_input_processor <vllm.inputs.registry.InputRegistry.register_input_processor>`.
+
+```diff
+  from vllm.inputs import INPUT_REGISTRY
+  from vllm.model_executor.models.interfaces import SupportsMultiModal
+  from vllm.multimodal import MULTIMODAL_REGISTRY
+
+  @MULTIMODAL_REGISTRY.register_image_input_mapper()
+  @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
+  @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
++ @INPUT_REGISTRY.register_input_processor(<your_input_processor>)
+  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+```
+
+A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation.
+Here are some examples:
+
+- Insert static number of image tokens: [LLaVA-1.5 Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py)
+- Insert dynamic number of image tokens: [LLaVA-NeXT Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py)
+
+```{seealso}
+[Input Processing Pipeline](#input-processing-pipeline)
+```
diff --git a/docs/source/models/enabling_multimodal_inputs.rst b/docs/source/models/enabling_multimodal_inputs.rst
deleted file mode 100644
index 5c1236e1a8972..0000000000000
--- a/docs/source/models/enabling_multimodal_inputs.rst
+++ /dev/null
@@ -1,147 +0,0 @@
-.. _enabling_multimodal_inputs:
-
-Enabling Multimodal Inputs
-==========================
-
-This document walks you through the steps to extend a vLLM model so that it accepts :ref:`multi-modal inputs <multimodal_inputs>`.
-
-.. seealso::
-    :ref:`adding_a_new_model`
-
-
-1. Update the base vLLM model
------------------------------
-
-It is assumed that you have already implemented the model in vLLM according to :ref:`these steps <adding_a_new_model>`.
-Further update the model as follows:
-
-- Implement the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
-
-  .. code-block:: diff
-
-      + from vllm.model_executor.models.interfaces import SupportsMultiModal
-
-      - class YourModelForImage2Seq(nn.Module):
-      + class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
-
-  .. note::
-      The model class does not have to be named :code:`*ForCausalLM`.
-      Check out `the HuggingFace Transformers documentation <https://huggingface.co/docs/transformers/model_doc/auto#multimodal>`__ for some examples.
-
-- If you haven't already done so, reserve a keyword parameter in :meth:`~torch.nn.Module.forward`
-  for each input tensor that corresponds to a multi-modal input, as shown in the following example:
-
-  .. code-block:: diff
-
-        def forward(
-            self,
-            input_ids: torch.Tensor,
-            positions: torch.Tensor,
-            kv_caches: List[torch.Tensor],
-            attn_metadata: AttentionMetadata,
-      +     pixel_values: torch.Tensor,
-        ) -> SamplerOutput:
-
-
-2. Register input mappers
--------------------------
-
-For each modality type that the model accepts as input, decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_input_mapper <vllm.multimodal.MultiModalRegistry.register_input_mapper>`.
-This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in :meth:`~torch.nn.Module.forward`.
-
-.. code-block:: diff
-
-      from vllm.model_executor.models.interfaces import SupportsMultiModal
-    + from vllm.multimodal import MULTIMODAL_REGISTRY
-
-    + @MULTIMODAL_REGISTRY.register_image_input_mapper()
-      class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
-
-A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function.
-
-.. seealso::
-    :ref:`input_processing_pipeline`
-
-
-3. Register maximum number of multi-modal tokens
-------------------------------------------------
-
-For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data item
-and register it via :meth:`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_max_multimodal_tokens>`.
-
-.. code-block:: diff
-
-      from vllm.inputs import INPUT_REGISTRY
-      from vllm.model_executor.models.interfaces import SupportsMultiModal
-      from vllm.multimodal import MULTIMODAL_REGISTRY
-
-      @MULTIMODAL_REGISTRY.register_image_input_mapper()
-    + @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
-      @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
-      class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
-
-Here are some examples:
-
-- Image inputs (static feature size): `LLaVA-1.5 Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py>`__
-- Image inputs (dynamic feature size): `LLaVA-NeXT Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py>`__
-
-.. seealso::
-    :ref:`input_processing_pipeline`
-
-
-4. (Optional) Register dummy data
----------------------------------
-
-During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models.
-In such cases, you can define your own dummy data by registering a factory method via :meth:`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_dummy_data>`.
-
-.. code-block:: diff
-
-      from vllm.inputs import INPUT_REGISTRY
-      from vllm.model_executor.models.interfaces import SupportsMultiModal
-      from vllm.multimodal import MULTIMODAL_REGISTRY
-
-      @MULTIMODAL_REGISTRY.register_image_input_mapper()
-      @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
-    + @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
-      class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
-
-.. note::
-    The dummy data should have the maximum possible number of multi-modal tokens, as described in the previous step.
-
-Here are some examples:
-
-- Image inputs (static feature size): `LLaVA-1.5 Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py>`__
-- Image inputs (dynamic feature size): `LLaVA-NeXT Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py>`__
-
-.. seealso::
-    :ref:`input_processing_pipeline`
-
-
-5. (Optional) Register input processor
---------------------------------------
-
-Sometimes, there is a need to process inputs at the :class:`~vllm.LLMEngine` level before they are passed to the model executor. 
-This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's :meth:`~torch.nn.Module.forward` call.
-You can register input processors via :meth:`INPUT_REGISTRY.register_input_processor <vllm.inputs.registry.InputRegistry.register_input_processor>`.
-
-.. code-block:: diff
-
-      from vllm.inputs import INPUT_REGISTRY
-      from vllm.model_executor.models.interfaces import SupportsMultiModal
-      from vllm.multimodal import MULTIMODAL_REGISTRY
-
-      @MULTIMODAL_REGISTRY.register_image_input_mapper()
-      @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
-      @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
-    + @INPUT_REGISTRY.register_input_processor(<your_input_processor>)
-      class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
-
-A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation.
-Here are some examples:
-
-- Insert static number of image tokens: `LLaVA-1.5 Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py>`__
-- Insert dynamic number of image tokens: `LLaVA-NeXT Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py>`__
-
-.. seealso::
-    :ref:`input_processing_pipeline`
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
new file mode 100644
index 0000000000000..7aeaba855dcfb
--- /dev/null
+++ b/docs/source/models/generative_models.md
@@ -0,0 +1,138 @@
+(generative-models)=
+
+# Generative Models
+
+vLLM provides first-class support for generative models, which covers most of LLMs.
+
+In vLLM, generative models implement the {class}`~vllm.model_executor.models.VllmModelForTextGeneration` interface.
+Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
+which are then passed through {class}`~vllm.model_executor.layers.Sampler` to obtain the final text.
+
+## Offline Inference
+
+The {class}`~vllm.LLM` class provides various methods for offline inference.
+See [Engine Arguments](#engine-args) for a list of options when initializing the model.
+
+For generative models, the only supported {code}`task` option is {code}`"generate"`.
+Usually, this is automatically inferred so you don't have to specify it.
+
+### `LLM.generate`
+
+The {class}`~vllm.LLM.generate` method is available to all generative models in vLLM.
+It is similar to [its counterpart in HF Transformers](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate),
+except that tokenization and detokenization are also performed automatically.
+
+```python
+llm = LLM(model="facebook/opt-125m")
+outputs = llm.generate("Hello, my name is")
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+You can optionally control the language generation by passing {class}`~vllm.SamplingParams`.
+For example, you can use greedy sampling by setting {code}`temperature=0`:
+
+```python
+llm = LLM(model="facebook/opt-125m")
+params = SamplingParams(temperature=0)
+outputs = llm.generate("Hello, my name is", params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+A code example can be found in [examples/offline_inference.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py).
+
+### `LLM.beam_search`
+
+The {class}`~vllm.LLM.beam_search` method implements [beam search](https://huggingface.co/docs/transformers/en/generation_strategies#beam-search-decoding) on top of {class}`~vllm.LLM.generate`.
+For example, to search using 5 beams and output at most 50 tokens:
+
+```python
+llm = LLM(model="facebook/opt-125m")
+params = BeamSearchParams(beam_width=5, max_tokens=50)
+outputs = llm.generate("Hello, my name is", params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+### `LLM.chat`
+
+The {class}`~vllm.LLM.chat` method implements chat functionality on top of {class}`~vllm.LLM.generate`.
+In particular, it accepts input similar to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
+and automatically applies the model's [chat template](https://huggingface.co/docs/transformers/en/chat_templating) to format the prompt.
+
+```{important}
+In general, only instruction-tuned models have a chat template.
+Base models may perform poorly as they are not trained to respond to the chat conversation.
+```
+
+```python
+llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
+conversation = [
+    {
+        "role": "system",
+        "content": "You are a helpful assistant"
+    },
+    {
+        "role": "user",
+        "content": "Hello"
+    },
+    {
+        "role": "assistant",
+        "content": "Hello! How can I assist you today?"
+    },
+    {
+        "role": "user",
+        "content": "Write an essay about the importance of higher education.",
+    },
+]
+outputs = llm.chat(conversation)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+A code example can be found in [examples/offline_inference_chat.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_chat.py).
+
+If the model doesn't have a chat template or you want to specify another one,
+you can explicitly pass a chat template:
+
+```python
+from vllm.entrypoints.chat_utils import load_chat_template
+
+# You can find a list of existing chat templates under `examples/`
+custom_template = load_chat_template(chat_template="<path_to_template>")
+print("Loaded chat template:", custom_template)
+
+outputs = llm.chat(conversation, chat_template=custom_template)
+```
+
+## Online Inference
+
+Our [OpenAI Compatible Server](../serving/openai_compatible_server) can be used for online inference.
+Please click on the above link for more details on how to launch the server.
+
+### Completions API
+
+Our Completions API is similar to `LLM.generate` but only accepts text.
+It is compatible with [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions)
+so that you can use OpenAI client to interact with it.
+A code example can be found in [examples/openai_completion_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py).
+
+### Chat API
+
+Our Chat API is similar to `LLM.chat`, accepting both text and [multi-modal inputs](#multimodal-inputs).
+It is compatible with [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
+so that you can use OpenAI client to interact with it.
+A code example can be found in [examples/openai_chat_completion_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client.py).
diff --git a/docs/source/models/generative_models.rst b/docs/source/models/generative_models.rst
deleted file mode 100644
index fb71185600863..0000000000000
--- a/docs/source/models/generative_models.rst
+++ /dev/null
@@ -1,146 +0,0 @@
-.. _generative_models:
-
-Generative Models
-=================
-
-vLLM provides first-class support for generative models, which covers most of LLMs.
-
-In vLLM, generative models implement the :class:`~vllm.model_executor.models.VllmModelForTextGeneration` interface.
-Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
-which are then passed through :class:`~vllm.model_executor.layers.Sampler` to obtain the final text.
-
-Offline Inference
------------------
-
-The :class:`~vllm.LLM` class provides various methods for offline inference.
-See :ref:`Engine Arguments <engine_args>` for a list of options when initializing the model.
-
-For generative models, the only supported :code:`task` option is :code:`"generate"`.
-Usually, this is automatically inferred so you don't have to specify it.
-
-``LLM.generate``
-^^^^^^^^^^^^^^^^
-
-The :class:`~vllm.LLM.generate` method is available to all generative models in vLLM.
-It is similar to `its counterpart in HF Transformers <https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate>`__,
-except that tokenization and detokenization are also performed automatically.
-
-.. code-block:: python
-
-    llm = LLM(model="facebook/opt-125m")
-    outputs = llm.generate("Hello, my name is")
-
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-You can optionally control the language generation by passing :class:`~vllm.SamplingParams`.
-For example, you can use greedy sampling by setting :code:`temperature=0`:
-
-.. code-block:: python
-
-    llm = LLM(model="facebook/opt-125m")
-    params = SamplingParams(temperature=0)
-    outputs = llm.generate("Hello, my name is", params)
-
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-A code example can be found in `examples/offline_inference.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py>`_.
-
-``LLM.beam_search``
-^^^^^^^^^^^^^^^^^^^
-
-The :class:`~vllm.LLM.beam_search` method implements `beam search <https://huggingface.co/docs/transformers/en/generation_strategies#beam-search-decoding>`__ on top of :class:`~vllm.LLM.generate`.
-For example, to search using 5 beams and output at most 50 tokens:
-
-.. code-block:: python
-
-    llm = LLM(model="facebook/opt-125m")
-    params = BeamSearchParams(beam_width=5, max_tokens=50)
-    outputs = llm.generate("Hello, my name is", params)
-
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-``LLM.chat``
-^^^^^^^^^^^^
-
-The :class:`~vllm.LLM.chat` method implements chat functionality on top of :class:`~vllm.LLM.generate`.
-In particular, it accepts input similar to `OpenAI Chat Completions API <https://platform.openai.com/docs/api-reference/chat>`__
-and automatically applies the model's `chat template <https://huggingface.co/docs/transformers/en/chat_templating>`__ to format the prompt.
-
-.. important::
-
-    In general, only instruction-tuned models have a chat template.
-    Base models may perform poorly as they are not trained to respond to the chat conversation.
-
-.. code-block:: python
-
-    llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
-    conversation = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": "Hello"
-        },
-        {
-            "role": "assistant",
-            "content": "Hello! How can I assist you today?"
-        },
-        {
-            "role": "user",
-            "content": "Write an essay about the importance of higher education.",
-        },
-    ]
-    outputs = llm.chat(conversation)
-
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-A code example can be found in `examples/offline_inference_chat.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_chat.py>`_.
-
-If the model doesn't have a chat template or you want to specify another one,
-you can explicitly pass a chat template:
-
-.. code-block:: python
-
-    from vllm.entrypoints.chat_utils import load_chat_template
-
-    # You can find a list of existing chat templates under `examples/`
-    custom_template = load_chat_template(chat_template="<path_to_template>")
-    print("Loaded chat template:", custom_template)
-
-    outputs = llm.chat(conversation, chat_template=custom_template)
-
-Online Inference
-----------------
-
-Our `OpenAI Compatible Server <../serving/openai_compatible_server>`__ can be used for online inference.
-Please click on the above link for more details on how to launch the server.
-
-Completions API
-^^^^^^^^^^^^^^^
-
-Our Completions API is similar to ``LLM.generate`` but only accepts text.
-It is compatible with `OpenAI Completions API <https://platform.openai.com/docs/api-reference/completions>`__
-so that you can use OpenAI client to interact with it.
-A code example can be found in `examples/openai_completion_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py>`_.
-
-Chat API
-^^^^^^^^
-
-Our Chat API is similar to ``LLM.chat``, accepting both text and :ref:`multi-modal inputs <multimodal_inputs>`.
-It is compatible with `OpenAI Chat Completions API <https://platform.openai.com/docs/api-reference/chat>`__
-so that you can use OpenAI client to interact with it.
-A code example can be found in `examples/openai_chat_completion_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client.py>`_.
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
new file mode 100644
index 0000000000000..20a7b8f33947d
--- /dev/null
+++ b/docs/source/models/pooling_models.md
@@ -0,0 +1,127 @@
+(pooling-models)=
+
+# Pooling Models
+
+vLLM also supports pooling models, including embedding, reranking and reward models.
+
+In vLLM, pooling models implement the {class}`~vllm.model_executor.models.VllmModelForPooling` interface.
+These models use a {class}`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input
+before returning them.
+
+```{note}
+We currently support pooling models primarily as a matter of convenience.
+As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM features are not applicable to
+pooling models as they only work on the generation or decode stage, so performance may not improve as much.
+```
+
+## Offline Inference
+
+The {class}`~vllm.LLM` class provides various methods for offline inference.
+See [Engine Arguments](#engine-args) for a list of options when initializing the model.
+
+For pooling models, we support the following {code}`task` options:
+
+- Embedding ({code}`"embed"` / {code}`"embedding"`)
+- Classification ({code}`"classify"`)
+- Sentence Pair Scoring ({code}`"score"`)
+- Reward Modeling ({code}`"reward"`)
+
+The selected task determines the default {class}`~vllm.model_executor.layers.Pooler` that is used:
+
+- Embedding: Extract only the hidden states corresponding to the last token, and apply normalization.
+- Classification: Extract only the hidden states corresponding to the last token, and apply softmax.
+- Sentence Pair Scoring: Extract only the hidden states corresponding to the last token, and apply softmax.
+- Reward Modeling: Extract all of the hidden states and return them directly.
+
+When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
+we attempt to override the default pooler based on its Sentence Transformers configuration file ({code}`modules.json`).
+
+You can customize the model's pooling method via the {code}`override_pooler_config` option,
+which takes priority over both the model's and Sentence Transformers's defaults.
+
+### `LLM.encode`
+
+The {class}`~vllm.LLM.encode` method is available to all pooling models in vLLM.
+It returns the extracted hidden states directly, which is useful for reward models.
+
+```python
+llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward")
+(output,) = llm.encode("Hello, my name is")
+
+data = output.outputs.data
+print(f"Data: {data!r}")
+```
+
+### `LLM.embed`
+
+The {class}`~vllm.LLM.embed` method outputs an embedding vector for each prompt.
+It is primarily designed for embedding models.
+
+```python
+llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed")
+(output,) = llm.embed("Hello, my name is")
+
+embeds = output.outputs.embedding
+print(f"Embeddings: {embeds!r} (size={len(embeds)})")
+```
+
+A code example can be found in [examples/offline_inference_embedding.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_embedding.py).
+
+### `LLM.classify`
+
+The {class}`~vllm.LLM.classify` method outputs a probability vector for each prompt.
+It is primarily designed for classification models.
+
+```python
+llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify")
+(output,) = llm.classify("Hello, my name is")
+
+probs = output.outputs.probs
+print(f"Class Probabilities: {probs!r} (size={len(probs)})")
+```
+
+A code example can be found in [examples/offline_inference_classification.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_classification.py).
+
+### `LLM.score`
+
+The {class}`~vllm.LLM.score` method outputs similarity scores between sentence pairs.
+It is primarily designed for [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html).
+These types of models serve as rerankers between candidate query-document pairs in RAG systems.
+
+```{note}
+vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.
+To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain).
+```
+
+```python
+llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
+(output,) = llm.score("What is the capital of France?",
+                      "The capital of Brazil is Brasilia.")
+
+score = output.outputs.score
+print(f"Score: {score}")
+```
+
+A code example can be found in [examples/offline_inference_scoring.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_scoring.py).
+
+## Online Inference
+
+Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) can be used for online inference.
+Please click on the above link for more details on how to launch the server.
+
+### Embeddings API
+
+Our Embeddings API is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs).
+
+The text-only API is compatible with [OpenAI Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)
+so that you can use OpenAI client to interact with it.
+A code example can be found in [examples/openai_embedding_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_embedding_client.py).
+
+The multi-modal API is an extension of the [OpenAI Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)
+that incorporates [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat),
+so it is not part of the OpenAI standard. Please see [](#multimodal-inputs) for more details on how to use it.
+
+### Score API
+
+Our Score API is similar to `LLM.score`.
+Please see [this page](#score-api) for more details on how to use it.
diff --git a/docs/source/models/pooling_models.rst b/docs/source/models/pooling_models.rst
deleted file mode 100644
index 4e67677a2767a..0000000000000
--- a/docs/source/models/pooling_models.rst
+++ /dev/null
@@ -1,136 +0,0 @@
-.. _pooling_models:
-
-Pooling Models
-==============
-
-vLLM also supports pooling models, including embedding, reranking and reward models.
-
-In vLLM, pooling models implement the :class:`~vllm.model_executor.models.VllmModelForPooling` interface.
-These models use a :class:`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input
-before returning them.
-
-.. note::
-
-    We currently support pooling models primarily as a matter of convenience.
-    As shown in the :ref:`Compatibility Matrix <compatibility_matrix>`, most vLLM features are not applicable to
-    pooling models as they only work on the generation or decode stage, so performance may not improve as much.
-
-Offline Inference
------------------
-
-The :class:`~vllm.LLM` class provides various methods for offline inference.
-See :ref:`Engine Arguments <engine_args>` for a list of options when initializing the model.
-
-For pooling models, we support the following :code:`task` options:
-
-- Embedding (:code:`"embed"` / :code:`"embedding"`)
-- Classification (:code:`"classify"`)
-- Sentence Pair Scoring (:code:`"score"`)
-- Reward Modeling (:code:`"reward"`)
-
-The selected task determines the default :class:`~vllm.model_executor.layers.Pooler` that is used:
-
-- Embedding: Extract only the hidden states corresponding to the last token, and apply normalization.
-- Classification: Extract only the hidden states corresponding to the last token, and apply softmax.
-- Sentence Pair Scoring: Extract only the hidden states corresponding to the last token, and apply softmax.
-- Reward Modeling: Extract all of the hidden states and return them directly.
-
-When loading `Sentence Transformers <https://huggingface.co/sentence-transformers>`__ models,
-we attempt to override the default pooler based on its Sentence Transformers configuration file (:code:`modules.json`).
-
-You can customize the model's pooling method via the :code:`override_pooler_config` option,
-which takes priority over both the model's and Sentence Transformers's defaults.
-
-``LLM.encode``
-^^^^^^^^^^^^^^
-
-The :class:`~vllm.LLM.encode` method is available to all pooling models in vLLM.
-It returns the extracted hidden states directly, which is useful for reward models.
-
-.. code-block:: python
-
-    llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward")
-    (output,) = llm.encode("Hello, my name is")
-
-    data = output.outputs.data
-    print(f"Data: {data!r}")
-
-``LLM.embed``
-^^^^^^^^^^^^^
-
-The :class:`~vllm.LLM.embed` method outputs an embedding vector for each prompt.
-It is primarily designed for embedding models.
-
-.. code-block:: python
-
-    llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed")
-    (output,) = llm.embed("Hello, my name is")
-
-    embeds = output.outputs.embedding
-    print(f"Embeddings: {embeds!r} (size={len(embeds)})")
-
-A code example can be found in `examples/offline_inference_embedding.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_embedding.py>`_.
-
-``LLM.classify``
-^^^^^^^^^^^^^^^^
-
-The :class:`~vllm.LLM.classify` method outputs a probability vector for each prompt.
-It is primarily designed for classification models.
-
-.. code-block:: python
-
-    llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify")
-    (output,) = llm.classify("Hello, my name is")
-
-    probs = output.outputs.probs
-    print(f"Class Probabilities: {probs!r} (size={len(probs)})")
-
-A code example can be found in `examples/offline_inference_classification.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_classification.py>`_.
-
-``LLM.score``
-^^^^^^^^^^^^^
-
-The :class:`~vllm.LLM.score` method outputs similarity scores between sentence pairs.
-It is primarily designed for `cross-encoder models <https://www.sbert.net/examples/applications/cross-encoder/README.html>`__.
-These types of models serve as rerankers between candidate query-document pairs in RAG systems.
-
-.. note::
-
-    vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.
-    To handle RAG at a higher level, you should use integration frameworks such as `LangChain <https://github.com/langchain-ai/langchain>`_.
-
-.. code-block:: python
-
-    llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
-    (output,) = llm.score("What is the capital of France?",
-                          "The capital of Brazil is Brasilia.")
-
-    score = output.outputs.score
-    print(f"Score: {score}")
-
-A code example can be found in `examples/offline_inference_scoring.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_scoring.py>`_.
-
-Online Inference
-----------------
-
-Our `OpenAI Compatible Server <../serving/openai_compatible_server>`__ can be used for online inference.
-Please click on the above link for more details on how to launch the server.
-
-Embeddings API
-^^^^^^^^^^^^^^
-
-Our Embeddings API is similar to ``LLM.embed``, accepting both text and :ref:`multi-modal inputs <multimodal_inputs>`.
-
-The text-only API is compatible with `OpenAI Embeddings API <https://platform.openai.com/docs/api-reference/embeddings>`__
-so that you can use OpenAI client to interact with it.
-A code example can be found in `examples/openai_embedding_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_embedding_client.py>`_.
-
-The multi-modal API is an extension of the `OpenAI Embeddings API <https://platform.openai.com/docs/api-reference/embeddings>`__
-that incorporates `OpenAI Chat Completions API <https://platform.openai.com/docs/api-reference/chat>`__,
-so it is not part of the OpenAI standard. Please see :ref:`this page <multimodal_inputs>` for more details on how to use it.
-
-Score API
-^^^^^^^^^
-
-Our Score API is similar to ``LLM.score``.
-Please see `this page <../serving/openai_compatible_server.html#score-api-for-cross-encoder-models>`__ for more details on how to use it.
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.md
similarity index 71%
rename from docs/source/models/supported_models.rst
rename to docs/source/models/supported_models.md
index 488fcc7709c77..650293d864011 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.md
@@ -1,84 +1,78 @@
-.. _supported_models:
+(supported-models)=
 
-Supported Models
-================
+# Supported Models
 
 vLLM supports generative and pooling models across various tasks.
-If a model supports more than one task, you can set the task via the :code:`--task` argument.
+If a model supports more than one task, you can set the task via the {code}`--task` argument.
 
 For each task, we list the model architectures that have been implemented in vLLM.
 Alongside each architecture, we include some popular models that use it.
 
-Loading a Model
-^^^^^^^^^^^^^^^
+## Loading a Model
 
-HuggingFace Hub
-+++++++++++++++
+### HuggingFace Hub
 
-By default, vLLM loads models from `HuggingFace (HF) Hub <https://huggingface.co/models>`_.
+By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co/models).
 
-To determine whether a given model is supported, you can check the :code:`config.json` file inside the HF repository.
-If the :code:`"architectures"` field contains a model architecture listed below, then it should be supported in theory.
+To determine whether a given model is supported, you can check the {code}`config.json` file inside the HF repository.
+If the {code}`"architectures"` field contains a model architecture listed below, then it should be supported in theory.
 
-.. tip::
-    The easiest way to check if your model is really supported at runtime is to run the program below:
+````{tip}
+The easiest way to check if your model is really supported at runtime is to run the program below:
 
-    .. code-block:: python
+```python
+from vllm import LLM
 
-        from vllm import LLM
+# For generative models (task=generate) only
+llm = LLM(model=..., task="generate")  # Name or path of your model
+output = llm.generate("Hello, my name is")
+print(output)
 
-        # For generative models (task=generate) only
-        llm = LLM(model=..., task="generate")  # Name or path of your model
-        output = llm.generate("Hello, my name is")
-        print(output)
+# For pooling models (task={embed,classify,reward}) only
+llm = LLM(model=..., task="embed")  # Name or path of your model
+output = llm.encode("Hello, my name is")
+print(output)
+```
 
-        # For pooling models (task={embed,classify,reward}) only
-        llm = LLM(model=..., task="embed")  # Name or path of your model
-        output = llm.encode("Hello, my name is")
-        print(output)
+If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
+````
 
-    If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
+Otherwise, please refer to [Adding a New Model](#adding-a-new-model) and [Enabling Multimodal Inputs](#enabling-multimodal-inputs) for instructions on how to implement your model in vLLM.
+Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
 
-Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Enabling Multimodal Inputs <enabling_multimodal_inputs>` 
-for instructions on how to implement your model in vLLM.
-Alternatively, you can `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_ to request vLLM support.
+### ModelScope
 
-ModelScope
-++++++++++
+To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable:
 
-To use models from `ModelScope <https://www.modelscope.cn>`_ instead of HuggingFace Hub, set an environment variable:
+```shell
+$ export VLLM_USE_MODELSCOPE=True
+```
 
-.. code-block:: shell
+And use with {code}`trust_remote_code=True`.
 
-    $ export VLLM_USE_MODELSCOPE=True
+```python
+from vllm import LLM
 
-And use with :code:`trust_remote_code=True`.
+llm = LLM(model=..., revision=..., task=..., trust_remote_code=True)
 
-.. code-block:: python
+# For generative models (task=generate) only
+output = llm.generate("Hello, my name is")
+print(output)
 
-    from vllm import LLM
+# For pooling models (task={embed,classify,reward}) only
+output = llm.encode("Hello, my name is")
+print(output)
+```
 
-    llm = LLM(model=..., revision=..., task=..., trust_remote_code=True)
+## List of Text-only Language Models
 
-    # For generative models (task=generate) only
-    output = llm.generate("Hello, my name is")
-    print(output)
+### Generative Models
 
-    # For pooling models (task={embed,classify,reward}) only
-    output = llm.encode("Hello, my name is")
-    print(output)
+See [this page](#generative-models) for more information on how to use generative models.
 
-List of Text-only Language Models
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Generative Models
-+++++++++++++++++
-
-See :ref:`this page <generative_models>` for more information on how to use generative models.
-
-Text Generation (``--task generate``)
--------------------------------------
+#### Text Generation (`--task generate`)
 
+```{eval-rst}
 .. list-table::
   :widths: 25 25 50 5 5
   :header-rows: 1
@@ -86,8 +80,8 @@ Text Generation (``--task generate``)
   * - Architecture
     - Models
     - Example HF Models
-    - :ref:`LoRA <lora>`
-    - :ref:`PP <distributed_serving>`
+    - :ref:`LoRA <lora-adapter>`
+    - :ref:`PP <distributed-serving>`
   * - :code:`AquilaForCausalLM`
     - Aquila, Aquila2
     - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc.
@@ -111,8 +105,8 @@ Text Generation (``--task generate``)
   * - :code:`BartForConditionalGeneration`
     - BART
     - :code:`facebook/bart-base`, :code:`facebook/bart-large-cnn`, etc.
-    - 
-    - 
+    -
+    -
   * - :code:`ChatGLMModel`
     - ChatGLM
     - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc.
@@ -136,12 +130,12 @@ Text Generation (``--task generate``)
   * - :code:`DeepseekForCausalLM`
     - DeepSeek
     - :code:`deepseek-ai/deepseek-llm-67b-base`, :code:`deepseek-ai/deepseek-llm-7b-chat` etc.
-    - 
+    -
     - ✅︎
   * - :code:`DeepseekV2ForCausalLM`
     - DeepSeek-V2
     - :code:`deepseek-ai/DeepSeek-V2`, :code:`deepseek-ai/DeepSeek-V2-Chat` etc.
-    - 
+    -
     - ✅︎
   * - :code:`ExaoneForCausalLM`
     - EXAONE-3
@@ -316,7 +310,7 @@ Text Generation (``--task generate``)
   * - :code:`PersimmonForCausalLM`
     - Persimmon
     - :code:`adept/persimmon-8b-base`, :code:`adept/persimmon-8b-chat`, etc.
-    - 
+    -
     - ✅︎
   * - :code:`QWenLMHeadModel`
     - Qwen
@@ -358,29 +352,32 @@ Text Generation (``--task generate``)
     - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc.
     - ✅︎
     - ✅︎
+```
 
-.. note::
-    Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
+```{note}
+Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
+```
 
-Pooling Models
-++++++++++++++
+### Pooling Models
 
-See :ref:`this page <pooling_models>` for more information on how to use pooling models.
+See [this page](pooling-models) for more information on how to use pooling models.
 
-.. important::
-    Since some model architectures support both generative and pooling tasks,
-    you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
+```{important}
+Since some model architectures support both generative and pooling tasks,
+you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
+```
 
-Text Embedding (``--task embed``)
----------------------------------
+#### Text Embedding (`--task embed`)
 
-Any text generation model can be converted into an embedding model by passing :code:`--task embed`.
+Any text generation model can be converted into an embedding model by passing {code}`--task embed`.
 
-.. note::
-    To get the best results, you should use pooling models that are specifically trained as such.
+```{note}
+To get the best results, you should use pooling models that are specifically trained as such.
+```
 
 The following table lists those that are tested in vLLM.
 
+```{eval-rst}
 .. list-table::
   :widths: 25 25 50 5 5
   :header-rows: 1
@@ -388,17 +385,17 @@ The following table lists those that are tested in vLLM.
   * - Architecture
     - Models
     - Example HF Models
-    - :ref:`LoRA <lora>`
-    - :ref:`PP <distributed_serving>`
+    - :ref:`LoRA <lora-adapter>`
+    - :ref:`PP <distributed-serving>`
   * - :code:`BertModel`
     - BERT-based
     - :code:`BAAI/bge-base-en-v1.5`, etc.
-    - 
-    - 
+    -
+    -
   * - :code:`Gemma2Model`
     - Gemma2-based
     - :code:`BAAI/bge-multilingual-gemma2`, etc.
-    - 
+    -
     - ✅︎
   * - :code:`GritLM`
     - GritLM
@@ -418,28 +415,31 @@ The following table lists those that are tested in vLLM.
   * - :code:`RobertaModel`, :code:`RobertaForMaskedLM`
     - RoBERTa-based
     - :code:`sentence-transformers/all-roberta-large-v1`, :code:`sentence-transformers/all-roberta-large-v1`, etc.
-    - 
-    - 
+    -
+    -
   * - :code:`XLMRobertaModel`
     - XLM-RoBERTa-based
     - :code:`intfloat/multilingual-e5-large`, etc.
-    - 
-    - 
+    -
+    -
+```
 
-.. note::
-  :code:`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
-  You should manually set mean pooling by passing :code:`--override-pooler-config '{"pooling_type": "MEAN"}'`.
+```{note}
+{code}`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
+You should manually set mean pooling by passing {code}`--override-pooler-config '{"pooling_type": "MEAN"}'`.
+```
 
-.. note::
-  Unlike base Qwen2, :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
-  You can set :code:`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
+```{note}
+Unlike base Qwen2, {code}`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
+You can set {code}`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
 
-  On the other hand, its 1.5B variant (:code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
-  despite being described otherwise on its model card.
+On the other hand, its 1.5B variant ({code}`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
+despite being described otherwise on its model card.
+```
 
-Reward Modeling (``--task reward``)
------------------------------------
+#### Reward Modeling (`--task reward`)
 
+```{eval-rst}
 .. list-table::
   :widths: 25 25 50 5 5
   :header-rows: 1
@@ -447,8 +447,8 @@ Reward Modeling (``--task reward``)
   * - Architecture
     - Models
     - Example HF Models
-    - :ref:`LoRA <lora>`
-    - :ref:`PP <distributed_serving>`
+    - :ref:`LoRA <lora-adapter>`
+    - :ref:`PP <distributed-serving>`
   * - :code:`LlamaForCausalLM`
     - Llama-based
     - :code:`peiyi9979/math-shepherd-mistral-7b-prm`, etc.
@@ -459,14 +459,16 @@ Reward Modeling (``--task reward``)
     - :code:`Qwen/Qwen2.5-Math-RM-72B`, etc.
     - ✅︎
     - ✅︎
+```
 
-.. important::
-  For process-supervised reward models such as :code:`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
-  e.g.: :code:`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
+```{important}
+For process-supervised reward models such as {code}`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
+e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
+```
 
-Classification (``--task classify``)
-------------------------------------
+#### Classification (`--task classify`)
 
+```{eval-rst}
 .. list-table::
   :widths: 25 25 50 5 5
   :header-rows: 1
@@ -474,8 +476,8 @@ Classification (``--task classify``)
   * - Architecture
     - Models
     - Example HF Models
-    - :ref:`LoRA <lora>`
-    - :ref:`PP <distributed_serving>`
+    - :ref:`LoRA <lora-adapter>`
+    - :ref:`PP <distributed-serving>`
   * - :code:`JambaForSequenceClassification`
     - Jamba
     - :code:`ai21labs/Jamba-tiny-reward-dev`, etc.
@@ -486,10 +488,11 @@ Classification (``--task classify``)
     - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc.
     - ✅︎
     - ✅︎
+```
 
-Sentence Pair Scoring (``--task score``)
-----------------------------------------
+#### Sentence Pair Scoring (`--task score`)
 
+```{eval-rst}
 .. list-table::
   :widths: 25 25 50 5 5
   :header-rows: 1
@@ -497,54 +500,53 @@ Sentence Pair Scoring (``--task score``)
   * - Architecture
     - Models
     - Example HF Models
-    - :ref:`LoRA <lora>`
-    - :ref:`PP <distributed_serving>`
+    - :ref:`LoRA <lora-adapter>`
+    - :ref:`PP <distributed-serving>`
   * - :code:`BertForSequenceClassification`
     - BERT-based
     - :code:`cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.
-    - 
-    - 
+    -
+    -
   * - :code:`RobertaForSequenceClassification`
     - RoBERTa-based
     - :code:`cross-encoder/quora-roberta-base`, etc.
-    - 
-    - 
+    -
+    -
   * - :code:`XLMRobertaForSequenceClassification`
     - XLM-RoBERTa-based
     - :code:`BAAI/bge-reranker-v2-m3`, etc.
-    - 
-    - 
+    -
+    -
+```
 
-.. _supported_mm_models:
+(supported-mm-models)=
 
-List of Multimodal Language Models
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+## List of Multimodal Language Models
 
 The following modalities are supported depending on the model:
 
-- **T**\ ext
-- **I**\ mage
-- **V**\ ideo
-- **A**\ udio
+- **T**ext
+- **I**mage
+- **V**ideo
+- **A**udio
 
-Any combination of modalities joined by :code:`+` are supported.
+Any combination of modalities joined by {code}`+` are supported.
 
-- e.g.: :code:`T + I` means that the model supports text-only, image-only, and text-with-image inputs.
+- e.g.: {code}`T + I` means that the model supports text-only, image-only, and text-with-image inputs.
 
-On the other hand, modalities separated by :code:`/` are mutually exclusive.
+On the other hand, modalities separated by {code}`/` are mutually exclusive.
 
-- e.g.: :code:`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
+- e.g.: {code}`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
 
-See :ref:`this page <multimodal_inputs>` on how to pass multi-modal inputs to the model.
+See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model.
 
-Generative Models
-+++++++++++++++++
+### Generative Models
 
-See :ref:`this page <generative_models>` for more information on how to use generative models.
+See [this page](#generative-models) for more information on how to use generative models.
 
-Text Generation (``--task generate``)
--------------------------------------
+#### Text Generation (`--task generate`)
 
+```{eval-rst}
 .. list-table::
   :widths: 25 25 15 20 5 5 5
   :header-rows: 1
@@ -553,63 +555,63 @@ Text Generation (``--task generate``)
     - Models
     - Inputs
     - Example HF Models
-    - :ref:`LoRA <lora>`
-    - :ref:`PP <distributed_serving>`
+    - :ref:`LoRA <lora-adapter>`
+    - :ref:`PP <distributed-serving>`
     - V1
   * - :code:`AriaForConditionalGeneration`
     - Aria
     - T + I
     - :code:`rhymes-ai/Aria`
-    - 
+    -
     - ✅︎
-    - 
+    -
   * - :code:`Blip2ForConditionalGeneration`
     - BLIP-2
     - T + I\ :sup:`E`
     - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc.
     -
     - ✅︎
-    - 
+    -
   * - :code:`ChameleonForConditionalGeneration`
     - Chameleon
     - T + I
     - :code:`facebook/chameleon-7b` etc.
-    - 
+    -
     - ✅︎
-    - 
+    -
   * - :code:`FuyuForCausalLM`
     - Fuyu
     - T + I
     - :code:`adept/fuyu-8b` etc.
-    - 
+    -
     - ✅︎
-    - 
+    -
   * - :code:`ChatGLMModel`
     - GLM-4V
     - T + I
     - :code:`THUDM/glm-4v-9b` etc.
     - ✅︎
     - ✅︎
-    - 
+    -
   * - :code:`H2OVLChatModel`
     - H2OVL
     - T + I\ :sup:`E+`
     - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc.
-    - 
+    -
     - ✅︎
-    - 
+    -
   * - :code:`Idefics3ForConditionalGeneration`
     - Idefics3
     - T + I
     - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc.
     - ✅︎
     -
-    - 
+    -
   * - :code:`InternVLChatModel`
     - InternVL 2.5, Mono-InternVL, InternVL 2.0
     - T + I\ :sup:`E+`
     - :code:`OpenGVLab/InternVL2_5-4B`, :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, etc.
-    - 
+    -
     - ✅︎
     - ✅︎
   * - :code:`LlavaForConditionalGeneration`
@@ -625,28 +627,28 @@ Text Generation (``--task generate``)
     - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
     -
     - ✅︎
-    - 
+    -
   * - :code:`LlavaNextVideoForConditionalGeneration`
     - LLaVA-NeXT-Video
     - T + V
     - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
     -
     - ✅︎
-    - 
+    -
   * - :code:`LlavaOnevisionForConditionalGeneration`
     - LLaVA-Onevision
     - T + I\ :sup:`+` + V\ :sup:`+`
     - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
     -
     - ✅︎
-    - 
+    -
   * - :code:`MiniCPMV`
     - MiniCPM-V
     - T + I\ :sup:`E+`
     - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc.
     - ✅︎
     - ✅︎
-    - 
+    -
   * - :code:`MllamaForConditionalGeneration`
     - Llama 3.2
     - T + I\ :sup:`+`
@@ -665,7 +667,7 @@ Text Generation (``--task generate``)
     - NVLM-D 1.0
     - T + I\ :sup:`E+`
     - :code:`nvidia/NVLM-D-72B`, etc.
-    - 
+    -
     - ✅︎
     - ✅︎
   * - :code:`PaliGemmaForConditionalGeneration`
@@ -674,7 +676,7 @@ Text Generation (``--task generate``)
     - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, :code:`google/paligemma2-3b-ft-docci-448`, etc.
     - 
     - ✅︎
-    - 
+    -
   * - :code:`Phi3VForCausalLM`
     - Phi-3-Vision, Phi-3.5-Vision
     - T + I\ :sup:`E+`
@@ -702,70 +704,79 @@ Text Generation (``--task generate``)
     - :code:`Qwen/Qwen2-Audio-7B-Instruct`
     -
     - ✅︎
-    - 
+    -
   * - :code:`Qwen2VLForConditionalGeneration`
     - Qwen2-VL
     - T + I\ :sup:`E+` + V\ :sup:`E+`
     - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
     - ✅︎
     - ✅︎
-    - 
+    -
   * - :code:`UltravoxModel`
     - Ultravox
     - T + A\ :sup:`E+`
     - :code:`fixie-ai/ultravox-v0_3`
     -
     - ✅︎
-    - 
-
-| :sup:`E` Pre-computed embeddings can be inputted for this modality.
-| :sup:`+` Multiple items can be inputted per text prompt for this modality.
+    -
+```
 
-.. important::
-    To enable multiple multi-modal items per text prompt, you have to set :code:`limit_mm_per_prompt` (offline inference)
-    or :code:`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
+```{eval-rst}
+:sup:`E` Pre-computed embeddings can be inputted for this modality.
 
-    .. code-block:: python
+:sup:`+` Multiple items can be inputted per text prompt for this modality.
+```
 
-        llm = LLM(
-            model="Qwen/Qwen2-VL-7B-Instruct",
-            limit_mm_per_prompt={"image": 4},
-        )
+````{important}
+To enable multiple multi-modal items per text prompt, you have to set {code}`limit_mm_per_prompt` (offline inference)
+or {code}`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
 
-    .. code-block:: bash
+```python
+llm = LLM(
+    model="Qwen/Qwen2-VL-7B-Instruct",
+    limit_mm_per_prompt={"image": 4},
+)
+```
 
-        vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
+```bash
+vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
+```
+````
 
-.. note::
-  vLLM currently only supports adding LoRA to the language backbone of multimodal models.
+```{note}
+vLLM currently only supports adding LoRA to the language backbone of multimodal models.
+```
 
-.. note::
-  To use :code:`TIGER-Lab/Mantis-8B-siglip-llama3`, you have to install their GitHub repo (:code:`pip install git+https://github.com/TIGER-AI-Lab/Mantis.git`)
-  and pass :code:`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
+```{note}
+To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have to install their GitHub repo ({code}`pip install git+https://github.com/TIGER-AI-Lab/Mantis.git`)
+and pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
+```
 
-.. note::
-  The official :code:`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
-  For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
+```{note}
+The official {code}`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork ({code}`HwwwH/MiniCPM-V-2`) for now.
+For more details, please see: <https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630>
+```
 
-Pooling Models
-++++++++++++++
+### Pooling Models
 
-See :ref:`this page <pooling_models>` for more information on how to use pooling models.
+See [this page](pooling-models) for more information on how to use pooling models.
 
-.. important::
-    Since some model architectures support both generative and pooling tasks,
-    you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
+```{important}
+Since some model architectures support both generative and pooling tasks,
+you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
+```
 
-Text Embedding (``--task embed``)
----------------------------------
+#### Text Embedding (`--task embed`)
 
-Any text generation model can be converted into an embedding model by passing :code:`--task embed`.
+Any text generation model can be converted into an embedding model by passing {code}`--task embed`.
 
-.. note::
-    To get the best results, you should use pooling models that are specifically trained as such.
+```{note}
+To get the best results, you should use pooling models that are specifically trained as such.
+```
 
 The following table lists those that are tested in vLLM.
 
+```{eval-rst}
 .. list-table::
   :widths: 25 25 15 25 5 5
   :header-rows: 1
@@ -774,13 +785,13 @@ The following table lists those that are tested in vLLM.
     - Models
     - Inputs
     - Example HF Models
-    - :ref:`LoRA <lora>`
-    - :ref:`PP <distributed_serving>`
+    - :ref:`LoRA <lora-adapter>`
+    - :ref:`PP <distributed-serving>`
   * - :code:`LlavaNextForConditionalGeneration`
     - LLaVA-NeXT-based
     - T / I
     - :code:`royokong/e5-v`
-    - 
+    -
     - ✅︎
   * - :code:`Phi3VForCausalLM`
     - Phi-3-Vision-based
@@ -792,27 +803,25 @@ The following table lists those that are tested in vLLM.
     - Qwen2-VL-based
     - T + I
     - :code:`MrLight/dse-qwen2-2b-mrl-v1`
-    - 
+    -
     - ✅︎
+```
 
-----
+______________________________________________________________________
 
-Model Support Policy
-=====================
+# Model Support Policy
 
 At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
 
 1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated!
-
 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
 
-.. tip::
-  When comparing the output of :code:`model.generate` from HuggingFace Transformers with the output of :code:`llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., `generation_config.json <https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945>`__) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
+```{tip}
+When comparing the output of {code}`model.generate` from HuggingFace Transformers with the output of {code}`llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
+```
 
 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
-
 4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use.
-
 5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement.
 
 Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem.
@@ -821,7 +830,7 @@ Note that, as an inference engine, vLLM does not introduce new models. Therefore
 
 We have the following levels of testing for models:
 
-1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `models tests <https://github.com/vllm-project/vllm/blob/main/tests/models>`_ for the models that have passed this test.
+1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test.
 2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
-3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to `functionality tests <https://github.com/vllm-project/vllm/tree/main/tests>`_ and `examples <https://github.com/vllm-project/vllm/tree/main/examples>`_ for the models that have passed this test.
+3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](https://github.com/vllm-project/vllm/tree/main/tests) and [examples](https://github.com/vllm-project/vllm/tree/main/examples) for the models that have passed this test.
 4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
diff --git a/docs/source/performance/benchmarks.md b/docs/source/performance/benchmarks.md
new file mode 100644
index 0000000000000..50ef4a1f3b54d
--- /dev/null
+++ b/docs/source/performance/benchmarks.md
@@ -0,0 +1,28 @@
+(benchmarks)=
+
+# Benchmark Suites
+
+vLLM contains two sets of benchmarks:
+
+- [Performance benchmarks](#performance-benchmarks)
+- [Nightly benchmarks](#nightly-benchmarks)
+
+(performance-benchmarks)=
+
+## Performance Benchmarks
+
+The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
+
+The latest performance results are hosted on the public [vLLM Performance Dashboard](https://perf.vllm.ai).
+
+More information on the performance benchmarks and their parameters can be found [here](https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
+
+(nightly-benchmarks)=
+
+## Nightly Benchmarks
+
+These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lmdeploy`) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the `perf-benchmarks` and `nightly-benchmarks` labels.
+
+The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html).
+
+More information on the nightly benchmarks and their parameters can be found [here](https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/nightly-descriptions.md).
diff --git a/docs/source/performance/benchmarks.rst b/docs/source/performance/benchmarks.rst
deleted file mode 100644
index 6d4d7b544cb5d..0000000000000
--- a/docs/source/performance/benchmarks.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-.. _benchmarks:
-
-================
-Benchmark Suites
-================
-
-vLLM contains two sets of benchmarks:
-
-+ :ref:`Performance benchmarks <performance_benchmarks>`
-+ :ref:`Nightly benchmarks <nightly_benchmarks>`
-
-
-.. _performance_benchmarks:
-
-Performance Benchmarks
-----------------------
-
-The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the ``perf-benchmarks`` and ``ready`` labels, and when a PR is merged into vLLM.
-
-The latest performance results are hosted on the public `vLLM Performance Dashboard <https://perf.vllm.ai>`_.
-
-More information on the performance benchmarks and their parameters can be found `here <https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md>`__.
-
-.. _nightly_benchmarks:
-
-Nightly Benchmarks
-------------------
-
-These compare vLLM's performance against alternatives (``tgi``, ``trt-llm``, and ``lmdeploy``) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the ``perf-benchmarks`` and ``nightly-benchmarks`` labels. 
-
-The latest nightly benchmark results are shared in major release blog posts such as `vLLM v0.6.0 <https://blog.vllm.ai/2024/09/05/perf-update.html>`_.
-
-More information on the nightly benchmarks and their parameters can be found `here <https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/nightly-descriptions.md>`__.
\ No newline at end of file
diff --git a/docs/source/quantization/auto_awq.md b/docs/source/quantization/auto_awq.md
new file mode 100644
index 0000000000000..c02fbf0605a8c
--- /dev/null
+++ b/docs/source/quantization/auto_awq.md
@@ -0,0 +1,78 @@
+(auto-awq)=
+
+# AutoAWQ
+
+```{warning}
+Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better
+accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency
+inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version.
+```
+
+To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
+Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%.
+The main benefits are lower latency and memory usage.
+
+You can quantize your own models by installing AutoAWQ or picking one of the [400+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq).
+
+```console
+$ pip install autoawq
+```
+
+After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
+
+```python
+from awq import AutoAWQForCausalLM
+from transformers import AutoTokenizer
+
+model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
+quant_path = 'mistral-instruct-v0.2-awq'
+quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
+
+# Load model
+model = AutoAWQForCausalLM.from_pretrained(
+    model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
+)
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+
+# Quantize
+model.quantize(tokenizer, quant_config=quant_config)
+
+# Save quantized model
+model.save_quantized(quant_path)
+tokenizer.save_pretrained(quant_path)
+
+print(f'Model is quantized and saved at "{quant_path}"')
+```
+
+To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
+
+```console
+$ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
+```
+
+AWQ models are also supported directly through the LLM entrypoint:
+
+```python
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
diff --git a/docs/source/quantization/auto_awq.rst b/docs/source/quantization/auto_awq.rst
deleted file mode 100644
index 8eb6fa2f4cbe1..0000000000000
--- a/docs/source/quantization/auto_awq.rst
+++ /dev/null
@@ -1,79 +0,0 @@
-.. _auto_awq:
-
-AutoAWQ
-==================
-
-.. warning::
-
-   Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better
-   accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency
-   inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version.
-
-To create a new 4-bit quantized model, you can leverage `AutoAWQ <https://github.com/casper-hansen/AutoAWQ>`_. 
-Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%.
-The main benefits are lower latency and memory usage.
-
-You can quantize your own models by installing AutoAWQ or picking one of the `400+ models on Huggingface <https://huggingface.co/models?sort=trending&search=awq>`_. 
-
-.. code-block:: console
-
-    $ pip install autoawq
-
-After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
-
-.. code-block:: python
-
-    from awq import AutoAWQForCausalLM
-    from transformers import AutoTokenizer
-    
-    model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
-    quant_path = 'mistral-instruct-v0.2-awq'
-    quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
-    
-    # Load model
-    model = AutoAWQForCausalLM.from_pretrained(
-        model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
-    )
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    
-    # Quantize
-    model.quantize(tokenizer, quant_config=quant_config)
-    
-    # Save quantized model
-    model.save_quantized(quant_path)
-    tokenizer.save_pretrained(quant_path)
-    
-    print(f'Model is quantized and saved at "{quant_path}"')
-
-To run an AWQ model with vLLM, you can use `TheBloke/Llama-2-7b-Chat-AWQ <https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ>`_ with the following command:
-
-.. code-block:: console
-
-    $ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
-
-AWQ models are also supported directly through the LLM entrypoint:
-
-.. code-block:: python
-
-    from vllm import LLM, SamplingParams
-
-    # Sample prompts.
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    # Create a sampling params object.
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-    # Create an LLM.
-    llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
-    # Generate texts from the prompts. The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params)
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/docs/source/quantization/bnb.md b/docs/source/quantization/bnb.md
new file mode 100644
index 0000000000000..8240eca1c7e03
--- /dev/null
+++ b/docs/source/quantization/bnb.md
@@ -0,0 +1,39 @@
+(bits-and-bytes)=
+
+# BitsAndBytes
+
+vLLM now supports [BitsAndBytes](https://github.com/TimDettmers/bitsandbytes) for more efficient model inference.
+BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy.
+Compared to other quantization methods, BitsAndBytes eliminates the need for calibrating the quantized model with input data.
+
+Below are the steps to utilize BitsAndBytes with vLLM.
+
+```console
+$ pip install bitsandbytes>=0.45.0
+```
+
+vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
+
+You can find bitsandbytes quantized models on <https://huggingface.co/models?other=bitsandbytes>.
+And usually, these repositories have a config.json file that includes a quantization_config section.
+
+## Read quantized checkpoint.
+
+```python
+from vllm import LLM
+import torch
+# unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
+model_id = "unsloth/tinyllama-bnb-4bit"
+llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
+quantization="bitsandbytes", load_format="bitsandbytes")
+```
+
+## Inflight quantization: load as 4bit quantization
+
+```python
+from vllm import LLM
+import torch
+model_id = "huggyllama/llama-7b"
+llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
+quantization="bitsandbytes", load_format="bitsandbytes")
+```
diff --git a/docs/source/quantization/bnb.rst b/docs/source/quantization/bnb.rst
deleted file mode 100644
index 84f805bb60c2a..0000000000000
--- a/docs/source/quantization/bnb.rst
+++ /dev/null
@@ -1,43 +0,0 @@
-.. _bits_and_bytes:
-
-BitsAndBytes
-==================
-
-vLLM now supports `BitsAndBytes <https://github.com/TimDettmers/bitsandbytes>`_ for more efficient model inference.
-BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy.
-Compared to other quantization methods,  BitsAndBytes eliminates the need for calibrating the quantized model with input data.
-
-Below are the steps to utilize BitsAndBytes with vLLM.
-
-.. code-block:: console
-
-    $ pip install bitsandbytes>=0.45.0
-
-vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
-
-You can find bitsandbytes quantized models on https://huggingface.co/models?other=bitsandbytes.
-And usually, these repositories have a config.json file that includes a quantization_config section.
-
-Read quantized checkpoint.
---------------------------
-
-.. code-block:: python
-
-    from vllm import LLM
-    import torch
-    # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
-    model_id = "unsloth/tinyllama-bnb-4bit"
-    llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
-    quantization="bitsandbytes", load_format="bitsandbytes")
-
-Inflight quantization: load as 4bit quantization
-------------------------------------------------
-
-.. code-block:: python
-
-    from vllm import LLM
-    import torch
-    model_id = "huggyllama/llama-7b"
-    llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
-    quantization="bitsandbytes", load_format="bitsandbytes")
-
diff --git a/docs/source/quantization/fp8.md b/docs/source/quantization/fp8.md
new file mode 100644
index 0000000000000..b2eda74fd1e3b
--- /dev/null
+++ b/docs/source/quantization/fp8.md
@@ -0,0 +1,192 @@
+(fp8)=
+
+# FP8 W8A8
+
+vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x.
+Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8.
+Ampere GPUs are supported for W8A16 (weight-only FP8) utilizing Marlin kernels.
+Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy.
+
+Please visit the HF collection of [quantized FP8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127).
+
+The FP8 types typically supported in hardware have two distinct representations, each useful in different scenarios:
+
+- **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and `nan`.
+- **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- `inf`, and `nan`. The tradeoff for the increased dynamic range is lower precision of the stored values.
+
+```{note}
+FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper).
+FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin.
+```
+
+## Quick Start with Online Dynamic Quantization
+
+Dynamic quantization of an original precision BF16/FP16 model to FP8 can be achieved with vLLM without any calibration data required. You can enable the feature by specifying `--quantization="fp8"` in the command line or setting `quantization="fp8"` in the LLM constructor.
+
+In this mode, all Linear modules (except for the final `lm_head`) have their weights quantized down to FP8_E4M3 precision with a per-tensor scale. Activations have their minimum and maximum values calculated during each forward pass to provide a dynamic per-tensor scale for high accuracy. As a result, latency improvements are limited in this mode.
+
+```python
+from vllm import LLM
+model = LLM("facebook/opt-125m", quantization="fp8")
+# INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB
+result = model.generate("Hello, my name is")
+```
+
+```{warning}
+Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model.
+```
+
+## Installation
+
+To produce performant FP8 quantized models with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
+
+```console
+$ pip install llmcompressor
+```
+
+## Quantization Process
+
+The quantization process involves three main steps:
+
+1. Loading the model
+2. Applying quantization
+3. Evaluating accuracy in vLLM
+
+### 1. Loading the Model
+
+Use `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM`, for saving and loading quantized models:
+
+```python
+from llmcompressor.transformers import SparseAutoModelForCausalLM
+from transformers import AutoTokenizer
+
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+
+model = SparseAutoModelForCausalLM.from_pretrained(
+  MODEL_ID, device_map="auto", torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+```
+
+### 2. Applying Quantization
+
+For FP8 quantization, we can recover accuracy with simple RTN quantization. We recommend targeting all `Linear` layers using the `FP8_DYNAMIC` scheme, which uses:
+
+- Static, per-channel quantization on the weights
+- Dynamic, per-token quantization on the activations
+
+Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
+
+```python
+from llmcompressor.transformers import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+# Configure the simple PTQ quantization
+recipe = QuantizationModifier(
+  targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
+
+# Apply the quantization algorithm.
+oneshot(model=model, recipe=recipe)
+
+# Save the model.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(SAVE_DIR)
+tokenizer.save_pretrained(SAVE_DIR)
+```
+
+### 3. Evaluating Accuracy
+
+Install `vllm` and `lm-evaluation-harness`:
+
+```console
+$ pip install vllm lm-eval==0.4.4
+```
+
+Load and run the model in `vllm`:
+
+```python
+from vllm import LLM
+model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
+model.generate("Hello my name is")
+```
+
+Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`):
+
+```{note}
+Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations.
+```
+
+```console
+$ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic
+$ lm_eval \
+  --model vllm \
+  --model_args pretrained=$MODEL,add_bos_token=True \
+  --tasks gsm8k  --num_fewshot 5 --batch_size auto --limit 250
+```
+
+Here's an example of the resulting scores:
+
+```text
+|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value|   |Stderr|
+|-----|------:|----------------|-----:|-----------|---|----:|---|-----:|
+|gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.768|±  |0.0268|
+|     |       |strict-match    |     5|exact_match|↑  |0.768|±  |0.0268|
+```
+
+## Troubleshooting and Support
+
+If you encounter any issues or have feature requests, please open an issue on the `vllm-project/llm-compressor` GitHub repository.
+
+## Deprecated Flow
+
+```{note}
+The following information is preserved for reference and search purposes.
+The quantization method described below is deprecated in favor of the `llmcompressor` method described above.
+```
+
+For static per-tensor offline quantization to FP8, please install the [AutoFP8 library](https://github.com/neuralmagic/autofp8).
+
+```bash
+git clone https://github.com/neuralmagic/AutoFP8.git
+pip install -e AutoFP8
+```
+
+This package introduces the `AutoFP8ForCausalLM` and `BaseQuantizeConfig` objects for managing how your model will be compressed.
+
+## Offline Quantization with Static Activation Scaling Factors
+
+You can use AutoFP8 with calibration data to produce per-tensor static scales for both the weights and activations by enabling the `activation_scheme="static"` argument.
+
+```python
+from datasets import load_dataset
+from transformers import AutoTokenizer
+from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig
+
+pretrained_model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
+quantized_model_dir = "Meta-Llama-3-8B-Instruct-FP8"
+
+tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
+tokenizer.pad_token = tokenizer.eos_token
+
+# Load and tokenize 512 dataset samples for calibration of activation scales
+ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(512))
+examples = [tokenizer.apply_chat_template(batch["messages"], tokenize=False) for batch in ds]
+examples = tokenizer(examples, padding=True, truncation=True, return_tensors="pt").to("cuda")
+
+# Define quantization config with static activation scales
+quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static")
+
+# Load the model, quantize, and save checkpoint
+model = AutoFP8ForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
+model.quantize(examples)
+model.save_quantized(quantized_model_dir)
+```
+
+Your model checkpoint with quantized weights and activations should be available at `Meta-Llama-3-8B-Instruct-FP8/`.
+Finally, you can load the quantized model checkpoint directly in vLLM.
+
+```python
+from vllm import LLM
+model = LLM(model="Meta-Llama-3-8B-Instruct-FP8/")
+# INFO 06-10 21:15:41 model_runner.py:159] Loading model weights took 8.4596 GB
+result = model.generate("Hello, my name is")
+```
diff --git a/docs/source/quantization/fp8.rst b/docs/source/quantization/fp8.rst
deleted file mode 100644
index 4dbf8e9d346e1..0000000000000
--- a/docs/source/quantization/fp8.rst
+++ /dev/null
@@ -1,204 +0,0 @@
-.. _fp8:
-
-FP8 W8A8
-==================
-
-vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. 
-Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8. 
-Ampere GPUs are supported for W8A16 (weight-only FP8) utilizing Marlin kernels.
-Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy.
-
-Please visit the HF collection of `quantized FP8 checkpoints of popular LLMs ready to use with vLLM <https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127>`_.
-
-The FP8 types typically supported in hardware have two distinct representations, each useful in different scenarios:
-
-- **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and ``nan``.
-- **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- ``inf``, and ``nan``. The tradeoff for the increased dynamic range is lower precision of the stored values.
-
-.. note::
-
-   FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper).
-   FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin.
-
-Quick Start with Online Dynamic Quantization
---------------------------------------------
-
-Dynamic quantization of an original precision BF16/FP16 model to FP8 can be achieved with vLLM without any calibration data required. You can enable the feature by specifying ``--quantization="fp8"`` in the command line or setting ``quantization="fp8"`` in the LLM constructor.
-
-In this mode, all Linear modules (except for the final ``lm_head``) have their weights quantized down to FP8_E4M3 precision with a per-tensor scale. Activations have their minimum and maximum values calculated during each forward pass to provide a dynamic per-tensor scale for high accuracy. As a result, latency improvements are limited in this mode.
-
-.. code-block:: python
-
-    from vllm import LLM
-    model = LLM("facebook/opt-125m", quantization="fp8")
-    # INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB
-    result = model.generate("Hello, my name is")
-
-.. warning::
-
-    Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model.
-
-Installation
-------------
-
-To produce performant FP8 quantized models with vLLM, you'll need to install the `llm-compressor <https://github.com/vllm-project/llm-compressor/>`_ library:
-
-.. code-block:: console
-
-   $ pip install llmcompressor
-
-Quantization Process
---------------------
-
-The quantization process involves three main steps:
-
-1. Loading the model
-2. Applying quantization
-3. Evaluating accuracy in vLLM
-
-1. Loading the Model
-^^^^^^^^^^^^^^^^^^^^
-
-Use ``SparseAutoModelForCausalLM``, which wraps ``AutoModelForCausalLM``, for saving and loading quantized models:
-
-.. code-block:: python
-
-   from llmcompressor.transformers import SparseAutoModelForCausalLM
-   from transformers import AutoTokenizer
-
-   MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-
-   model = SparseAutoModelForCausalLM.from_pretrained(
-     MODEL_ID, device_map="auto", torch_dtype="auto")
-   tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-2. Applying Quantization
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-For FP8 quantization, we can recover accuracy with simple RTN quantization. We recommend targeting all ``Linear`` layers using the ``FP8_DYNAMIC`` scheme, which uses:
-
-- Static, per-channel quantization on the weights
-- Dynamic, per-token quantization on the activations
-
-Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
-
-.. code-block:: python
-
-   from llmcompressor.transformers import oneshot
-   from llmcompressor.modifiers.quantization import QuantizationModifier
-
-   # Configure the simple PTQ quantization
-   recipe = QuantizationModifier(
-     targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
-
-   # Apply the quantization algorithm.
-   oneshot(model=model, recipe=recipe)
-
-   # Save the model.
-   SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-   model.save_pretrained(SAVE_DIR)
-   tokenizer.save_pretrained(SAVE_DIR)
-
-3. Evaluating Accuracy
-^^^^^^^^^^^^^^^^^^^^^^
-
-Install ``vllm`` and ``lm-evaluation-harness``:
-
-.. code-block:: console
-
-   $ pip install vllm lm-eval==0.4.4
-
-Load and run the model in ``vllm``:
-
-.. code-block:: python
-
-   from vllm import LLM
-   model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
-   model.generate("Hello my name is")
-
-Evaluate accuracy with ``lm_eval`` (for example on 250 samples of ``gsm8k``):
-
-.. note::
-
-   Quantized models can be sensitive to the presence of the ``bos`` token. ``lm_eval`` does not add a ``bos`` token by default, so make sure to include the ``add_bos_token=True`` argument when running your evaluations.
-
-.. code-block:: console
-
-   $ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic 
-   $ lm_eval \
-     --model vllm \
-     --model_args pretrained=$MODEL,add_bos_token=True \
-     --tasks gsm8k  --num_fewshot 5 --batch_size auto --limit 250
-
-Here's an example of the resulting scores:
-
-.. code-block:: text
-
-   |Tasks|Version|     Filter     |n-shot|  Metric   |   |Value|   |Stderr|
-   |-----|------:|----------------|-----:|-----------|---|----:|---|-----:|
-   |gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.768|±  |0.0268|
-   |     |       |strict-match    |     5|exact_match|↑  |0.768|±  |0.0268|
-
-Troubleshooting and Support
----------------------------
-
-If you encounter any issues or have feature requests, please open an issue on the ``vllm-project/llm-compressor`` GitHub repository.
-
-
-Deprecated Flow
-------------------
-
-.. note::
-
-   The following information is preserved for reference and search purposes.
-   The quantization method described below is deprecated in favor of the ``llmcompressor`` method described above.
-
-For static per-tensor offline quantization to FP8, please install the `AutoFP8 library <https://github.com/neuralmagic/autofp8>`_.
-
-.. code-block:: bash
-
-    git clone https://github.com/neuralmagic/AutoFP8.git
-    pip install -e AutoFP8
-
-This package introduces the ``AutoFP8ForCausalLM`` and ``BaseQuantizeConfig`` objects for managing how your model will be compressed.
-
-Offline Quantization with Static Activation Scaling Factors
------------------------------------------------------------
-
-You can use AutoFP8 with calibration data to produce per-tensor static scales for both the weights and activations by enabling the ``activation_scheme="static"`` argument.
-
-.. code-block:: python
-
-    from datasets import load_dataset
-    from transformers import AutoTokenizer
-    from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig
-
-    pretrained_model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
-    quantized_model_dir = "Meta-Llama-3-8B-Instruct-FP8"
-
-    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
-    tokenizer.pad_token = tokenizer.eos_token
-
-    # Load and tokenize 512 dataset samples for calibration of activation scales
-    ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(512))
-    examples = [tokenizer.apply_chat_template(batch["messages"], tokenize=False) for batch in ds]
-    examples = tokenizer(examples, padding=True, truncation=True, return_tensors="pt").to("cuda")
-
-    # Define quantization config with static activation scales
-    quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static")
-
-    # Load the model, quantize, and save checkpoint
-    model = AutoFP8ForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
-    model.quantize(examples)
-    model.save_quantized(quantized_model_dir)
-
-Your model checkpoint with quantized weights and activations should be available at ``Meta-Llama-3-8B-Instruct-FP8/``.
-Finally, you can load the quantized model checkpoint directly in vLLM.
-
-.. code-block:: python
-
-    from vllm import LLM
-    model = LLM(model="Meta-Llama-3-8B-Instruct-FP8/")
-    # INFO 06-10 21:15:41 model_runner.py:159] Loading model weights took 8.4596 GB
-    result = model.generate("Hello, my name is")
-
diff --git a/docs/source/quantization/fp8_e4m3_kvcache.md b/docs/source/quantization/fp8_e4m3_kvcache.md
new file mode 100644
index 0000000000000..f200c722d1d42
--- /dev/null
+++ b/docs/source/quantization/fp8_e4m3_kvcache.md
@@ -0,0 +1,44 @@
+(fp8-e4m3-kvcache)=
+
+# FP8 E4M3 KV Cache
+
+Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache,
+improving throughput. OCP (Open Compute Project www.opencompute.org) specifies two common 8-bit floating point data formats: E5M2
+(5 exponent bits and 2 mantissa bits) and E4M3FN (4 exponent bits and 3 mantissa bits), often shortened as E4M3. One benefit of
+the E4M3 format over E5M2 is that floating point numbers are represented in higher precision. However, the small dynamic range of
+FP8 E4M3 (±240.0 can be represented) typically necessitates the use of a higher-precision (typically FP32) scaling factor alongside
+each quantized tensor. For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling
+factors of a finer granularity (e.g. per-channel).
+
+These scaling factors can be specified by passing an optional quantization param JSON to the LLM engine at load time. If
+this JSON is not specified, scaling factors default to 1.0. These scaling factors are typically obtained when running an
+unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO).
+
+To install AMMO (AlgorithMic Model Optimization):
+
+```console
+$ pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo
+```
+
+Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon
+offerings e.g. AMD MI300, NVIDIA Hopper or later support native hardware conversion to and from fp32, fp16, bf16, etc.
+Thus, LLM inference is greatly accelerated with minimal accuracy loss.
+
+Here is an example of how to enable this feature:
+
+```python
+# two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to
+# https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own.
+
+from vllm import LLM, SamplingParams
+sampling_params = SamplingParams(temperature=1.3, top_p=0.8)
+llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
+          kv_cache_dtype="fp8",
+          quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
+prompt = "London is the capital of"
+out = llm.generate(prompt, sampling_params)[0].outputs[0].text
+print(out)
+
+# output w/ scaling factors:  England, the United Kingdom, and one of the world's leading financial,
+# output w/o scaling factors:  England, located in the southeastern part of the country. It is known
+```
diff --git a/docs/source/quantization/fp8_e4m3_kvcache.rst b/docs/source/quantization/fp8_e4m3_kvcache.rst
deleted file mode 100644
index cc52d8f40af8f..0000000000000
--- a/docs/source/quantization/fp8_e4m3_kvcache.rst
+++ /dev/null
@@ -1,47 +0,0 @@
-.. _fp8_e4m3_kvcache:
-
-FP8 E4M3 KV Cache
-==================
-
-Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache, 
-improving throughput. OCP (Open Compute Project www.opencompute.org) specifies two common 8-bit floating point data formats: E5M2 
-(5 exponent bits and 2 mantissa bits) and E4M3FN (4 exponent bits and 3 mantissa bits), often shortened as E4M3. One benefit of 
-the E4M3 format over E5M2 is that floating point numbers are represented in higher precision. However, the small dynamic range of 
-FP8 E4M3 (±240.0 can be represented) typically necessitates the use of a higher-precision (typically FP32) scaling factor alongside 
-each quantized tensor. For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling 
-factors of a finer granularity (e.g. per-channel).
-
-These scaling factors can be specified by passing an optional quantization param JSON to the LLM engine at load time. If 
-this JSON is not specified, scaling factors default to 1.0. These scaling factors are typically obtained when running an 
-unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO). 
-
-To install AMMO (AlgorithMic Model Optimization):
-
-.. code-block:: console
-
-        $ pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo
-
-Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon 
-offerings e.g. AMD MI300, NVIDIA Hopper or later support native hardware conversion to and from fp32, fp16, bf16, etc. 
-Thus, LLM inference is greatly accelerated with minimal accuracy loss.
-
-
-Here is an example of how to enable this feature:
-
-.. code-block:: python
-
-        # two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to 
-        # https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own.
-
-        from vllm import LLM, SamplingParams
-        sampling_params = SamplingParams(temperature=1.3, top_p=0.8)
-        llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
-                  kv_cache_dtype="fp8",
-                  quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
-        prompt = "London is the capital of"
-        out = llm.generate(prompt, sampling_params)[0].outputs[0].text
-        print(out)
-
-        # output w/ scaling factors:  England, the United Kingdom, and one of the world's leading financial,
-        # output w/o scaling factors:  England, located in the southeastern part of the country. It is known 
-
diff --git a/docs/source/quantization/fp8_e5m2_kvcache.md b/docs/source/quantization/fp8_e5m2_kvcache.md
new file mode 100644
index 0000000000000..3a81ab17f332f
--- /dev/null
+++ b/docs/source/quantization/fp8_e5m2_kvcache.md
@@ -0,0 +1,31 @@
+(fp8-kv-cache)=
+
+# FP8 E5M2 KV Cache
+
+The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits.
+The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bfloat16 and fp8 to each other.
+
+Here is an example of how to enable this feature:
+
+```python
+from vllm import LLM, SamplingParams
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+# Create an LLM.
+llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8")
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
diff --git a/docs/source/quantization/fp8_e5m2_kvcache.rst b/docs/source/quantization/fp8_e5m2_kvcache.rst
deleted file mode 100644
index b2d824427f786..0000000000000
--- a/docs/source/quantization/fp8_e5m2_kvcache.rst
+++ /dev/null
@@ -1,34 +0,0 @@
-.. _fp8_kv_cache:
-
-FP8 E5M2 KV Cache
-==================
-
-The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits.
-The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bfloat16 and fp8 to each other.
-
-Here is an example of how to enable this feature:
-
-.. code-block:: python
-
-    from vllm import LLM, SamplingParams
-    # Sample prompts.
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    # Create a sampling params object.
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    # Create an LLM.
-    llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8")
-    # Generate texts from the prompts. The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params)
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
diff --git a/docs/source/quantization/gguf.md b/docs/source/quantization/gguf.md
new file mode 100644
index 0000000000000..eebf11dfc1b2b
--- /dev/null
+++ b/docs/source/quantization/gguf.md
@@ -0,0 +1,72 @@
+(gguf)=
+
+# GGUF
+
+```{warning}
+Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team.
+```
+
+```{warning}
+Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model.
+```
+
+To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command:
+
+```console
+$ wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
+$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
+$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
+```
+
+You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs:
+
+```console
+$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
+$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2
+```
+
+```{warning}
+We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
+```
+
+You can also use the GGUF model directly through the LLM entrypoint:
+
+```python
+from vllm import LLM, SamplingParams
+
+# In this script, we demonstrate how to pass input to the chat method:
+conversation = [
+   {
+      "role": "system",
+      "content": "You are a helpful assistant"
+   },
+   {
+      "role": "user",
+      "content": "Hello"
+   },
+   {
+      "role": "assistant",
+      "content": "Hello! How can I assist you today?"
+   },
+   {
+      "role": "user",
+      "content": "Write an essay about the importance of higher education.",
+   },
+]
+
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+         tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.chat(conversation, sampling_params)
+
+# Print the outputs.
+for output in outputs:
+   prompt = output.prompt
+   generated_text = output.outputs[0].text
+   print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
diff --git a/docs/source/quantization/gguf.rst b/docs/source/quantization/gguf.rst
deleted file mode 100644
index 9f00dc5563909..0000000000000
--- a/docs/source/quantization/gguf.rst
+++ /dev/null
@@ -1,73 +0,0 @@
-.. _gguf:
-
-GGUF
-==================
-
-.. warning::
-
-   Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team.
-
-.. warning::
-
-   Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use `gguf-split <https://github.com/ggerganov/llama.cpp/pull/6135>`_ tool to merge them to a single-file model.
-
-To run a GGUF model with vLLM, you can download and use the local GGUF model from `TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF <https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF>`_ with the following command:
-
-.. code-block:: console
-
-   $ wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
-   $ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
-   $ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
-
-You can also add ``--tensor-parallel-size 2`` to enable tensor parallelism inference with 2 GPUs:
-
-.. code-block:: console
-
-   $ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
-   $ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2
-
-.. warning::
-
-   We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
-
-You can also use the GGUF model directly through the LLM entrypoint:
-
-.. code-block:: python
-
-   from vllm import LLM, SamplingParams
-
-   # In this script, we demonstrate how to pass input to the chat method:
-   conversation = [
-      {
-         "role": "system",
-         "content": "You are a helpful assistant"
-      },
-      {
-         "role": "user",
-         "content": "Hello"
-      },
-      {
-         "role": "assistant",
-         "content": "Hello! How can I assist you today?"
-      },
-      {
-         "role": "user",
-         "content": "Write an essay about the importance of higher education.",
-      },
-   ]
-
-   # Create a sampling params object.
-   sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-   # Create an LLM.
-   llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
-            tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-   # Generate texts from the prompts. The output is a list of RequestOutput objects
-   # that contain the prompt, generated text, and other information.
-   outputs = llm.chat(conversation, sampling_params)
-
-   # Print the outputs.
-   for output in outputs:
-      prompt = output.prompt
-      generated_text = output.outputs[0].text
-      print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/docs/source/quantization/int8.md b/docs/source/quantization/int8.md
new file mode 100644
index 0000000000000..1ac50ba987dda
--- /dev/null
+++ b/docs/source/quantization/int8.md
@@ -0,0 +1,136 @@
+(int8)=
+
+# INT8 W8A8
+
+vLLM supports quantizing weights and activations to INT8 for memory savings and inference acceleration.
+This quantization method is particularly useful for reducing model size while maintaining good performance.
+
+Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415).
+
+```{note}
+INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper).
+```
+
+## Prerequisites
+
+To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
+
+```console
+$ pip install llmcompressor
+```
+
+## Quantization Process
+
+The quantization process involves four main steps:
+
+1. Loading the model
+2. Preparing calibration data
+3. Applying quantization
+4. Evaluating accuracy in vLLM
+
+### 1. Loading the Model
+
+Use `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM`, for saving and loading quantized models:
+
+```python
+from llmcompressor.transformers import SparseAutoModelForCausalLM
+from transformers import AutoTokenizer
+
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+model = SparseAutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto",
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+```
+
+### 2. Preparing Calibration Data
+
+When quantizing activations to INT8, you need sample data to estimate the activation scales.
+It's best to use calibration data that closely matches your deployment data.
+For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
+
+```python
+from datasets import load_dataset
+
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load and preprocess the dataset
+ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
+ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+def preprocess(example):
+    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
+ds = ds.map(preprocess)
+
+def tokenize(sample):
+    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+```
+
+### 3. Applying Quantization
+
+Now, apply the quantization algorithms:
+
+```python
+from llmcompressor.transformers import oneshot
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+
+# Configure the quantization algorithms
+recipe = [
+    SmoothQuantModifier(smoothing_strength=0.8),
+    GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
+]
+
+# Apply quantization
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Save the compressed model
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+```
+
+This process creates a W8A8 model with weights and activations quantized to 8-bit integers.
+
+### 4. Evaluating Accuracy
+
+After quantization, you can load and run the model in vLLM:
+
+```python
+from vllm import LLM
+model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
+```
+
+To evaluate accuracy, you can use `lm_eval`:
+
+```console
+$ lm_eval --model vllm \
+  --model_args pretrained="./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true \
+  --tasks gsm8k \
+  --num_fewshot 5 \
+  --limit 250 \
+  --batch_size 'auto'
+```
+
+```{note}
+Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations.
+```
+
+## Best Practices
+
+- Start with 512 samples for calibration data (increase if accuracy drops)
+- Use a sequence length of 2048 as a starting point
+- Employ the chat template or instruction template that the model was trained with
+- If you've fine-tuned a model, consider using a sample of your training data for calibration
+
+## Troubleshooting and Support
+
+If you encounter any issues or have feature requests, please open an issue on the `vllm-project/llm-compressor` GitHub repository.
diff --git a/docs/source/quantization/int8.rst b/docs/source/quantization/int8.rst
deleted file mode 100644
index aa5b251becb1c..0000000000000
--- a/docs/source/quantization/int8.rst
+++ /dev/null
@@ -1,145 +0,0 @@
-.. _int8:
-
-INT8 W8A8
-==================
-
-vLLM supports quantizing weights and activations to INT8 for memory savings and inference acceleration.
-This quantization method is particularly useful for reducing model size while maintaining good performance.
-
-Please visit the HF collection of `quantized INT8 checkpoints of popular LLMs ready to use with vLLM <https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415>`_.
-
-.. note::
-
-   INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper).
-
-Prerequisites
--------------
-
-To use INT8 quantization with vLLM, you'll need to install the `llm-compressor <https://github.com/vllm-project/llm-compressor/>`_ library:
-
-.. code-block:: console
-
-   $ pip install llmcompressor
-
-Quantization Process
---------------------
-
-The quantization process involves four main steps:
-
-1. Loading the model
-2. Preparing calibration data
-3. Applying quantization
-4. Evaluating accuracy in vLLM
-
-1. Loading the Model
-^^^^^^^^^^^^^^^^^^^^
-
-Use ``SparseAutoModelForCausalLM``, which wraps ``AutoModelForCausalLM``, for saving and loading quantized models:
-
-.. code-block:: python
-
-   from llmcompressor.transformers import SparseAutoModelForCausalLM
-   from transformers import AutoTokenizer
-
-   MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-   model = SparseAutoModelForCausalLM.from_pretrained(
-       MODEL_ID, device_map="auto", torch_dtype="auto",
-   )
-   tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-2. Preparing Calibration Data
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-When quantizing activations to INT8, you need sample data to estimate the activation scales.
-It's best to use calibration data that closely matches your deployment data. 
-For a general-purpose instruction-tuned model, you can use a dataset like ``ultrachat``:
-
-.. code-block:: python
-
-   from datasets import load_dataset
-
-   NUM_CALIBRATION_SAMPLES = 512
-   MAX_SEQUENCE_LENGTH = 2048
-
-   # Load and preprocess the dataset
-   ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
-   ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
-
-   def preprocess(example):
-       return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
-   ds = ds.map(preprocess)
-
-   def tokenize(sample):
-       return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
-   ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-3. Applying Quantization
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-Now, apply the quantization algorithms:
-
-.. code-block:: python
-
-   from llmcompressor.transformers import oneshot
-   from llmcompressor.modifiers.quantization import GPTQModifier
-   from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-
-   # Configure the quantization algorithms
-   recipe = [
-       SmoothQuantModifier(smoothing_strength=0.8),
-       GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
-   ]
-
-   # Apply quantization
-   oneshot(
-       model=model,
-       dataset=ds,
-       recipe=recipe,
-       max_seq_length=MAX_SEQUENCE_LENGTH,
-       num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-   )
-
-   # Save the compressed model
-   SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
-   model.save_pretrained(SAVE_DIR, save_compressed=True)
-   tokenizer.save_pretrained(SAVE_DIR)
-
-This process creates a W8A8 model with weights and activations quantized to 8-bit integers.
-
-4. Evaluating Accuracy
-^^^^^^^^^^^^^^^^^^^^^^
-
-After quantization, you can load and run the model in vLLM:
-
-.. code-block:: python
-
-   from vllm import LLM
-   model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
-
-To evaluate accuracy, you can use ``lm_eval``:
-
-.. code-block:: console
-
-   $ lm_eval --model vllm \
-     --model_args pretrained="./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true \
-     --tasks gsm8k \
-     --num_fewshot 5 \
-     --limit 250 \
-     --batch_size 'auto'
-
-.. note::
-
-   Quantized models can be sensitive to the presence of the ``bos`` token. Make sure to include the ``add_bos_token=True`` argument when running evaluations.
-
-Best Practices
---------------
-
-- Start with 512 samples for calibration data (increase if accuracy drops)
-- Use a sequence length of 2048 as a starting point
-- Employ the chat template or instruction template that the model was trained with
-- If you've fine-tuned a model, consider using a sample of your training data for calibration
-
-Troubleshooting and Support
----------------------------
-
-If you encounter any issues or have feature requests, please open an issue on the ``vllm-project/llm-compressor`` GitHub repository.
diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.md
similarity index 84%
rename from docs/source/quantization/supported_hardware.rst
rename to docs/source/quantization/supported_hardware.md
index 09f8e7112cf0c..d2160772a24cb 100644
--- a/docs/source/quantization/supported_hardware.rst
+++ b/docs/source/quantization/supported_hardware.md
@@ -1,132 +1,132 @@
-.. _supported_hardware_for_quantization:
-
-Supported Hardware for Quantization Kernels
-===========================================
-
-The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
-
-.. list-table::
-   :header-rows: 1
-   :widths: 20 8 8 8 8 8 8 8 8 8 8
-
-   * - Implementation
-     - Volta
-     - Turing
-     - Ampere
-     - Ada
-     - Hopper
-     - AMD GPU
-     - Intel GPU
-     - x86 CPU
-     - AWS Inferentia
-     - Google TPU
-   * - AWQ
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-   * - GPTQ
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-   * - Marlin (GPTQ/AWQ/FP8)
-     - ✗
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - INT8 (W8A8)
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✅︎
-     - ✗
-     - ✗
-   * - FP8 (W8A8)
-     - ✗
-     - ✗
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - AQLM
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - bitsandbytes
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - DeepSpeedFP
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - GGUF
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-
-Notes:
-^^^^^^
-
-- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
-- "✅︎" indicates that the quantization method is supported on the specified hardware.
-- "✗" indicates that the quantization method is not supported on the specified hardware.
-
-Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
-
-For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory <https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization>`_ or consult with the vLLM development team.
+(supported-hardware-for-quantization)=
+
+# Supported Hardware for Quantization Kernels
+
+The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
+
+```{eval-rst}
+.. list-table::
+   :header-rows: 1
+   :widths: 20 8 8 8 8 8 8 8 8 8 8
+
+   * - Implementation
+     - Volta
+     - Turing
+     - Ampere
+     - Ada
+     - Hopper
+     - AMD GPU
+     - Intel GPU
+     - x86 CPU
+     - AWS Inferentia
+     - Google TPU
+   * - AWQ
+     - ✗
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✗
+     - ✅︎
+     - ✅︎
+     - ✗
+     - ✗
+   * - GPTQ
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✗
+     - ✅︎
+     - ✅︎
+     - ✗
+     - ✗
+   * - Marlin (GPTQ/AWQ/FP8)
+     - ✗
+     - ✗
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+   * - INT8 (W8A8)
+     - ✗
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✗
+     - ✗
+     - ✅︎
+     - ✗
+     - ✗
+   * - FP8 (W8A8)
+     - ✗
+     - ✗
+     - ✗
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+   * - AQLM
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+   * - bitsandbytes
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+   * - DeepSpeedFP
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+   * - GGUF
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✅︎
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+```
+
+## Notes:
+
+- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
+- "✅︎" indicates that the quantization method is supported on the specified hardware.
+- "✗" indicates that the quantization method is not supported on the specified hardware.
+
+Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
+
+For the most up-to-date information on hardware support and quantization methods, please check the [quantization directory](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization) or consult with the vLLM development team.
diff --git a/docs/source/serving/deploying_with_bentoml.md b/docs/source/serving/deploying_with_bentoml.md
new file mode 100644
index 0000000000000..dfa0de4f0f6d7
--- /dev/null
+++ b/docs/source/serving/deploying_with_bentoml.md
@@ -0,0 +1,7 @@
+(deploying-with-bentoml)=
+
+# Deploying with BentoML
+
+[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes.
+
+For details, see the tutorial [vLLM inference in the BentoML documentation](https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html).
diff --git a/docs/source/serving/deploying_with_bentoml.rst b/docs/source/serving/deploying_with_bentoml.rst
deleted file mode 100644
index 4b9d19f5bdb72..0000000000000
--- a/docs/source/serving/deploying_with_bentoml.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-.. _deploying_with_bentoml:
-
-Deploying with BentoML
-======================
-
-`BentoML <https://github.com/bentoml/BentoML>`_ allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes.
-
-For details, see the tutorial `vLLM inference in the BentoML documentation <https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html>`_.
\ No newline at end of file
diff --git a/docs/source/serving/deploying_with_cerebrium.md b/docs/source/serving/deploying_with_cerebrium.md
new file mode 100644
index 0000000000000..4863936236119
--- /dev/null
+++ b/docs/source/serving/deploying_with_cerebrium.md
@@ -0,0 +1,109 @@
+(deploying-with-cerebrium)=
+
+# Deploying with Cerebrium
+
+```{raw} html
+<p align="center">
+    <img src="https://i.ibb.co/hHcScTT/Screenshot-2024-06-13-at-10-14-54.png" alt="vLLM_plus_cerebrium"/>
+</p>
+```
+
+vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebrium.ai/), a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications.
+
+To install the Cerebrium client, run:
+
+```console
+$ pip install cerebrium
+$ cerebrium login
+```
+
+Next, create your Cerebrium project, run:
+
+```console
+$ cerebrium init vllm-project
+```
+
+Next, to install the required packages, add the following to your cerebrium.toml:
+
+```toml
+[cerebrium.deployment]
+docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
+
+[cerebrium.dependencies.pip]
+vllm = "latest"
+```
+
+Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py\`:
+
+```python
+from vllm import LLM, SamplingParams
+
+llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")
+
+def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
+
+    sampling_params = SamplingParams(temperature=temperature, top_p=top_p)
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    results = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        results.append({"prompt": prompt, "generated_text": generated_text})
+
+    return {"results": results}
+```
+
+Then, run the following code to deploy it to the cloud
+
+```console
+$ cerebrium deploy
+```
+
+If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run)
+
+```python
+curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
+ -H 'Content-Type: application/json' \
+ -H 'Authorization: <JWT TOKEN>' \
+ --data '{
+   "prompts": [
+     "Hello, my name is",
+     "The president of the United States is",
+     "The capital of France is",
+     "The future of AI is"
+   ]
+ }'
+```
+
+You should get a response like:
+
+```python
+{
+    "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
+    "result": {
+        "result": [
+            {
+                "prompt": "Hello, my name is",
+                "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of"
+            },
+            {
+                "prompt": "The president of the United States is",
+                "generated_text": " elected every four years. This is a democratic system.\n\n5. What"
+            },
+            {
+                "prompt": "The capital of France is",
+                "generated_text": " Paris.\n"
+            },
+            {
+                "prompt": "The future of AI is",
+                "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective."
+            }
+        ]
+    },
+    "run_time_ms": 152.53663063049316
+}
+```
+
+You now have an autoscaling endpoint where you only pay for the compute you use!
diff --git a/docs/source/serving/deploying_with_cerebrium.rst b/docs/source/serving/deploying_with_cerebrium.rst
deleted file mode 100644
index 9585b6ef5cb38..0000000000000
--- a/docs/source/serving/deploying_with_cerebrium.rst
+++ /dev/null
@@ -1,112 +0,0 @@
-.. _deploying_with_cerebrium:
-
-Deploying with Cerebrium
-============================
-
-.. raw:: html
-
-    <p align="center">
-        <img src="https://i.ibb.co/hHcScTT/Screenshot-2024-06-13-at-10-14-54.png" alt="vLLM_plus_cerebrium"/>
-    </p>
-
-vLLM can be run on a cloud based GPU machine with `Cerebrium <https://www.cerebrium.ai/>`__, a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications.
-
-To install the Cerebrium client, run:
-
-.. code-block:: console
-
-    $ pip install cerebrium
-    $ cerebrium login
-
-Next, create your Cerebrium project, run:
-    
-.. code-block:: console
-
-    $ cerebrium init vllm-project
-
-Next, to install the required packages, add the following to your cerebrium.toml:
-
-.. code-block:: toml
-
-    [cerebrium.deployment]
-    docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
-
-    [cerebrium.dependencies.pip]
-    vllm = "latest"
-
-Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py`:
-    
-.. code-block:: python
-
-    from vllm import LLM, SamplingParams
-
-    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")
-
-    def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
-    
-        sampling_params = SamplingParams(temperature=temperature, top_p=top_p)
-        outputs = llm.generate(prompts, sampling_params)
-
-        # Print the outputs.
-        results = []
-        for output in outputs:
-            prompt = output.prompt
-            generated_text = output.outputs[0].text
-            results.append({"prompt": prompt, "generated_text": generated_text})
-
-        return {"results": results}
-
-
-Then, run the following code to deploy it to the cloud
-
-.. code-block:: console
-
-    $ cerebrium deploy
-
-If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run)
-
-.. code-block:: python
-
-    curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
-     -H 'Content-Type: application/json' \
-     -H 'Authorization: <JWT TOKEN>' \
-     --data '{
-       "prompts": [
-         "Hello, my name is",
-         "The president of the United States is",
-         "The capital of France is",
-         "The future of AI is"
-       ]
-     }'
-
-You should get a response like:
-
-.. code-block:: python
-    
-    {
-        "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
-        "result": {
-            "result": [
-                {
-                    "prompt": "Hello, my name is",
-                    "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of"
-                },
-                {
-                    "prompt": "The president of the United States is",
-                    "generated_text": " elected every four years. This is a democratic system.\n\n5. What"
-                },
-                {
-                    "prompt": "The capital of France is",
-                    "generated_text": " Paris.\n"
-                },
-                {
-                    "prompt": "The future of AI is",
-                    "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective."
-                }
-            ]
-        },
-        "run_time_ms": 152.53663063049316
-    }
-
-You now have an autoscaling endpoint where you only pay for the compute you use!
-
diff --git a/docs/source/serving/deploying_with_docker.md b/docs/source/serving/deploying_with_docker.md
new file mode 100644
index 0000000000000..2d8ceed8cecfd
--- /dev/null
+++ b/docs/source/serving/deploying_with_docker.md
@@ -0,0 +1,81 @@
+(deploying-with-docker)=
+
+# Deploying with Docker
+
+## Use vLLM's Official Docker Image
+
+vLLM offers an official Docker image for deployment.
+The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags).
+
+```console
+$ docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    -p 8000:8000 \
+    --ipc=host \
+    vllm/vllm-openai:latest \
+    --model mistralai/Mistral-7B-v0.1
+```
+
+```{note}
+You can either use the `ipc=host` flag or `--shm-size` flag to allow the
+container to access the host's shared memory. vLLM uses PyTorch, which uses shared
+memory to share data between processes under the hood, particularly for tensor parallel inference.
+```
+
+## Building vLLM's Docker Image from Source
+
+You can build and run vLLM from source via the provided [Dockerfile](https://github.com/vllm-project/vllm/blob/main/Dockerfile). To build vLLM:
+
+```console
+$ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
+$ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
+```
+
+```{note}
+By default vLLM will build for all GPU types for widest distribution. If you are just building for the
+current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""`
+for vLLM to find the current GPU type and build for that.
+```
+
+## Building for Arm64/aarch64
+
+A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
+of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
+
+```{note}
+Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
+flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
+Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
+```
+
+```console
+# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
+$ python3 use_existing_torch.py
+$ DOCKER_BUILDKIT=1 docker build . \
+  --target vllm-openai \
+  --platform "linux/arm64" \
+  -t vllm/vllm-gh200-openai:latest \
+  --build-arg max_jobs=66 \
+  --build-arg nvcc_threads=2 \
+  --build-arg torch_cuda_arch_list="9.0+PTX" \
+  --build-arg vllm_fa_cmake_gpu_arches="90-real"
+```
+
+## Use the custom-built vLLM Docker image
+
+To run vLLM with the custom-built Docker image:
+
+```console
+$ docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    -p 8000:8000 \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    vllm/vllm-openai <args...>
+```
+
+The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command).
+
+```{note}
+**For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` .
+```
diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
deleted file mode 100644
index b64eef819cd2e..0000000000000
--- a/docs/source/serving/deploying_with_docker.rst
+++ /dev/null
@@ -1,88 +0,0 @@
-.. _deploying_with_docker:
-
-Deploying with Docker
-============================
-
-Use vLLM's Official Docker Image
---------------------------------
-
-vLLM offers an official Docker image for deployment.
-The image can be used to run OpenAI compatible server and is available on Docker Hub as `vllm/vllm-openai <https://hub.docker.com/r/vllm/vllm-openai/tags>`_.
-
-.. code-block:: console
-
-    $ docker run --runtime nvidia --gpus all \
-        -v ~/.cache/huggingface:/root/.cache/huggingface \
-        --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
-        -p 8000:8000 \
-        --ipc=host \
-        vllm/vllm-openai:latest \
-        --model mistralai/Mistral-7B-v0.1
-
-
-.. note::
-
-        You can either use the ``ipc=host`` flag or ``--shm-size`` flag to allow the
-        container to access the host's shared memory. vLLM uses PyTorch, which uses shared
-        memory to share data between processes under the hood, particularly for tensor parallel inference.
-
-
-Building vLLM's Docker Image from Source
-----------------------------------------
-
-You can build and run vLLM from source via the provided `Dockerfile <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`_. To build vLLM:
-
-.. code-block:: console
-
-    $ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
-    $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
-
-.. note::
-
-        By default vLLM will build for all GPU types for widest distribution. If you are just building for the
-        current GPU type the machine is running on, you can add the argument ``--build-arg torch_cuda_arch_list=""``
-        for vLLM to find the current GPU type and build for that.
-
-Building for Arm64/aarch64
---------------------------
-
-A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
-of PyTorch Nightly and should be considered **experimental**. Using the flag ``--platform "linux/arm64"`` will attempt to build for arm64.
-
-.. note::
-
-        Multiple modules must be compiled, so this process can take a while. Recommend using ``--build-arg max_jobs=`` & ``--build-arg nvcc_threads=``
-        flags to speed up build process. However, ensure your ``max_jobs`` is substantially larger than ``nvcc_threads`` to get the most benefits.
-        Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
-
-.. code-block:: console
-
-    # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
-    $ python3 use_existing_torch.py
-    $ DOCKER_BUILDKIT=1 docker build . \
-      --target vllm-openai \
-      --platform "linux/arm64" \
-      -t vllm/vllm-gh200-openai:latest \
-      --build-arg max_jobs=66 \
-      --build-arg nvcc_threads=2 \
-      --build-arg torch_cuda_arch_list="9.0+PTX" \
-      --build-arg vllm_fa_cmake_gpu_arches="90-real"
-
-Use the custom-built vLLM Docker image
---------------------------------------
-
-To run vLLM with the custom-built Docker image:
-
-.. code-block:: console
-
-    $ docker run --runtime nvidia --gpus all \
-        -v ~/.cache/huggingface:/root/.cache/huggingface \
-        -p 8000:8000 \
-        --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
-        vllm/vllm-openai <args...>
-
-The argument ``vllm/vllm-openai`` specifies the image to run, and should be replaced with the name of the custom-built image (the ``-t`` tag from the build command).
-
-.. note::
-
-        **For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. ``/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable ``VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` .
diff --git a/docs/source/serving/deploying_with_dstack.md b/docs/source/serving/deploying_with_dstack.md
new file mode 100644
index 0000000000000..65ef1c0016208
--- /dev/null
+++ b/docs/source/serving/deploying_with_dstack.md
@@ -0,0 +1,102 @@
+(deploying-with-dstack)=
+
+# Deploying with dstack
+
+```{raw} html
+<p align="center">
+    <img src="https://i.ibb.co/71kx6hW/vllm-dstack.png" alt="vLLM_plus_dstack"/>
+</p>
+```
+
+vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment.
+
+To install dstack client, run:
+
+```console
+$ pip install "dstack[all]
+$ dstack server
+```
+
+Next, to configure your dstack project, run:
+
+```console
+$ mkdir -p vllm-dstack
+$ cd vllm-dstack
+$ dstack init
+```
+
+Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
+
+```yaml
+type: service
+
+python: "3.11"
+env:
+    - MODEL=NousResearch/Llama-2-7b-chat-hf
+port: 8000
+resources:
+    gpu: 24GB
+commands:
+    - pip install vllm
+    - vllm serve $MODEL --port 8000
+model:
+    format: openai
+    type: chat
+    name: NousResearch/Llama-2-7b-chat-hf
+```
+
+Then, run the following CLI for provisioning:
+
+```console
+$ dstack run . -f serve.dstack.yml
+
+⠸ Getting run plan...
+ Configuration  serve.dstack.yml
+ Project        deep-diver-main
+ User           deep-diver
+ Min resources  2..xCPU, 8GB.., 1xGPU (24GB)
+ Max price      -
+ Max duration   -
+ Spot policy    auto
+ Retry policy   no
+
+ #  BACKEND  REGION       INSTANCE       RESOURCES                               SPOT  PRICE
+ 1  gcp   us-central1  g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
+ 2  gcp   us-east1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
+ 3  gcp   us-west1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
+    ...
+ Shown 3 of 193 offers, $5.876 max
+
+Continue? [y/n]: y
+⠙ Submitting run...
+⠏ Launching spicy-treefrog-1 (pulling)
+spicy-treefrog-1 provisioning completed (running)
+Service is published at ...
+```
+
+After the provisioning, you can interact with the model by using the OpenAI SDK:
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    base_url="https://gateway.<gateway domain>",
+    api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
+)
+
+completion = client.chat.completions.create(
+    model="NousResearch/Llama-2-7b-chat-hf",
+    messages=[
+        {
+            "role": "user",
+            "content": "Compose a poem that explains the concept of recursion in programming.",
+        }
+    ]
+)
+
+print(completion.choices[0].message.content)
+```
+
+```{note}
+dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm)
+```
diff --git a/docs/source/serving/deploying_with_dstack.rst b/docs/source/serving/deploying_with_dstack.rst
deleted file mode 100644
index e1eb45b225d9c..0000000000000
--- a/docs/source/serving/deploying_with_dstack.rst
+++ /dev/null
@@ -1,103 +0,0 @@
-.. _deploying_with_dstack:
-
-Deploying with dstack
-============================
-
-.. raw:: html
-
-    <p align="center">
-        <img src="https://i.ibb.co/71kx6hW/vllm-dstack.png" alt="vLLM_plus_dstack"/>
-    </p>
-
-vLLM can be run on a cloud based GPU machine with `dstack <https://dstack.ai/>`__, an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment.
-
-To install dstack client, run:
-
-.. code-block:: console
-
-    $ pip install "dstack[all]
-    $ dstack server
-
-Next, to configure your dstack project, run:
-    
-.. code-block:: console
-
-    $ mkdir -p vllm-dstack
-    $ cd vllm-dstack
-    $ dstack init
-
-Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
-    
-.. code-block:: yaml
-
-    type: service
-    
-    python: "3.11"
-    env:
-        - MODEL=NousResearch/Llama-2-7b-chat-hf
-    port: 8000
-    resources:
-        gpu: 24GB
-    commands:
-        - pip install vllm
-        - vllm serve $MODEL --port 8000
-    model:
-        format: openai
-        type: chat
-        name: NousResearch/Llama-2-7b-chat-hf
-
-Then, run the following CLI for provisioning:
-
-.. code-block:: console
-
-    $ dstack run . -f serve.dstack.yml
-    
-    ⠸ Getting run plan...
-     Configuration  serve.dstack.yml             
-     Project        deep-diver-main              
-     User           deep-diver                   
-     Min resources  2..xCPU, 8GB.., 1xGPU (24GB) 
-     Max price      -                            
-     Max duration   -                            
-     Spot policy    auto                         
-     Retry policy   no                           
-    
-     #  BACKEND  REGION       INSTANCE       RESOURCES                               SPOT  PRICE       
-     1  gcp   us-central1  g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804   
-     2  gcp   us-east1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804   
-     3  gcp   us-west1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804   
-        ...                                                                                            
-     Shown 3 of 193 offers, $5.876 max
-    
-    Continue? [y/n]: y
-    ⠙ Submitting run...
-    ⠏ Launching spicy-treefrog-1 (pulling)
-    spicy-treefrog-1 provisioning completed (running)
-    Service is published at ...
-
-After the provisioning, you can interact with the model by using the OpenAI SDK:
-
-.. code-block:: python
-
-    from openai import OpenAI
-    
-    client = OpenAI(
-        base_url="https://gateway.<gateway domain>",
-        api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
-    )
-    
-    completion = client.chat.completions.create(
-        model="NousResearch/Llama-2-7b-chat-hf",
-        messages=[
-            {
-                "role": "user",
-                "content": "Compose a poem that explains the concept of recursion in programming.",
-            }
-        ]
-    )
-
-    print(completion.choices[0].message.content)
-
-.. note::
-
-    dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out `this repository <https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm>`__
diff --git a/docs/source/serving/deploying_with_helm.rst b/docs/source/serving/deploying_with_helm.md
similarity index 88%
rename from docs/source/serving/deploying_with_helm.rst
rename to docs/source/serving/deploying_with_helm.md
index d185a6951d7ec..3b26575827011 100644
--- a/docs/source/serving/deploying_with_helm.rst
+++ b/docs/source/serving/deploying_with_helm.md
@@ -1,7 +1,6 @@
-.. _deploying_with_helm:
+(deploying-with-helm)=
 
-Deploying with Helm
-===================
+# Deploying with Helm
 
 A Helm chart to deploy vLLM for Kubernetes
 
@@ -9,44 +8,42 @@ Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s
 
 This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm install and documentation on architecture and values file.
 
-Prerequisites
--------------
+## Prerequisites
+
 Before you begin, ensure that you have the following:
 
 - A running Kubernetes cluster
-- NVIDIA Kubernetes Device Plugin (``k8s-device-plugin``): This can be found at `https://github.com/NVIDIA/k8s-device-plugin <https://github.com/NVIDIA/k8s-device-plugin>`__
+- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin)
 - Available GPU resources in your cluster
 - S3 with the model which will be deployed
 
-Installing the chart
---------------------
-
-To install the chart with the release name ``test-vllm``:
-
-.. code-block:: console
+## Installing the chart
 
-    helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
+To install the chart with the release name `test-vllm`:
 
-Uninstalling the Chart
-----------------------
+```console
+helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
+```
 
-To uninstall the ``test-vllm`` deployment:
+## Uninstalling the Chart
 
-.. code-block:: console
+To uninstall the `test-vllm` deployment:
 
-    helm uninstall test-vllm --namespace=ns-vllm
+```console
+helm uninstall test-vllm --namespace=ns-vllm
+```
 
 The command removes all the Kubernetes components associated with the
 chart **including persistent volumes** and deletes the release.
 
-Architecture
-------------
+## Architecture
 
-.. image:: architecture_helm_deployment.png
+```{image} architecture_helm_deployment.png
+```
 
-Values
-------
+## Values
 
+```{eval-rst}
 .. list-table:: Values
    :widths: 25 25 25 25
    :header-rows: 1
@@ -251,3 +248,4 @@ Values
      - string
      - test
      - Release name
+```
diff --git a/docs/source/serving/deploying_with_k8s.md b/docs/source/serving/deploying_with_k8s.md
new file mode 100644
index 0000000000000..d27db826cd006
--- /dev/null
+++ b/docs/source/serving/deploying_with_k8s.md
@@ -0,0 +1,171 @@
+(deploying-with-k8s)=
+
+# Deploying with Kubernetes
+
+Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing.
+
+## Prerequisites
+
+Before you begin, ensure that you have the following:
+
+- A running Kubernetes cluster
+- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at `https://github.com/NVIDIA/k8s-device-plugin/`
+- Available GPU resources in your cluster
+
+## Deployment Steps
+
+1. **Create a PVC , Secret and Deployment for vLLM**
+
+PVC is used to store the model cache and it is optional, you can use hostPath or other storage options
+
+```yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: mistral-7b
+  namespace: default
+spec:
+  accessModes:
+  - ReadWriteOnce
+  resources:
+    requests:
+      storage: 50Gi
+  storageClassName: default
+  volumeMode: Filesystem
+```
+
+Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models
+
+```yaml
+apiVersion: v1
+kind: Secret
+metadata:
+  name: hf-token-secret
+  namespace: default
+type: Opaque
+data:
+  token: "REPLACE_WITH_TOKEN"
+```
+
+Create a deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model:
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: mistral-7b
+  namespace: default
+  labels:
+    app: mistral-7b
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: mistral-7b
+  template:
+    metadata:
+      labels:
+        app: mistral-7b
+    spec:
+      volumes:
+      - name: cache-volume
+        persistentVolumeClaim:
+          claimName: mistral-7b
+      # vLLM needs to access the host's shared memory for tensor parallel inference.
+      - name: shm
+        emptyDir:
+          medium: Memory
+          sizeLimit: "2Gi"
+      containers:
+      - name: mistral-7b
+        image: vllm/vllm-openai:latest
+        command: ["/bin/sh", "-c"]
+        args: [
+          "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
+        ]
+        env:
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-token-secret
+              key: token
+        ports:
+        - containerPort: 8000
+        resources:
+          limits:
+            cpu: "10"
+            memory: 20G
+            nvidia.com/gpu: "1"
+          requests:
+            cpu: "2"
+            memory: 6G
+            nvidia.com/gpu: "1"
+        volumeMounts:
+        - mountPath: /root/.cache/huggingface
+          name: cache-volume
+        - name: shm
+          mountPath: /dev/shm
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 8000
+          initialDelaySeconds: 60
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 8000
+          initialDelaySeconds: 60
+          periodSeconds: 5
+```
+
+2. **Create a Kubernetes Service for vLLM**
+
+Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:
+
+```yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: mistral-7b
+  namespace: default
+spec:
+  ports:
+  - name: http-mistral-7b
+    port: 80
+    protocol: TCP
+    targetPort: 8000
+  # The label selector should match the deployment labels & it is useful for prefix caching feature
+  selector:
+    app: mistral-7b
+  sessionAffinity: None
+  type: ClusterIP
+```
+
+3. **Deploy and Test**
+
+Apply the deployment and service configurations using `kubectl apply -f <filename>`:
+
+```console
+kubectl apply -f deployment.yaml
+kubectl apply -f service.yaml
+```
+
+To test the deployment, run the following `curl` command:
+
+```console
+curl http://mistral-7b.default.svc.cluster.local/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+        "model": "mistralai/Mistral-7B-Instruct-v0.3",
+        "prompt": "San Francisco is a",
+        "max_tokens": 7,
+        "temperature": 0
+      }'
+```
+
+If the service is correctly deployed, you should receive a response from the vLLM model.
+
+## Conclusion
+
+Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation.
diff --git a/docs/source/serving/deploying_with_k8s.rst b/docs/source/serving/deploying_with_k8s.rst
deleted file mode 100644
index cc3606f0df851..0000000000000
--- a/docs/source/serving/deploying_with_k8s.rst
+++ /dev/null
@@ -1,175 +0,0 @@
-.. _deploying_with_k8s:
-
-Deploying with Kubernetes
-==========================
-
-Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing.
-
-Prerequisites
--------------
-Before you begin, ensure that you have the following:
-
-- A running Kubernetes cluster
-- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at `https://github.com/NVIDIA/k8s-device-plugin/`
-- Available GPU resources in your cluster
-
-Deployment Steps
-----------------
-
-1.  **Create a PVC , Secret and Deployment for vLLM**
-
-
-PVC is used to store the model cache and it is optional, you can use hostPath or other storage options
-
-.. code-block:: yaml
-
-  apiVersion: v1
-  kind: PersistentVolumeClaim
-  metadata:
-    name: mistral-7b
-    namespace: default
-  spec:
-    accessModes:
-    - ReadWriteOnce
-    resources:
-      requests:
-        storage: 50Gi
-    storageClassName: default
-    volumeMode: Filesystem
-
-Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models
-
-.. code-block:: yaml
-
-  apiVersion: v1
-  kind: Secret
-  metadata:
-    name: hf-token-secret
-    namespace: default
-  type: Opaque
-  data:
-    token: "REPLACE_WITH_TOKEN"
-
-
-Create a deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model:
-
-.. code-block:: yaml
-
-  apiVersion: apps/v1
-  kind: Deployment
-  metadata:
-    name: mistral-7b
-    namespace: default
-    labels:
-      app: mistral-7b
-  spec:
-    replicas: 1
-    selector:
-      matchLabels:
-        app: mistral-7b
-    template:
-      metadata:
-        labels:
-          app: mistral-7b
-      spec:
-        volumes:
-        - name: cache-volume
-          persistentVolumeClaim:
-            claimName: mistral-7b
-        # vLLM needs to access the host's shared memory for tensor parallel inference.
-        - name: shm
-          emptyDir:
-            medium: Memory
-            sizeLimit: "2Gi"
-        containers:
-        - name: mistral-7b
-          image: vllm/vllm-openai:latest
-          command: ["/bin/sh", "-c"]
-          args: [
-            "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
-          ]
-          env:
-          - name: HUGGING_FACE_HUB_TOKEN
-            valueFrom:
-              secretKeyRef:
-                name: hf-token-secret
-                key: token
-          ports:
-          - containerPort: 8000
-          resources:
-            limits:
-              cpu: "10"
-              memory: 20G
-              nvidia.com/gpu: "1"
-            requests:
-              cpu: "2"
-              memory: 6G
-              nvidia.com/gpu: "1"
-          volumeMounts:
-          - mountPath: /root/.cache/huggingface
-            name: cache-volume
-          - name: shm
-            mountPath: /dev/shm
-          livenessProbe:
-            httpGet:
-              path: /health
-              port: 8000
-            initialDelaySeconds: 60
-            periodSeconds: 10
-          readinessProbe:
-            httpGet:
-              path: /health
-              port: 8000
-            initialDelaySeconds: 60
-            periodSeconds: 5
-
-2. **Create a Kubernetes Service for vLLM**
-
-Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:
-
-.. code-block:: yaml
-
-    apiVersion: v1
-    kind: Service
-    metadata:
-      name: mistral-7b
-      namespace: default
-    spec:
-      ports:
-      - name: http-mistral-7b
-        port: 80
-        protocol: TCP
-        targetPort: 8000
-      # The label selector should match the deployment labels & it is useful for prefix caching feature
-      selector:
-        app: mistral-7b
-      sessionAffinity: None
-      type: ClusterIP
-
-3. **Deploy and Test**
-
-Apply the deployment and service configurations using ``kubectl apply -f <filename>``:
-
-.. code-block:: console
-
-    kubectl apply -f deployment.yaml
-    kubectl apply -f service.yaml
-
-To test the deployment, run the following ``curl`` command:
-
-.. code-block:: console
-
-    curl http://mistral-7b.default.svc.cluster.local/v1/completions \
-      -H "Content-Type: application/json" \
-      -d '{
-            "model": "mistralai/Mistral-7B-Instruct-v0.3",
-            "prompt": "San Francisco is a",
-            "max_tokens": 7,
-            "temperature": 0
-          }'
-
-If the service is correctly deployed, you should receive a response from the vLLM model.
-
-Conclusion
-----------
-Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation.
diff --git a/docs/source/serving/deploying_with_kserve.md b/docs/source/serving/deploying_with_kserve.md
new file mode 100644
index 0000000000000..feaeb5d0ec8a2
--- /dev/null
+++ b/docs/source/serving/deploying_with_kserve.md
@@ -0,0 +1,7 @@
+(deploying-with-kserve)=
+
+# Deploying with KServe
+
+vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
+
+Please see [this guide](https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/) for more details on using vLLM with KServe.
diff --git a/docs/source/serving/deploying_with_kserve.rst b/docs/source/serving/deploying_with_kserve.rst
deleted file mode 100644
index 01d7ccc6e9300..0000000000000
--- a/docs/source/serving/deploying_with_kserve.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-.. _deploying_with_kserve:
-
-Deploying with KServe
-============================
-
-vLLM can be deployed with `KServe <https://github.com/kserve/kserve>`_ on Kubernetes for highly scalable distributed model serving.
-
-Please see `this guide <https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/>`_ for more details on using vLLM with KServe.
diff --git a/docs/source/serving/deploying_with_kubeai.md b/docs/source/serving/deploying_with_kubeai.md
new file mode 100644
index 0000000000000..3609d7e05acd3
--- /dev/null
+++ b/docs/source/serving/deploying_with_kubeai.md
@@ -0,0 +1,15 @@
+(deploying-with-kubeai)=
+
+# Deploying with KubeAI
+
+[KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
+
+Please see the Installation Guides for environment specific instructions:
+
+- [Any Kubernetes Cluster](https://www.kubeai.org/installation/any/)
+- [EKS](https://www.kubeai.org/installation/eks/)
+- [GKE](https://www.kubeai.org/installation/gke/)
+
+Once you have KubeAI installed, you can
+[configure text generation models](https://www.kubeai.org/how-to/configure-text-generation-models/)
+using vLLM.
diff --git a/docs/source/serving/deploying_with_kubeai.rst b/docs/source/serving/deploying_with_kubeai.rst
deleted file mode 100644
index ec3c065320fd9..0000000000000
--- a/docs/source/serving/deploying_with_kubeai.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-.. _deploying_with_kubeai:
-
-Deploying with KubeAI
-=====================
-
-`KubeAI <https://github.com/substratusai/kubeai>`_ is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
-
-
-Please see the Installation Guides for environment specific instructions:
-
-* `Any Kubernetes Cluster <https://www.kubeai.org/installation/any/>`_
-* `EKS <https://www.kubeai.org/installation/eks/>`_
-* `GKE <https://www.kubeai.org/installation/gke/>`_
-
-Once you have KubeAI installed, you can
-`configure text generation models <https://www.kubeai.org/how-to/configure-text-generation-models/>`_
-using vLLM.
\ No newline at end of file
diff --git a/docs/source/serving/deploying_with_lws.md b/docs/source/serving/deploying_with_lws.md
new file mode 100644
index 0000000000000..22bab419eaca3
--- /dev/null
+++ b/docs/source/serving/deploying_with_lws.md
@@ -0,0 +1,11 @@
+(deploying-with-lws)=
+
+# Deploying with LWS
+
+LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
+A major use case is for multi-host/multi-node distributed inference.
+
+vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kubernetes for distributed model serving.
+
+Please see [this guide](https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/vllm) for more details on
+deploying vLLM on Kubernetes using LWS.
diff --git a/docs/source/serving/deploying_with_lws.rst b/docs/source/serving/deploying_with_lws.rst
deleted file mode 100644
index b63a432dde0d5..0000000000000
--- a/docs/source/serving/deploying_with_lws.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-.. _deploying_with_lws:
-
-Deploying with LWS
-============================
-
-LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
-A major use case is for multi-host/multi-node distributed inference.
-
-vLLM can be deployed with `LWS <https://github.com/kubernetes-sigs/lws>`_ on Kubernetes for distributed model serving.
-
-Please see `this guide <https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/vllm>`_ for more details on
-deploying vLLM on Kubernetes using LWS.
diff --git a/docs/source/serving/deploying_with_nginx.md b/docs/source/serving/deploying_with_nginx.md
new file mode 100644
index 0000000000000..a1f00d8536465
--- /dev/null
+++ b/docs/source/serving/deploying_with_nginx.md
@@ -0,0 +1,133 @@
+(nginxloadbalancer)=
+
+# Deploying with Nginx Loadbalancer
+
+This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
+
+Table of contents:
+
+1. [Build Nginx Container](#nginxloadbalancer-nginx-build)
+2. [Create Simple Nginx Config file](#nginxloadbalancer-nginx-conf)
+3. [Build vLLM Container](#nginxloadbalancer-nginx-vllm-container)
+4. [Create Docker Network](#nginxloadbalancer-nginx-docker-network)
+5. [Launch vLLM Containers](#nginxloadbalancer-nginx-launch-container)
+6. [Launch Nginx](#nginxloadbalancer-nginx-launch-nginx)
+7. [Verify That vLLM Servers Are Ready](#nginxloadbalancer-nginx-verify-nginx)
+
+(nginxloadbalancer-nginx-build)=
+
+## Build Nginx Container
+
+This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory.
+
+```console
+export vllm_root=`pwd`
+```
+
+Create a file named `Dockerfile.nginx`:
+
+```console
+FROM nginx:latest
+RUN rm /etc/nginx/conf.d/default.conf
+EXPOSE 80
+CMD ["nginx", "-g", "daemon off;"]
+```
+
+Build the container:
+
+```console
+docker build . -f Dockerfile.nginx --tag nginx-lb
+```
+
+(nginxloadbalancer-nginx-conf)=
+
+## Create Simple Nginx Config file
+
+Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.
+
+```console
+upstream backend {
+    least_conn;
+    server vllm0:8000 max_fails=3 fail_timeout=10000s;
+    server vllm1:8000 max_fails=3 fail_timeout=10000s;
+}
+server {
+    listen 80;
+    location / {
+        proxy_pass http://backend;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+    }
+}
+```
+
+(nginxloadbalancer-nginx-vllm-container)=
+
+## Build vLLM Container
+
+```console
+cd $vllm_root
+docker build -f Dockerfile . --tag vllm
+```
+
+If you are behind proxy, you can pass the proxy settings to the docker build command as shown below:
+
+```console
+cd $vllm_root
+docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
+```
+
+(nginxloadbalancer-nginx-docker-network)=
+
+## Create Docker Network
+
+```console
+docker network create vllm_nginx
+```
+
+(nginxloadbalancer-nginx-launch-container)=
+
+## Launch vLLM Containers
+
+Notes:
+
+- If you have your HuggingFace models cached somewhere else, update `hf_cache_dir` below.
+- If you don't have an existing HuggingFace cache you will want to start `vllm0` and wait for the model to complete downloading and the server to be ready. This will ensure that `vllm1` can leverage the model you just downloaded and it won't have to be downloaded again.
+- The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus all`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command.
+- Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`.
+
+```console
+mkdir -p ~/.cache/huggingface/hub/
+hf_cache_dir=~/.cache/huggingface/
+docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf
+docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf
+```
+
+```{note}
+If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
+```
+
+(nginxloadbalancer-nginx-launch-nginx)=
+
+## Launch Nginx
+
+```console
+docker run -itd -p 8000:80 --network vllm_nginx -v ./nginx_conf/:/etc/nginx/conf.d/ --name nginx-lb nginx-lb:latest
+```
+
+(nginxloadbalancer-nginx-verify-nginx)=
+
+## Verify That vLLM Servers Are Ready
+
+```console
+docker logs vllm0 | grep Uvicorn
+docker logs vllm1 | grep Uvicorn
+```
+
+Both outputs should look like this:
+
+```console
+INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
+```
diff --git a/docs/source/serving/deploying_with_nginx.rst b/docs/source/serving/deploying_with_nginx.rst
deleted file mode 100644
index b5dff02b6bae6..0000000000000
--- a/docs/source/serving/deploying_with_nginx.rst
+++ /dev/null
@@ -1,142 +0,0 @@
-.. _nginxloadbalancer:
-
-Deploying with Nginx Loadbalancer
-=================================
-
-This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers. 
-
-Table of contents:
-
-#. :ref:`Build Nginx Container <nginxloadbalancer_nginx_build>`
-#. :ref:`Create Simple Nginx Config file <nginxloadbalancer_nginx_conf>`
-#. :ref:`Build vLLM Container <nginxloadbalancer_nginx_vllm_container>`
-#. :ref:`Create Docker Network <nginxloadbalancer_nginx_docker_network>`
-#. :ref:`Launch vLLM Containers <nginxloadbalancer_nginx_launch_container>`
-#. :ref:`Launch Nginx <nginxloadbalancer_nginx_launch_nginx>`
-#. :ref:`Verify That vLLM Servers Are Ready <nginxloadbalancer_nginx_verify_nginx>`
-
-.. _nginxloadbalancer_nginx_build:
-
-Build Nginx Container
----------------------
-
-This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory.
-
-.. code-block:: console
-
-    export vllm_root=`pwd`
-
-Create a file named ``Dockerfile.nginx``:
-
-.. code-block:: console
-
-    FROM nginx:latest
-    RUN rm /etc/nginx/conf.d/default.conf
-    EXPOSE 80
-    CMD ["nginx", "-g", "daemon off;"]
-
-Build the container:
-
-.. code-block:: console
-
-    docker build . -f Dockerfile.nginx --tag nginx-lb
-
-.. _nginxloadbalancer_nginx_conf:
-
-Create Simple Nginx Config file
--------------------------------
-
-Create a file named ``nginx_conf/nginx.conf``. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another ``server vllmN:8000 max_fails=3 fail_timeout=10000s;`` entry to ``upstream backend``.
-
-.. code-block:: console
-
-    upstream backend {
-        least_conn;
-        server vllm0:8000 max_fails=3 fail_timeout=10000s;
-        server vllm1:8000 max_fails=3 fail_timeout=10000s;
-    }     
-    server {
-        listen 80;
-        location / {
-            proxy_pass http://backend;
-            proxy_set_header Host $host;
-            proxy_set_header X-Real-IP $remote_addr;
-            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
-            proxy_set_header X-Forwarded-Proto $scheme;
-        }
-    }
-
-.. _nginxloadbalancer_nginx_vllm_container:
-
-Build vLLM Container
---------------------
-
-.. code-block:: console
-
-    cd $vllm_root
-    docker build -f Dockerfile . --tag vllm
-
-
-If you are behind proxy, you can pass the proxy settings to the docker build command as shown below:
-
-.. code-block:: console
-
-    cd $vllm_root
-    docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
-
-.. _nginxloadbalancer_nginx_docker_network:
-
-Create Docker Network
----------------------
-
-.. code-block:: console
-
-    docker network create vllm_nginx
-
-
-.. _nginxloadbalancer_nginx_launch_container:
-
-Launch vLLM Containers
-----------------------
-
-Notes:
-
-* If you have your HuggingFace models cached somewhere else, update ``hf_cache_dir`` below. 
-* If you don't have an existing HuggingFace cache you will want to start ``vllm0`` and wait for the model to complete downloading and the server to be ready. This will ensure that ``vllm1`` can leverage the model you just downloaded and it won't have to be downloaded again.
-* The below example assumes GPU backend used. If you are using CPU backend, remove ``--gpus all``, add ``VLLM_CPU_KVCACHE_SPACE`` and ``VLLM_CPU_OMP_THREADS_BIND`` environment variables to the docker run command.
-* Adjust the model name that you want to use in your vLLM servers if you don't want to use ``Llama-2-7b-chat-hf``. 
-
-.. code-block:: console
-
-    mkdir -p ~/.cache/huggingface/hub/
-    hf_cache_dir=~/.cache/huggingface/
-    docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf
-    docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf
-
-.. note::
-    If you are behind proxy, you can pass the proxy settings to the docker run command via ``-e http_proxy=$http_proxy -e https_proxy=$https_proxy``.
-
-.. _nginxloadbalancer_nginx_launch_nginx:
-
-Launch Nginx
-------------
-
-.. code-block:: console
-
-    docker run -itd -p 8000:80 --network vllm_nginx -v ./nginx_conf/:/etc/nginx/conf.d/ --name nginx-lb nginx-lb:latest
-    
-.. _nginxloadbalancer_nginx_verify_nginx:
-
-Verify That vLLM Servers Are Ready
-----------------------------------
-
-.. code-block:: console
-    
-    docker logs vllm0 | grep Uvicorn
-    docker logs vllm1 | grep Uvicorn
-
-Both outputs should look like this:
-
-.. code-block:: console
-
-    INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
diff --git a/docs/source/serving/deploying_with_triton.md b/docs/source/serving/deploying_with_triton.md
new file mode 100644
index 0000000000000..9b0a6f1d54ae8
--- /dev/null
+++ b/docs/source/serving/deploying_with_triton.md
@@ -0,0 +1,5 @@
+(deploying-with-triton)=
+
+# Deploying with NVIDIA Triton
+
+The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details.
diff --git a/docs/source/serving/deploying_with_triton.rst b/docs/source/serving/deploying_with_triton.rst
deleted file mode 100644
index 5ce7c3d03dd2d..0000000000000
--- a/docs/source/serving/deploying_with_triton.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-.. _deploying_with_triton:
-
-Deploying with NVIDIA Triton
-============================
-
-The `Triton Inference Server <https://github.com/triton-inference-server>`_ hosts a tutorial demonstrating how to quickly deploy a simple `facebook/opt-125m <https://huggingface.co/facebook/opt-125m>`_ model using vLLM. Please see `Deploying a vLLM model in Triton <https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton>`_ for more details.
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
new file mode 100644
index 0000000000000..e0485d66c0a26
--- /dev/null
+++ b/docs/source/serving/distributed_serving.md
@@ -0,0 +1,105 @@
+(distributed-serving)=
+
+# Distributed Inference and Serving
+
+## How to decide the distributed inference strategy?
+
+Before going into the details of distributed inference and serving, let's first make it clear when to use distributed inference and what are the strategies available. The common practice is:
+
+- **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference.
+- **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4.
+- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2.
+
+In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes.
+
+After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like `# GPU blocks: 790`. Multiply the number by `16` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough.
+
+```{note}
+There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs.
+```
+
+## Details for Distributed Inference and Serving
+
+vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
+
+Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured {code}`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the {code}`LLM` class {code}`distributed-executor-backend` argument or {code}`--distributed-executor-backend` API server argument. Set it to {code}`mp` for multiprocessing or {code}`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
+
+To run multi-GPU inference with the {code}`LLM` class, set the {code}`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
+
+```python
+from vllm import LLM
+llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
+output = llm.generate("San Franciso is a")
+```
+
+To run multi-GPU serving, pass in the {code}`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
+
+```console
+$ vllm serve facebook/opt-13b \
+$     --tensor-parallel-size 4
+```
+
+You can also additionally specify {code}`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
+
+```console
+$ vllm serve gpt2 \
+$     --tensor-parallel-size 4 \
+$     --pipeline-parallel-size 2
+```
+
+## Multi-Node Inference and Serving
+
+If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
+
+The first step, is to start containers and organize them into a cluster. We have provided a helper [script](https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh) to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command.
+
+Pick a node as the head node, and run the following command:
+
+```console
+$ bash run_cluster.sh \
+$                   vllm/vllm-openai \
+$                   ip_of_head_node \
+$                   --head \
+$                   /path/to/the/huggingface/home/in/this/node
+```
+
+On the rest of the worker nodes, run the following command:
+
+```console
+$ bash run_cluster.sh \
+$                   vllm/vllm-openai \
+$                   ip_of_head_node \
+$                   --worker \
+$                   /path/to/the/huggingface/home/in/this/node
+```
+
+Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct.
+
+Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
+
+After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
+
+```console
+$ vllm serve /path/to/the/model/in/the/container \
+$     --tensor-parallel-size 8 \
+$     --pipeline-parallel-size 2
+```
+
+You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 16:
+
+```console
+$ vllm serve /path/to/the/model/in/the/container \
+$     --tensor-parallel-size 16
+```
+
+To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
+
+```{warning}
+After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](https://docs.vllm.ai/en/latest/getting_started/debugging.html) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See the [discussion](https://github.com/vllm-project/vllm/issues/6803) for more information.
+```
+
+```{warning}
+Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes.
+
+When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model.
+```
diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
deleted file mode 100644
index b24ba53e59694..0000000000000
--- a/docs/source/serving/distributed_serving.rst
+++ /dev/null
@@ -1,107 +0,0 @@
-.. _distributed_serving:
-
-Distributed Inference and Serving
-=================================
-
-How to decide the distributed inference strategy?
--------------------------------------------------
-
-Before going into the details of distributed inference and serving, let's first make it clear when to use distributed inference and what are the strategies available. The common practice is:
-
-- **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference.
-- **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4.
-- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2.
-
-In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes.
-
-After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like ``# GPU blocks: 790``. Multiply the number by ``16`` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough.
-
-.. note::
-    There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs.
-
-Details for Distributed Inference and Serving
-----------------------------------------------
-
-vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm <https://arxiv.org/pdf/1909.08053.pdf>`_. We manage the distributed runtime with either `Ray <https://github.com/ray-project/ray>`_ or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
-
-Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured :code:`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the :code:`LLM` class :code:`distributed-executor-backend` argument or :code:`--distributed-executor-backend` API server argument. Set it to :code:`mp` for multiprocessing or :code:`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
-
-To run multi-GPU inference with the :code:`LLM` class, set the :code:`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
-
-.. code-block:: python
-
-    from vllm import LLM
-    llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
-    output = llm.generate("San Franciso is a")
-
-To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
-
-.. code-block:: console
-
-    $ vllm serve facebook/opt-13b \
-    $     --tensor-parallel-size 4
-
-You can also additionally specify :code:`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
-
-.. code-block:: console
-
-    $ vllm serve gpt2 \
-    $     --tensor-parallel-size 4 \
-    $     --pipeline-parallel-size 2
-
-Multi-Node Inference and Serving
---------------------------------
-
-If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
-
-The first step, is to start containers and organize them into a cluster. We have provided a helper `script <https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh>`_ to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have ``CAP_SYS_ADMIN`` to the docker container by using the ``--cap-add`` option in the docker run command.
-
-Pick a node as the head node, and run the following command:
-
-.. code-block:: console
-
-    $ bash run_cluster.sh \
-    $                   vllm/vllm-openai \
-    $                   ip_of_head_node \
-    $                   --head \
-    $                   /path/to/the/huggingface/home/in/this/node
-
-On the rest of the worker nodes, run the following command:
-
-.. code-block:: console
-
-    $ bash run_cluster.sh \
-    $                   vllm/vllm-openai \
-    $                   ip_of_head_node \
-    $                   --worker \
-    $                   /path/to/the/huggingface/home/in/this/node
-
-Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument ``ip_of_head_node`` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct.
-
-Then, on any node, use ``docker exec -it node /bin/bash`` to enter the container, execute ``ray status`` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
-
-After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
-
-.. code-block:: console
-
-    $ vllm serve /path/to/the/model/in/the/container \
-    $     --tensor-parallel-size 8 \
-    $     --pipeline-parallel-size 2
-
-You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 16:
-
-.. code-block:: console
-
-    $ vllm serve /path/to/the/model/in/the/container \
-    $     --tensor-parallel-size 16
-
-To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like ``--privileged -e NCCL_IB_HCA=mlx5`` to the ``run_cluster.sh`` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with ``NCCL_DEBUG=TRACE`` environment variable set, e.g. ``NCCL_DEBUG=TRACE vllm serve ...`` and check the logs for the NCCL version and the network used. If you find ``[send] via NET/Socket`` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find ``[send] via NET/IB/GDRDMA`` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
-
-.. warning::
-    After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the `sanity check script <https://docs.vllm.ai/en/latest/getting_started/debugging.html>`_ for more information. If you need to set some environment variables for the communication configuration, you can append them to the ``run_cluster.sh`` script, e.g. ``-e NCCL_SOCKET_IFNAME=eth0``. Note that setting environment variables in the shell (e.g. ``NCCL_SOCKET_IFNAME=eth0 vllm serve ...``) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See the `discussion <https://github.com/vllm-project/vllm/issues/6803>`_ for more information.
-
-.. warning::
-
-    Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes.
-
-    When you use huggingface repo id to refer to the model, you should append your huggingface token to the ``run_cluster.sh`` script, e.g. ``-e HF_TOKEN=``. The recommended way is to download the model first, and then use the path to refer to the model.
diff --git a/docs/source/serving/integrations.md b/docs/source/serving/integrations.md
new file mode 100644
index 0000000000000..d214c77254257
--- /dev/null
+++ b/docs/source/serving/integrations.md
@@ -0,0 +1,17 @@
+# Integrations
+
+```{toctree}
+:maxdepth: 1
+
+run_on_sky
+deploying_with_kserve
+deploying_with_kubeai
+deploying_with_triton
+deploying_with_bentoml
+deploying_with_cerebrium
+deploying_with_lws
+deploying_with_dstack
+serving_with_langchain
+serving_with_llamaindex
+serving_with_llamastack
+```
diff --git a/docs/source/serving/integrations.rst b/docs/source/serving/integrations.rst
deleted file mode 100644
index 0dd505a739863..0000000000000
--- a/docs/source/serving/integrations.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-Integrations
-------------
-
-.. toctree::
-   :maxdepth: 1
-
-   run_on_sky
-   deploying_with_kserve
-   deploying_with_kubeai
-   deploying_with_triton
-   deploying_with_bentoml
-   deploying_with_cerebrium
-   deploying_with_lws
-   deploying_with_dstack
-   serving_with_langchain
-   serving_with_llamaindex
-   serving_with_llamastack
diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md
new file mode 100644
index 0000000000000..2dc78643f6d8f
--- /dev/null
+++ b/docs/source/serving/metrics.md
@@ -0,0 +1,38 @@
+# Production Metrics
+
+vLLM exposes a number of metrics that can be used to monitor the health of the
+system. These metrics are exposed via the `/metrics` endpoint on the vLLM
+OpenAI compatible API server.
+
+You can start the server using Python, or using [Docker](deploying_with_docker.md):
+
+```console
+$ vllm serve unsloth/Llama-3.2-1B-Instruct
+```
+
+Then query the endpoint to get the latest metrics from the server:
+
+```console
+$ curl http://0.0.0.0:8000/metrics
+
+# HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step.
+# TYPE vllm:iteration_tokens_total histogram
+vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0
+vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
+...
+```
+
+The following metrics are exposed:
+
+```{literalinclude} ../../../vllm/engine/metrics.py
+:end-before: end-metrics-definitions
+:language: python
+:start-after: begin-metrics-definitions
+```
diff --git a/docs/source/serving/metrics.rst b/docs/source/serving/metrics.rst
deleted file mode 100644
index 231111cd7b738..0000000000000
--- a/docs/source/serving/metrics.rst
+++ /dev/null
@@ -1,38 +0,0 @@
-Production Metrics
-==================
-
-vLLM exposes a number of metrics that can be used to monitor the health of the
-system. These metrics are exposed via the ``/metrics`` endpoint on the vLLM
-OpenAI compatible API server.
-
-You can start the server using Python, or using [Docker](deploying_with_docker.rst):
-
-.. code-block:: console
-
-    $ vllm serve unsloth/Llama-3.2-1B-Instruct
-
-Then query the endpoint to get the latest metrics from the server:
-
-.. code-block:: console
-
-    $ curl http://0.0.0.0:8000/metrics
-    
-    # HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step.
-    # TYPE vllm:iteration_tokens_total histogram
-    vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0
-    vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-    vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-    vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-    vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-    vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-    vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-    vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-    vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
-    ...
-
-The following metrics are exposed:
-
-.. literalinclude:: ../../../vllm/engine/metrics.py
-    :language: python
-    :start-after: begin-metrics-definitions
-    :end-before: end-metrics-definitions
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 1bc8d32d2d161..934a7cea7b9cb 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -2,7 +2,7 @@
 
 vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API, and more!
 
-You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](deploying_with_docker.rst):
+You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](deploying_with_docker.md):
 ```bash
 vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
 ```
@@ -30,20 +30,20 @@ print(completion.choices[0].message)
 We currently support the following OpenAI APIs:
 
 - [Completions API](#completions-api) (`/v1/completions`)
-  - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`).
+  - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`).
   - *Note: `suffix` parameter is not supported.*
 - [Chat Completions API](#chat-api) (`/v1/chat/completions`)
-  - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`) with a [chat template](#chat-template).
+  - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`) with a [chat template](#chat-template).
   - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
 - [Embeddings API](#embeddings-api) (`/v1/embeddings`)
-  - Only applicable to [embedding models](../models/pooling_models.rst) (`--task embed`).
+  - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`).
 
 In addition, we have the following custom APIs:
 
 - [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`)
   - Applicable to any model with a tokenizer.
 - [Score API](#score-api) (`/score`)
-  - Only applicable to [cross-encoder models](../models/pooling_models.rst) (`--task score`).
+  - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`).
 
 (chat-template)=
 ## Chat Template
@@ -183,7 +183,7 @@ Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference
 
 #### Extra parameters
 
-The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
+The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -206,12 +206,12 @@ Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference
 
 We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
 [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters;
-see our [Multimodal Inputs](../usage/multimodal_inputs.rst) guide for more information.
+see our [Multimodal Inputs](../usage/multimodal_inputs.md) guide for more information.
 - *Note: `image_url.detail` parameter is not supported.*
 
 #### Extra parameters
 
-The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
+The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -236,12 +236,12 @@ If the model has a [chat template](#chat-template), you can replace `inputs` wit
 which will be treated as a single prompt to the model.
 
 ```{tip}
-This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.rst) for details.
+This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.md) for details.
 ```
 
 #### Extra parameters
 
-The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported.
+The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -418,7 +418,7 @@ Response:
 
 #### Extra parameters
 
-The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported.
+The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
diff --git a/docs/source/serving/run_on_sky.md b/docs/source/serving/run_on_sky.md
new file mode 100644
index 0000000000000..115873ae49292
--- /dev/null
+++ b/docs/source/serving/run_on_sky.md
@@ -0,0 +1,345 @@
+(on-cloud)=
+
+# Deploying and scaling up with SkyPilot
+
+```{raw} html
+<p align="center">
+  <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
+</p>
+```
+
+vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html).
+
+## Prerequisites
+
+- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and request access to the model {code}`meta-llama/Meta-Llama-3-8B-Instruct`.
+- Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)).
+- Check that {code}`sky check` shows clouds or Kubernetes are enabled.
+
+```console
+pip install skypilot-nightly
+sky check
+```
+
+## Run on a single instance
+
+See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml).
+
+```yaml
+resources:
+  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+  use_spot: True
+  disk_size: 512  # Ensure model checkpoints can fit.
+  disk_tier: best
+  ports: 8081  # Expose to internet traffic.
+
+envs:
+  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+
+setup: |
+  conda create -n vllm python=3.10 -y
+  conda activate vllm
+
+  pip install vllm==0.4.0.post1
+  # Install Gradio for web UI.
+  pip install gradio openai
+  pip install flash-attn==2.5.7
+
+run: |
+  conda activate vllm
+  echo 'Starting vllm api server...'
+  python -u -m vllm.entrypoints.openai.api_server \
+    --port 8081 \
+    --model $MODEL_NAME \
+    --trust-remote-code \
+    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+    2>&1 | tee api_server.log &
+
+  echo 'Waiting for vllm api server to start...'
+  while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
+
+  echo 'Starting gradio server...'
+  git clone https://github.com/vllm-project/vllm.git || true
+  python vllm/examples/gradio_openai_chatbot_webserver.py \
+    -m $MODEL_NAME \
+    --port 8811 \
+    --model-url http://localhost:8081/v1 \
+    --stop-token-ids 128009,128001
+```
+
+Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...):
+
+```console
+HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN
+```
+
+Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion.
+
+```console
+(task, pid=7431) Running on public URL: https://<gradio-hash>.gradio.live
+```
+
+**Optional**: Serve the 70B model instead of the default 8B and use more GPU:
+
+```console
+HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct
+```
+
+## Scale up to multiple replicas
+
+SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file.
+
+```yaml
+service:
+  replicas: 2
+  # An actual request for readiness probe.
+  readiness_probe:
+    path: /v1/chat/completions
+    post_data:
+    model: $MODEL_NAME
+    messages:
+      - role: user
+        content: Hello! What is your name?
+  max_completion_tokens: 1
+```
+
+```{raw} html
+<details>
+<summary>Click to see the full recipe YAML</summary>
+```
+
+```yaml
+service:
+  replicas: 2
+  # An actual request for readiness probe.
+  readiness_probe:
+    path: /v1/chat/completions
+    post_data:
+      model: $MODEL_NAME
+      messages:
+        - role: user
+          content: Hello! What is your name?
+      max_completion_tokens: 1
+
+resources:
+  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+  use_spot: True
+  disk_size: 512  # Ensure model checkpoints can fit.
+  disk_tier: best
+  ports: 8081  # Expose to internet traffic.
+
+envs:
+  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+
+setup: |
+  conda create -n vllm python=3.10 -y
+  conda activate vllm
+
+  pip install vllm==0.4.0.post1
+  # Install Gradio for web UI.
+  pip install gradio openai
+  pip install flash-attn==2.5.7
+
+run: |
+  conda activate vllm
+  echo 'Starting vllm api server...'
+  python -u -m vllm.entrypoints.openai.api_server \
+    --port 8081 \
+    --model $MODEL_NAME \
+    --trust-remote-code \
+    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+    2>&1 | tee api_server.log
+```
+
+```{raw} html
+</details>
+```
+
+Start the serving the Llama-3 8B model on multiple replicas:
+
+```console
+HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN
+```
+
+Wait until the service is ready:
+
+```console
+watch -n10 sky serve status vllm
+```
+
+```{raw} html
+<details>
+<summary>Example outputs:</summary>
+```
+
+```console
+Services
+NAME  VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT
+vllm  1        35s     READY   2/2       xx.yy.zz.100:30001
+
+Service Replicas
+SERVICE_NAME  ID  VERSION  IP            LAUNCHED     RESOURCES                STATUS  REGION
+vllm          1   1        xx.yy.zz.121  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
+vllm          2   1        xx.yy.zz.245  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
+```
+
+```{raw} html
+</details>
+```
+
+After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
+
+```console
+ENDPOINT=$(sky serve status --endpoint 8081 vllm)
+curl -L http://$ENDPOINT/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "messages": [
+    {
+      "role": "system",
+      "content": "You are a helpful assistant."
+    },
+    {
+      "role": "user",
+      "content": "Who are you?"
+    }
+    ],
+    "stop_token_ids": [128009,  128001]
+  }'
+```
+
+To enable autoscaling, you could replace the `replicas` with the following configs in `service`:
+
+```yaml
+service:
+  replica_policy:
+    min_replicas: 2
+    max_replicas: 4
+    target_qps_per_replica: 2
+```
+
+This will scale the service up to when the QPS exceeds 2 for each replica.
+
+```{raw} html
+<details>
+<summary>Click to see the full recipe YAML</summary>
+```
+
+```yaml
+service:
+  replica_policy:
+    min_replicas: 2
+    max_replicas: 4
+    target_qps_per_replica: 2
+  # An actual request for readiness probe.
+  readiness_probe:
+    path: /v1/chat/completions
+    post_data:
+      model: $MODEL_NAME
+      messages:
+        - role: user
+          content: Hello! What is your name?
+      max_completion_tokens: 1
+
+resources:
+  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+  use_spot: True
+  disk_size: 512  # Ensure model checkpoints can fit.
+  disk_tier: best
+  ports: 8081  # Expose to internet traffic.
+
+envs:
+  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+
+setup: |
+  conda create -n vllm python=3.10 -y
+  conda activate vllm
+
+  pip install vllm==0.4.0.post1
+  # Install Gradio for web UI.
+  pip install gradio openai
+  pip install flash-attn==2.5.7
+
+run: |
+  conda activate vllm
+  echo 'Starting vllm api server...'
+  python -u -m vllm.entrypoints.openai.api_server \
+    --port 8081 \
+    --model $MODEL_NAME \
+    --trust-remote-code \
+    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+    2>&1 | tee api_server.log
+```
+
+```{raw} html
+</details>
+```
+
+To update the service with the new config:
+
+```console
+HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN
+```
+
+To stop the service:
+
+```console
+sky serve down vllm
+```
+
+### **Optional**: Connect a GUI to the endpoint
+
+It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
+
+```{raw} html
+<details>
+<summary>Click to see the full GUI YAML</summary>
+```
+
+```yaml
+envs:
+  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+  ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm.
+
+resources:
+  cpus: 2
+
+setup: |
+  conda create -n vllm python=3.10 -y
+  conda activate vllm
+
+  # Install Gradio for web UI.
+  pip install gradio openai
+
+run: |
+  conda activate vllm
+  export PATH=$PATH:/sbin
+
+  echo 'Starting gradio server...'
+  git clone https://github.com/vllm-project/vllm.git || true
+  python vllm/examples/gradio_openai_chatbot_webserver.py \
+    -m $MODEL_NAME \
+    --port 8811 \
+    --model-url http://$ENDPOINT/v1 \
+    --stop-token-ids 128009,128001 | tee ~/gradio.log
+```
+
+```{raw} html
+</details>
+```
+
+1. Start the chat web UI:
+
+```console
+sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm)
+```
+
+2. Then, we can access the GUI at the returned gradio link:
+
+```console
+| INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live
+```
diff --git a/docs/source/serving/run_on_sky.rst b/docs/source/serving/run_on_sky.rst
deleted file mode 100644
index 227e6fd2a7818..0000000000000
--- a/docs/source/serving/run_on_sky.rst
+++ /dev/null
@@ -1,366 +0,0 @@
-.. _on_cloud:
-
-Deploying and scaling up with SkyPilot
-================================================
-
-.. raw:: html
-
-  <p align="center">
-    <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
-  </p>
-
-vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with `SkyPilot <https://github.com/skypilot-org/skypilot>`__, an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in `SkyPilot AI gallery <https://skypilot.readthedocs.io/en/latest/gallery/index.html>`__.
-
-
-Prerequisites
--------------
-
-- Go to the `HuggingFace model page <https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct>`__ and request access to the model :code:`meta-llama/Meta-Llama-3-8B-Instruct`.
-- Check that you have installed SkyPilot (`docs <https://skypilot.readthedocs.io/en/latest/getting-started/installation.html>`__).
-- Check that :code:`sky check` shows clouds or Kubernetes are enabled.
-
-.. code-block:: console
-
-  pip install skypilot-nightly
-  sky check
-
-
-Run on a single instance
-------------------------
-
-See the vLLM SkyPilot YAML for serving, `serving.yaml <https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml>`__.
-
-.. code-block:: yaml
-
-  resources:
-    accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
-    use_spot: True
-    disk_size: 512  # Ensure model checkpoints can fit.
-    disk_tier: best
-    ports: 8081  # Expose to internet traffic.
-
-  envs:
-    MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-    HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
-
-  setup: |
-    conda create -n vllm python=3.10 -y
-    conda activate vllm
-
-    pip install vllm==0.4.0.post1
-    # Install Gradio for web UI.
-    pip install gradio openai
-    pip install flash-attn==2.5.7
-
-  run: |
-    conda activate vllm
-    echo 'Starting vllm api server...'
-    python -u -m vllm.entrypoints.openai.api_server \
-      --port 8081 \
-      --model $MODEL_NAME \
-      --trust-remote-code \
-      --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
-      2>&1 | tee api_server.log &
-    
-    echo 'Waiting for vllm api server to start...'
-    while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
-
-    echo 'Starting gradio server...'
-    git clone https://github.com/vllm-project/vllm.git || true
-    python vllm/examples/gradio_openai_chatbot_webserver.py \
-      -m $MODEL_NAME \
-      --port 8811 \
-      --model-url http://localhost:8081/v1 \
-      --stop-token-ids 128009,128001
-
-Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...): 
-
-.. code-block:: console
-
-  HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN
-
-Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion.
-
-.. code-block:: console
-
-  (task, pid=7431) Running on public URL: https://<gradio-hash>.gradio.live
-
-**Optional**: Serve the 70B model instead of the default 8B and use more GPU:
-
-.. code-block:: console
-
-  HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct
-
-
-Scale up to multiple replicas
------------------------------
-
-SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file.
-
-.. code-block:: yaml
-
-  service:
-    replicas: 2
-    # An actual request for readiness probe.
-    readiness_probe:
-      path: /v1/chat/completions
-      post_data:
-      model: $MODEL_NAME
-      messages:
-        - role: user
-          content: Hello! What is your name?
-    max_completion_tokens: 1
-    
-.. raw:: html
-
-  <details>
-  <summary>Click to see the full recipe YAML</summary>
-
-
-.. code-block:: yaml
-
-  service:
-    replicas: 2
-    # An actual request for readiness probe.
-    readiness_probe:
-      path: /v1/chat/completions
-      post_data:
-        model: $MODEL_NAME
-        messages:
-          - role: user
-            content: Hello! What is your name?
-        max_completion_tokens: 1
-
-  resources:
-    accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
-    use_spot: True
-    disk_size: 512  # Ensure model checkpoints can fit.
-    disk_tier: best
-    ports: 8081  # Expose to internet traffic.
-
-  envs:
-    MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-    HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
-
-  setup: |
-    conda create -n vllm python=3.10 -y
-    conda activate vllm
-
-    pip install vllm==0.4.0.post1
-    # Install Gradio for web UI.
-    pip install gradio openai
-    pip install flash-attn==2.5.7
-
-  run: |
-    conda activate vllm
-    echo 'Starting vllm api server...'
-    python -u -m vllm.entrypoints.openai.api_server \
-      --port 8081 \
-      --model $MODEL_NAME \
-      --trust-remote-code \
-      --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
-      2>&1 | tee api_server.log
-
-.. raw:: html
-
-  </details>
-
-Start the serving the Llama-3 8B model on multiple replicas:
-
-.. code-block:: console
-
-  HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN
-
-
-Wait until the service is ready:
-
-.. code-block:: console
-
-  watch -n10 sky serve status vllm
-
-
-.. raw:: html
-
-  <details>
-  <summary>Example outputs:</summary>
-
-.. code-block:: console
-
-  Services
-  NAME  VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT
-  vllm  1        35s     READY   2/2       xx.yy.zz.100:30001
-
-  Service Replicas
-  SERVICE_NAME  ID  VERSION  IP            LAUNCHED     RESOURCES                STATUS  REGION
-  vllm          1   1        xx.yy.zz.121  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
-  vllm          2   1        xx.yy.zz.245  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
-
-.. raw:: html
-  
-  </details>
-
-After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
-
-.. code-block:: console
-
-  ENDPOINT=$(sky serve status --endpoint 8081 vllm)
-  curl -L http://$ENDPOINT/v1/chat/completions \
-    -H "Content-Type: application/json" \
-    -d '{
-      "model": "meta-llama/Meta-Llama-3-8B-Instruct",
-      "messages": [
-      {
-        "role": "system",
-        "content": "You are a helpful assistant."
-      },
-      {
-        "role": "user",
-        "content": "Who are you?"
-      }
-      ],
-      "stop_token_ids": [128009,  128001]
-    }'
-
-To enable autoscaling, you could replace the `replicas` with the following configs in `service`:
-
-.. code-block:: yaml
-
-  service:
-    replica_policy:
-      min_replicas: 2
-      max_replicas: 4
-      target_qps_per_replica: 2
-
-This will scale the service up to when the QPS exceeds 2 for each replica.
-
-    
-.. raw:: html
-
-  <details>
-  <summary>Click to see the full recipe YAML</summary>
-
-
-.. code-block:: yaml
-
-  service:
-    replica_policy:
-      min_replicas: 2
-      max_replicas: 4
-      target_qps_per_replica: 2
-    # An actual request for readiness probe.
-    readiness_probe:
-      path: /v1/chat/completions
-      post_data:
-        model: $MODEL_NAME
-        messages:
-          - role: user
-            content: Hello! What is your name?
-        max_completion_tokens: 1
-
-  resources:
-    accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
-    use_spot: True
-    disk_size: 512  # Ensure model checkpoints can fit.
-    disk_tier: best
-    ports: 8081  # Expose to internet traffic.
-
-  envs:
-    MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-    HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
-
-  setup: |
-    conda create -n vllm python=3.10 -y
-    conda activate vllm
-
-    pip install vllm==0.4.0.post1
-    # Install Gradio for web UI.
-    pip install gradio openai
-    pip install flash-attn==2.5.7
-
-  run: |
-    conda activate vllm
-    echo 'Starting vllm api server...'
-    python -u -m vllm.entrypoints.openai.api_server \
-      --port 8081 \
-      --model $MODEL_NAME \
-      --trust-remote-code \
-      --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
-      2>&1 | tee api_server.log
-
-
-.. raw:: html
-  
-  </details>
-
-To update the service with the new config:
-
-.. code-block:: console
-
-  HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN
-
-
-To stop the service:
-
-.. code-block:: console
-
-  sky serve down vllm
-
-
-**Optional**: Connect a GUI to the endpoint
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-
-It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
-
-.. raw:: html
-
-  <details>
-  <summary>Click to see the full GUI YAML</summary>
-
-.. code-block:: yaml
-
-  envs:
-    MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-    ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. 
-
-  resources:
-    cpus: 2
-
-  setup: |
-    conda create -n vllm python=3.10 -y
-    conda activate vllm
-
-    # Install Gradio for web UI.
-    pip install gradio openai
-
-  run: |
-    conda activate vllm
-    export PATH=$PATH:/sbin
-
-    echo 'Starting gradio server...'
-    git clone https://github.com/vllm-project/vllm.git || true
-    python vllm/examples/gradio_openai_chatbot_webserver.py \
-      -m $MODEL_NAME \
-      --port 8811 \
-      --model-url http://$ENDPOINT/v1 \
-      --stop-token-ids 128009,128001 | tee ~/gradio.log
-
-
-.. raw:: html
-  
-  </details>
-
-1. Start the chat web UI:
-
-.. code-block:: console
-
-  sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm)
-
-
-2. Then, we can access the GUI at the returned gradio link:
-
-.. code-block:: console
-
-  | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live
-
-
diff --git a/docs/source/serving/runai_model_streamer.md b/docs/source/serving/runai_model_streamer.md
new file mode 100644
index 0000000000000..1b5756a95075a
--- /dev/null
+++ b/docs/source/serving/runai_model_streamer.md
@@ -0,0 +1,53 @@
+(runai-model-streamer)=
+
+# Loading Models with Run:ai Model Streamer
+
+Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory.
+Further reading can be found in [Run:ai Model Streamer Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md).
+
+vLLM supports loading weights in Safetensors format using the Run:ai Model Streamer.
+You first need to install vLLM RunAI optional dependency:
+
+```console
+$ pip3 install vllm[runai]
+```
+
+To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag:
+
+```console
+$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer
+```
+
+To run model from AWS S3 object store run:
+
+```console
+$ vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer
+```
+
+To run model from a S3 compatible object store run:
+
+```console
+$ RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer
+```
+
+## Tunable parameters
+
+You can tune parameters using `--model-loader-extra-config`:
+
+You can tune `concurrency` that controls the level of concurrency and number of OS threads reading tensors from the file to the CPU buffer.
+For reading from S3, it will be the number of client instances the host is opening to the S3 server.
+
+```console
+$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}'
+```
+
+You can controls the size of the CPU Memory buffer to which tensors are read from the file, and limit this size.
+You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit).
+
+```console
+$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}'
+```
+
+```{note}
+For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md).
+```
diff --git a/docs/source/serving/runai_model_streamer.rst b/docs/source/serving/runai_model_streamer.rst
deleted file mode 100644
index 459eb8677fb95..0000000000000
--- a/docs/source/serving/runai_model_streamer.rst
+++ /dev/null
@@ -1,53 +0,0 @@
-.. _runai_model_streamer:
-
-Loading Models with Run:ai Model Streamer
-=========================================
-Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory.
-Further reading can be found in `Run:ai Model Streamer Documentation <https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md>`_.
-
-vLLM supports loading weights in Safetensors format using the Run:ai Model Streamer.
-You first need to install vLLM RunAI optional dependency:
-
-.. code-block:: console
-
-    $ pip3 install vllm[runai]
-
-To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag:
-
-.. code-block:: console
-
-    $ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer
-
-To run model from AWS S3 object store run:
-
-.. code-block:: console
-
-    $ vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer
-
-
-To run model from a S3 compatible object store run:
-
-.. code-block:: console
-
-    $ RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer
-
-Tunable parameters
-------------------
-You can tune parameters using `--model-loader-extra-config`:
-
-You can tune `concurrency` that controls the level of concurrency and number of OS threads reading tensors from the file to the CPU buffer.
-For reading from S3, it will be the number of client instances the host is opening to the S3 server.
-
- .. code-block:: console
-
-    $ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}'
-
-You can controls the size of the CPU Memory buffer to which tensors are read from the file, and limit this size.
-You can read further about CPU buffer memory limiting `here <https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit>`_.
-
- .. code-block:: console
-
-    $ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}'
-
-.. note::
-  For further instructions about tunable parameters and additional parameters configurable through environment variables, read the `Environment Variables Documentation <https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md>`_.
diff --git a/docs/source/serving/serving_with_langchain.md b/docs/source/serving/serving_with_langchain.md
new file mode 100644
index 0000000000000..96bd5943f3d64
--- /dev/null
+++ b/docs/source/serving/serving_with_langchain.md
@@ -0,0 +1,30 @@
+(run-on-langchain)=
+
+# Serving with Langchain
+
+vLLM is also available via [Langchain](https://github.com/langchain-ai/langchain) .
+
+To install langchain, run
+
+```console
+$ pip install langchain langchain_community -q
+```
+
+To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`.
+
+```python
+from langchain_community.llms import VLLM
+
+llm = VLLM(model="mosaicml/mpt-7b",
+           trust_remote_code=True,  # mandatory for hf models
+           max_new_tokens=128,
+           top_k=10,
+           top_p=0.95,
+           temperature=0.8,
+           # tensor_parallel_size=... # for distributed inference
+)
+
+print(llm("What is the capital of France ?"))
+```
+
+Please refer to this [Tutorial](https://python.langchain.com/docs/integrations/llms/vllm) for more details.
diff --git a/docs/source/serving/serving_with_langchain.rst b/docs/source/serving/serving_with_langchain.rst
deleted file mode 100644
index 6440c8aad5986..0000000000000
--- a/docs/source/serving/serving_with_langchain.rst
+++ /dev/null
@@ -1,31 +0,0 @@
-.. _run_on_langchain:
-
-Serving with Langchain
-============================
-
-vLLM is also available via `Langchain <https://github.com/langchain-ai/langchain>`_ .
-
-To install langchain, run
-
-.. code-block:: console
-
-    $ pip install langchain langchain_community -q
-
-To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langchain``.
-
-.. code-block:: python
-
-    from langchain_community.llms import VLLM
-
-    llm = VLLM(model="mosaicml/mpt-7b",
-               trust_remote_code=True,  # mandatory for hf models
-               max_new_tokens=128,
-               top_k=10,
-               top_p=0.95,
-               temperature=0.8,
-               # tensor_parallel_size=... # for distributed inference
-    )
-
-    print(llm("What is the capital of France ?"))
-
-Please refer to this `Tutorial <https://python.langchain.com/docs/integrations/llms/vllm>`_ for more details.
diff --git a/docs/source/serving/serving_with_llamaindex.md b/docs/source/serving/serving_with_llamaindex.md
new file mode 100644
index 0000000000000..98859d8e3f828
--- /dev/null
+++ b/docs/source/serving/serving_with_llamaindex.md
@@ -0,0 +1,26 @@
+(run-on-llamaindex)=
+
+# Serving with llama_index
+
+vLLM is also available via [llama_index](https://github.com/run-llama/llama_index) .
+
+To install llamaindex, run
+
+```console
+$ pip install llama-index-llms-vllm -q
+```
+
+To run inference on a single or multiple GPUs, use `Vllm` class from `llamaindex`.
+
+```python
+from llama_index.llms.vllm import Vllm
+
+llm = Vllm(
+    model="microsoft/Orca-2-7b",
+    tensor_parallel_size=4,
+    max_new_tokens=100,
+    vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5},
+)
+```
+
+Please refer to this [Tutorial](https://docs.llamaindex.ai/en/latest/examples/llm/vllm/) for more details.
diff --git a/docs/source/serving/serving_with_llamaindex.rst b/docs/source/serving/serving_with_llamaindex.rst
deleted file mode 100644
index 038e961344e47..0000000000000
--- a/docs/source/serving/serving_with_llamaindex.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-.. _run_on_llamaindex:
-
-Serving with llama_index
-============================
-
-vLLM is also available via `llama_index <https://github.com/run-llama/llama_index>`_ .
-
-To install llamaindex, run
-
-.. code-block:: console
-
-    $ pip install llama-index-llms-vllm -q
-
-To run inference on a single or multiple GPUs, use ``Vllm`` class from ``llamaindex``.
-
-.. code-block:: python
-
-    from llama_index.llms.vllm import Vllm
-
-    llm = Vllm(
-        model="microsoft/Orca-2-7b",
-        tensor_parallel_size=4,
-        max_new_tokens=100,
-        vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5},
-    )
-
-Please refer to this `Tutorial <https://docs.llamaindex.ai/en/latest/examples/llm/vllm/>`_ for more details.
diff --git a/docs/source/serving/serving_with_llamastack.md b/docs/source/serving/serving_with_llamastack.md
new file mode 100644
index 0000000000000..71dadca7ad47c
--- /dev/null
+++ b/docs/source/serving/serving_with_llamastack.md
@@ -0,0 +1,38 @@
+(run-on-llamastack)=
+
+# Serving with Llama Stack
+
+vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) .
+
+To install Llama Stack, run
+
+```console
+$ pip install llama-stack -q
+```
+
+## Inference using OpenAI Compatible API
+
+Then start Llama Stack server pointing to your vLLM server with the following configuration:
+
+```yaml
+inference:
+  - provider_id: vllm0
+    provider_type: remote::vllm
+    config:
+      url: http://127.0.0.1:8000
+```
+
+Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html) for more details on this remote vLLM provider.
+
+## Inference via Embedded vLLM
+
+An [inline vLLM provider](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/inference/vllm)
+is also available. This is a sample of configuration using that method:
+
+```yaml
+inference
+  - provider_type: vllm
+    config:
+      model: Llama3.1-8B-Instruct
+      tensor_parallel_size: 4
+```
diff --git a/docs/source/serving/serving_with_llamastack.rst b/docs/source/serving/serving_with_llamastack.rst
deleted file mode 100644
index a2acd7b39f887..0000000000000
--- a/docs/source/serving/serving_with_llamastack.rst
+++ /dev/null
@@ -1,42 +0,0 @@
-.. _run_on_llamastack:
-
-Serving with Llama Stack
-============================
-
-vLLM is also available via `Llama Stack <https://github.com/meta-llama/llama-stack>`_ .
-
-To install Llama Stack, run
-
-.. code-block:: console
-
-    $ pip install llama-stack -q
-
-Inference using OpenAI Compatible API
--------------------------------------
-
-Then start Llama Stack server pointing to your vLLM server with the following configuration:
-
-.. code-block:: yaml
-
-    inference:
-      - provider_id: vllm0
-        provider_type: remote::vllm
-        config:
-          url: http://127.0.0.1:8000
-
-Please refer to `this guide <https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html>`_ for more details on this remote vLLM provider.
-
-Inference via Embedded vLLM
----------------------------
-
-An `inline vLLM provider
-<https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/inference/vllm>`_
-is also available. This is a sample of configuration using that method:
-
-.. code-block:: yaml
-
-    inference
-      - provider_type: vllm
-        config:
-          model: Llama3.1-8B-Instruct
-          tensor_parallel_size: 4
diff --git a/docs/source/serving/tensorizer.md b/docs/source/serving/tensorizer.md
new file mode 100644
index 0000000000000..d3dd29d48f730
--- /dev/null
+++ b/docs/source/serving/tensorizer.md
@@ -0,0 +1,16 @@
+(tensorizer)=
+
+# Loading Models with CoreWeave's Tensorizer
+
+vLLM supports loading models with [CoreWeave's Tensorizer](https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer).
+vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized
+at runtime extremely quickly directly to the GPU, resulting in significantly
+shorter Pod startup times and CPU memory usage. Tensor encryption is also supported.
+
+For more information on CoreWeave's Tensorizer, please refer to
+[CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
+the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/tensorize_vllm_model.html).
+
+```{note}
+Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
+```
diff --git a/docs/source/serving/tensorizer.rst b/docs/source/serving/tensorizer.rst
deleted file mode 100644
index 96a93db94871b..0000000000000
--- a/docs/source/serving/tensorizer.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-.. _tensorizer:
-
-Loading Models with CoreWeave's Tensorizer
-==========================================
-vLLM supports loading models with `CoreWeave's Tensorizer <https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer>`_.
-vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized
-at runtime extremely quickly directly to the GPU, resulting in significantly
-shorter Pod startup times and CPU memory usage. Tensor encryption is also supported.
-
-For more information on CoreWeave's Tensorizer, please refer to
-`CoreWeave's Tensorizer documentation <https://github.com/coreweave/tensorizer>`_. For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
-the `vLLM example script <https://docs.vllm.ai/en/stable/getting_started/examples/tensorize_vllm_model.html>`_.
-
-.. note::
-  Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
diff --git a/docs/source/usage/compatibility_matrix.md b/docs/source/usage/compatibility_matrix.md
new file mode 100644
index 0000000000000..763b49dac4f8a
--- /dev/null
+++ b/docs/source/usage/compatibility_matrix.md
@@ -0,0 +1,468 @@
+(compatibility-matrix)=
+
+# Compatibility Matrix
+
+The tables below show mutually exclusive features and the support on some hardware.
+
+```{note}
+Check the '✗' with links to see tracking issue for unsupported feature/hardware combination.
+```
+
+## Feature x Feature
+
+```{raw} html
+<style>
+  /* Make smaller to try to improve readability  */
+  td {
+    font-size: 0.8rem;
+    text-align: center;
+  }
+
+  th {
+    text-align: center;
+    font-size: 0.8rem;
+  }
+</style>
+```
+
+```{list-table}
+   :header-rows: 1
+   :stub-columns: 1
+   :widths: auto
+
+   * - Feature
+     - [CP](#chunked-prefill)
+     - [APC](#apc)
+     - [LoRA](#lora-adapter)
+     - <abbr title="Prompt Adapter">prmpt adptr</abbr>
+     - [SD](#spec_decode)
+     - CUDA graph
+     - <abbr title="Pooling Models">pooling</abbr>
+     - <abbr title="Encoder-Decoder Models">enc-dec</abbr>
+     - <abbr title="Logprobs">logP</abbr>
+     - <abbr title="Prompt Logprobs">prmpt logP</abbr>
+     - <abbr title="Async Output Processing">async output</abbr>
+     - multi-step
+     - <abbr title="Multimodal Inputs">mm</abbr>
+     - best-of
+     - beam-search
+     - <abbr title="Guided Decoding">guided dec</abbr>
+   * - [CP](#chunked-prefill)
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+   * - [APC](#apc)
+     - ✅
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+   * - [LoRA](#lora-adapter)
+     - [✗](https://github.com/vllm-project/vllm/pull/9057)
+     - ✅
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+   * - <abbr title="Prompt Adapter">prmpt adptr</abbr>
+     - ✅
+     - ✅
+     - ✅
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+   * - [SD](#spec_decode)
+     - ✅
+     - ✅
+     - ✗
+     - ✅
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+   * - CUDA graph
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+   * - <abbr title="Pooling Models">pooling</abbr>
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+     - ✗
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+   * - <abbr title="Encoder-Decoder Models">enc-dec</abbr>
+     - ✗
+     - [✗](https://github.com/vllm-project/vllm/issues/7366)
+     - ✗
+     - ✗
+     - [✗](https://github.com/vllm-project/vllm/issues/7366)
+     - ✅
+     - ✅
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+   * - <abbr title="Logprobs">logP</abbr>
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✗
+     - ✅
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+   * - <abbr title="Prompt Logprobs">prmpt logP</abbr>
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - [✗](https://github.com/vllm-project/vllm/pull/8199)
+     - ✅
+     - ✗
+     - ✅
+     - ✅
+     -
+     -
+     -
+     -
+     -
+     -
+     -
+   * - <abbr title="Async Output Processing">async output</abbr>
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✗
+     - ✅
+     - ✗
+     - ✗
+     - ✅
+     - ✅
+     -
+     -
+     -
+     -
+     -
+     -
+   * - multi-step
+     - ✗
+     - ✅
+     - ✗
+     - ✅
+     - ✗
+     - ✅
+     - ✗
+     - ✗
+     - ✅
+     - [✗](https://github.com/vllm-project/vllm/issues/8198)
+     - ✅
+     -
+     -
+     -
+     -
+     -
+   * - <abbr title="Multimodal Inputs">mm</abbr>
+     - ✅
+     -  [✗](https://github.com/vllm-project/vllm/pull/8348)
+     -  [✗](https://github.com/vllm-project/vllm/pull/7199)
+     - ?
+     - ?
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ?
+     -
+     -
+     -
+     -
+   * - best-of
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - [✗](https://github.com/vllm-project/vllm/issues/6137)
+     - ✅
+     - ✗
+     - ✅
+     - ✅
+     - ✅
+     - ?
+     - [✗](https://github.com/vllm-project/vllm/issues/7968)
+     - ✅
+     -
+     -
+     -
+   * - beam-search
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - [✗](https://github.com/vllm-project/vllm/issues/6137)
+     - ✅
+     - ✗
+     - ✅
+     - ✅
+     - ✅
+     - ?
+     - [✗](https://github.com/vllm-project/vllm/issues/7968>)
+     - ?
+     - ✅
+     -
+     -
+   * - <abbr title="Guided Decoding">guided dec</abbr>
+     - ✅
+     - ✅
+     - ?
+     - ?
+     - ✅
+     - ✅
+     - ✗
+     - ?
+     - ✅
+     - ✅
+     - ✅
+     - [✗](https://github.com/vllm-project/vllm/issues/9893)
+     - ?
+     - ✅
+     - ✅
+     -
+
+```
+
+### Feature x Hardware
+
+```{list-table}
+   :header-rows: 1
+   :stub-columns: 1
+   :widths: auto
+
+   * - Feature
+     - Volta
+     - Turing
+     - Ampere
+     - Ada
+     - Hopper
+     - CPU
+     - AMD
+   * - [CP](#chunked-prefill)
+     - [✗](https://github.com/vllm-project/vllm/issues/2729)
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - [APC](#apc)
+     - [✗](https://github.com/vllm-project/vllm/issues/3687)
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - [LoRA](#lora-adapter)
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - [✗](https://github.com/vllm-project/vllm/pull/4830)
+     - ✅
+   * - <abbr title="Prompt Adapter">prmpt adptr</abbr>
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - [✗](https://github.com/vllm-project/vllm/issues/8475)
+     - ✅
+   * - [SD](#spec_decode)
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - CUDA graph
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✗
+     - ✅
+   * - <abbr title="Pooling Models">pooling</abbr>
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ?
+   * - <abbr title="Encoder-Decoder Models">enc-dec</abbr>
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✗
+   * - <abbr title="Multimodal Inputs">mm</abbr>
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - <abbr title="Logprobs">logP</abbr>
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - <abbr title="Prompt Logprobs">prmpt logP</abbr>
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - <abbr title="Async Output Processing">async output</abbr>
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✗
+     - ✗
+   * - multi-step
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - [✗](https://github.com/vllm-project/vllm/issues/8477)
+     - ✅
+   * - best-of
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - beam-search
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - <abbr title="Guided Decoding">guided dec</abbr>
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+```
diff --git a/docs/source/usage/compatibility_matrix.rst b/docs/source/usage/compatibility_matrix.rst
deleted file mode 100644
index 04dd72b1e3527..0000000000000
--- a/docs/source/usage/compatibility_matrix.rst
+++ /dev/null
@@ -1,468 +0,0 @@
-.. _compatibility_matrix:
-
-Compatibility Matrix
-====================
-
-The tables below show mutually exclusive features and the support on some hardware. 
-
-.. note::
-
-   Check the '✗' with links to see tracking issue for unsupported feature/hardware combination.
-
-Feature x Feature
------------------
-
-
-.. raw:: html
-
-    <style>
-      /* Make smaller to try to improve readability  */
-      td {
-        font-size: 0.8rem;
-        text-align: center;
-      }
-
-      th {
-        text-align: center;
-        font-size: 0.8rem;
-      }
-    </style>
-
-.. list-table::
-   :header-rows: 1
-   :widths: auto
-
-   * - Feature
-     - :ref:`CP <chunked-prefill>`
-     - :ref:`APC <apc>`
-     - :ref:`LoRA <lora>`
-     - :abbr:`prmpt adptr (Prompt Adapter)`
-     - :ref:`SD <spec_decode>`
-     - CUDA graph
-     - :abbr:`pooling (Pooling Models)`
-     - :abbr:`enc-dec (Encoder-Decoder Models)`
-     - :abbr:`logP (Logprobs)`
-     - :abbr:`prmpt logP (Prompt Logprobs)`
-     - :abbr:`async output (Async Output Processing)`
-     - multi-step
-     - :abbr:`mm (Multimodal Inputs)`
-     - best-of
-     - beam-search
-     - :abbr:`guided dec (Guided Decoding)`
-   * - :ref:`CP <chunked-prefill>`
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - :ref:`APC <apc>`
-     - ✅
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - :ref:`LoRA <lora>`
-     - `✗ <https://github.com/vllm-project/vllm/pull/9057>`__ 
-     - ✅
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - :abbr:`prmpt adptr (Prompt Adapter)`
-     - ✅
-     - ✅
-     - ✅
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - :ref:`SD <spec_decode>`
-     - ✅
-     - ✅
-     - ✗
-     - ✅
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - CUDA graph
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - :abbr:`pooling (Pooling Models)`
-     - ✗
-     - ✗
-     - ✗ 
-     - ✗
-     - ✗
-     - ✗
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - :abbr:`enc-dec (Encoder-Decoder Models)`
-     - ✗
-     - `✗ <https://github.com/vllm-project/vllm/issues/7366>`__ 
-     - ✗ 
-     - ✗
-     - `✗ <https://github.com/vllm-project/vllm/issues/7366>`__ 
-     - ✅
-     - ✅
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - :abbr:`logP (Logprobs)`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅ 
-     - ✗
-     - ✅
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - :abbr:`prmpt logP (Prompt Logprobs)`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - `✗ <https://github.com/vllm-project/vllm/pull/8199>`__ 
-     - ✅
-     - ✗
-     - ✅ 
-     - ✅
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - :abbr:`async output (Async Output Processing)`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-     - ✅ 
-     - ✗ 
-     - ✗
-     - ✅
-     - ✅
-     - 
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - multi-step
-     - ✗
-     - ✅
-     - ✗
-     - ✅
-     - ✗
-     - ✅
-     - ✗ 
-     - ✗
-     - ✅
-     - `✗ <https://github.com/vllm-project/vllm/issues/8198>`__ 
-     - ✅
-     - 
-     - 
-     - 
-     - 
-     - 
-   * - :abbr:`mm (Multimodal Inputs)`
-     - ✅
-     -  `✗ <https://github.com/vllm-project/vllm/pull/8348>`__ 
-     -  `✗ <https://github.com/vllm-project/vllm/pull/7199>`__ 
-     - ?
-     - ?
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ?
-     - 
-     - 
-     - 
-     - 
-   * - best-of
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - `✗ <https://github.com/vllm-project/vllm/issues/6137>`__ 
-     - ✅
-     - ✗
-     - ✅
-     - ✅
-     - ✅
-     - ?
-     - `✗ <https://github.com/vllm-project/vllm/issues/7968>`__ 
-     - ✅
-     - 
-     - 
-     - 
-   * - beam-search
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - `✗ <https://github.com/vllm-project/vllm/issues/6137>`__ 
-     - ✅
-     - ✗
-     - ✅
-     - ✅
-     - ✅
-     - ?
-     - `✗ <https://github.com/vllm-project/vllm/issues/7968>`__ 
-     - ?
-     - ✅
-     - 
-     - 
-   * - :abbr:`guided dec (Guided Decoding)`
-     - ✅
-     - ✅
-     - ?
-     - ?
-     - ✅
-     - ✅
-     - ✗
-     - ?
-     - ✅
-     - ✅
-     - ✅
-     - `✗ <https://github.com/vllm-project/vllm/issues/9893>`__ 
-     - ?
-     - ✅
-     - ✅
-     - 
-
-
-Feature x Hardware
-^^^^^^^^^^^^^^^^^^
-
-.. list-table::
-   :header-rows: 1
-   :widths: auto
-
-   * - Feature
-     - Volta
-     - Turing
-     - Ampere
-     - Ada
-     - Hopper
-     - CPU
-     - AMD
-   * - :ref:`CP <chunked-prefill>`
-     - `✗ <https://github.com/vllm-project/vllm/issues/2729>`__ 
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - :ref:`APC <apc>`
-     - `✗ <https://github.com/vllm-project/vllm/issues/3687>`__ 
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - :ref:`LoRA <lora>`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - `✗ <https://github.com/vllm-project/vllm/pull/4830>`__ 
-     - ✅
-   * - :abbr:`prmpt adptr (Prompt Adapter)`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - `✗ <https://github.com/vllm-project/vllm/issues/8475>`__ 
-     - ✅
-   * - :ref:`SD <spec_decode>`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - CUDA graph
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-     - ✅
-   * - :abbr:`pooling (Pooling Models)`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ?
-   * - :abbr:`enc-dec (Encoder-Decoder Models)`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-   * - :abbr:`mm (Multimodal Inputs)`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - :abbr:`logP (Logprobs)`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - :abbr:`prmpt logP (Prompt Logprobs)`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - :abbr:`async output (Async Output Processing)`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-     - ✗
-   * - multi-step
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - `✗ <https://github.com/vllm-project/vllm/issues/8477>`__ 
-     - ✅
-   * - best-of
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - beam-search
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - :abbr:`guided dec (Guided Decoding)`
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
diff --git a/docs/source/usage/disagg_prefill.md b/docs/source/usage/disagg_prefill.md
new file mode 100644
index 0000000000000..a61c00fad1e3c
--- /dev/null
+++ b/docs/source/usage/disagg_prefill.md
@@ -0,0 +1,64 @@
+(disagg-prefill)=
+
+# Disaggregated prefilling (experimental)
+
+This page introduces you the disaggregated prefilling feature in vLLM. This feature is experimental and subject to change.
+
+## Why disaggregated prefilling?
+
+Two main reasons:
+
+- **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. `tp` and `pp`) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT.
+- **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL.
+
+```{note}
+Disaggregated prefill DOES NOT improve throughput.
+```
+
+## Usage example
+
+Please refer to `examples/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
+
+## Benchmarks
+
+Please refer to `benchmarks/disagg_benchmarks/` for disaggregated prefilling benchmarks.
+
+## Development
+
+We implement disaggregated prefilling by running 2 vLLM instances. One for prefill (we call it prefill instance) and one for decode (we call it decode instance), and then use a connector to transfer the prefill KV caches and results from prefill instance to decode instance.
+
+All disaggregated prefilling implementation is under `vllm/distributed/kv_transfer`.
+
+Key abstractions for disaggregated prefilling:
+
+- **Connector**: Connector allows **kv consumer** to retrieve the KV caches of a batch of request from **kv producer**.
+- **LookupBuffer**: LookupBuffer provides two API: `insert` KV cache and `drop_select` KV cache. The semantics of `insert` and `drop_select` are similar to SQL, where `insert` inserts a KV cache into the buffer, and `drop_select` returns the KV cache that matches the given condition and drop it from the buffer.
+- **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports `send_tensor` and `recv_tensor`.
+
+```{note}
+`insert` is non-blocking operation but `drop_select` is blocking operation.
+```
+
+Here is a figure illustrating how the above 3 abstractions are organized:
+
+```{image} /assets/usage/disagg_prefill/abstraction.jpg
+:alt: Disaggregated prefilling abstractions
+```
+
+The workflow of disaggregated prefilling is as follows:
+
+```{image} /assets/usage/disagg_prefill/overview.jpg
+:alt: Disaggregated prefilling workflow
+```
+
+The `buffer` corresponds to `insert` API in LookupBuffer, and the `drop_select` corresponds to `drop_select` API in LookupBuffer.
+
+## Third-party contributions
+
+Disaggregated prefilling is highly related to infrastructure, so vLLM relies on third-party connectors for production-level disaggregated prefilling (and vLLM team will actively review and merge new PRs for third-party connectors).
+
+We recommend three ways of implementations:
+
+- **Fully-customized connector**: Implement your own `Connector`, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
+- **Database-like connector**: Implement your own `LookupBuffer` and support the `insert` and `drop_select` APIs just like SQL.
+- **Distributed P2P connector**: Implement your own `Pipe` and support the `send_tensor` and `recv_tensor` APIs, just like `torch.distributed`.
diff --git a/docs/source/usage/disagg_prefill.rst b/docs/source/usage/disagg_prefill.rst
deleted file mode 100644
index 9fe714b4fd856..0000000000000
--- a/docs/source/usage/disagg_prefill.rst
+++ /dev/null
@@ -1,69 +0,0 @@
-.. _disagg_prefill:
-
-Disaggregated prefilling (experimental)
-=======================================
-
-This page introduces you the disaggregated prefilling feature in vLLM. This feature is experimental and subject to change. 
-
-Why disaggregated prefilling?
------------------------------
-
-Two main reasons:
-
-* **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. ``tp`` and ``pp``) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT.
-* **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL.
-
-.. note::
-    Disaggregated prefill DOES NOT improve throughput.
-
-Usage example
--------------
-
-Please refer to ``examples/disaggregated_prefill.sh`` for the example usage of disaggregated prefilling.
-
-
-Benchmarks
-----------
-
-Please refer to ``benchmarks/disagg_benchmarks/`` for disaggregated prefilling benchmarks.
-
-
-Development
------------
-
-We implement disaggregated prefilling by running 2 vLLM instances. One for prefill (we call it prefill instance) and one for decode (we call it decode instance), and then use a connector to transfer the prefill KV caches and results from prefill instance to decode instance.
-
-All disaggregated prefilling implementation is under ``vllm/distributed/kv_transfer``.
-
-Key abstractions for disaggregated prefilling:
-
-* **Connector**: Connector allows **kv consumer** to retrieve the KV caches of a batch of request from **kv producer**.
-* **LookupBuffer**: LookupBuffer provides two API: ``insert`` KV cache and ``drop_select`` KV cache. The semantics of ``insert`` and ``drop_select`` are similar to SQL, where ``insert`` inserts a KV cache into the buffer, and ``drop_select`` returns the KV cache that matches the given condition and drop it from the buffer.
-* **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports ``send_tensor`` and ``recv_tensor``.
-
-.. note::
-    ``insert`` is non-blocking operation but ``drop_select`` is blocking operation.
-
-Here is a figure illustrating how the above 3 abstractions are organized:
-
-.. image:: /assets/usage/disagg_prefill/abstraction.jpg
-    :alt: Disaggregated prefilling abstractions
-
-The workflow of disaggregated prefilling is as follows:
-
-.. image:: /assets/usage/disagg_prefill/overview.jpg
-    :alt: Disaggregated prefilling workflow
-
-The ``buffer`` corresponds to ``insert`` API in LookupBuffer, and the ``drop_select`` corresponds to ``drop_select`` API in LookupBuffer.
-
-
-Third-party contributions
--------------------------
-
-Disaggregated prefilling is highly related to infrastructure, so vLLM relies on third-party connectors for production-level disaggregated prefilling (and vLLM team will actively review and merge new PRs for third-party connectors).
-
-We recommend three ways of implementations:
-
-* **Fully-customized connector**: Implement your own ``Connector``, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
-* **Database-like connector**: Implement your own ``LookupBuffer`` and support the ``insert`` and ``drop_select`` APIs just like SQL.
-* **Distributed P2P connector**: Implement your own ``Pipe`` and support the ``send_tensor`` and ``recv_tensor`` APIs, just like `torch.distributed`.
diff --git a/docs/source/usage/engine_args.rst b/docs/source/usage/engine_args.md
similarity index 76%
rename from docs/source/usage/engine_args.rst
rename to docs/source/usage/engine_args.md
index e7ce8cdcabe88..cd3c6a430b7fa 100644
--- a/docs/source/usage/engine_args.rst
+++ b/docs/source/usage/engine_args.md
@@ -1,23 +1,25 @@
-.. _engine_args:
+(engine-args)=
 
-Engine Arguments
-================
+# Engine Arguments
 
 Below, you can find an explanation of every engine argument for vLLM:
 
+```{eval-rst}
 .. argparse::
     :module: vllm.engine.arg_utils
     :func: _engine_args_parser
     :prog: vllm serve
     :nodefaultconst:
+```
 
-Async Engine Arguments
-----------------------
+## Async Engine Arguments
 
 Below are the additional arguments related to the asynchronous engine:
 
+```{eval-rst}
 .. argparse::
     :module: vllm.engine.arg_utils
     :func: _async_engine_args_parser
     :prog: vllm serve
-    :nodefaultconst:
\ No newline at end of file
+    :nodefaultconst:
+```
diff --git a/docs/source/usage/env_vars.md b/docs/source/usage/env_vars.md
new file mode 100644
index 0000000000000..f9b08077a03b4
--- /dev/null
+++ b/docs/source/usage/env_vars.md
@@ -0,0 +1,15 @@
+# Environment Variables
+
+vLLM uses the following environment variables to configure the system:
+
+```{warning}
+Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work.
+
+All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
+```
+
+```{literalinclude} ../../../vllm/envs.py
+:end-before: end-env-vars-definition
+:language: python
+:start-after: begin-env-vars-definition
+```
diff --git a/docs/source/usage/env_vars.rst b/docs/source/usage/env_vars.rst
deleted file mode 100644
index ff2259c0da3f1..0000000000000
--- a/docs/source/usage/env_vars.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Environment Variables
-========================
-
-vLLM uses the following environment variables to configure the system:
-
-.. warning::
-    Please note that ``VLLM_PORT`` and ``VLLM_HOST_IP`` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use ``--host $VLLM_HOST_IP`` and ``--port $VLLM_PORT`` to start the API server, it will not work.
-
-    All environment variables used by vLLM are prefixed with ``VLLM_``. **Special care should be taken for Kubernetes users**: please do not name the service as ``vllm``, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because `Kubernetes sets environment variables for each service with the capitalized service name as the prefix <https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables>`_.
-
-.. literalinclude:: ../../../vllm/envs.py
-    :language: python
-    :start-after: begin-env-vars-definition
-    :end-before: end-env-vars-definition
diff --git a/docs/source/usage/faq.rst b/docs/source/usage/faq.md
similarity index 61%
rename from docs/source/usage/faq.rst
rename to docs/source/usage/faq.md
index d88da32092924..fde2954f10c59 100644
--- a/docs/source/usage/faq.rst
+++ b/docs/source/usage/faq.md
@@ -1,34 +1,33 @@
-.. _faq:
+(faq)=
 
-Frequently Asked Questions
-===========================
+# Frequently Asked Questions
 
-    Q: How can I serve multiple models on a single port using the OpenAI API?
+> Q: How can I serve multiple models on a single port using the OpenAI API?
 
 A: Assuming that you're referring to using OpenAI compatible server to serve multiple models at once, that is not currently supported, you can run multiple instances of the server (each serving a different model) at the same time, and have another layer to route the incoming request to the correct server accordingly.
 
-----------------------------------------
+______________________________________________________________________
 
-    Q: Which model to use for offline inference embedding?
+> Q: Which model to use for offline inference embedding?
 
-A: You can try `e5-mistral-7b-instruct <https://huggingface.co/intfloat/e5-mistral-7b-instruct>`__ and `BAAI/bge-base-en-v1.5 <https://huggingface.co/BAAI/bge-base-en-v1.5>`__;
-more are listed :ref:`here <supported_models>`.
+A: You can try [e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) and [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5);
+more are listed [here](#supported-models).
 
-By extracting hidden states, vLLM can automatically convert text generation models like `Llama-3-8B <https://huggingface.co/meta-llama/Meta-Llama-3-8B>`__,
-`Mistral-7B-Instruct-v0.3 <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`__ into embedding models,
+By extracting hidden states, vLLM can automatically convert text generation models like [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B),
+[Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) into embedding models,
 but they are expected be inferior to models that are specifically trained on embedding tasks.
 
-----------------------------------------
+______________________________________________________________________
 
-    Q: Can the output of a prompt vary across runs in vLLM?
+> Q: Can the output of a prompt vary across runs in vLLM?
 
 A: Yes, it can. vLLM does not guarantee stable log probabilities (logprobs) for the output tokens. Variations in logprobs may occur due to
-numerical instability in Torch operations or non-deterministic behavior in batched Torch operations when batching changes. For more details, 
-see the `Numerical Accuracy section <https://pytorch.org/docs/stable/notes/numerical_accuracy.html#batched-computations-or-slice-computations>`_.
+numerical instability in Torch operations or non-deterministic behavior in batched Torch operations when batching changes. For more details,
+see the [Numerical Accuracy section](https://pytorch.org/docs/stable/notes/numerical_accuracy.html#batched-computations-or-slice-computations).
 
 In vLLM, the same requests might be batched differently due to factors such as other concurrent requests,
-changes in batch size, or batch expansion in speculative decoding. These batching variations, combined with numerical instability of Torch operations, 
-can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in 
+changes in batch size, or batch expansion in speculative decoding. These batching variations, combined with numerical instability of Torch operations,
+can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in
 different tokens being sampled. Once a different token is sampled, further divergence is likely.
 
 **Mitigation Strategies**
diff --git a/docs/source/usage/lora.md b/docs/source/usage/lora.md
new file mode 100644
index 0000000000000..e2ddde74aaa45
--- /dev/null
+++ b/docs/source/usage/lora.md
@@ -0,0 +1,215 @@
+(lora-adapter)=
+
+# LoRA Adapters
+
+This document shows you how to use [LoRA adapters](https://arxiv.org/abs/2106.09685) with vLLM on top of a base model.
+
+LoRA adapters can be used with any vLLM model that implements {class}`~vllm.model_executor.models.interfaces.SupportsLoRA`.
+
+Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save
+them locally with
+
+```python
+from huggingface_hub import snapshot_download
+
+sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+```
+
+Then we instantiate the base model and pass in the `enable_lora=True` flag:
+
+```python
+from vllm import LLM, SamplingParams
+from vllm.lora.request import LoRARequest
+
+llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True)
+```
+
+We can now submit the prompts and call `llm.generate` with the `lora_request` parameter. The first parameter
+of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
+the third parameter is the path to the LoRA adapter.
+
+```python
+sampling_params = SamplingParams(
+    temperature=0,
+    max_tokens=256,
+    stop=["[/assistant]"]
+)
+
+prompts = [
+     "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
+     "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
+]
+
+outputs = llm.generate(
+    prompts,
+    sampling_params,
+    lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
+)
+```
+
+Check out [examples/multilora_inference.py](https://github.com/vllm-project/vllm/blob/main/examples/multilora_inference.py)
+for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
+
+## Serving LoRA Adapters
+
+LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use
+`--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kickoff the server:
+
+```bash
+vllm serve meta-llama/Llama-2-7b-hf \
+    --enable-lora \
+    --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
+```
+
+```{note}
+The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one.
+```
+
+The server entrypoint accepts all other LoRA configuration parameters (`max_loras`, `max_lora_rank`, `max_cpu_loras`,
+etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
+with its base model:
+
+```bash
+curl localhost:8000/v1/models | jq .
+{
+    "object": "list",
+    "data": [
+        {
+            "id": "meta-llama/Llama-2-7b-hf",
+            "object": "model",
+            ...
+        },
+        {
+            "id": "sql-lora",
+            "object": "model",
+            ...
+        }
+    ]
+}
+```
+
+Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be
+processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
+LoRA adapter requests if they were provided and `max_loras` is set high enough).
+
+The following is an example request
+
+```bash
+curl http://localhost:8000/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "sql-lora",
+        "prompt": "San Francisco is a",
+        "max_tokens": 7,
+        "temperature": 0
+    }' | jq
+```
+
+## Dynamically serving LoRA Adapters
+
+In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading
+LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility
+to change models on-the-fly is needed.
+
+Note: Enabling this feature in production environments is risky as user may participate model adapter management.
+
+To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
+is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active.
+
+```bash
+export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
+```
+
+Loading a LoRA Adapter:
+
+To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary
+details of the adapter to be loaded. The request payload should include the name and path to the LoRA adapter.
+
+Example request to load a LoRA adapter:
+
+```bash
+curl -X POST http://localhost:8000/v1/load_lora_adapter \
+-H "Content-Type: application/json" \
+-d '{
+    "lora_name": "sql_adapter",
+    "lora_path": "/path/to/sql-lora-adapter"
+}'
+```
+
+Upon a successful request, the API will respond with a 200 OK status code. If an error occurs, such as if the adapter
+cannot be found or loaded, an appropriate error message will be returned.
+
+Unloading a LoRA Adapter:
+
+To unload a LoRA adapter that has been previously loaded, send a POST request to the `/v1/unload_lora_adapter` endpoint
+with the name or ID of the adapter to be unloaded.
+
+Example request to unload a LoRA adapter:
+
+```bash
+curl -X POST http://localhost:8000/v1/unload_lora_adapter \
+-H "Content-Type: application/json" \
+-d '{
+    "lora_name": "sql_adapter"
+}'
+```
+
+## New format for `--lora-modules`
+
+In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example:
+
+```bash
+--lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
+```
+
+This would only include the `name` and `path` for each LoRA module, but did not provide a way to specify a `base_model_name`.
+Now, you can specify a base_model_name alongside the name and path using JSON format. For example:
+
+```bash
+--lora-modules '{"name": "sql-lora", "path": "/path/to/lora", "base_model_name": "meta-llama/Llama-2-7b"}'
+```
+
+To provide the backward compatibility support, you can still use the old key-value format (name=path), but the `base_model_name` will remain unspecified in that case.
+
+## Lora model lineage in model card
+
+The new format of `--lora-modules` is mainly to support the display of parent model information in the model card. Here's an explanation of how your current response supports this:
+
+- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
+- The `root` field points to the artifact location of the lora adapter.
+
+```bash
+$ curl http://localhost:8000/v1/models
+
+{
+    "object": "list",
+    "data": [
+        {
+        "id": "meta-llama/Llama-2-7b-hf",
+        "object": "model",
+        "created": 1715644056,
+        "owned_by": "vllm",
+        "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
+        "parent": null,
+        "permission": [
+            {
+            .....
+            }
+        ]
+        },
+        {
+        "id": "sql-lora",
+        "object": "model",
+        "created": 1715644056,
+        "owned_by": "vllm",
+        "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
+        "parent": meta-llama/Llama-2-7b-hf,
+        "permission": [
+            {
+            ....
+            }
+        ]
+        }
+    ]
+}
+```
diff --git a/docs/source/usage/lora.rst b/docs/source/usage/lora.rst
deleted file mode 100644
index c2c6fa2aebfaf..0000000000000
--- a/docs/source/usage/lora.rst
+++ /dev/null
@@ -1,225 +0,0 @@
-.. _lora:
-
-LoRA Adapters
-=============
-
-This document shows you how to use `LoRA adapters <https://arxiv.org/abs/2106.09685>`_ with vLLM on top of a base model.
-
-LoRA adapters can be used with any vLLM model that implements :class:`~vllm.model_executor.models.interfaces.SupportsLoRA`.
-
-Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save
-them locally with
-
-.. code-block:: python
-
-    from huggingface_hub import snapshot_download
-
-    sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
-
-
-Then we instantiate the base model and pass in the ``enable_lora=True`` flag:
-
-.. code-block:: python
-
-    from vllm import LLM, SamplingParams
-    from vllm.lora.request import LoRARequest
-
-    llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True)
-
-
-We can now submit the prompts and call ``llm.generate`` with the ``lora_request`` parameter. The first parameter
-of ``LoRARequest`` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
-the third parameter is the path to the LoRA adapter.
-
-.. code-block:: python
-
-    sampling_params = SamplingParams(
-        temperature=0,
-        max_tokens=256,
-        stop=["[/assistant]"]
-    )
-
-    prompts = [
-         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
-         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
-    ]
-
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
-    )
-
-
-Check out `examples/multilora_inference.py <https://github.com/vllm-project/vllm/blob/main/examples/multilora_inference.py>`_
-for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
-
-Serving LoRA Adapters
----------------------
-LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use
-``--lora-modules {name}={path} {name}={path}`` to specify each LoRA module when we kickoff the server:
-
-.. code-block:: bash
-
-    vllm serve meta-llama/Llama-2-7b-hf \
-        --enable-lora \
-        --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
-
-.. note::
-   The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one.
-
-The server entrypoint accepts all other LoRA configuration parameters (``max_loras``, ``max_lora_rank``, ``max_cpu_loras``,
-etc.), which will apply to all forthcoming requests. Upon querying the ``/models`` endpoint, we should see our LoRA along
-with its base model:
-
-.. code-block:: bash
-
-    curl localhost:8000/v1/models | jq .
-    {
-        "object": "list",
-        "data": [
-            {
-                "id": "meta-llama/Llama-2-7b-hf",
-                "object": "model",
-                ...
-            },
-            {
-                "id": "sql-lora",
-                "object": "model",
-                ...
-            }
-        ]
-    }
-
-Requests can specify the LoRA adapter as if it were any other model via the ``model`` request parameter. The requests will be
-processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
-LoRA adapter requests if they were provided and ``max_loras`` is set high enough).
-
-The following is an example request
-
-.. code-block:: bash
-
-    curl http://localhost:8000/v1/completions \
-        -H "Content-Type: application/json" \
-        -d '{
-            "model": "sql-lora",
-            "prompt": "San Francisco is a",
-            "max_tokens": 7,
-            "temperature": 0
-        }' | jq
-
-
-Dynamically serving LoRA Adapters
----------------------------------
-
-In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading
-LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility
-to change models on-the-fly is needed.
-
-Note: Enabling this feature in production environments is risky as user may participate model adapter management.
-
-To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
-is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active.
-
-.. code-block:: bash
-
-    export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
-
-
-Loading a LoRA Adapter:
-
-To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary
-details of the adapter to be loaded. The request payload should include the name and path to the LoRA adapter.
-
-Example request to load a LoRA adapter:
-
-.. code-block:: bash
-
-    curl -X POST http://localhost:8000/v1/load_lora_adapter \
-    -H "Content-Type: application/json" \
-    -d '{
-        "lora_name": "sql_adapter",
-        "lora_path": "/path/to/sql-lora-adapter"
-    }'
-
-Upon a successful request, the API will respond with a 200 OK status code. If an error occurs, such as if the adapter
-cannot be found or loaded, an appropriate error message will be returned.
-
-Unloading a LoRA Adapter:
-
-To unload a LoRA adapter that has been previously loaded, send a POST request to the `/v1/unload_lora_adapter` endpoint
-with the name or ID of the adapter to be unloaded.
-
-Example request to unload a LoRA adapter:
-
-.. code-block:: bash
-
-    curl -X POST http://localhost:8000/v1/unload_lora_adapter \
-    -H "Content-Type: application/json" \
-    -d '{
-        "lora_name": "sql_adapter"
-    }'
-
-
-New format for `--lora-modules`
--------------------------------
-
-In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example:
-
-.. code-block:: bash
-
-    --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
-
-This would only include the `name` and `path` for each LoRA module, but did not provide a way to specify a `base_model_name`.
-Now, you can specify a base_model_name alongside the name and path using JSON format. For example:
-
-.. code-block:: bash
-
-    --lora-modules '{"name": "sql-lora", "path": "/path/to/lora", "base_model_name": "meta-llama/Llama-2-7b"}'
-
-To provide the backward compatibility support, you can still use the old key-value format (name=path), but the `base_model_name` will remain unspecified in that case.
-
-
-Lora model lineage in model card
---------------------------------
-
-The new format of `--lora-modules` is mainly to support the display of parent model information in the model card. Here's an explanation of how your current response supports this:
-
-- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
-- The `root` field points to the artifact location of the lora adapter.
-
-.. code-block:: bash
-
-    $ curl http://localhost:8000/v1/models
-
-    {
-        "object": "list",
-        "data": [
-            {
-            "id": "meta-llama/Llama-2-7b-hf",
-            "object": "model",
-            "created": 1715644056,
-            "owned_by": "vllm",
-            "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
-            "parent": null,
-            "permission": [
-                {
-                .....
-                }
-            ]
-            },
-            {
-            "id": "sql-lora",
-            "object": "model",
-            "created": 1715644056,
-            "owned_by": "vllm",
-            "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
-            "parent": meta-llama/Llama-2-7b-hf,
-            "permission": [
-                {
-                ....
-                }
-            ]
-            }
-        ]
-    }
diff --git a/docs/source/usage/multimodal_inputs.md b/docs/source/usage/multimodal_inputs.md
new file mode 100644
index 0000000000000..b0c887398b1b7
--- /dev/null
+++ b/docs/source/usage/multimodal_inputs.md
@@ -0,0 +1,486 @@
+(multimodal-inputs)=
+
+# Multimodal Inputs
+
+This page teaches you how to pass multi-modal inputs to [multi-modal models](#supported-mm-models) in vLLM.
+
+```{note}
+We are actively iterating on multi-modal support. See [this RFC](https://github.com/vllm-project/vllm/issues/4194) for upcoming changes,
+and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.
+```
+
+## Offline Inference
+
+To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`:
+
+- `prompt`: The prompt should follow the format that is documented on HuggingFace.
+- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.MultiModalDataDict`.
+
+### Image
+
+You can pass a single image to the {code}`'image'` field of the multi-modal dictionary, as shown in the following examples:
+
+```python
+llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+
+# Refer to the HuggingFace repo for the correct format to use
+prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
+
+# Load the image using PIL.Image
+image = PIL.Image.open(...)
+
+# Single prompt inference
+outputs = llm.generate({
+    "prompt": prompt,
+    "multi_modal_data": {"image": image},
+})
+
+for o in outputs:
+    generated_text = o.outputs[0].text
+    print(generated_text)
+
+# Batch inference
+image_1 = PIL.Image.open(...)
+image_2 = PIL.Image.open(...)
+outputs = llm.generate(
+    [
+        {
+            "prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
+            "multi_modal_data": {"image": image_1},
+        },
+        {
+            "prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
+            "multi_modal_data": {"image": image_2},
+        }
+    ]
+)
+
+for o in outputs:
+    generated_text = o.outputs[0].text
+    print(generated_text)
+```
+
+A code example can be found in [examples/offline_inference_vision_language.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py).
+
+To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
+
+```python
+llm = LLM(
+    model="microsoft/Phi-3.5-vision-instruct",
+    trust_remote_code=True,  # Required to load Phi-3.5-vision
+    max_model_len=4096,  # Otherwise, it may not fit in smaller GPUs
+    limit_mm_per_prompt={"image": 2},  # The maximum number to accept
+)
+
+# Refer to the HuggingFace repo for the correct format to use
+prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
+
+# Load the images using PIL.Image
+image1 = PIL.Image.open(...)
+image2 = PIL.Image.open(...)
+
+outputs = llm.generate({
+    "prompt": prompt,
+    "multi_modal_data": {
+        "image": [image1, image2]
+    },
+})
+
+for o in outputs:
+    generated_text = o.outputs[0].text
+    print(generated_text)
+```
+
+A code example can be found in [examples/offline_inference_vision_language_multi_image.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py).
+
+Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
+
+```python
+# Specify the maximum number of frames per video to be 4. This can be changed.
+llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
+
+# Create the request payload.
+video_frames = ... # load your video making sure it only has the number of frames specified earlier.
+message = {
+    "role": "user",
+    "content": [
+        {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
+    ],
+}
+for i in range(len(video_frames)):
+    base64_image = encode_image(video_frames[i]) # base64 encoding.
+    new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
+    message["content"].append(new_image)
+
+# Perform inference and log output.
+outputs = llm.chat([message])
+
+for o in outputs:
+    generated_text = o.outputs[0].text
+    print(generated_text)
+```
+
+### Video
+
+You can pass a list of NumPy arrays directly to the {code}`'video'` field of the multi-modal dictionary
+instead of using multi-image input.
+
+Please refer to [examples/offline_inference_vision_language.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py) for more details.
+
+### Audio
+
+You can pass a tuple {code}`(array, sampling_rate)` to the {code}`'audio'` field of the multi-modal dictionary.
+
+Please refer to [examples/offline_inference_audio_language.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_audio_language.py) for more details.
+
+### Embedding
+
+To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
+pass a tensor of shape {code}`(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
+
+```python
+# Inference with image embeddings as input
+llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+
+# Refer to the HuggingFace repo for the correct format to use
+prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
+
+# Embeddings for single image
+# torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
+image_embeds = torch.load(...)
+
+outputs = llm.generate({
+    "prompt": prompt,
+    "multi_modal_data": {"image": image_embeds},
+})
+
+for o in outputs:
+    generated_text = o.outputs[0].text
+    print(generated_text)
+```
+
+For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
+
+```python
+# Construct the prompt based on your model
+prompt = ...
+
+# Embeddings for multiple images
+# torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
+image_embeds = torch.load(...)
+
+# Qwen2-VL
+llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
+mm_data = {
+    "image": {
+        "image_embeds": image_embeds,
+        # image_grid_thw is needed to calculate positional encoding.
+        "image_grid_thw": torch.load(...),  # torch.Tensor of shape (1, 3),
+    }
+}
+
+# MiniCPM-V
+llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
+mm_data = {
+    "image": {
+        "image_embeds": image_embeds,
+        # image_size_list is needed to calculate details of the sliced image.
+        "image_size_list": [image.size for image in images],  # list of image sizes
+    }
+}
+
+outputs = llm.generate({
+    "prompt": prompt,
+    "multi_modal_data": mm_data,
+})
+
+for o in outputs:
+    generated_text = o.outputs[0].text
+    print(generated_text)
+```
+
+## Online Inference
+
+Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
+
+```{important}
+A chat template is **required** to use Chat Completions API.
+
+Although most models come with a chat template, for others you have to define one yourself.
+The chat template can be inferred based on the documentation on the model's HuggingFace repo.
+For example, LLaVA-1.5 (`llava-hf/llava-1.5-7b-hf`) requires a chat template that can be found [here](https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja).
+```
+
+### Image
+
+Image input is supported according to [OpenAI Vision API](https://platform.openai.com/docs/guides/vision).
+Here is a simple example using Phi-3.5-Vision.
+
+First, launch the OpenAI-compatible server:
+
+```bash
+vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
+  --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
+```
+
+Then, you can use the OpenAI client as follows:
+
+```python
+from openai import OpenAI
+
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+# Single-image input inference
+image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+chat_response = client.chat.completions.create(
+    model="microsoft/Phi-3.5-vision-instruct",
+    messages=[{
+        "role": "user",
+        "content": [
+            # NOTE: The prompt formatting with the image token `<image>` is not needed
+            # since the prompt will be processed automatically by the API server.
+            {"type": "text", "text": "What’s in this image?"},
+            {"type": "image_url", "image_url": {"url": image_url}},
+        ],
+    }],
+)
+print("Chat completion output:", chat_response.choices[0].message.content)
+
+# Multi-image input inference
+image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
+image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
+
+chat_response = client.chat.completions.create(
+    model="microsoft/Phi-3.5-vision-instruct",
+    messages=[{
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "What are the animals in these images?"},
+            {"type": "image_url", "image_url": {"url": image_url_duck}},
+            {"type": "image_url", "image_url": {"url": image_url_lion}},
+        ],
+    }],
+)
+print("Chat completion output:", chat_response.choices[0].message.content)
+```
+
+A full code example can be found in [examples/openai_chat_completion_client_for_multimodal.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py).
+
+```{tip}
+Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
+and pass the file path as `url` in the API request.
+```
+
+```{tip}
+There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
+In fact, you can place image placeholders in the middle of the text by interleaving text and image content.
+```
+
+````{note}
+By default, the timeout for fetching images through HTTP URL is `5` seconds.
+You can override this by setting the environment variable:
+
+```console
+$ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
+```
+````
+
+### Video
+
+Instead of {code}`image_url`, you can pass a video file via {code}`video_url`.
+
+You can use [these tests](https://github.com/vllm-project/vllm/blob/main/tests/entrypoints/openai/test_video.py) as reference.
+
+````{note}
+By default, the timeout for fetching videos through HTTP URL url is `30` seconds.
+You can override this by setting the environment variable:
+
+```console
+$ export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
+```
+````
+
+### Audio
+
+Audio input is supported according to [OpenAI Audio API](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in).
+Here is a simple example using Ultravox-v0.3.
+
+First, launch the OpenAI-compatible server:
+
+```bash
+vllm serve fixie-ai/ultravox-v0_3
+```
+
+Then, you can use the OpenAI client as follows:
+
+```python
+import base64
+import requests
+from openai import OpenAI
+from vllm.assets.audio import AudioAsset
+
+def encode_base64_content_from_url(content_url: str) -> str:
+    """Encode a content retrieved from a remote url to base64 format."""
+
+    with requests.get(content_url) as response:
+        response.raise_for_status()
+        result = base64.b64encode(response.content).decode('utf-8')
+
+    return result
+
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+# Any format supported by librosa is supported
+audio_url = AudioAsset("winning_call").url
+audio_base64 = encode_base64_content_from_url(audio_url)
+
+chat_completion_from_base64 = client.chat.completions.create(
+    messages=[{
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What's in this audio?"
+            },
+            {
+                "type": "input_audio",
+                "input_audio": {
+                    "data": audio_base64,
+                    "format": "wav"
+                },
+            },
+        ],
+    }],
+    model=model,
+    max_completion_tokens=64,
+)
+
+result = chat_completion_from_base64.choices[0].message.content
+print("Chat completion output from input audio:", result)
+```
+
+Alternatively, you can pass {code}`audio_url`, which is the audio counterpart of {code}`image_url` for image input:
+
+```python
+chat_completion_from_url = client.chat.completions.create(
+    messages=[{
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What's in this audio?"
+            },
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    "url": audio_url
+                },
+            },
+        ],
+    }],
+    model=model,
+    max_completion_tokens=64,
+)
+
+result = chat_completion_from_url.choices[0].message.content
+print("Chat completion output from audio url:", result)
+```
+
+A full code example can be found in [examples/openai_chat_completion_client_for_multimodal.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py).
+
+````{note}
+By default, the timeout for fetching audios through HTTP URL is `10` seconds.
+You can override this by setting the environment variable:
+
+```console
+$ export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
+```
+````
+
+### Embedding
+
+vLLM's Embeddings API is a superset of OpenAI's [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings),
+where a list of chat `messages` can be passed instead of batched `inputs`. This enables multi-modal inputs to be passed to embedding models.
+
+```{tip}
+The schema of `messages` is exactly the same as in Chat Completions API.
+You can refer to the above tutorials for more details on how to pass each type of multi-modal data.
+```
+
+Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images.
+Refer to the examples below for illustration.
+
+Here is an end-to-end example using VLM2Vec. To serve the model:
+
+```bash
+vllm serve TIGER-Lab/VLM2Vec-Full --task embed \
+  --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
+```
+
+```{important}
+Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed`
+to run this model in embedding mode instead of text generation mode.
+
+The custom chat template is completely different from the original one for this model,
+and can be found [here](https://github.com/vllm-project/vllm/blob/main/examples/template_vlm2vec.jinja).
+```
+
+Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
+
+```python
+import requests
+
+image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+response = requests.post(
+    "http://localhost:8000/v1/embeddings",
+    json={
+        "model": "TIGER-Lab/VLM2Vec-Full",
+        "messages": [{
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": image_url}},
+                {"type": "text", "text": "Represent the given image."},
+            ],
+        }],
+        "encoding_format": "float",
+    },
+)
+response.raise_for_status()
+response_json = response.json()
+print("Embedding output:", response_json["data"][0]["embedding"])
+```
+
+Below is another example, this time using the `MrLight/dse-qwen2-2b-mrl-v1` model.
+
+```bash
+vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
+  --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja
+```
+
+```{important}
+Like with VLM2Vec, we have to explicitly pass `--task embed`.
+
+Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
+by [this custom chat template](https://github.com/vllm-project/vllm/blob/main/examples/template_dse_qwen2_vl.jinja).
+```
+
+```{important}
+Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
+example below for details.
+```
+
+A full code example can be found in [examples/openai_chat_embedding_client_for_multimodal.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_embedding_client_for_multimodal.py).
diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst
deleted file mode 100644
index 680382e457cc5..0000000000000
--- a/docs/source/usage/multimodal_inputs.rst
+++ /dev/null
@@ -1,492 +0,0 @@
-.. _multimodal_inputs:
-
-Multimodal Inputs
-=================
-
-This page teaches you how to pass multi-modal inputs to :ref:`multi-modal models <supported_mm_models>` in vLLM.
-
-.. note::
-    We are actively iterating on multi-modal support. See `this RFC <https://github.com/vllm-project/vllm/issues/4194>`_ for upcoming changes,
-    and `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_ if you have any feedback or feature requests.
-
-Offline Inference
------------------
-
-To input multi-modal data, follow this schema in :class:`vllm.inputs.PromptType`:
-
-* ``prompt``: The prompt should follow the format that is documented on HuggingFace.
-* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`.
-
-Image
-^^^^^
-
-You can pass a single image to the :code:`'image'` field of the multi-modal dictionary, as shown in the following examples:
-
-.. code-block:: python
-
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
-
-    # Refer to the HuggingFace repo for the correct format to use
-    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
-
-    # Load the image using PIL.Image
-    image = PIL.Image.open(...)
-
-    # Single prompt inference
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": {"image": image},
-    })
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-    # Batch inference
-    image_1 = PIL.Image.open(...)
-    image_2 = PIL.Image.open(...)
-    outputs = llm.generate(
-        [
-            {
-                "prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
-                "multi_modal_data": {"image": image_1},
-            },
-            {
-                "prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
-                "multi_modal_data": {"image": image_2},
-            }
-        ]
-    )
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-A code example can be found in `examples/offline_inference_vision_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py>`_.
-
-To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
-
-.. code-block:: python
-
-    llm = LLM(
-        model="microsoft/Phi-3.5-vision-instruct",
-        trust_remote_code=True,  # Required to load Phi-3.5-vision
-        max_model_len=4096,  # Otherwise, it may not fit in smaller GPUs
-        limit_mm_per_prompt={"image": 2},  # The maximum number to accept
-    )
-
-    # Refer to the HuggingFace repo for the correct format to use
-    prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
-
-    # Load the images using PIL.Image
-    image1 = PIL.Image.open(...)
-    image2 = PIL.Image.open(...)
-
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": {
-            "image": [image1, image2]
-        },
-    })
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-A code example can be found in `examples/offline_inference_vision_language_multi_image.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py>`_.
-
-Multi-image input can be extended to perform video captioning. We show this with `Qwen2-VL <https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct>`_ as it supports videos:
-
-.. code-block:: python
-
-    # Specify the maximum number of frames per video to be 4. This can be changed.
-    llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
-
-    # Create the request payload.
-    video_frames = ... # load your video making sure it only has the number of frames specified earlier.
-    message = {
-        "role": "user",
-        "content": [
-            {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
-        ],
-    }
-    for i in range(len(video_frames)):
-        base64_image = encode_image(video_frames[i]) # base64 encoding.
-        new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
-        message["content"].append(new_image)
-
-    # Perform inference and log output.
-    outputs = llm.chat([message])
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-Video
-^^^^^
-
-You can pass a list of NumPy arrays directly to the :code:`'video'` field of the multi-modal dictionary
-instead of using multi-image input.
-
-Please refer to `examples/offline_inference_vision_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py>`_ for more details.
-
-Audio
-^^^^^
-
-You can pass a tuple :code:`(array, sampling_rate)` to the :code:`'audio'` field of the multi-modal dictionary.
-
-Please refer to `examples/offline_inference_audio_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_audio_language.py>`_ for more details.
-
-Embedding
-^^^^^^^^^
-
-To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
-pass a tensor of shape :code:`(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
-
-.. code-block:: python
-
-    # Inference with image embeddings as input
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
-
-    # Refer to the HuggingFace repo for the correct format to use
-    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
-
-    # Embeddings for single image
-    # torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
-    image_embeds = torch.load(...)
-
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": {"image": image_embeds},
-    })
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
-
-.. code-block:: python
-
-    # Construct the prompt based on your model
-    prompt = ...
-
-    # Embeddings for multiple images
-    # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
-    image_embeds = torch.load(...)
-
-    # Qwen2-VL
-    llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
-    mm_data = {
-        "image": {
-            "image_embeds": image_embeds,
-            # image_grid_thw is needed to calculate positional encoding.
-            "image_grid_thw": torch.load(...),  # torch.Tensor of shape (1, 3),
-        }
-    }
-
-    # MiniCPM-V
-    llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
-    mm_data = {
-        "image": {
-            "image_embeds": image_embeds,
-            # image_size_list is needed to calculate details of the sliced image.
-            "image_size_list": [image.size for image in images],  # list of image sizes
-        }
-    }
-
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": mm_data,
-    })
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-Online Inference
-----------------
-
-Our OpenAI-compatible server accepts multi-modal data via the `Chat Completions API <https://platform.openai.com/docs/api-reference/chat>`_.
-
-.. important::
-    A chat template is **required** to use Chat Completions API.
-
-    Although most models come with a chat template, for others you have to define one yourself.
-    The chat template can be inferred based on the documentation on the model's HuggingFace repo.
-    For example, LLaVA-1.5 (``llava-hf/llava-1.5-7b-hf``) requires a chat template that can be found `here <https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja>`__.
-
-Image
-^^^^^
-
-Image input is supported according to `OpenAI Vision API <https://platform.openai.com/docs/guides/vision>`_.
-Here is a simple example using Phi-3.5-Vision.
-
-First, launch the OpenAI-compatible server:
-
-.. code-block:: bash
-
-    vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
-      --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
-
-Then, you can use the OpenAI client as follows:
-
-.. code-block:: python
-
-    from openai import OpenAI
-
-    openai_api_key = "EMPTY"
-    openai_api_base = "http://localhost:8000/v1"
-
-    client = OpenAI(
-        api_key=openai_api_key,
-        base_url=openai_api_base,
-    )
-
-    # Single-image input inference
-    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-
-    chat_response = client.chat.completions.create(
-        model="microsoft/Phi-3.5-vision-instruct",
-        messages=[{
-            "role": "user",
-            "content": [
-                # NOTE: The prompt formatting with the image token `<image>` is not needed
-                # since the prompt will be processed automatically by the API server.
-                {"type": "text", "text": "What’s in this image?"},
-                {"type": "image_url", "image_url": {"url": image_url}},
-            ],
-        }],
-    )
-    print("Chat completion output:", chat_response.choices[0].message.content)
-
-    # Multi-image input inference
-    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
-    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
-
-    chat_response = client.chat.completions.create(
-        model="microsoft/Phi-3.5-vision-instruct",
-        messages=[{
-            "role": "user",
-            "content": [
-                {"type": "text", "text": "What are the animals in these images?"},
-                {"type": "image_url", "image_url": {"url": image_url_duck}},
-                {"type": "image_url", "image_url": {"url": image_url_lion}},
-            ],
-        }],
-    )
-    print("Chat completion output:", chat_response.choices[0].message.content)
-
-A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py>`_.
-
-.. tip::
-    Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via ``--allowed-local-media-path`` when launching the API server/engine,
-    and pass the file path as ``url`` in the API request.
-
-.. tip::
-    There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
-    In fact, you can place image placeholders in the middle of the text by interleaving text and image content.
-
-.. note::
-
-    By default, the timeout for fetching images through HTTP URL is ``5`` seconds.
-    You can override this by setting the environment variable:
-
-    .. code-block:: console
-
-        $ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
-
-Video
-^^^^^
-
-Instead of :code:`image_url`, you can pass a video file via :code:`video_url`.
-
-You can use `these tests <https://github.com/vllm-project/vllm/blob/main/tests/entrypoints/openai/test_video.py>`_ as reference.
-
-.. note::
-
-    By default, the timeout for fetching videos through HTTP URL url is ``30`` seconds.
-    You can override this by setting the environment variable:
-
-    .. code-block:: console
-
-        $ export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
-
-Audio
-^^^^^
-
-Audio input is supported according to `OpenAI Audio API <https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in>`_.
-Here is a simple example using Ultravox-v0.3.
-
-First, launch the OpenAI-compatible server:
-
-.. code-block:: bash
-
-    vllm serve fixie-ai/ultravox-v0_3
-    
-Then, you can use the OpenAI client as follows:
-
-.. code-block:: python
-
-    import base64
-    import requests
-    from openai import OpenAI
-    from vllm.assets.audio import AudioAsset
-
-    def encode_base64_content_from_url(content_url: str) -> str:
-        """Encode a content retrieved from a remote url to base64 format."""
-
-        with requests.get(content_url) as response:
-            response.raise_for_status()
-            result = base64.b64encode(response.content).decode('utf-8')
-
-        return result
-
-    openai_api_key = "EMPTY"
-    openai_api_base = "http://localhost:8000/v1"
-
-    client = OpenAI(
-        api_key=openai_api_key,
-        base_url=openai_api_base,
-    )
-
-    # Any format supported by librosa is supported
-    audio_url = AudioAsset("winning_call").url
-    audio_base64 = encode_base64_content_from_url(audio_url)
-
-    chat_completion_from_base64 = client.chat.completions.create(
-        messages=[{
-            "role": "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's in this audio?"
-                },
-                {
-                    "type": "input_audio",
-                    "input_audio": {
-                        "data": audio_base64,
-                        "format": "wav"
-                    },
-                },
-            ],
-        }],
-        model=model,
-        max_completion_tokens=64,
-    )
-
-    result = chat_completion_from_base64.choices[0].message.content
-    print("Chat completion output from input audio:", result)
-
-Alternatively, you can pass :code:`audio_url`, which is the audio counterpart of :code:`image_url` for image input:
-
-.. code-block:: python
-
-    chat_completion_from_url = client.chat.completions.create(
-        messages=[{
-            "role": "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's in this audio?"
-                },
-                {
-                    "type": "audio_url",
-                    "audio_url": {
-                        "url": audio_url
-                    },
-                },
-            ],
-        }],
-        model=model,
-        max_completion_tokens=64,
-    )
-
-    result = chat_completion_from_url.choices[0].message.content
-    print("Chat completion output from audio url:", result)
-
-A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py>`_.
-
-.. note::
-
-    By default, the timeout for fetching audios through HTTP URL is ``10`` seconds.
-    You can override this by setting the environment variable:
-
-    .. code-block:: console
-
-        $ export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
-
-Embedding
-^^^^^^^^^
-
-vLLM's Embeddings API is a superset of OpenAI's `Embeddings API <https://platform.openai.com/docs/api-reference/embeddings>`_,
-where a list of chat ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models.
-
-.. tip::
-    The schema of ``messages`` is exactly the same as in Chat Completions API.
-    You can refer to the above tutorials for more details on how to pass each type of multi-modal data.
-
-Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images.
-Refer to the examples below for illustration.
-
-Here is an end-to-end example using VLM2Vec. To serve the model:
-
-.. code-block:: bash
-
-    vllm serve TIGER-Lab/VLM2Vec-Full --task embed \
-      --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
-
-.. important::
-
-    Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embed``
-    to run this model in embedding mode instead of text generation mode.
-
-    The custom chat template is completely different from the original one for this model,
-    and can be found `here <https://github.com/vllm-project/vllm/blob/main/examples/template_vlm2vec.jinja>`__.
-
-Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library:
-
-.. code-block:: python
-
-    import requests
-
-    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-
-    response = requests.post(
-        "http://localhost:8000/v1/embeddings",
-        json={
-            "model": "TIGER-Lab/VLM2Vec-Full",
-            "messages": [{
-                "role": "user",
-                "content": [
-                    {"type": "image_url", "image_url": {"url": image_url}},
-                    {"type": "text", "text": "Represent the given image."},
-                ],
-            }],
-            "encoding_format": "float",
-        },
-    )
-    response.raise_for_status()
-    response_json = response.json()
-    print("Embedding output:", response_json["data"][0]["embedding"])
-
-Below is another example, this time using the ``MrLight/dse-qwen2-2b-mrl-v1`` model.
-
-.. code-block:: bash
-
-    vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
-      --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja
-
-.. important::
-
-    Like with VLM2Vec, we have to explicitly pass ``--task embed``.
-    
-    Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, which is handled
-    by `this custom chat template <https://github.com/vllm-project/vllm/blob/main/examples/template_dse_qwen2_vl.jinja>`__.
-
-.. important::
-
-    Also important, ``MrLight/dse-qwen2-2b-mrl-v1`` requires a placeholder image of the minimum image size for text query embeddings. See the full code 
-    example below for details.
-
-A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_embedding_client_for_multimodal.py>`_.
diff --git a/docs/source/usage/performance.rst b/docs/source/usage/performance.md
similarity index 54%
rename from docs/source/usage/performance.rst
rename to docs/source/usage/performance.md
index 23b5ab79a7378..f028e28627a9f 100644
--- a/docs/source/usage/performance.rst
+++ b/docs/source/usage/performance.md
@@ -1,16 +1,15 @@
-.. _performance:
+(performance)=
 
-Performance and Tuning
-======================
+# Performance and Tuning
+
+## Preemption
 
-Preemption
-----------
 Due to the auto-regressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
 The vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes
 available again. When this occurs, the following warning is printed:
 
 ```
-WARNING 05-09 00:49:33 scheduler.py:1057] Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1
+WARNING 05-09 00:49:33 scheduler.py:1057 Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1
 ```
 
 While this mechanism ensures system robustness, preemption and recomputation can adversely affect end-to-end latency.
@@ -22,44 +21,44 @@ If you frequently encounter preemptions from the vLLM engine, consider the follo
 
 You can also monitor the number of preemption requests through Prometheus metrics exposed by the vLLM. Additionally, you can log the cumulative number of preemption requests by setting disable_log_stats=False.
 
-.. _chunked-prefill:
+(chunked-prefill)=
 
-Chunked Prefill
----------------
-vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests.
+## Chunked Prefill
 
-You can enable the feature by specifying ``--enable-chunked-prefill`` in the command line or setting ``enable_chunked_prefill=True`` in the LLM constructor.
+vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests.
 
-.. code-block:: python
+You can enable the feature by specifying `--enable-chunked-prefill` in the command line or setting `enable_chunked_prefill=True` in the LLM constructor.
 
-    llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True)
-    # Set max_num_batched_tokens to tune performance.
-    # NOTE: 512 is the default max_num_batched_tokens for chunked prefill.
-    # llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512)
+```python
+llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True)
+# Set max_num_batched_tokens to tune performance.
+# NOTE: 512 is the default max_num_batched_tokens for chunked prefill.
+# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512)
+```
 
 By default, vLLM scheduler prioritizes prefills and doesn't batch prefill and decode to the same batch.
 This policy optimizes the TTFT (time to the first token), but incurs slower ITL (inter token latency) and inefficient GPU utilization.
 
 Once chunked prefill is enabled, the policy is changed to prioritize decode requests.
 It batches all pending decode requests to the batch before scheduling any prefill.
-When there are available token_budget (``max_num_batched_tokens``), it schedules pending prefills.
-If a last pending prefill request cannot fit into ``max_num_batched_tokens``, it chunks it.
+When there are available token_budget (`max_num_batched_tokens`), it schedules pending prefills.
+If a last pending prefill request cannot fit into `max_num_batched_tokens`, it chunks it.
 
 This policy has two benefits:
 
 - It improves ITL and generation decode because decode requests are prioritized.
 - It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch.
 
-You can tune the performance by changing ``max_num_batched_tokens``.
+You can tune the performance by changing `max_num_batched_tokens`.
 By default, it is set to 512, which has the best ITL on A100 in the initial benchmark (llama 70B and mixtral 8x22B).
-Smaller ``max_num_batched_tokens`` achieves better ITL because there are fewer prefills interrupting decodes.
-Higher ``max_num_batched_tokens`` achieves better TTFT as you can put more prefill to the batch.
+Smaller `max_num_batched_tokens` achieves better ITL because there are fewer prefills interrupting decodes.
+Higher `max_num_batched_tokens` achieves better TTFT as you can put more prefill to the batch.
 
-- If ``max_num_batched_tokens`` is the same as ``max_model_len``, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes).
-- Note that the default value (512) of ``max_num_batched_tokens`` is optimized for ITL, and it may have lower throughput than the default scheduler.
+- If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes).
+- Note that the default value (512) of `max_num_batched_tokens` is optimized for ITL, and it may have lower throughput than the default scheduler.
 
-We recommend you set ``max_num_batched_tokens > 2048`` for throughput.
+We recommend you set `max_num_batched_tokens > 2048` for throughput.
 
-See related papers for more details (https://arxiv.org/pdf/2401.08671 or https://arxiv.org/pdf/2308.16369).
+See related papers for more details (<https://arxiv.org/pdf/2401.08671> or <https://arxiv.org/pdf/2308.16369>).
 
-Please try out this feature and let us know your feedback via GitHub issues!
\ No newline at end of file
+Please try out this feature and let us know your feedback via GitHub issues!
diff --git a/docs/source/usage/spec_decode.md b/docs/source/usage/spec_decode.md
new file mode 100644
index 0000000000000..77e35c437de30
--- /dev/null
+++ b/docs/source/usage/spec_decode.md
@@ -0,0 +1,205 @@
+(spec-decode)=
+
+# Speculative decoding
+
+```{warning}
+Please note that speculative decoding in vLLM is not yet optimized and does
+not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. The work
+to optimize it is ongoing and can be followed in [this issue.](https://github.com/vllm-project/vllm/issues/4630)
+```
+
+```{warning}
+Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.
+```
+
+This document shows how to use [Speculative Decoding](https://x.com/karpathy/status/1697318534555336961) with vLLM.
+Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference.
+
+## Speculating with a draft model
+
+The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
+
+```python
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+llm = LLM(
+    model="facebook/opt-6.7b",
+    tensor_parallel_size=1,
+    speculative_model="facebook/opt-125m",
+    num_speculative_tokens=5,
+)
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+To perform the same with an online mode launch the server:
+
+```bash
+python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \
+    --seed 42 -tp 1 --speculative_model facebook/opt-125m --use-v2-block-manager \
+    --num_speculative_tokens 5 --gpu_memory_utilization 0.8
+```
+
+Then use a client:
+
+```python
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+# Completion API
+stream = False
+completion = client.completions.create(
+    model=model,
+    prompt="The future of AI is",
+    echo=False,
+    n=1,
+    stream=stream,
+)
+
+print("Completion results:")
+if stream:
+    for c in completion:
+        print(c)
+else:
+    print(completion)
+```
+
+## Speculating by matching n-grams in the prompt
+
+The following code configures vLLM to use speculative decoding where proposals are generated by
+matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259)
+
+```python
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+llm = LLM(
+    model="facebook/opt-6.7b",
+    tensor_parallel_size=1,
+    speculative_model="[ngram]",
+    num_speculative_tokens=5,
+    ngram_prompt_lookup_max=4,
+)
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+## Speculating using MLP speculators
+
+The following code configures vLLM to use speculative decoding where proposals are generated by
+draft models that conditioning draft predictions on both context vectors and sampled tokens.
+For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or
+[this technical report](https://arxiv.org/abs/2404.19124).
+
+```python
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+llm = LLM(
+    model="meta-llama/Meta-Llama-3.1-70B-Instruct",
+    tensor_parallel_size=4,
+    speculative_model="ibm-fms/llama3-70b-accelerator",
+    speculative_draft_tensor_parallel_size=1,
+)
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+Note that these speculative models currently need to be run without tensor parallelism, although
+it is possible to run the main model using tensor parallelism (see example above). Since the
+speculative models are relatively small, we still see significant speedups. However, this
+limitation will be fixed in a future release.
+
+A variety of speculative models of this type are available on HF hub:
+
+- [llama-13b-accelerator](https://huggingface.co/ibm-fms/llama-13b-accelerator)
+- [llama3-8b-accelerator](https://huggingface.co/ibm-fms/llama3-8b-accelerator)
+- [codellama-34b-accelerator](https://huggingface.co/ibm-fms/codellama-34b-accelerator)
+- [llama2-70b-accelerator](https://huggingface.co/ibm-fms/llama2-70b-accelerator)
+- [llama3-70b-accelerator](https://huggingface.co/ibm-fms/llama3-70b-accelerator)
+- [granite-3b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-3b-code-instruct-accelerator)
+- [granite-8b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-8b-code-instruct-accelerator)
+- [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator)
+- [granite-20b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator)
+
+## Lossless guarantees of Speculative Decoding
+
+In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of
+speculative decoding, breaking down the guarantees into three key areas:
+
+1. **Theoretical Losslessness**
+   \- Speculative decoding sampling is theoretically lossless up to the precision limits of hardware numerics. Floating-point errors might
+   cause slight variations in output distributions, as discussed
+   in [Accelerating Large Language Model Decoding with Speculative Sampling](https://arxiv.org/pdf/2302.01318)
+
+2. **Algorithmic Losslessness**
+   \- vLLM’s implementation of speculative decoding is algorithmically validated to be lossless. Key validation tests include:
+
+   > - **Rejection Sampler Convergence**: Ensures that samples from vLLM’s rejection sampler align with the target
+   >   distribution. [View Test Code](https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252)
+   > - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling
+   >   without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler,
+   >   provides a lossless guarantee. Almost all of the tests in [this directory](https://github.com/vllm-project/vllm/tree/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e)
+   >   verify this property using [this assertion implementation](https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291)
+
+3. **vLLM Logprob Stability**
+   \- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the
+   same request across runs. For more details, see the FAQ section
+   titled *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs <faq>`.
+
+**Conclusion**
+
+While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding
+can occur due to following factors:
+
+- **Floating-Point Precision**: Differences in hardware numerical precision may lead to slight discrepancies in the output distribution.
+- **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially
+  due to non-deterministic behavior in batched operations or numerical instability.
+
+**Mitigation Strategies**
+
+For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs <faq>`.
+
+## Resources for vLLM contributors
+
+- [A Hacker's Guide to Speculative Decoding in vLLM](https://www.youtube.com/watch?v=9wNAgpX6z_4)
+- [What is Lookahead Scheduling in vLLM?](https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a)
+- [Information on batch expansion](https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8)
+- [Dynamic speculative decoding](https://github.com/vllm-project/vllm/issues/4565)
diff --git a/docs/source/usage/spec_decode.rst b/docs/source/usage/spec_decode.rst
deleted file mode 100644
index f1f1917f974bb..0000000000000
--- a/docs/source/usage/spec_decode.rst
+++ /dev/null
@@ -1,210 +0,0 @@
-.. _spec_decode:
-
-Speculative decoding
-====================
-
-.. warning::
-    Please note that speculative decoding in vLLM is not yet optimized and does
-    not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. The work
-    to optimize it is ongoing and can be followed in `this issue. <https://github.com/vllm-project/vllm/issues/4630>`_
-
-.. warning::
-    Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.
-
-This document shows how to use `Speculative Decoding <https://x.com/karpathy/status/1697318534555336961>`_ with vLLM.
-Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference.
-
-Speculating with a draft model
-------------------------------
-
-The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
-
-.. code-block:: python
-
-    from vllm import LLM, SamplingParams
-
-    prompts = [
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-    llm = LLM(
-        model="facebook/opt-6.7b",
-        tensor_parallel_size=1,
-        speculative_model="facebook/opt-125m",
-        num_speculative_tokens=5,
-    )
-    outputs = llm.generate(prompts, sampling_params)
-
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-To perform the same with an online mode launch the server:
-
-.. code-block:: bash
-
-    python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \
-        --seed 42 -tp 1 --speculative_model facebook/opt-125m --use-v2-block-manager \
-        --num_speculative_tokens 5 --gpu_memory_utilization 0.8
-
-Then use a client:
-
-.. code-block:: python
-
-    from openai import OpenAI
-
-    # Modify OpenAI's API key and API base to use vLLM's API server.
-    openai_api_key = "EMPTY"
-    openai_api_base = "http://localhost:8000/v1"
-
-    client = OpenAI(
-        # defaults to os.environ.get("OPENAI_API_KEY")
-        api_key=openai_api_key,
-        base_url=openai_api_base,
-    )
-
-    models = client.models.list()
-    model = models.data[0].id
-
-    # Completion API
-    stream = False
-    completion = client.completions.create(
-        model=model,
-        prompt="The future of AI is",
-        echo=False,
-        n=1,
-        stream=stream,
-    )
-
-    print("Completion results:")
-    if stream:
-        for c in completion:
-            print(c)
-    else:
-        print(completion)
-
-Speculating by matching n-grams in the prompt
----------------------------------------------
-
-The following code configures vLLM to use speculative decoding where proposals are generated by
-matching n-grams in the prompt. For more information read `this thread. <https://x.com/joao_gante/status/1747322413006643259>`_
-
-.. code-block:: python
-
-    from vllm import LLM, SamplingParams
-
-    prompts = [
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-    llm = LLM(
-        model="facebook/opt-6.7b",
-        tensor_parallel_size=1,
-        speculative_model="[ngram]",
-        num_speculative_tokens=5,
-        ngram_prompt_lookup_max=4,
-    )
-    outputs = llm.generate(prompts, sampling_params)
-
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-Speculating using MLP speculators
----------------------------------
-
-The following code configures vLLM to use speculative decoding where proposals are generated by
-draft models that conditioning draft predictions on both context vectors and sampled tokens.
-For more information see `this blog <https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/>`_ or
-`this technical report <https://arxiv.org/abs/2404.19124>`_.
-
-.. code-block:: python
-
-    from vllm import LLM, SamplingParams
-
-    prompts = [
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-    llm = LLM(
-        model="meta-llama/Meta-Llama-3.1-70B-Instruct",
-        tensor_parallel_size=4,
-        speculative_model="ibm-fms/llama3-70b-accelerator",
-        speculative_draft_tensor_parallel_size=1,
-    )
-    outputs = llm.generate(prompts, sampling_params)
-
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-Note that these speculative models currently need to be run without tensor parallelism, although
-it is possible to run the main model using tensor parallelism (see example above). Since the
-speculative models are relatively small, we still see significant speedups. However, this
-limitation will be fixed in a future release.
-
-A variety of speculative models of this type are available on HF hub:
-
-* `llama-13b-accelerator <https://huggingface.co/ibm-fms/llama-13b-accelerator>`_
-* `llama3-8b-accelerator <https://huggingface.co/ibm-fms/llama3-8b-accelerator>`_
-* `codellama-34b-accelerator <https://huggingface.co/ibm-fms/codellama-34b-accelerator>`_
-* `llama2-70b-accelerator <https://huggingface.co/ibm-fms/llama2-70b-accelerator>`_
-* `llama3-70b-accelerator <https://huggingface.co/ibm-fms/llama3-70b-accelerator>`_
-* `granite-3b-code-instruct-accelerator <https://huggingface.co/ibm-granite/granite-3b-code-instruct-accelerator>`_
-* `granite-8b-code-instruct-accelerator <https://huggingface.co/ibm-granite/granite-8b-code-instruct-accelerator>`_
-* `granite-7b-instruct-accelerator <https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator>`_
-* `granite-20b-code-instruct-accelerator <https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator>`_
-
-Lossless guarantees of Speculative Decoding
--------------------------------------------
-In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of 
-speculative decoding, breaking down the guarantees into three key areas:
-
-1. **Theoretical Losslessness**
-   - Speculative decoding sampling is theoretically lossless up to the precision limits of hardware numerics. Floating-point errors might 
-   cause slight variations in output distributions, as discussed 
-   in `Accelerating Large Language Model Decoding with Speculative Sampling <https://arxiv.org/pdf/2302.01318>`_
-
-2. **Algorithmic Losslessness**
-   - vLLM’s implementation of speculative decoding is algorithmically validated to be lossless. Key validation tests include:
-
-    - **Rejection Sampler Convergence**: Ensures that samples from vLLM’s rejection sampler align with the target 
-      distribution. `View Test Code <https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252>`_
-
-    - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling
-      without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler, 
-      provides a lossless guarantee.  Almost all of the tests in `this directory <https://github.com/vllm-project/vllm/tree/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e>`_
-      verify this property using `this assertion implementation <https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291>`_
-
-3. **vLLM Logprob Stability**
-   - vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the 
-   same request across runs. For more details, see the FAQ section 
-   titled *Can the output of a prompt vary across runs in vLLM?* in the :ref:`FAQs <faq>`.
-
-
-**Conclusion**
-
-While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding 
-can occur due to following factors:
-
-- **Floating-Point Precision**: Differences in hardware numerical precision may lead to slight discrepancies in the output distribution.
-
-- **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially 
-  due to non-deterministic behavior in batched operations or numerical instability.
-
-**Mitigation Strategies**
-
-For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the :ref:`FAQs <faq>`.
-
-Resources for vLLM contributors
--------------------------------
-* `A Hacker's Guide to Speculative Decoding in vLLM <https://www.youtube.com/watch?v=9wNAgpX6z_4>`_
-* `What is Lookahead Scheduling in vLLM? <https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a>`_
-* `Information on batch expansion <https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8>`_
-* `Dynamic speculative decoding <https://github.com/vllm-project/vllm/issues/4565>`_
diff --git a/docs/source/usage/structured_outputs.md b/docs/source/usage/structured_outputs.md
new file mode 100644
index 0000000000000..14dd387743aac
--- /dev/null
+++ b/docs/source/usage/structured_outputs.md
@@ -0,0 +1,260 @@
+(structured-outputs)=
+
+# Structured Outputs
+
+vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines) or [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer) as backends for the guided decoding.
+This document shows you some examples of the different options that are available to generate structured outputs.
+
+## Online Inference (OpenAI API)
+
+You can generate structured outputs using the OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API.
+
+The following parameters are supported, which must be added as extra parameters:
+
+- `guided_choice`: the output will be exactly one of the choices.
+- `guided_regex`: the output will follow the regex pattern.
+- `guided_json`: the output will follow the JSON schema.
+- `guided_grammar`: the output will follow the context free grammar.
+- `guided_whitespace_pattern`: used to override the default whitespace pattern for guided json decoding.
+- `guided_decoding_backend`: used to select the guided decoding backend to use.
+
+You can see the complete list of supported parameters on the [OpenAI Compatible Server](../serving/openai_compatible_server.md) page.
+
+Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
+
+```python
+from openai import OpenAI
+client = OpenAI(
+    base_url="http://localhost:8000/v1",
+    api_key="-",
+)
+
+completion = client.chat.completions.create(
+    model="Qwen/Qwen2.5-3B-Instruct",
+    messages=[
+        {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
+    ],
+    extra_body={"guided_choice": ["positive", "negative"]},
+)
+print(completion.choices[0].message.content)
+```
+
+The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
+
+```python
+completion = client.chat.completions.create(
+    model="Qwen/Qwen2.5-3B-Instruct",
+    messages=[
+        {
+            "role": "user",
+            "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
+        }
+    ],
+    extra_body={"guided_regex": "\w+@\w+\.com\n", "stop": ["\n"]},
+)
+print(completion.choices[0].message.content)
+```
+
+One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
+For this we can use the `guided_json` parameter in two different ways:
+
+- Using directly a [JSON Schema](https://json-schema.org/)
+- Defining a [Pydantic model](https://docs.pydantic.dev/latest/) and then extracting the JSON Schema from it (which is normally an easier option).
+
+The next example shows how to use the `guided_json` parameter with a Pydantic model:
+
+```python
+from pydantic import BaseModel
+from enum import Enum
+
+class CarType(str, Enum):
+    sedan = "sedan"
+    suv = "SUV"
+    truck = "Truck"
+    coupe = "Coupe"
+
+
+class CarDescription(BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+
+
+json_schema = CarDescription.model_json_schema()
+
+completion = client.chat.completions.create(
+    model="Qwen/Qwen2.5-3B-Instruct",
+    messages=[
+        {
+            "role": "user",
+            "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
+        }
+    ],
+    extra_body={"guided_json": json_schema},
+)
+print(completion.choices[0].message.content)
+```
+
+```{tip}
+While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them.
+This can improve the results notably in most cases.
+```
+
+Finally we have the `guided_grammar`, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries.
+It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below:
+
+```python
+simplified_sql_grammar = """
+    ?start: select_statement
+
+    ?select_statement: "SELECT " column_list " FROM " table_name
+
+    ?column_list: column_name ("," column_name)*
+
+    ?table_name: identifier
+
+    ?column_name: identifier
+
+    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+"""
+
+completion = client.chat.completions.create(
+    model="Qwen/Qwen2.5-3B-Instruct",
+    messages=[
+        {
+            "role": "user",
+            "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
+        }
+    ],
+    extra_body={"guided_grammar": simplified_sql_grammar},
+)
+print(completion.choices[0].message.content)
+```
+
+The complete code of the examples can be found on [examples/openai_chat_completion_structured_outputs.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_structured_outputs.py).
+
+## Experimental Automatic Parsing (OpenAI API)
+
+This section covers the OpenAI beta wrapper over the `client.chat.completions.create()` method that provides richer integrations with Python specific types.
+
+At the time of writing (`openai==1.54.4`), this is a "beta" feature in the OpenAI client library. Code reference can be found [here](https://github.com/openai/openai-python/blob/52357cff50bee57ef442e94d78a0de38b4173fc2/src/openai/resources/beta/chat/completions.py#L100-L104).
+
+For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.1-8B-Instruct`
+
+Here is a simple example demonstrating how to get structured output using Pydantic models:
+
+```python
+from pydantic import BaseModel
+from openai import OpenAI
+
+
+class Info(BaseModel):
+    name: str
+    age: int
+
+
+client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
+completion = client.beta.chat.completions.parse(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
+    ],
+    response_format=Info,
+    extra_body=dict(guided_decoding_backend="outlines"),
+)
+
+message = completion.choices[0].message
+print(message)
+assert message.parsed
+print("Name:", message.parsed.name)
+print("Age:", message.parsed.age)
+```
+
+Output:
+
+```console
+ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28))
+Name: Cameron
+Age: 28
+```
+
+Here is a more complex example using nested Pydantic models to handle a step-by-step math solution:
+
+```python
+from typing import List
+from pydantic import BaseModel
+from openai import OpenAI
+
+
+class Step(BaseModel):
+    explanation: str
+    output: str
+
+
+class MathResponse(BaseModel):
+    steps: List[Step]
+    final_answer: str
+
+
+client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
+completion = client.beta.chat.completions.parse(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    messages=[
+        {"role": "system", "content": "You are a helpful expert math tutor."},
+        {"role": "user", "content": "Solve 8x + 31 = 2."},
+    ],
+    response_format=MathResponse,
+    extra_body=dict(guided_decoding_backend="outlines"),
+)
+
+message = completion.choices[0].message
+print(message)
+assert message.parsed
+for i, step in enumerate(message.parsed.steps):
+    print(f"Step #{i}:", step)
+print("Answer:", message.parsed.final_answer)
+```
+
+Output:
+
+```console
+ParsedChatCompletionMessage[MathResponse](content='{ "steps": [{ "explanation": "First, let\'s isolate the term with the variable \'x\'. To do this, we\'ll subtract 31 from both sides of the equation.", "output": "8x + 31 - 31 = 2 - 31"}, { "explanation": "By subtracting 31 from both sides, we simplify the equation to 8x = -29.", "output": "8x = -29"}, { "explanation": "Next, let\'s isolate \'x\' by dividing both sides of the equation by 8.", "output": "8x / 8 = -29 / 8"}], "final_answer": "x = -29/8" }', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=MathResponse(steps=[Step(explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation.", output='8x + 31 - 31 = 2 - 31'), Step(explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.', output='8x = -29'), Step(explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8.", output='8x / 8 = -29 / 8')], final_answer='x = -29/8'))
+Step #0: explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation." output='8x + 31 - 31 = 2 - 31'
+Step #1: explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.' output='8x = -29'
+Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8." output='8x / 8 = -29 / 8'
+Answer: x = -29/8
+```
+
+## Offline Inference
+
+Offline inference allows for the same types of guided decoding.
+To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`.
+The main available options inside `GuidedDecodingParams` are:
+
+- `json`
+- `regex`
+- `choice`
+- `grammar`
+- `backend`
+- `whitespace_pattern`
+
+These parameters can be used in the same way as the parameters from the Online Inference examples above.
+One example for the usage of the `choices` parameter is shown below:
+
+```python
+from vllm import LLM, SamplingParams
+from vllm.sampling_params import GuidedDecodingParams
+
+llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
+
+guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
+sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
+outputs = llm.generate(
+    prompts="Classify this sentiment: vLLM is wonderful!",
+    sampling_params=sampling_params,
+)
+print(outputs[0].outputs[0].text)
+```
+
+A complete example with all options can be found in [examples/offline_inference_structured_outputs.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_structured_outputs.py).
diff --git a/docs/source/usage/structured_outputs.rst b/docs/source/usage/structured_outputs.rst
deleted file mode 100644
index 484e1f17d191e..0000000000000
--- a/docs/source/usage/structured_outputs.rst
+++ /dev/null
@@ -1,267 +0,0 @@
-.. _structured_outputs:
-
-Structured Outputs
-==================
-
-vLLM supports the generation of structured outputs using `outlines <https://github.com/dottxt-ai/outlines>`_ or `lm-format-enforcer <https://github.com/noamgat/lm-format-enforcer>`_ as backends for the guided decoding.
-This document shows you some examples of the different options that are available to generate structured outputs. 
-
-
-Online Inference (OpenAI API)
------------------------------
-
-You can generate structured outputs using the OpenAI's `Completions <https://platform.openai.com/docs/api-reference/completions>`_ and `Chat <https://platform.openai.com/docs/api-reference/chat>`_  API.
-
-The following parameters are supported, which must be added as extra parameters:
-
-- ``guided_choice``: the output will be exactly one of the choices.
-- ``guided_regex``: the output will follow the regex pattern.
-- ``guided_json``: the output will follow the JSON schema.
-- ``guided_grammar``: the output will follow the context free grammar.
-- ``guided_whitespace_pattern``: used to override the default whitespace pattern for guided json decoding.
-- ``guided_decoding_backend``: used to select the guided decoding backend to use.
-
-You can see the complete list of supported parameters on the `OpenAI Compatible Server </../serving/openai_compatible_server.html>`_ page. 
-
-Now let´s see an example for each of the cases, starting with the ``guided_choice``, as it´s the easiest one: 
-
-.. code-block:: python
-
-    from openai import OpenAI
-    client = OpenAI(
-        base_url="http://localhost:8000/v1",
-        api_key="-",
-    )
-
-    completion = client.chat.completions.create(
-        model="Qwen/Qwen2.5-3B-Instruct",
-        messages=[
-            {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
-        ],
-        extra_body={"guided_choice": ["positive", "negative"]},
-    )
-    print(completion.choices[0].message.content)
-
-
-The next example shows how to use the ``guided_regex``. The idea is to generate an email address, given a simple regex template: 
-
-.. code-block:: python
-
-    completion = client.chat.completions.create(
-        model="Qwen/Qwen2.5-3B-Instruct",
-        messages=[
-            {
-                "role": "user",
-                "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
-            }
-        ],
-        extra_body={"guided_regex": "\w+@\w+\.com\n", "stop": ["\n"]},
-    )
-    print(completion.choices[0].message.content)
-
-One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats. 
-For this we can use the ``guided_json`` parameter in two different ways:
-
-- Using directly a `JSON Schema <https://json-schema.org/>`_ 
-- Defining a `Pydantic model <https://docs.pydantic.dev/latest/>`_ and then extracting the JSON Schema from it (which is normally an easier option).
-
-The next example shows how to use the ``guided_json`` parameter with a Pydantic model:
-
-.. code-block:: python
-
-    from pydantic import BaseModel
-    from enum import Enum
-
-    class CarType(str, Enum):
-        sedan = "sedan"
-        suv = "SUV"
-        truck = "Truck"
-        coupe = "Coupe"
-
-
-    class CarDescription(BaseModel):
-        brand: str
-        model: str
-        car_type: CarType
-
-
-    json_schema = CarDescription.model_json_schema()
-
-    completion = client.chat.completions.create(
-        model="Qwen/Qwen2.5-3B-Instruct",
-        messages=[
-            {
-                "role": "user",
-                "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
-            }
-        ],
-        extra_body={"guided_json": json_schema},
-    )
-    print(completion.choices[0].message.content)
-
-.. tip::
-    While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them.
-    This can improve the results notably in most cases.
-
-
-Finally we have the ``guided_grammar``, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries.
-It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below:
-
-.. code-block:: python
-
-    simplified_sql_grammar = """
-        ?start: select_statement
-
-        ?select_statement: "SELECT " column_list " FROM " table_name
-
-        ?column_list: column_name ("," column_name)*
-
-        ?table_name: identifier
-
-        ?column_name: identifier
-
-        ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
-    """
-
-    completion = client.chat.completions.create(
-        model="Qwen/Qwen2.5-3B-Instruct",
-        messages=[
-            {
-                "role": "user",
-                "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
-            }
-        ],
-        extra_body={"guided_grammar": simplified_sql_grammar},
-    )
-    print(completion.choices[0].message.content)
-
-The complete code of the examples can be found on `examples/openai_chat_completion_structured_outputs.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_structured_outputs.py>`_.
-
-Experimental Automatic Parsing (OpenAI API)
---------------------------------------------
-
-This section covers the OpenAI beta wrapper over the ``client.chat.completions.create()`` method that provides richer integrations with Python specific types.
-
-At the time of writing (``openai==1.54.4``), this is a "beta" feature in the OpenAI client library. Code reference can be found `here <https://github.com/openai/openai-python/blob/52357cff50bee57ef442e94d78a0de38b4173fc2/src/openai/resources/beta/chat/completions.py#L100-L104>`_.
-
-For the following examples, vLLM was setup using ``vllm serve meta-llama/Llama-3.1-8B-Instruct``
-
-Here is a simple example demonstrating how to get structured output using Pydantic models:
-
-.. code-block:: python
-
-    from pydantic import BaseModel
-    from openai import OpenAI
-
-
-    class Info(BaseModel):
-        name: str
-        age: int
-
-
-    client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
-    completion = client.beta.chat.completions.parse(
-        model="meta-llama/Llama-3.1-8B-Instruct",
-        messages=[
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
-        ],
-        response_format=Info,
-        extra_body=dict(guided_decoding_backend="outlines"),
-    )
-
-    message = completion.choices[0].message
-    print(message)
-    assert message.parsed
-    print("Name:", message.parsed.name)
-    print("Age:", message.parsed.age)
-
-Output:
-
-.. code-block:: console
-
-    ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28))
-    Name: Cameron
-    Age: 28
-
-
-Here is a more complex example using nested Pydantic models to handle a step-by-step math solution:
-
-.. code-block:: python
-
-    from typing import List
-    from pydantic import BaseModel
-    from openai import OpenAI
-
-
-    class Step(BaseModel):
-        explanation: str
-        output: str
-
-
-    class MathResponse(BaseModel):
-        steps: List[Step]
-        final_answer: str
-
-
-    client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
-    completion = client.beta.chat.completions.parse(
-        model="meta-llama/Llama-3.1-8B-Instruct",
-        messages=[
-            {"role": "system", "content": "You are a helpful expert math tutor."},
-            {"role": "user", "content": "Solve 8x + 31 = 2."},
-        ],
-        response_format=MathResponse,
-        extra_body=dict(guided_decoding_backend="outlines"),
-    )
-
-    message = completion.choices[0].message
-    print(message)
-    assert message.parsed
-    for i, step in enumerate(message.parsed.steps):
-        print(f"Step #{i}:", step)
-    print("Answer:", message.parsed.final_answer)
-
-Output:
-
-.. code-block:: console
-
-    ParsedChatCompletionMessage[MathResponse](content='{ "steps": [{ "explanation": "First, let\'s isolate the term with the variable \'x\'. To do this, we\'ll subtract 31 from both sides of the equation.", "output": "8x + 31 - 31 = 2 - 31"}, { "explanation": "By subtracting 31 from both sides, we simplify the equation to 8x = -29.", "output": "8x = -29"}, { "explanation": "Next, let\'s isolate \'x\' by dividing both sides of the equation by 8.", "output": "8x / 8 = -29 / 8"}], "final_answer": "x = -29/8" }', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=MathResponse(steps=[Step(explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation.", output='8x + 31 - 31 = 2 - 31'), Step(explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.', output='8x = -29'), Step(explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8.", output='8x / 8 = -29 / 8')], final_answer='x = -29/8'))
-    Step #0: explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation." output='8x + 31 - 31 = 2 - 31'
-    Step #1: explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.' output='8x = -29'
-    Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8." output='8x / 8 = -29 / 8'
-    Answer: x = -29/8
-
-Offline Inference
------------------
-
-Offline inference allows for the same types of guided decoding.
-To use it, we´ll need to configure the guided decoding using the class ``GuidedDecodingParams`` inside ``SamplingParams``. 
-The main available options inside ``GuidedDecodingParams`` are: 
-
-- ``json`` 
-- ``regex`` 
-- ``choice``
-- ``grammar``
-- ``backend``
-- ``whitespace_pattern``
-
-These parameters can be used in the same way as the parameters from the Online Inference examples above. 
-One example for the usage of the ``choices`` parameter is shown below: 
-
-.. code-block:: python
-
-    from vllm import LLM, SamplingParams
-    from vllm.sampling_params import GuidedDecodingParams
-
-    llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
-
-    guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
-    sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
-    outputs = llm.generate(
-        prompts="Classify this sentiment: vLLM is wonderful!",
-        sampling_params=sampling_params,
-    )
-    print(outputs[0].outputs[0].text)
-
-A complete example with all options can be found in `examples/offline_inference_structured_outputs.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_structured_outputs.py>`_.
diff --git a/docs/source/usage/usage_stats.md b/docs/source/usage/usage_stats.md
index a1e4b1c38acae..a7eb6144571a4 100644
--- a/docs/source/usage/usage_stats.md
+++ b/docs/source/usage/usage_stats.md
@@ -47,7 +47,7 @@ tail ~/.config/vllm/usage_stats.json
 
 ## Opt-out of Usage Stats Collection
 
-You can opt-out of usage stats collection by setting the VLLM_NO_USAGE_STATS or DO_NOT_TRACK environment variable, or by creating a ~/.config/vllm/do_not_track file:
+You can opt-out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file:
 
 ```bash
 # Any of the following methods can disable usage stats collection
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 19daeb729ee61..480901f71047f 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -430,7 +430,7 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
         # If the feature combo become valid
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
diff --git a/vllm/config.py b/vllm/config.py
index 643698f8bbec3..17602bda15c69 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -638,7 +638,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
             self.use_async_output_proc = False
             return
 
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
         # If the feature combo become valid
         if not current_platform.is_async_output_supported(self.enforce_eager):
             logger.warning(
@@ -658,7 +658,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
         if self.runner_type == "pooling":
             self.use_async_output_proc = False
 
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
         # If the feature combo become valid
         if speculative_config:
             logger.warning("Async output processing is not supported with"
@@ -2058,7 +2058,7 @@ def verify_with_model_config(self, model_config: ModelConfig):
                            model_config.quantization)
 
     def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
         # If the feature combo become valid
         if scheduler_config.chunked_prefill_enabled:
             logger.warning("LoRA with chunked prefill is still experimental "
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 997a952240ecb..21966d003c7ef 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1148,7 +1148,7 @@ def create_engine_config(self,
             disable_logprobs=self.disable_logprobs_during_spec_decoding,
         )
 
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
         # If the feature combo become valid
         if self.num_scheduler_steps > 1:
             if speculative_config is not None:
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index e78b6f4d26758..39f59e55da1f7 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -133,7 +133,7 @@ class LLMEngine:
     and the :class:`AsyncLLMEngine` class wraps this class for online serving.
 
     The config arguments are derived from :class:`~vllm.EngineArgs`. (See
-    :ref:`engine_args`)
+    :ref:`engine-args`)
 
     Args:
         model_config: The configuration related to the LLM model.
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index a9b638ed02a1e..1c6f735f39e04 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -65,7 +65,7 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
     @staticmethod
     @functools.lru_cache
     def _log_prompt_logprob_unsupported_warning_once():
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
         # If the feature combo become valid
         logger.warning(
             "Prompt logprob is not supported by multi step workers. "
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 94d4a4d89adc9..830f54c6a8afa 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -115,7 +115,7 @@ class LLM:
             integer, it is used as the level of compilation optimization. If it
             is a dictionary, it can specify the full compilation configuration.
         **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See
-            :ref:`engine_args`)
+            :ref:`engine-args`)
 
     Note:
         This class is intended to be used for offline inference. For online
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index 2816b5c5c1f88..5495bc50ede83 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -22,7 +22,7 @@ class CPUExecutor(ExecutorBase):
 
     def _init_executor(self) -> None:
         assert self.device_config.device_type == "cpu"
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
         # If the feature combo become valid
         assert self.lora_config is None, "cpu backend doesn't support LoRA"
 
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index d4402e77a3886..aaeecab7ffde1 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -13,7 +13,7 @@
 to dispatch data processing according to the target model.
 
 See also:
-    :ref:`input_processing_pipeline`
+    :ref:`input-processing-pipeline`
 """
 
 __all__ = [
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index fb02627eb22bd..f3ec9d115c9ba 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -314,7 +314,7 @@ def dummy_data_for_profiling(
         The model is identified by ``model_config``.
 
         See also:
-            :ref:`enabling_multimodal_inputs`
+            :ref:`enabling-multimodal-inputs`
 
         Note:
             This should be called after
@@ -391,7 +391,7 @@ def register_input_processor(self, processor: InputProcessor):
         happens before :meth:`~vllm.multimodal.MultiModalRegistry.map_input`.
 
         See also:
-            :ref:`input_processing_pipeline`
+            :ref:`input-processing-pipeline`
         """
 
         def wrapper(model_cls: N) -> N:
@@ -435,7 +435,7 @@ def process_input(self, model_config: "ModelConfig",
         The model is identified by ``model_config``.
 
         See also:
-            :ref:`input_processing_pipeline`
+            :ref:`input-processing-pipeline`
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 928c31a2f2843..9255e062e4870 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -11,7 +11,7 @@
 dispatch data processing according to its modality and the target model.
 
 See also:
-    :ref:`input_processing_pipeline`
+    :ref:`input-processing-pipeline`
 """
 
 __all__ = [
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index fe77a4635f7d8..1e5a46946c6c0 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -50,7 +50,7 @@ class MultiModalPlugin(ABC):
     (i.e., the modality of the data).
 
     See also:
-        :ref:`adding_multimodal_plugin`
+        :ref:`adding-multimodal-plugin`
     """
 
     def __init__(self) -> None:
@@ -94,8 +94,8 @@ def register_input_mapper(
         If `None` is provided, then the default input mapper is used instead.
 
         See also:
-            - :ref:`input_processing_pipeline`
-            - :ref:`enabling_multimodal_inputs`
+            - :ref:`input-processing-pipeline`
+            - :ref:`enabling-multimodal-inputs`
         """
 
         def wrapper(model_cls: N) -> N:
@@ -130,8 +130,8 @@ def map_input(
             TypeError: If the data type is not supported.
 
         See also:
-            - :ref:`input_processing_pipeline`
-            - :ref:`enabling_multimodal_inputs`
+            - :ref:`input-processing-pipeline`
+            - :ref:`enabling-multimodal-inputs`
         """
 
         # Avoid circular import
@@ -190,7 +190,7 @@ def register_max_multimodal_tokens(
         If `None` is provided, then the default calculation is used instead.
 
         See also:
-            :ref:`enabling_multimodal_inputs`
+            :ref:`enabling-multimodal-inputs`
         """
 
         def wrapper(model_cls: N) -> N:
@@ -222,7 +222,7 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
         The model is identified by ``model_config``.
 
         See also:
-            :ref:`enabling_multimodal_inputs`
+            :ref:`enabling-multimodal-inputs`
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 138cc6a44c11a..9ecae2c1ca2bf 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -75,7 +75,7 @@ class MultiModalDataBuiltins(TypedDict, total=False):
     This dictionary also accepts modality keys defined outside
     :class:`MultiModalDataBuiltins` as long as a customized plugin
     is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-    Read more on that :ref:`here <adding_multimodal_plugin>`.
+    Read more on that :ref:`here <adding-multimodal-plugin>`.
 """
 
 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 6cd79d414c978..ded45a7184b5d 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -76,7 +76,7 @@ def register_plugin(self, plugin: MultiModalPlugin) -> None:
         Register a multi-modal plugin so it can be recognized by vLLM.
 
         See also:
-            :ref:`adding_multimodal_plugin`
+            :ref:`adding-multimodal-plugin`
         """
         data_type_key = plugin.get_data_key()
 
@@ -311,8 +311,8 @@ def register_processor(
         invoked to transform the data into a dictionary of model inputs.
 
         See also:
-            - :ref:`input_processing_pipeline`
-            - :ref:`enabling_multimodal_inputs`
+            - :ref:`input-processing-pipeline`
+            - :ref:`enabling-multimodal-inputs`
         """
 
         def wrapper(model_cls: N) -> N:
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index d95a2b4cd5565..09bde9f065eaa 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -50,7 +50,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         import vllm.envs as envs
         from vllm.utils import GiB_bytes
         model_config = vllm_config.model_config
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
         # If the feature combo become valid
         if not model_config.enforce_eager:
             logger.warning(
diff --git a/vllm/scripts.py b/vllm/scripts.py
index a51c21cfa29e7..42e1c639eda10 100644
--- a/vllm/scripts.py
+++ b/vllm/scripts.py
@@ -165,7 +165,7 @@ def main():
         required=False,
         help="Read CLI options from a config file."
         "Must be a YAML with the following options:"
-        "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server"
+        "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference"
     )
     serve_parser = make_arg_parser(serve_parser)
     serve_parser.set_defaults(dispatch_function=serve)
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 2689802161987..de593113b938b 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -108,7 +108,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
     return spec_decode_worker
 
 
-# Reminder: Please update docs/source/usage/compatibility_matrix.rst
+# Reminder: Please update docs/source/usage/compatibility_matrix.md
 # If the feature combo become valid
 class SpecDecodeWorker(LoraNotSupportedWorkerBase):
     """Worker which implements speculative decoding.
diff --git a/vllm/utils.py b/vllm/utils.py
index 1b90eca1cd6cc..49e532540d7ee 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -51,7 +51,7 @@
 
 # Exception strings for non-implemented encoder/decoder scenarios
 
-# Reminder: Please update docs/source/usage/compatibility_matrix.rst
+# Reminder: Please update docs/source/usage/compatibility_matrix.md
 # If the feature combo become valid
 
 STR_NOT_IMPL_ENC_DEC_SWA = \
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index f3d7c726a29f1..65d9bab0e2822 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -821,7 +821,7 @@ def _pythonize_sampler_output(
 
     for sgdx, (seq_group,
                sample_result) in enumerate(zip(seq_groups, samples_list)):
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
         # If the feature combo become valid
         # (Check for Guided Decoding)
         if seq_group.sampling_params.logits_processors:
diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py
index 5f71ec0c14df8..8f2d343440d3e 100644
--- a/vllm/worker/utils.py
+++ b/vllm/worker/utils.py
@@ -13,7 +13,7 @@ def assert_enc_dec_mr_supported_scenario(
     a supported scenario.
     '''
 
-    # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+    # Reminder: Please update docs/source/usage/compatibility_matrix.md
     # If the feature combo become valid
 
     if enc_dec_mr.cache_config.enable_prefix_caching:

From a491d6f535d96939d17e5290991dc975495c9580 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Mon, 23 Dec 2024 15:00:12 -0800
Subject: [PATCH 160/357] [V1] TP Ray executor (#11107)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 .../test_basic_correctness.py                 |   2 +-
 vllm/v1/engine/llm_engine.py                  |   7 +-
 vllm/v1/executor/ray_executor.py              | 339 ++++++++++++++++++
 vllm/v1/executor/ray_utils.py                 | 271 ++++++++++++++
 vllm/v1/worker/gpu_worker.py                  |   1 -
 5 files changed, 617 insertions(+), 3 deletions(-)
 create mode 100644 vllm/v1/executor/ray_executor.py
 create mode 100644 vllm/v1/executor/ray_utils.py

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 11d05cefb7313..9e4eb16fc6cc5 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -130,7 +130,7 @@ def test_models_distributed(
     # Import VLLM_USE_V1 dynamically to handle patching
     from vllm.envs import VLLM_USE_V1
     if VLLM_USE_V1 and distributed_executor_backend != "mp":
-        pytest.skip(f"Skip {distributed_executor_backend} for V1")
+        os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
 
     dtype = "half"
     max_tokens = 5
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index bea8c5502f612..9ad51575b3cc3 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -21,6 +21,7 @@
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.executor.ray_utils import initialize_ray_cluster
 
 logger = init_logger(__name__)
 
@@ -110,7 +111,11 @@ def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
         executor_class: Type[Executor]
         distributed_executor_backend = (
             vllm_config.parallel_config.distributed_executor_backend)
-        if distributed_executor_backend == "mp":
+        if distributed_executor_backend == "ray":
+            initialize_ray_cluster(vllm_config.parallel_config)
+            from vllm.v1.executor.ray_executor import RayExecutor
+            executor_class = RayExecutor
+        elif distributed_executor_backend == "mp":
             from vllm.v1.executor.multiproc_executor import MultiprocExecutor
             executor_class = MultiprocExecutor
         else:
diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py
new file mode 100644
index 0000000000000..dfeb69fa701a3
--- /dev/null
+++ b/vllm/v1/executor/ray_executor.py
@@ -0,0 +1,339 @@
+import os
+from collections import defaultdict
+from itertools import islice, repeat
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.executor.ray_utils import RayWorkerWrapper, ray
+from vllm.v1.outputs import ModelRunnerOutput
+
+if ray is not None:
+    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
+logger = init_logger(__name__)
+
+
+class RayExecutor(Executor):
+
+    def __init__(self, vllm_config: VllmConfig) -> None:
+        self.vllm_config = vllm_config
+        self.parallel_config = vllm_config.parallel_config
+        self.model_config = vllm_config.model_config
+        self.forward_dag: Optional[ray.dag.CompiledDAG] = None
+
+        # Disable Ray usage stats collection.
+        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
+        if ray_usage != "1":
+            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
+
+        placement_group = self.parallel_config.placement_group
+        # Create the parallel GPU workers.
+        self._init_workers_ray(placement_group)
+
+    def _init_workers_ray(self, placement_group: "PlacementGroup",
+                          **ray_remote_kwargs):
+        # A list of workers to run a model.
+        self.workers: List[RayWorkerWrapper] = []
+        if self.parallel_config.ray_workers_use_nsight:
+            ray_remote_kwargs = self._configure_ray_workers_use_nsight(
+                ray_remote_kwargs)
+
+        # Create the workers.
+        driver_ip = get_ip()
+        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
+            if not bundle.get("GPU", 0):
+                # Skip bundles that don't have GPUs,
+                # as each worker needs one GPU.
+                continue
+            scheduling_strategy = PlacementGroupSchedulingStrategy(
+                placement_group=placement_group,
+                placement_group_capture_child_tasks=True,
+                placement_group_bundle_index=bundle_id,
+            )
+
+            worker = ray.remote(
+                num_cpus=0,
+                num_gpus=1,
+                scheduling_strategy=scheduling_strategy,
+                **ray_remote_kwargs,
+            )(RayWorkerWrapper).remote(vllm_config=self.vllm_config)
+            self.workers.append(worker)
+
+        logger.debug("workers: %s", self.workers)
+        worker_ips = [
+            ray.get(worker.get_node_ip.remote())  # type: ignore[attr-defined]
+            for worker in self.workers
+        ]
+        ip_counts: Dict[str, int] = {}
+        for ip in worker_ips:
+            ip_counts[ip] = ip_counts.get(ip, 0) + 1
+
+        worker_to_ip = dict(zip(self.workers, worker_ips))
+
+        def sort_by_driver_then_worker_ip(worker):
+            """
+            Sort the workers based on 3 properties:
+            1. If the worker is on the same node as the driver (vllm engine),
+                it should be placed first.
+            2. Then, if the worker is on a node with fewer workers, it should
+                be placed first.
+            3. Finally, if the work is on a node with smaller IP address, it
+                should be placed first. This is simply a tiebreaker to make
+                sure the workers are sorted in a deterministic way.
+            """
+            ip = worker_to_ip[worker]
+            return (ip != driver_ip, ip_counts[ip], ip)
+
+        # After sorting, the workers on the same node will be
+        # close to each other, and the workers on the driver
+        # node will be placed first.
+        self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
+
+        # Get the set of GPU IDs used on each node.
+        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids")
+
+        node_workers = defaultdict(list)  # node id -> list of worker ranks
+        node_gpus = defaultdict(list)  # node id -> list of gpu ids
+
+        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
+            node_workers[node_id].append(i)
+            # `gpu_ids` can be a list of strings or integers.
+            # convert them to integers for consistency.
+            # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
+            # string sorting is not sufficient.
+            # see https://github.com/vllm-project/vllm/issues/5590
+            gpu_ids = [int(x) for x in gpu_ids]
+            node_gpus[node_id].extend(gpu_ids)
+
+        for node_id, gpu_ids in node_gpus.items():
+            node_gpus[node_id] = sorted(gpu_ids)
+
+        all_ips = set(worker_ips)
+        n_ips = len(all_ips)
+        n_nodes = len(node_workers)
+
+        if n_nodes != n_ips:
+            raise RuntimeError(
+                f"Every node should have a unique IP address. Got {n_nodes}"
+                f" nodes with node ids {list(node_workers.keys())} and "
+                f"{n_ips} unique IP addresses {all_ips}. Please check your"
+                " network configuration. If you set `VLLM_HOST_IP` or "
+                "`HOST_IP` environment variable, make sure it is unique for"
+                " each node.")
+
+        # Set environment variables for the driver and workers.
+        all_args_to_update_environment_variables = [({
+            "CUDA_VISIBLE_DEVICES":
+            ",".join(map(str, node_gpus[node_id])),
+            "VLLM_TRACE_FUNCTION":
+            str(envs.VLLM_TRACE_FUNCTION),
+            "VLLM_USE_V1":
+            str(int(envs.VLLM_USE_V1)),
+            **({
+                "VLLM_ATTENTION_BACKEND": envs.VLLM_ATTENTION_BACKEND
+            } if envs.VLLM_ATTENTION_BACKEND is not None else {})
+        }, ) for (node_id, _) in worker_node_and_gpu_ids]
+
+        self._env_vars_for_all_workers = (
+            all_args_to_update_environment_variables)
+
+        self._run_workers("update_environment_variables",
+                          all_args=self._get_env_vars_to_be_updated())
+
+        if len(node_gpus) == 1:
+            # in single node case, we don't need to get the IP address.
+            # the loopback address is sufficient
+            # NOTE: a node may have several IP addresses, one for each
+            # network interface. `get_ip()` might return any of them,
+            # while they might not work for communication inside the node
+            # if the network setup is complicated. Using the loopback address
+            # solves this issue, as it always works for communication inside
+            # the node.
+            driver_ip = "127.0.0.1"
+        distributed_init_method = get_distributed_init_method(
+            driver_ip, get_open_port())
+
+        # Initialize the actual workers inside worker wrapper.
+        init_worker_all_kwargs = [
+            self._get_worker_kwargs(
+                local_rank=node_workers[node_id].index(rank),
+                rank=rank,
+                distributed_init_method=distributed_init_method,
+            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
+        ]
+        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
+        self._run_workers("initialize")
+        self._run_workers("load_model")
+
+    def _configure_ray_workers_use_nsight(self,
+                                          ray_remote_kwargs) -> Dict[str, Any]:
+        # If nsight profiling is enabled, we need to set the profiling
+        # configuration for the ray workers as runtime env.
+        runtime_env = ray_remote_kwargs.setdefault("runtime_env", {})
+        runtime_env.update({
+            "nsight": {
+                "t": "cuda,cudnn,cublas",
+                "o": "'worker_process_%p'",
+                "cuda-graph-trace": "node",
+            }
+        })
+
+        return ray_remote_kwargs
+
+    def _get_env_vars_to_be_updated(self):
+        return self._env_vars_for_all_workers
+
+    def _get_worker_kwargs(
+            self,
+            local_rank: int = 0,
+            rank: int = 0,
+            distributed_init_method: Optional[str] = None) -> Dict[str, Any]:
+        """
+        Return worker init args for a given rank.
+        """
+        if distributed_init_method is None:
+            distributed_init_method = get_distributed_init_method(
+                get_ip(), get_open_port())
+        return dict(
+            vllm_config=self.vllm_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+        )
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """
+        Determine the number of available KV blocks.
+        
+        This invokes `determine_num_available_blocks` on each worker and takes
+        the min of the results, guaranteeing that the selected cache sizes are
+        compatible with all workers.
+        
+        Returns:
+            - tuple[num_gpu_blocks, num_cpu_blocks]
+        """
+        # Get the maximum number of blocks that can be allocated on GPU and CPU.
+        num_blocks = self._run_workers("determine_num_available_blocks")
+
+        # Since we use a shared centralized controller, we take the minimum
+        # number of blocks across all workers to make sure all the memory
+        # operators can be applied to all workers.
+        num_gpu_blocks = min(b[0] for b in num_blocks)
+        num_cpu_blocks = min(b[1] for b in num_blocks)
+
+        return num_gpu_blocks, num_cpu_blocks
+
+    def initialize(self, num_gpu_blocks: int) -> None:
+        """
+        Initialize the KV cache in all workers.
+        """
+        # NOTE: This is logged in the executor because there can be >1 worker
+        # with other executors. We could log in the engine level, but work
+        # remains to abstract away the device for non-GPU configurations.
+        logger.info("# GPU blocks: %d", num_gpu_blocks)
+        self._run_workers("initialize_cache", num_gpu_blocks)
+        self._run_workers("compile_or_warm_up_model")
+
+    def _run_workers(
+        self,
+        method: str,
+        *args,
+        all_args: Optional[List[Tuple[Any, ...]]] = None,
+        all_kwargs: Optional[List[Dict[str, Any]]] = None,
+        **kwargs,
+    ) -> Any:
+        """
+        Runs the given method on all workers. Can be used in the following
+        ways:
+
+        Args:
+        - args/kwargs: All workers share the same args/kwargs
+        - all_args/all_kwargs: args/kwargs for each worker are specified
+          individually
+        """
+        count = len(self.workers)
+        all_worker_args = repeat(args, count) if all_args is None \
+            else islice(all_args, 0, None)
+        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
+            else islice(all_kwargs, 0, None)
+
+        ray_worker_refs = [
+            worker.execute_method.remote(  # type: ignore[attr-defined]
+                method, *worker_args, **worker_kwargs)
+            for (worker, worker_args, worker_kwargs
+                 ) in zip(self.workers, all_worker_args, all_worker_kwargs)
+        ]
+        return ray.get(ray_worker_refs)
+
+    def execute_model(
+        self,
+        scheduler_output,
+    ) -> ModelRunnerOutput:
+        if self.forward_dag is None:
+            self.forward_dag = self._compiled_ray_dag()
+        # Only the first worker (with rank 0) returns the execution result.
+        # Others return None.
+        output = ray.get(self.forward_dag.execute(scheduler_output))[0]
+        return output
+
+    def profile(self, is_start=True):
+        raise NotImplementedError
+
+    def shutdown(self):
+        if hasattr(self, "forward_dag") and self.forward_dag is not None:
+            self.forward_dag.teardown()
+            import ray
+            for worker in self.workers:
+                ray.kill(worker)
+            self.forward_dag = None
+
+    def check_health(self) -> None:
+        logger.debug("Called check_health.")
+
+    def _check_ray_compiled_graph_installation(self):
+        import pkg_resources
+        from packaging import version
+
+        required_version = version.parse("2.39")
+        current_version = version.parse(
+            pkg_resources.get_distribution("ray").version)
+        if current_version < required_version:
+            raise ValueError(f"Ray version {required_version} is "
+                             f"required, but found {current_version}")
+
+        import importlib.util
+        raycg = importlib.util.find_spec("ray.experimental.compiled_dag_ref")
+        if raycg is None:
+            raise ValueError("Ray Compiled Graph is not installed. "
+                             "Run `pip install ray[adag]` to install it.")
+
+        cupy_spec = importlib.util.find_spec("cupy")
+        if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL:
+            raise ValueError(
+                "cupy is not installed but required since "
+                "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set."
+                "Run `pip install ray[adag]` and check cupy installation.")
+
+    def _compiled_ray_dag(self):
+        assert self.parallel_config.use_ray
+        self._check_ray_compiled_graph_installation()
+        from ray.dag import InputNode, MultiOutputNode
+
+        with InputNode() as input_batches:
+            outputs = [
+                worker.execute_model.bind(  # type: ignore[attr-defined]
+                    input_batches) for worker in self.workers
+            ]
+            forward_dag = MultiOutputNode(outputs)
+
+        return forward_dag.experimental_compile()
+
+    def __del__(self):
+        self.shutdown()
diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py
new file mode 100644
index 0000000000000..7733610e59c7f
--- /dev/null
+++ b/vllm/v1/executor/ray_utils.py
@@ -0,0 +1,271 @@
+import time
+from collections import defaultdict
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
+
+from vllm.config import ParallelConfig
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils import get_ip
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.worker.worker_base import WorkerWrapperBase
+
+if TYPE_CHECKING:
+    from vllm.v1.core.scheduler import SchedulerOutput
+
+logger = init_logger(__name__)
+PG_WAIT_TIMEOUT = 60
+
+try:
+    import ray
+    from ray.util import placement_group_table
+    from ray.util.placement_group import PlacementGroup
+    try:
+        from ray._private.state import available_resources_per_node
+    except ImportError:
+        # Ray 2.9.x doesn't expose `available_resources_per_node`
+        from ray._private.state import state as _state
+        available_resources_per_node = _state._available_resources_per_node
+
+    class RayWorkerWrapper(WorkerWrapperBase):
+
+        def __init__(self, *args, **kwargs) -> None:
+            super().__init__(*args, **kwargs)
+            # Since the compiled DAG runs a main execution
+            # in a different thread that calls cuda.set_device.
+            # The flag indicates is set_device is called on
+            # that thread. It will be removed soon.
+            self.compiled_dag_cuda_device_set = False
+
+        def get_node_ip(self) -> str:
+            return get_ip()
+
+        def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
+            node_id = ray.get_runtime_context().get_node_id()
+            gpu_ids = ray.get_gpu_ids()
+            return node_id, gpu_ids
+
+        def setup_device_if_necessary(self):
+            # TODO(swang): This is needed right now because Ray CG executes
+            # on a background thread, so we need to reset torch's current
+            # device.
+            # We can remove this API after it is fixed in compiled graph.
+            import torch
+            assert self.worker is not None, "Worker is not initialized"
+            if not self.compiled_dag_cuda_device_set:
+                torch.cuda.set_device(self.worker.device)
+                self.compiled_dag_cuda_device_set = True
+
+        def execute_model(
+            self,
+            scheduler_output: "SchedulerOutput",
+        ) -> ModelRunnerOutput:
+            self.setup_device_if_necessary()
+            assert self.worker is not None, "Worker is not initialized"
+            output = self.worker.model_runner.execute_model(scheduler_output)
+            return output
+
+    ray_import_err = None
+
+except ImportError as e:
+    ray = None  # type: ignore
+    ray_import_err = e
+    RayWorkerWrapper = None  # type: ignore
+
+
+def ray_is_available() -> bool:
+    """Returns True if Ray is available."""
+    return ray is not None
+
+
+def assert_ray_available():
+    """
+    Raise an exception if Ray is not available.
+    """
+    if ray is None:
+        raise ValueError("Failed to import Ray, please install Ray with "
+                         "`pip install ray`.") from ray_import_err
+
+
+def _verify_bundles(placement_group: "PlacementGroup",
+                    parallel_config: ParallelConfig, device_str: str):
+    """
+    Verify a given placement group has bundles located in the right place.
+
+    There are 2 rules.
+    - Warn if all tensor parallel workers cannot fit in a single node.
+    - Fail if driver node is not included in a placement group.
+
+    Args:
+        placement_group: The placement group to verify.
+        parallel_config: The parallel configuration.
+        device_str: The required device.
+    """
+    assert ray.is_initialized(), (
+        "Ray is not initialized although distributed-executor-backend is ray.")
+    pg_data = placement_group_table(placement_group)
+    # bundle_idx -> node_id
+    bundle_to_node_ids = pg_data["bundles_to_node_id"]
+    # bundle_idx -> bundle (e.g., {"GPU": 1})
+    bundles = pg_data["bundles"]
+    # node_id -> List of bundle (e.g., {"GPU": 1})
+    node_id_to_bundle: Dict[str, List[Dict[str, float]]] = defaultdict(list)
+
+    for bundle_idx, node_id in bundle_to_node_ids.items():
+        node_id_to_bundle[node_id].append(bundles[bundle_idx])
+    driver_node_id = ray.get_runtime_context().get_node_id()
+
+    if driver_node_id not in node_id_to_bundle:
+        raise RuntimeError(
+            f"driver node id {driver_node_id} is not included in a placement "
+            f"group {placement_group.id}. Node id -> bundles "
+            f"{node_id_to_bundle}. "
+            "You don't have enough GPUs available in a current node. Check "
+            "`ray status` to see if you have available GPUs in a node "
+            f"{driver_node_id} before starting an vLLM engine.")
+
+    for node_id, bundles in node_id_to_bundle.items():
+        if len(bundles) < parallel_config.tensor_parallel_size:
+            logger.warning(
+                "tensor_parallel_size=%d "
+                "is bigger than a reserved number of %ss (%d "
+                "%ss) in a node %s. Tensor parallel workers can be "
+                "spread out to 2+ nodes which can degrade the performance "
+                "unless you have fast interconnect across nodes, like "
+                "Infiniband. To resolve this issue, make sure you have more "
+                "than %d GPUs available at each node.",
+                parallel_config.tensor_parallel_size, device_str, len(bundles),
+                device_str, node_id, parallel_config.tensor_parallel_size)
+
+
+def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
+    """Wait until a placement group is ready.
+
+    It prints the informative log messages if the placement group is
+    not created within time.
+
+    """
+    # Wait until PG is ready - this will block until all
+    # requested resources are available, and will timeout
+    # if they cannot be provisioned.
+    placement_group_specs = current_placement_group.bundle_specs
+
+    s = time.time()
+    pg_ready_ref = current_placement_group.ready()
+    wait_interval = 10
+    while time.time() - s < PG_WAIT_TIMEOUT:
+        ready, _ = ray.wait([pg_ready_ref], timeout=wait_interval)
+        if len(ready) > 0:
+            break
+
+        # Exponential backoff for warning print.
+        wait_interval *= 2
+        logger.info(
+            "Waiting for creating a placement group of specs for "
+            "%d seconds. specs=%s. Check "
+            "`ray status` to see if you have enough resources.",
+            int(time.time() - s), placement_group_specs)
+
+    try:
+        ray.get(pg_ready_ref, timeout=0)
+    except ray.exceptions.GetTimeoutError:
+        raise ValueError(
+            "Cannot provide a placement group of "
+            f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See "
+            "`ray status` to make sure the cluster has enough resources."
+        ) from None
+
+
+def initialize_ray_cluster(
+    parallel_config: ParallelConfig,
+    ray_address: Optional[str] = None,
+):
+    """Initialize the distributed cluster with Ray.
+
+    it will connect to the Ray cluster and create a placement group
+    for the workers, which includes the specification of the resources
+    for each distributed worker.
+
+    Args:
+        parallel_config: The configurations for parallel execution.
+        ray_address: The address of the Ray cluster. If None, uses
+            the default Ray cluster address.
+    """
+    assert_ray_available()
+
+    # Connect to a ray cluster.
+    if current_platform.is_rocm() or current_platform.is_xpu():
+        # Try to connect existing ray instance and create a new one if not found
+        try:
+            ray.init("auto")
+        except ConnectionError:
+            logger.warning(
+                "No existing RAY instance detected. "
+                "A new instance will be launched with current node resources.")
+            ray.init(address=ray_address,
+                     ignore_reinit_error=True,
+                     num_gpus=parallel_config.world_size)
+    else:
+        ray.init(address=ray_address, ignore_reinit_error=True)
+
+    if parallel_config.placement_group:
+        # Placement group is already set.
+        return
+
+    device_str = "GPU" if not current_platform.is_tpu() else "TPU"
+    # Create placement group for worker processes
+    current_placement_group = ray.util.get_current_placement_group()
+    if current_placement_group:
+        # We are in a placement group
+        bundles = current_placement_group.bundle_specs
+        # Verify that we can use the placement group.
+        device_bundles = 0
+        for bundle in bundles:
+            bundle_devices = bundle.get(device_str, 0)
+            if bundle_devices > 1:
+                raise ValueError(
+                    "Placement group bundle cannot have more than 1 "
+                    f"{device_str}.")
+            if bundle_devices:
+                device_bundles += 1
+        if parallel_config.world_size > device_bundles:
+            raise ValueError(
+                f"The number of required {device_str}s exceeds the total "
+                f"number of available {device_str}s in the placement group."
+                f"Required number of devices: {parallel_config.world_size}. "
+                f"Total number of devices: {device_bundles}.")
+    else:
+        num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
+        if parallel_config.world_size > num_devices_in_cluster:
+            raise ValueError(
+                f"The number of required {device_str}s exceeds the total "
+                f"number of available {device_str}s in the placement group.")
+        # Create a new placement group
+        placement_group_specs: List[Dict[str, float]] = ([{
+            device_str: 1.0
+        } for _ in range(parallel_config.world_size)])
+
+        # vLLM engine is also a worker to execute model with an accelerator,
+        # so it requires to have the device in a current node. Check if
+        # the current node has at least one device.
+        current_ip = get_ip()
+        current_node_id = ray.get_runtime_context().get_node_id()
+        current_node_resource = available_resources_per_node()[current_node_id]
+        if current_node_resource.get(device_str, 0) < 1:
+            raise ValueError(
+                f"Current node has no {device_str} available. "
+                f"{current_node_resource=}. vLLM engine cannot start without "
+                f"{device_str}. Make sure you have at least 1 {device_str} "
+                f"available in a node {current_node_id=} {current_ip=}.")
+        # This way, at least bundle is required to be created in a current
+        # node.
+        placement_group_specs[0][f"node:{current_ip}"] = 0.001
+
+        # By default, Ray packs resources as much as possible.
+        current_placement_group = ray.util.placement_group(
+            placement_group_specs, strategy="PACK")
+        _wait_until_pg_ready(current_placement_group)
+
+    assert current_placement_group is not None
+    _verify_bundles(current_placement_group, parallel_config, device_str)
+    # Set the placement group in the parallel config
+    parallel_config.placement_group = current_placement_group
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 33491f700de10..0000b09bfaa36 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -202,7 +202,6 @@ def execute_model(
     ) -> ModelRunnerOutput:
         output = self.model_runner.execute_model(scheduler_output)
         return output if self.rank == 0 else None
-        return output
 
     def profile(self, is_start: bool = True):
         if self.profiler is None:

From 4f074fbf53f9e11a57a4c3d8b084796d155a270a Mon Sep 17 00:00:00 2001
From: dpxa <shiquan1988@gmail.com>
Date: Tue, 24 Dec 2024 16:43:39 +0800
Subject: [PATCH 161/357] =?UTF-8?q?[Misc]Suppress=20irrelevant=20exception?=
 =?UTF-8?q?=20stack=20trace=20information=20when=20CUDA=E2=80=A6=20(#11438?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: shiquan <shiquan>
---
 vllm/entrypoints/llm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 830f54c6a8afa..fadf297e9f6aa 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -233,7 +233,8 @@ def __init__(
         self.request_counter = Counter()
 
     def __del__(self):
-        if self.llm_engine and hasattr(self.llm_engine, "shutdown"):
+        if hasattr(self, 'llm_engine') and self.llm_engine and hasattr(
+                self.llm_engine, "shutdown"):
             self.llm_engine.shutdown()
 
     @staticmethod

From 9edca6bf8fa81e2dc678be68e9cdcede572947c1 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 24 Dec 2024 17:54:30 +0800
Subject: [PATCH 162/357] [Frontend] Online Pooling API (#11457)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/generative_models.md       |  18 +-
 docs/source/models/pooling_models.md          |  22 +-
 .../serving/openai_compatible_server.md       |  48 +++-
 examples/openai_cross_encoder_score.py        |   2 +-
 examples/openai_pooling_client.py             |  51 ++++
 tests/entrypoints/openai/test_embedding.py    |  68 +++--
 tests/entrypoints/openai/test_pooling.py      | 238 ++++++++++++++++++
 .../openai/test_vision_embedding.py           |  38 ++-
 vllm/entrypoints/openai/api_server.py         |  73 +++++-
 vllm/entrypoints/openai/protocol.py           |  19 ++
 vllm/entrypoints/openai/run_batch.py          |   2 +-
 vllm/entrypoints/openai/serving_embedding.py  |  79 +++---
 vllm/entrypoints/openai/serving_pooling.py    | 234 +++++++++++++++++
 vllm/entrypoints/openai/serving_score.py      |  71 +++---
 vllm/outputs.py                               |   3 +-
 15 files changed, 809 insertions(+), 157 deletions(-)
 create mode 100644 examples/openai_pooling_client.py
 create mode 100644 tests/entrypoints/openai/test_pooling.py
 create mode 100644 vllm/entrypoints/openai/serving_pooling.py

diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index 7aeaba855dcfb..e97014dbef156 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -120,19 +120,7 @@ outputs = llm.chat(conversation, chat_template=custom_template)
 
 ## Online Inference
 
-Our [OpenAI Compatible Server](../serving/openai_compatible_server) can be used for online inference.
-Please click on the above link for more details on how to launch the server.
+Our [OpenAI Compatible Server](../serving/openai_compatible_server) provides endpoints that correspond to the offline APIs:
 
-### Completions API
-
-Our Completions API is similar to `LLM.generate` but only accepts text.
-It is compatible with [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions)
-so that you can use OpenAI client to interact with it.
-A code example can be found in [examples/openai_completion_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py).
-
-### Chat API
-
-Our Chat API is similar to `LLM.chat`, accepting both text and [multi-modal inputs](#multimodal-inputs).
-It is compatible with [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
-so that you can use OpenAI client to interact with it.
-A code example can be found in [examples/openai_chat_completion_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client.py).
+- [Completions API](#completions-api) is similar to `LLM.generate` but only accepts text.
+- [Chat API](#chat-api)  is similar to `LLM.chat`, accepting both text and [multi-modal inputs](#multimodal-inputs) for models with a chat template.
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 20a7b8f33947d..6d034f652d2ab 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -106,22 +106,8 @@ A code example can be found in [examples/offline_inference_scoring.py](https://g
 
 ## Online Inference
 
-Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) can be used for online inference.
-Please click on the above link for more details on how to launch the server.
+Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
 
-### Embeddings API
-
-Our Embeddings API is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs).
-
-The text-only API is compatible with [OpenAI Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)
-so that you can use OpenAI client to interact with it.
-A code example can be found in [examples/openai_embedding_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_embedding_client.py).
-
-The multi-modal API is an extension of the [OpenAI Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)
-that incorporates [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat),
-so it is not part of the OpenAI standard. Please see [](#multimodal-inputs) for more details on how to use it.
-
-### Score API
-
-Our Score API is similar to `LLM.score`.
-Please see [this page](#score-api) for more details on how to use it.
+- [Pooling API](#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
+- [Embeddings API](#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs) for embedding models.
+- [Score API](#score-api) is similar to `LLM.score` for cross-encoder models.
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 934a7cea7b9cb..597618cc5a215 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -42,6 +42,8 @@ In addition, we have the following custom APIs:
 
 - [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`)
   - Applicable to any model with a tokenizer.
+- [Pooling API](#pooling-api) (`/pooling`)
+  - Applicable to all [pooling models](../models/pooling_models.md).
 - [Score API](#score-api) (`/score`)
   - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`).
 
@@ -179,7 +181,12 @@ The order of priorities is `command line > config file values > defaults`.
 (completions-api)=
 ### Completions API
 
-Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/completions) for more details.
+Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
+you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
+
+#### Code example
+
+See [examples/openai_completion_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py).
 
 #### Extra parameters
 
@@ -200,15 +207,20 @@ The following extra parameters are supported:
 ```
 
 (chat-api)=
-### Chat Completions API
+### Chat API
 
-Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/chat) for more details.
+Our Chat API is compatible with [OpenAI's Chat Completions API](https://platform.openai.com/docs/api-reference/chat);
+you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
 
 We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
 [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters;
 see our [Multimodal Inputs](../usage/multimodal_inputs.md) guide for more information.
 - *Note: `image_url.detail` parameter is not supported.*
 
+#### Code example
+
+See [examples/openai_chat_completion_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client.py).
+
 #### Extra parameters
 
 The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported.
@@ -230,15 +242,20 @@ The following extra parameters are supported:
 (embeddings-api)=
 ### Embeddings API
 
-Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/embeddings) for more details.
+Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
+you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
 
-If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat Completions API](#chat-api))
+If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api))
 which will be treated as a single prompt to the model.
 
 ```{tip}
-This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.md) for details.
+This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details.
 ```
 
+#### Code example
+
+See [examples/openai_embedding_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_embedding_client.py).
+
 #### Extra parameters
 
 The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported.
@@ -268,20 +285,35 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s
 (tokenizer-api)=
 ### Tokenizer API
 
-The Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer).
+Our Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer).
 It consists of two endpoints:
 
 - `/tokenize` corresponds to calling `tokenizer.encode()`.
 - `/detokenize` corresponds to calling `tokenizer.decode()`.
 
+(pooling-api)=
+### Pooling API
+
+Our Pooling API encodes input prompts using a [pooling model](../models/pooling_models.md) and returns the corresponding hidden states.
+
+The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
+
+#### Code example
+
+See [examples/openai_pooling_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_pooling_client.py).
+
 (score-api)=
 ### Score API
 
-The Score API applies a cross-encoder model to predict scores for sentence pairs.
+Our Score API applies a cross-encoder model to predict scores for sentence pairs.
 Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1.
 
 You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
 
+#### Code example
+
+See [examples/openai_cross_encoder_score.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_cross_encoder_score.py).
+
 #### Single inference
 
 You can pass a string to both `text_1` and `text_2`, forming a single sentence pair.
diff --git a/examples/openai_cross_encoder_score.py b/examples/openai_cross_encoder_score.py
index a06af8df5d3fe..365a684d53f2b 100644
--- a/examples/openai_cross_encoder_score.py
+++ b/examples/openai_cross_encoder_score.py
@@ -20,9 +20,9 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
     parser.add_argument("--host", type=str, default="localhost")
     parser.add_argument("--port", type=int, default=8000)
     parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3")
+
     args = parser.parse_args()
     api_url = f"http://{args.host}:{args.port}/score"
-
     model_name = args.model
 
     text_1 = "What is the capital of Brazil?"
diff --git a/examples/openai_pooling_client.py b/examples/openai_pooling_client.py
new file mode 100644
index 0000000000000..37ec8f2fb6be3
--- /dev/null
+++ b/examples/openai_pooling_client.py
@@ -0,0 +1,51 @@
+"""
+Example online usage of Pooling API.
+
+Run `vllm serve <model> --task <embed|classify|reward|score>`
+to start up the server in vLLM.
+"""
+import argparse
+import pprint
+
+import requests
+
+
+def post_http_request(prompt: dict, api_url: str) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    response = requests.post(api_url, headers=headers, json=prompt)
+    return response
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--model",
+                        type=str,
+                        default="jason9693/Qwen2.5-1.5B-apeach")
+
+    args = parser.parse_args()
+    api_url = f"http://{args.host}:{args.port}/pooling"
+    model_name = args.model
+
+    # Input like Completions API
+    prompt = {"model": model_name, "input": "vLLM is great!"}
+    pooling_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("Pooling Response:")
+    pprint.pprint(pooling_response.json())
+
+    # Input like Chat API
+    prompt = {
+        "model":
+        model_name,
+        "messages": [{
+            "role": "user",
+            "content": [{
+                "type": "text",
+                "text": "vLLM is great!"
+            }],
+        }]
+    }
+    pooling_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("Pooling Response:")
+    pprint.pprint(pooling_response.json())
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index 9f2b77dde2a7f..b52a5b28c9cff 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -6,6 +6,7 @@
 import pytest_asyncio
 import requests
 
+from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 from ...utils import RemoteOpenAIServer
@@ -17,6 +18,8 @@
 @pytest.fixture(scope="module")
 def server():
     args = [
+        "--task",
+        "embed",
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         "bfloat16",
@@ -45,11 +48,14 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
     ]
 
     # test single embedding
-    embeddings = await client.embeddings.create(
+    embedding_response = await client.embeddings.create(
         model=model_name,
         input=input_texts,
         encoding_format="float",
     )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
     assert len(embeddings.data[0].embedding) == 4096
@@ -59,11 +65,14 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
 
     # test using token IDs
     input_tokens = [1, 1, 1, 1, 1]
-    embeddings = await client.embeddings.create(
+    embedding_response = await client.embeddings.create(
         model=model_name,
         input=input_tokens,
         encoding_format="float",
     )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
     assert len(embeddings.data[0].embedding) == 4096
@@ -80,11 +89,14 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
         "The cat sat on the mat.", "A feline was resting on a rug.",
         "Stars twinkle brightly in the night sky."
     ]
-    embeddings = await client.embeddings.create(
+    embedding_response = await client.embeddings.create(
         model=model_name,
         input=input_texts,
         encoding_format="float",
     )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
     assert embeddings.id is not None
     assert len(embeddings.data) == 3
     assert len(embeddings.data[0].embedding) == 4096
@@ -95,11 +107,14 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
     # test List[List[int]]
     input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
                     [25, 32, 64, 77]]
-    embeddings = await client.embeddings.create(
+    embedding_response = await client.embeddings.create(
         model=model_name,
         input=input_tokens,
         encoding_format="float",
     )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
     assert embeddings.id is not None
     assert len(embeddings.data) == 4
     assert len(embeddings.data[0].embedding) == 4096
@@ -124,14 +139,16 @@ async def test_conversation_embedding(server: RemoteOpenAIServer,
         "content": "Stars twinkle brightly in the night sky.",
     }]
 
-    chat_response = requests.post(server.url_for("v1/embeddings"),
-                                  json={
-                                      "model": model_name,
-                                      "messages": messages,
-                                      "encoding_format": "float",
-                                  })
+    chat_response = requests.post(
+        server.url_for("v1/embeddings"),
+        json={
+            "model": model_name,
+            "messages": messages,
+            "encoding_format": "float",
+        },
+    )
     chat_response.raise_for_status()
-    chat_embeddings = chat_response.json()
+    chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
 
     tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
     prompt = tokenizer.apply_chat_template(
@@ -148,13 +165,15 @@ async def test_conversation_embedding(server: RemoteOpenAIServer,
         # To be consistent with chat
         extra_body={"add_special_tokens": False},
     )
-    completion_embeddings = completion_response.model_dump(mode="json")
+    completion_embeddings = EmbeddingResponse.model_validate(
+        completion_response.model_dump(mode="json"))
 
-    assert chat_embeddings.pop("id") is not None
-    assert completion_embeddings.pop("id") is not None
-    assert chat_embeddings.pop("created") <= completion_embeddings.pop(
-        "created")
-    assert chat_embeddings == completion_embeddings
+    assert chat_embeddings.id is not None
+    assert completion_embeddings.id is not None
+    assert chat_embeddings.created <= completion_embeddings.created
+    assert chat_embeddings.model_dump(
+        exclude={"id", "created"}) == (completion_embeddings.model_dump(
+            exclude={"id", "created"}))
 
 
 @pytest.mark.asyncio
@@ -204,10 +223,13 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
     ]
 
     # test single embedding
-    embeddings = await client.embeddings.create(
+    embedding_response = await client.embeddings.create(
         model=model_name,
         input=input_texts,
         extra_body={"truncate_prompt_tokens": 10})
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
     assert len(embeddings.data[0].embedding) == 4096
@@ -219,10 +241,12 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
         1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
         9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
     ]
-    embeddings = await client.embeddings.create(
+    embedding_response = await client.embeddings.create(
         model=model_name,
         input=input_tokens,
         extra_body={"truncate_prompt_tokens": 10})
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
@@ -241,10 +265,10 @@ async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
     ]
 
     with pytest.raises(openai.BadRequestError):
-        embeddings = await client.embeddings.create(
+        response = await client.embeddings.create(
             model=model_name,
             input=input_texts,
             extra_body={"truncate_prompt_tokens": 8193})
-        assert "error" in embeddings.object
+        assert "error" in response.object
         assert "truncate_prompt_tokens value is greater than max_model_len. "\
-               "Please, select a smaller truncation size." in embeddings.message
+               "Please, select a smaller truncation size." in response.message
diff --git a/tests/entrypoints/openai/test_pooling.py b/tests/entrypoints/openai/test_pooling.py
new file mode 100644
index 0000000000000..9c49239398cd2
--- /dev/null
+++ b/tests/entrypoints/openai/test_pooling.py
@@ -0,0 +1,238 @@
+import base64
+
+import numpy as np
+import pytest
+import requests
+
+from vllm.entrypoints.openai.protocol import PoolingResponse
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
+DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task",
+        "classify",
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--enforce-eager",
+        "--max-model-len",
+        "8192",
+        "--chat-template",
+        DUMMY_CHAT_TEMPLATE,
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
+    input_texts = [
+        "The chef prepared a delicious meal.",
+    ]
+
+    # test single pooling
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_texts,
+            "encoding_format": "float"
+        },
+    )
+    response.raise_for_status()
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert poolings.id is not None
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 2
+    assert poolings.usage.completion_tokens == 0
+    assert poolings.usage.prompt_tokens == 7
+    assert poolings.usage.total_tokens == 7
+
+    # test using token IDs
+    input_tokens = [1, 1, 1, 1, 1]
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_tokens,
+            "encoding_format": "float"
+        },
+    )
+    response.raise_for_status()
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert poolings.id is not None
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 2
+    assert poolings.usage.completion_tokens == 0
+    assert poolings.usage.prompt_tokens == 5
+    assert poolings.usage.total_tokens == 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
+    # test List[str]
+    input_texts = [
+        "The cat sat on the mat.", "A feline was resting on a rug.",
+        "Stars twinkle brightly in the night sky."
+    ]
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_texts,
+            "encoding_format": "float"
+        },
+    )
+    response.raise_for_status()
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert poolings.id is not None
+    assert len(poolings.data) == 3
+    assert len(poolings.data[0].data) == 2
+    assert poolings.usage.completion_tokens == 0
+    assert poolings.usage.prompt_tokens == 25
+    assert poolings.usage.total_tokens == 25
+
+    # test List[List[int]]
+    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
+                    [25, 32, 64, 77]]
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_tokens,
+            "encoding_format": "float"
+        },
+    )
+    response.raise_for_status()
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert poolings.id is not None
+    assert len(poolings.data) == 4
+    assert len(poolings.data[0].data) == 2
+    assert poolings.usage.completion_tokens == 0
+    assert poolings.usage.prompt_tokens == 17
+    assert poolings.usage.total_tokens == 17
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_conversation_pooling(server: RemoteOpenAIServer,
+                                    model_name: str):
+    messages = [{
+        "role": "user",
+        "content": "The cat sat on the mat.",
+    }, {
+        "role": "assistant",
+        "content": "A feline was resting on a rug.",
+    }, {
+        "role": "user",
+        "content": "Stars twinkle brightly in the night sky.",
+    }]
+
+    chat_response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "messages": messages,
+            "encoding_format": "float",
+        },
+    )
+    chat_response.raise_for_status()
+    chat_poolings = PoolingResponse.model_validate(chat_response.json())
+
+    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        chat_template=DUMMY_CHAT_TEMPLATE,
+        add_generation_prompt=True,
+        continue_final_message=False,
+        tokenize=False,
+    )
+    completions_response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": prompt,
+            "encoding_format": "float",
+            # To be consistent with chat
+            "add_special_tokens": False,
+        },
+    )
+    completions_response.raise_for_status()
+    completion_poolings = PoolingResponse.model_validate(
+        completions_response.json())
+
+    assert chat_poolings.id is not None
+    assert completion_poolings.id is not None
+    assert chat_poolings.created <= completion_poolings.created
+    assert chat_poolings.model_dump(
+        exclude={"id", "created"}) == (completion_poolings.model_dump(
+            exclude={"id", "created"}))
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_base64_pooling(server: RemoteOpenAIServer,
+                                    model_name: str):
+    input_texts = [
+        "Hello my name is",
+        "The best thing about vLLM is that it supports many different models"
+    ]
+
+    float_response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "input": input_texts,
+            "model": model_name,
+            "encoding_format": "float",
+        },
+    )
+    float_response.raise_for_status()
+    responses_float = PoolingResponse.model_validate(float_response.json())
+
+    base64_response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "input": input_texts,
+            "model": model_name,
+            "encoding_format": "base64",
+        },
+    )
+    base64_response.raise_for_status()
+    responses_base64 = PoolingResponse.model_validate(base64_response.json())
+
+    decoded_responses_base64_data = []
+    for data in responses_base64.data:
+        decoded_responses_base64_data.append(
+            np.frombuffer(base64.b64decode(data.data),
+                          dtype="float32").tolist())
+
+    assert responses_float.data[0].data == decoded_responses_base64_data[0]
+    assert responses_float.data[1].data == decoded_responses_base64_data[1]
+
+    # Default response is float32 decoded from base64 by OpenAI Client
+    default_response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "input": input_texts,
+            "model": model_name,
+        },
+    )
+    default_response.raise_for_status()
+    responses_default = PoolingResponse.model_validate(default_response.json())
+
+    assert responses_float.data[0].data == responses_default.data[0].data
+    assert responses_float.data[1].data == responses_default.data[1].data
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index 43c63daacb17f..3731b2dcdeae1 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -1,9 +1,9 @@
 from typing import Dict
 
 import pytest
-import pytest_asyncio
 import requests
 
+from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.multimodal.utils import encode_image_base64, fetch_image
 
 from ...utils import VLLM_PATH, RemoteOpenAIServer
@@ -46,12 +46,6 @@ def server():
         yield remote_server
 
 
-@pytest_asyncio.fixture
-async def client(server):
-    async with server.get_async_client() as async_client:
-        yield async_client
-
-
 @pytest.fixture(scope="session")
 def base64_encoded_image() -> Dict[str, str]:
     return {
@@ -82,18 +76,20 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
         ],
     }]
 
-    response = requests.post(server.url_for("v1/embeddings"),
-                             json={
-                                 "model": model_name,
-                                 "messages": messages,
-                                 "encoding_format": "float"
-                             })
+    response = requests.post(
+        server.url_for("v1/embeddings"),
+        json={
+            "model": model_name,
+            "messages": messages,
+            "encoding_format": "float"
+        },
+    )
     response.raise_for_status()
-
-    embeddings = response.json()
-    assert embeddings["id"] is not None
-    assert len(embeddings["data"]) == 1
-    assert len(embeddings["data"][0]["embedding"]) == 3072
-    assert embeddings["usage"]["completion_tokens"] == 0
-    assert embeddings["usage"]["prompt_tokens"] == 765
-    assert embeddings["usage"]["total_tokens"] == 765
+    embeddings = EmbeddingResponse.model_validate(response.json())
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 3072
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 765
+    assert embeddings.usage.total_tokens == 765
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 2e5b769a825ce..3e50613a73dd3 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -45,8 +45,11 @@
                                               DetokenizeRequest,
                                               DetokenizeResponse,
                                               EmbeddingRequest,
-                                              EmbeddingResponse, ErrorResponse,
+                                              EmbeddingResponse,
+                                              EmbeddingResponseData,
+                                              ErrorResponse,
                                               LoadLoraAdapterRequest,
+                                              PoolingRequest, PoolingResponse,
                                               ScoreRequest, ScoreResponse,
                                               TokenizeRequest,
                                               TokenizeResponse,
@@ -56,6 +59,7 @@
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
 from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
 from vllm.entrypoints.openai.serving_score import OpenAIServingScores
 from vllm.entrypoints.openai.serving_tokenization import (
     OpenAIServingTokenization)
@@ -284,6 +288,10 @@ def completion(request: Request) -> Optional[OpenAIServingCompletion]:
     return request.app.state.openai_serving_completion
 
 
+def pooling(request: Request) -> Optional[OpenAIServingPooling]:
+    return request.app.state.openai_serving_pooling
+
+
 def embedding(request: Request) -> Optional[OpenAIServingEmbedding]:
     return request.app.state.openai_serving_embedding
 
@@ -395,10 +403,36 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
 async def create_embedding(request: EmbeddingRequest, raw_request: Request):
     handler = embedding(raw_request)
     if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Embeddings API")
+        fallback_handler = pooling(raw_request)
+        if fallback_handler is None:
+            return base(raw_request).create_error_response(
+                message="The model does not support Embeddings API")
+
+        logger.warning(
+            "Embeddings API will become exclusive to embedding models "
+            "in a future release. To return the hidden states directly, "
+            "use the Pooling API (`/pooling`) instead.")
+
+        res = await fallback_handler.create_pooling(request, raw_request)
+        if isinstance(res, PoolingResponse):
+            generator = EmbeddingResponse(
+                id=res.id,
+                object=res.object,
+                created=res.created,
+                model=res.model,
+                data=[
+                    EmbeddingResponseData(
+                        index=d.index,
+                        embedding=d.data,  # type: ignore
+                    ) for d in res.data
+                ],
+                usage=res.usage,
+            )
+        else:
+            generator = res
+    else:
+        generator = await handler.create_embedding(request, raw_request)
 
-    generator = await handler.create_embedding(request, raw_request)
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.code)
@@ -408,6 +442,24 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
     assert_never(generator)
 
 
+@router.post("/pooling")
+@with_cancellation
+async def create_pooling(request: PoolingRequest, raw_request: Request):
+    handler = pooling(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Pooling API")
+
+    generator = await handler.create_pooling(request, raw_request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    elif isinstance(generator, PoolingResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
 @router.post("/score")
 @with_cancellation
 async def create_score(request: ScoreRequest, raw_request: Request):
@@ -605,7 +657,7 @@ def init_app_state(
         request_logger=request_logger,
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
     ) if model_config.runner_type == "generate" else None
-    state.openai_serving_embedding = OpenAIServingEmbedding(
+    state.openai_serving_pooling = OpenAIServingPooling(
         engine_client,
         model_config,
         base_model_paths,
@@ -613,13 +665,20 @@ def init_app_state(
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
     ) if model_config.runner_type == "pooling" else None
+    state.openai_serving_embedding = OpenAIServingEmbedding(
+        engine_client,
+        model_config,
+        base_model_paths,
+        request_logger=request_logger,
+        chat_template=resolved_chat_template,
+        chat_template_content_format=args.chat_template_content_format,
+    ) if model_config.task == "embed" else None
     state.openai_serving_scores = OpenAIServingScores(
         engine_client,
         model_config,
         base_model_paths,
         request_logger=request_logger
-    ) if (model_config.runner_type == "pooling" \
-          and model_config.is_cross_encoder) else None
+    ) if model_config.task == "score" else None
     state.openai_serving_tokenization = OpenAIServingTokenization(
         engine_client,
         model_config,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 1d8b0d19f9516..14e41346df775 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -963,6 +963,10 @@ def to_pooling_params(self):
 
 EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
 
+PoolingCompletionRequest = EmbeddingCompletionRequest
+PoolingChatRequest = EmbeddingChatRequest
+PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest]
+
 
 class ScoreRequest(OpenAIBaseModel):
     model: str
@@ -1058,6 +1062,21 @@ class EmbeddingResponse(OpenAIBaseModel):
     usage: UsageInfo
 
 
+class PoolingResponseData(OpenAIBaseModel):
+    index: int
+    object: str = "pooling"
+    data: Union[List[List[float]], List[float], str]
+
+
+class PoolingResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"pool-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: List[PoolingResponseData]
+    usage: UsageInfo
+
+
 class ScoreResponseData(OpenAIBaseModel):
     index: int
     object: str = "score"
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 675daf54c0d0d..572ed27b39083 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -232,7 +232,7 @@ async def main(args):
         request_logger=request_logger,
         chat_template=None,
         chat_template_content_format="auto",
-    ) if model_config.runner_type == "pooling" else None
+    ) if model_config.task == "embed" else None
 
     tracker = BatchProgressTracker()
     logger.info("Reading batch from %s...", args.input_file)
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 879276646d2ba..b8fb9d6bd77f2 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -40,36 +40,6 @@ def _get_embedding(
     assert_never(encoding_format)
 
 
-def request_output_to_embedding_response(
-        final_res_batch: List[PoolingRequestOutput], request_id: str,
-        created_time: int, model_name: str,
-        encoding_format: Literal["float", "base64"]) -> EmbeddingResponse:
-    data: List[EmbeddingResponseData] = []
-    num_prompt_tokens = 0
-    for idx, final_res in enumerate(final_res_batch):
-        embedding_res = EmbeddingRequestOutput.from_base(final_res)
-        prompt_token_ids = final_res.prompt_token_ids
-
-        embedding = _get_embedding(embedding_res.outputs, encoding_format)
-        embedding_data = EmbeddingResponseData(index=idx, embedding=embedding)
-        data.append(embedding_data)
-
-        num_prompt_tokens += len(prompt_token_ids)
-
-    usage = UsageInfo(
-        prompt_tokens=num_prompt_tokens,
-        total_tokens=num_prompt_tokens,
-    )
-
-    return EmbeddingResponse(
-        id=request_id,
-        created=created_time,
-        model=model_name,
-        data=data,
-        usage=usage,
-    )
-
-
 class OpenAIServingEmbedding(OpenAIServing):
 
     def __init__(
@@ -114,7 +84,7 @@ async def create_embedding(
 
         model_name = request.model
         request_id = f"embd-{self._base_request_id(raw_request)}"
-        created_time = int(time.monotonic())
+        created_time = int(time.time())
 
         truncate_prompt_tokens = None
 
@@ -218,9 +188,13 @@ async def create_embedding(
             final_res_batch_checked = cast(List[PoolingRequestOutput],
                                            final_res_batch)
 
-            response = request_output_to_embedding_response(
-                final_res_batch_checked, request_id, created_time, model_name,
-                encoding_format)
+            response = self.request_output_to_embedding_response(
+                final_res_batch_checked,
+                request_id,
+                created_time,
+                model_name,
+                encoding_format,
+            )
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
         except ValueError as e:
@@ -228,3 +202,40 @@ async def create_embedding(
             return self.create_error_response(str(e))
 
         return response
+
+    def request_output_to_embedding_response(
+        self,
+        final_res_batch: List[PoolingRequestOutput],
+        request_id: str,
+        created_time: int,
+        model_name: str,
+        encoding_format: Literal["float", "base64"],
+    ) -> EmbeddingResponse:
+        items: List[EmbeddingResponseData] = []
+        num_prompt_tokens = 0
+
+        for idx, final_res in enumerate(final_res_batch):
+            embedding_res = EmbeddingRequestOutput.from_base(final_res)
+
+            item = EmbeddingResponseData(
+                index=idx,
+                embedding=_get_embedding(embedding_res.outputs,
+                                         encoding_format),
+            )
+            prompt_token_ids = final_res.prompt_token_ids
+
+            items.append(item)
+            num_prompt_tokens += len(prompt_token_ids)
+
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            total_tokens=num_prompt_tokens,
+        )
+
+        return EmbeddingResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            data=items,
+            usage=usage,
+        )
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
new file mode 100644
index 0000000000000..01852f0df1eca
--- /dev/null
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -0,0 +1,234 @@
+import asyncio
+import base64
+import time
+from typing import AsyncGenerator, Final, List, Literal, Optional, Union, cast
+
+import numpy as np
+from fastapi import Request
+from typing_extensions import assert_never
+
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (ErrorResponse,
+                                              PoolingChatRequest,
+                                              PoolingRequest, PoolingResponse,
+                                              PoolingResponseData, UsageInfo)
+from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.logger import init_logger
+from vllm.outputs import PoolingOutput, PoolingRequestOutput
+from vllm.utils import merge_async_iterators
+
+logger = init_logger(__name__)
+
+
+def _get_data(
+    output: PoolingOutput,
+    encoding_format: Literal["float", "base64"],
+) -> Union[List[float], str]:
+    if encoding_format == "float":
+        return output.data.tolist()
+    elif encoding_format == "base64":
+        # Force to use float32 for base64 encoding
+        # to match the OpenAI python client behavior
+        pooling_bytes = np.array(output.data, dtype="float32").tobytes()
+        return base64.b64encode(pooling_bytes).decode("utf-8")
+
+    assert_never(encoding_format)
+
+
+class OpenAIServingPooling(OpenAIServing):
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        base_model_paths: List[BaseModelPath],
+        *,
+        request_logger: Optional[RequestLogger],
+        chat_template: Optional[str],
+        chat_template_content_format: ChatTemplateContentFormatOption,
+    ) -> None:
+        super().__init__(engine_client=engine_client,
+                         model_config=model_config,
+                         base_model_paths=base_model_paths,
+                         lora_modules=None,
+                         prompt_adapters=None,
+                         request_logger=request_logger)
+
+        self.chat_template = chat_template
+        self.chat_template_content_format: Final = chat_template_content_format
+
+    async def create_pooling(
+        self,
+        request: PoolingRequest,
+        raw_request: Optional[Request] = None,
+    ) -> Union[PoolingResponse, ErrorResponse]:
+        """
+        See https://platform.openai.com/docs/api-reference/embeddings/create
+        for the API specification. This API mimics the OpenAI Embedding API.
+        """
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        encoding_format = request.encoding_format
+        if request.dimensions is not None:
+            return self.create_error_response(
+                "dimensions is currently not supported")
+
+        model_name = request.model
+        request_id = f"pool-{self._base_request_id(raw_request)}"
+        created_time = int(time.time())
+
+        truncate_prompt_tokens = None
+
+        if request.truncate_prompt_tokens is not None:
+            if request.truncate_prompt_tokens <= self.max_model_len:
+                truncate_prompt_tokens = request.truncate_prompt_tokens
+            else:
+                return self.create_error_response(
+                    "truncate_prompt_tokens value is "
+                    "greater than max_model_len."
+                    " Please, select a smaller truncation size.")
+
+        try:
+            (
+                lora_request,
+                prompt_adapter_request,
+            ) = self._maybe_get_adapters(request)
+
+            tokenizer = await self.engine_client.get_tokenizer(lora_request)
+
+            if prompt_adapter_request is not None:
+                raise NotImplementedError("Prompt adapter is not supported "
+                                          "for pooling models")
+
+            if isinstance(request, PoolingChatRequest):
+                (
+                    _,
+                    request_prompts,
+                    engine_prompts,
+                ) = await self._preprocess_chat(
+                    request,
+                    tokenizer,
+                    request.messages,
+                    chat_template=request.chat_template or self.chat_template,
+                    chat_template_content_format=self.
+                    chat_template_content_format,
+                    # In pooling requests, we are not generating tokens,
+                    # so there is no need to append extra tokens to the input
+                    add_generation_prompt=False,
+                    continue_final_message=False,
+                    truncate_prompt_tokens=truncate_prompt_tokens,
+                    add_special_tokens=request.add_special_tokens,
+                )
+            else:
+                (request_prompts,
+                 engine_prompts) = await self._preprocess_completion(
+                     request,
+                     tokenizer,
+                     request.input,
+                     truncate_prompt_tokens=truncate_prompt_tokens,
+                     add_special_tokens=request.add_special_tokens,
+                 )
+        except ValueError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+
+        # Schedule the request and get the result generator.
+        generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
+        try:
+            pooling_params = request.to_pooling_params()
+
+            for i, engine_prompt in enumerate(engine_prompts):
+                request_id_item = f"{request_id}-{i}"
+
+                self._log_inputs(request_id_item,
+                                 request_prompts[i],
+                                 params=pooling_params,
+                                 lora_request=lora_request,
+                                 prompt_adapter_request=prompt_adapter_request)
+
+                trace_headers = (None if raw_request is None else await
+                                 self._get_trace_headers(raw_request.headers))
+
+                generator = self.engine_client.encode(
+                    engine_prompt,
+                    pooling_params,
+                    request_id_item,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    priority=request.priority,
+                )
+
+                generators.append(generator)
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        result_generator = merge_async_iterators(*generators)
+
+        num_prompts = len(engine_prompts)
+
+        # Non-streaming response
+        final_res_batch: List[Optional[PoolingRequestOutput]]
+        final_res_batch = [None] * num_prompts
+        try:
+            async for i, res in result_generator:
+                final_res_batch[i] = res
+
+            assert all(final_res is not None for final_res in final_res_batch)
+
+            final_res_batch_checked = cast(List[PoolingRequestOutput],
+                                           final_res_batch)
+
+            response = self.request_output_to_pooling_response(
+                final_res_batch_checked,
+                request_id,
+                created_time,
+                model_name,
+                encoding_format,
+            )
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        return response
+
+    def request_output_to_pooling_response(
+        self,
+        final_res_batch: List[PoolingRequestOutput],
+        request_id: str,
+        created_time: int,
+        model_name: str,
+        encoding_format: Literal["float", "base64"],
+    ) -> PoolingResponse:
+        items: List[PoolingResponseData] = []
+        num_prompt_tokens = 0
+
+        for idx, final_res in enumerate(final_res_batch):
+            item = PoolingResponseData(
+                index=idx,
+                data=_get_data(final_res.outputs, encoding_format),
+            )
+            prompt_token_ids = final_res.prompt_token_ids
+
+            items.append(item)
+            num_prompt_tokens += len(prompt_token_ids)
+
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            total_tokens=num_prompt_tokens,
+        )
+
+        return PoolingResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            data=items,
+            usage=usage,
+        )
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 101d170bee4d6..a8a126e697641 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -20,32 +20,6 @@
 logger = init_logger(__name__)
 
 
-def request_output_to_score_response(
-        final_res_batch: List[PoolingRequestOutput], request_id: str,
-        created_time: int, model_name: str) -> ScoreResponse:
-    data: List[ScoreResponseData] = []
-    num_prompt_tokens = 0
-    for idx, final_res in enumerate(final_res_batch):
-        classify_res = ScoringRequestOutput.from_base(final_res)
-
-        score_data = ScoreResponseData(index=idx,
-                                       score=classify_res.outputs.score)
-        data.append(score_data)
-
-    usage = UsageInfo(
-        prompt_tokens=num_prompt_tokens,
-        total_tokens=num_prompt_tokens,
-    )
-
-    return ScoreResponse(
-        id=request_id,
-        created=created_time,
-        model=model_name,
-        data=data,
-        usage=usage,
-    )
-
-
 def make_pairs(text_1: Union[List[str], str], text_2: Union[List[str],
                                                             str]) -> List:
     if isinstance(text_1, (str, dict)):
@@ -103,7 +77,7 @@ async def create_score(
 
         model_name = request.model
         request_id = f"score-{self._base_request_id(raw_request)}"
-        created_time = int(time.monotonic())
+        created_time = int(time.time())
         truncate_prompt_tokens = request.truncate_prompt_tokens
 
         request_prompts = []
@@ -203,8 +177,12 @@ async def create_score(
             final_res_batch_checked = cast(List[PoolingRequestOutput],
                                            final_res_batch)
 
-            response = request_output_to_score_response(
-                final_res_batch_checked, request_id, created_time, model_name)
+            response = self.request_output_to_score_response(
+                final_res_batch_checked,
+                request_id,
+                created_time,
+                model_name,
+            )
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
         except ValueError as e:
@@ -212,3 +190,38 @@ async def create_score(
             return self.create_error_response(str(e))
 
         return response
+
+    def request_output_to_score_response(
+        self,
+        final_res_batch: List[PoolingRequestOutput],
+        request_id: str,
+        created_time: int,
+        model_name: str,
+    ) -> ScoreResponse:
+        items: List[ScoreResponseData] = []
+        num_prompt_tokens = 0
+
+        for idx, final_res in enumerate(final_res_batch):
+            classify_res = ScoringRequestOutput.from_base(final_res)
+
+            item = ScoreResponseData(
+                index=idx,
+                score=classify_res.outputs.score,
+            )
+            prompt_token_ids = final_res.prompt_token_ids
+
+            items.append(item)
+            num_prompt_tokens += len(prompt_token_ids)
+
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            total_tokens=num_prompt_tokens,
+        )
+
+        return ScoreResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            data=items,
+            usage=usage,
+        )
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 2ecdf74ee59b3..b519c159b1531 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -355,7 +355,8 @@ def from_seq_group(seq_group: SequenceGroup) -> "PoolingRequestOutput":
         pooled_data = seq_group.pooled_data
         assert pooled_data is not None
 
-        output = PoolingOutput(pooled_data)
+        data = pooled_data.to(dtype=torch.float32, device="cpu")
+        output = PoolingOutput(data)
         prompt_token_ids = seq_group.prompt_token_ids
         finished = seq_group.is_finished()
 

From b1b1038fbdc1f14b32c5e348194aae395124b43c Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 24 Dec 2024 17:56:10 +0800
Subject: [PATCH 163/357] [Bugfix] Fix Qwen2-VL LoRA weight loading  (#11430)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/conftest.py                 |  5 ++
 tests/lora/test_lora_checkpoints.py    | 30 ++++++++++
 tests/lora/test_qwen2vl.py             | 78 ++++++++++++++++++++++++++
 vllm/lora/models.py                    |  9 ++-
 vllm/lora/utils.py                     | 37 ++++++++++--
 vllm/lora/worker_manager.py            | 11 +++-
 vllm/model_executor/models/qwen2_vl.py | 12 ++--
 7 files changed, 168 insertions(+), 14 deletions(-)
 create mode 100644 tests/lora/test_qwen2vl.py

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 29ecf37808205..8b247fb9b2388 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -200,6 +200,11 @@ def minicpmv_lora_files():
     return snapshot_download(repo_id="jeeejeee/minicpmv25-lora-pokemon")
 
 
+@pytest.fixture(scope="session")
+def qwen2vl_lora_files():
+    return snapshot_download(repo_id="jeeejeee/qwen2-vl-lora-pokemon")
+
+
 @pytest.fixture(scope="session")
 def tinyllama_lora_files():
     return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index 9a529e27b4cd8..9842203eb15e0 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -4,6 +4,7 @@
 
 from vllm.lora.models import LoRAModel
 from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
+from vllm.model_executor.models.utils import WeightsMapper
 
 lora_lst = [
     "baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"
@@ -71,3 +72,32 @@ def test_load_checkpoints(
                 device="cpu",
                 embedding_modules=embedding_modules,
                 embedding_padding_modules=embed_padding_modules)
+
+
+def test_lora_weights_mapping(baichuan_lora_files, ):
+    supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
+    packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
+    embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
+    embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
+    expected_lora_modules: List[str] = []
+    for module in supported_lora_modules:
+        if module in packed_modules_mapping:
+            expected_lora_modules.extend(packed_modules_mapping[module])
+        else:
+            expected_lora_modules.append(module)
+
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
+        "model.": "language_model.model.",
+    }, )
+
+    lora_model = LoRAModel.from_local_checkpoint(
+        baichuan_lora_files,
+        expected_lora_modules,
+        lora_model_id=1,
+        device="cpu",
+        embedding_modules=embedding_modules,
+        embedding_padding_modules=embed_padding_modules,
+        weights_mapper=hf_to_vllm_mapper,
+    )
+    for name in lora_model.loras:
+        assert name.startswith(hf_to_vllm_mapper.orig_to_new_prefix["model."])
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
new file mode 100644
index 0000000000000..c8c720ff0c776
--- /dev/null
+++ b/tests/lora/test_qwen2vl.py
@@ -0,0 +1,78 @@
+from typing import List
+
+import pytest
+
+import vllm
+from vllm.assets.image import ImageAsset
+from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
+
+MODEL_PATH = "Qwen/Qwen2-VL-7B-Instruct"
+
+PROMPT_TEMPLATE = (
+    "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
+    "\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+    "What is in the image?<|im_end|>\n"
+    "<|im_start|>assistant\n")
+
+IMAGE_ASSETS = [
+    ImageAsset("stop_sign"),
+    ImageAsset("cherry_blossom"),
+]
+
+# After fine-tuning with LoRA, all generated content should start begin `A`.
+EXPECTED_OUTPUT = [
+    "A stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.",  # noqa: E501
+    "A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.",  # noqa: E501
+]
+
+
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+    sampling_params = vllm.SamplingParams(
+        temperature=0,
+        max_tokens=5,
+    )
+
+    inputs = [{
+        "prompt": PROMPT_TEMPLATE,
+        "multi_modal_data": {
+            "image": asset.pil_image
+        },
+    } for asset in IMAGE_ASSETS]
+
+    outputs = llm.generate(
+        inputs,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None,
+    )
+    # Print the outputs.
+    generated_texts: List[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+@pytest.mark.xfail(current_platform.is_rocm(),
+                   reason="Qwen2-VL dependency xformers incompatible with ROCm"
+                   )
+def test_qwen2vl_lora(qwen2vl_lora_files):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_num_seqs=2,
+        enable_lora=True,
+        max_loras=2,
+        max_lora_rank=16,
+        trust_remote_code=True,
+        mm_processor_kwargs={
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+        },
+        max_model_len=4096,
+    )
+    output1 = do_sample(llm, qwen2vl_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output1[i])
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 70806a77b9fff..f50db8e3b8e10 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -28,7 +28,7 @@
                              parse_fine_tuned_lora_name, replace_submodule)
 from vllm.model_executor.models import SupportsLoRA, supports_multimodal
 from vllm.model_executor.models.module_mapping import MultiModelKeys
-from vllm.model_executor.models.utils import PPMissingLayer
+from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper
 from vllm.utils import is_pin_memory_available
 
 logger = init_logger(__name__)
@@ -113,13 +113,14 @@ def from_lora_tensors(
         target_embedding_padding: Optional[int] = None,
         embedding_modules: Optional[Dict[str, str]] = None,
         embedding_padding_modules: Optional[List[str]] = None,
+        weights_mapper: Optional[WeightsMapper] = None,
     ) -> "LoRAModel":
         """Create a LoRAModel from a dictionary of tensors."""
         pin_memory = str(device) == "cpu" and is_pin_memory_available()
         loras: Dict[str, LoRALayerWeights] = {}
         for tensor_name, tensor in tensors.items():
             module_name, is_lora_a, is_bias = parse_fine_tuned_lora_name(
-                tensor_name)
+                tensor_name, weights_mapper)
             if module_name not in loras:
                 lora_embeddings_tensor = None
                 if embeddings:
@@ -187,6 +188,7 @@ def from_local_checkpoint(
         target_embedding_padding: Optional[int] = None,
         embedding_modules: Optional[Dict[str, str]] = None,
         embedding_padding_modules: Optional[List[str]] = None,
+        weights_mapper: Optional[WeightsMapper] = None,
     ) -> "LoRAModel":
         """Create a LoRAModel from a local checkpoint.
         
@@ -289,7 +291,8 @@ def from_local_checkpoint(
             embeddings=embeddings,
             target_embedding_padding=target_embedding_padding,
             embedding_modules=embedding_modules,
-            embedding_padding_modules=embedding_padding_modules)
+            embedding_padding_modules=embedding_padding_modules,
+            weights_mapper=weights_mapper)
 
 
 class LoRAModelManager(AdapterModelManager):
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 5876494ce2824..3a84a6ae1c02a 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -1,3 +1,4 @@
+import copy
 import os
 import re
 from typing import List, Optional, Set, Tuple, Type, Union
@@ -30,6 +31,8 @@
 # yapf: enable
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.models.utils import WeightsMapper
+from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
 
@@ -91,28 +94,54 @@ def replace_submodule(model: nn.Module, module_name: str,
     return new_module
 
 
-def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool, bool]:
+def parse_fine_tuned_lora_name(
+        name: str,
+        weights_mapper: Optional[WeightsMapper] = None
+) -> Tuple[str, bool, bool]:
     """Parse the name of lora weights.
 
     args:
         name: the name of the fine-tuned LoRA, e.g.
             base_model.model.dense1.weight
+        weights_mapper: maps the name of weight, e.g.
+            `model.` -> `language_model.model.`,
     return:
         Tuple(module_name, is_lora_a):
             module_name: the name of the module, e.g. model.dense1,
             is_lora_a whether the tensor is lora_a or lora_b.
             is_bias whether the tensor is lora bias.
     """
+
+    w_mapper = None
+    if weights_mapper:
+        w_mapper = copy.deepcopy(weights_mapper)
+        # TODO: Currently only supports mapping for prefix, mapping for
+        # substr and subfix will be supported in the future.
+        for attr, mapping in [
+            ("orig_to_new_substr", w_mapper.orig_to_new_substr),
+            ("orig_to_new_suffix", w_mapper.orig_to_new_suffix),
+        ]:
+            if mapping:
+                print_warning_once(
+                    f"vLLM currently does not support mapping of LoRA weights "
+                    f"for {mapping}.")
+                setattr(w_mapper, attr, {})
+
+    mapper = (lambda name: w_mapper._map_name(name)
+              if w_mapper is not None else name)
     parts = name.split(".")
     if parts[-1] == "weight" and (parts[-2] == "lora_A"
                                   or parts[-2] == "lora_B"):
-        return ".".join(parts[2:-2]), parts[-2] == "lora_A", False
+        new_name = ".".join(parts[2:-2])
+        return mapper(new_name), parts[-2] == "lora_A", False
 
     if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B":
-        return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A", False
+        new_name = ".".join(parts[2:-1])
+        return mapper(new_name), parts[-1] == "lora_embedding_A", False
 
     if parts[-1] == "bias":
-        return ".".join(parts[2:-2]), False, True
+        new_name = ".".join(parts[2:-2])
+        return mapper(new_name), False, True
 
     raise ValueError(f"{name} is unsupported LoRA weight")
 
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 93a5e27621912..ef8cc5886103e 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -92,6 +92,14 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
                 else:
                     expected_lora_modules.append(module)
             lora_path = get_adapter_absolute_path(lora_request.lora_path)
+
+            # For some models like Qwen2VL, we need to use hf_to_vllm_mapper
+            # to ensure correct loading of lora weights.
+            hf_to_vllm_mapper = None
+            if (hasattr(model, "hf_to_vllm_mapper")
+                    and model.hf_to_vllm_mapper is not None):
+                hf_to_vllm_mapper = model.hf_to_vllm_mapper
+
             lora = self._lora_model_cls.from_local_checkpoint(
                 lora_path,
                 expected_lora_modules,
@@ -103,7 +111,8 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
                 self.lora_config.lora_extra_vocab_size,
                 embedding_modules=self.embedding_modules,
                 embedding_padding_modules=self.embedding_padding_modules,
-            )
+                weights_mapper=hf_to_vllm_mapper)
+
         except Exception as e:
             raise RuntimeError(f"Loading lora {lora_path} failed") from e
         if lora.rank > self.lora_config.max_lora_rank:
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index b38ea923f0bf1..fb97eb1916002 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -901,6 +901,11 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
     ]
     embedding_modules = {}
     embedding_padding_modules = []
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
+        "lm_head.": "language_model.lm_head.",
+        "model.": "language_model.model.",
+    })
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -1190,11 +1195,6 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        hf_to_vllm_mapper = WeightsMapper(
-            orig_to_new_prefix={
-                "lm_head.": "language_model.lm_head.",
-                "model.": "language_model.model.",
-            })
 
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights, mapper=hf_to_vllm_mapper)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)

From 7a5286cc047112c7cc52bad8da8c17aedc880ef5 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 24 Dec 2024 17:59:51 +0800
Subject: [PATCH 164/357] [Bugfix][Hardware][CPU] Fix CPU `input_positions`
 creation for text-only inputs with mrope (#11434)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/worker/cpu_model_runner.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 420aaf8a1b4cd..f1531e0fc0675 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -114,8 +114,7 @@ class ModelInputData:
         def __init__(self, use_mrope: bool):
             self.use_mrope = use_mrope
             self.input_tokens: List[int] = []
-            self.input_positions: Optional[
-                List[int]] = [] if not self.use_mrope else None
+            self.input_positions: List[int] = []
             self.token_type_ids: Optional[List[int]] = []
             self.seq_lens: List[int] = []
             self.query_lens: List[int] = []
@@ -130,9 +129,8 @@ def __init__(self, use_mrope: bool):
             self.multi_modal_placeholder_maps: Dict[
                 str, MultiModalPlaceholderMap] = defaultdict(
                     MultiModalPlaceholderMap)
-            self.input_mrope_positions: Optional[List[List[int]]] = [
-                [] for _ in range(3)
-            ] if self.use_mrope else None
+            self.input_mrope_positions: List[List[int]] = [[]
+                                                           for _ in range(3)]
 
     def __init__(self,
                  runner: "CPUModelRunner",
@@ -167,7 +165,8 @@ def build(self) -> ModelInputForCPU:
                                     device="cpu")
         input_positions = torch.tensor(
             input_data.input_positions
-            if not input_data.use_mrope else input_data.input_mrope_positions,
+            if not any(input_data.input_mrope_positions) else
+            input_data.input_mrope_positions,
             dtype=torch.long,
             device="cpu")
         token_type_ids = torch.tensor(input_data.token_type_ids,
@@ -236,7 +235,7 @@ def _compute_decode_input_tokens(self, data: ModelInputData,
             block_table = block_table[start_block:]
 
         # For MRotaryEmbedding
-        if data.input_positions is None:
+        if seq_data.mrope_position_delta is not None:
             next_pos = MRotaryEmbedding.get_next_input_positions(
                 seq_data.mrope_position_delta,
                 context_len,
@@ -309,8 +308,7 @@ def _compute_prompt_input_tokens(self, data: ModelInputData,
             data.slot_mapping.extend(slot_mapping)
 
         # The MROPE positions are prepared in _compute_multi_modal_input
-        if data.input_positions is not None:
-            data.input_positions.extend(token_positions)
+        data.input_positions.extend(token_positions)
 
         if data.token_type_ids is not None:
             data.token_type_ids.extend(token_types if token_types else [])

From 461cde20801eb77be32227e0f23d43a6a1299b48 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 24 Dec 2024 15:38:21 +0400
Subject: [PATCH 165/357] [OpenVINO] Fixed installation conflicts (#11458)

Signed-off-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 requirements-openvino.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-openvino.txt b/requirements-openvino.txt
index 95e5914757812..ac9d851d661b0 100644
--- a/requirements-openvino.txt
+++ b/requirements-openvino.txt
@@ -4,5 +4,5 @@
 torch == 2.5.1 #  should be aligned with "common" vLLM torch version
 openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
 
-optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version
-optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git@main # latest optimum-intel is used to support latest transformers version
+optimum @ git+https://github.com/huggingface/optimum.git # latest optimum is used to support latest transformers version
+optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git # latest optimum-intel is used to support latest transformers version

From 5c7963249daf0b57e803605079e8869e8b071247 Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Tue, 24 Dec 2024 20:39:36 +0800
Subject: [PATCH 166/357] [attn][tiny fix] fix attn backend in
 MultiHeadAttention (#11463)

Signed-off-by: Mengqing Cao <cmq0113@163.com>
---
 vllm/attention/layer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 05d997279893b..69b6d1e4648df 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -191,6 +191,7 @@ def __init__(
                                         kv_cache_dtype=None,
                                         block_size=16,
                                         is_attention_free=False)
+        attn_backend = backend_name_to_enum(attn_backend.get_name())
         if attn_backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
             attn_backend = _Backend.XFORMERS
 

From 196c34b0acdd19014feb6c065c324036407f3b36 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 24 Dec 2024 21:05:25 +0800
Subject: [PATCH 167/357] [Misc] Move weights mapper (#11443)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../my_gemma_embedding.py                     |  5 +-
 vllm/model_executor/models/aria.py            | 20 +++----
 vllm/model_executor/models/bert.py            |  4 +-
 vllm/model_executor/models/molmo.py           | 58 ++++++++++---------
 vllm/model_executor/models/phi3v.py           | 16 ++---
 vllm/model_executor/models/qwen2.py           |  5 +-
 vllm/model_executor/models/telechat2.py       | 27 ++++-----
 vllm/model_executor/models/ultravox.py        |  7 ++-
 8 files changed, 74 insertions(+), 68 deletions(-)

diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
index d676eacffb056..5e7d7d1877e61 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -13,6 +13,7 @@
 
 
 class MyGemma2Embedding(nn.Module):
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -62,8 +63,8 @@ def pooler(
         return self._pooler(hidden_states, pooling_metadata)
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
-        weights = hf_to_vllm_mapper.apply(weights)
+
+        weights = self.hf_to_vllm_mapper.apply(weights)
         weights = ((name, data) for name, data in weights
                    if not name.startswith("lm_head."))
         return self.model.load_weights(weights)
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index dd4b0c75cb84d..9437ad9688422 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -521,6 +521,15 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
     This model combines a vision tower, a multi-modal projector, and a language
     model to perform tasks that involve both image and text inputs.
     """
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "language_model.model": "language_model",
+            "language_model.lm_head": "lm_head",
+        },
+        orig_to_new_suffix={
+            "router.weight": "router_weight",
+        },
+    )
 
     def __init__(
         self,
@@ -662,15 +671,6 @@ def sample(
         return next_tokens
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        hf_to_vllm_mapper = WeightsMapper(
-            orig_to_new_prefix={
-                "language_model.model": "language_model",
-                "language_model.lm_head": "lm_head",
-            },
-            orig_to_new_suffix={
-                "router.weight": "router_weight",
-            },
-        )
 
         loader = AutoWeightsLoader(self)
-        loader.load_weights(weights, mapper=hf_to_vllm_mapper)
+        loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 053d838432885..c1d47b1bc9bcd 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -409,6 +409,7 @@ class BertEmbeddingModel(nn.Module):
        model: An instance of BertModel used for forward operations.
        _pooler: An instance of Pooler used for pooling operations.
    """
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -441,8 +442,7 @@ def pooler(
         return self._pooler(hidden_states, pooling_metadata)
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
-        weights = hf_to_vllm_mapper.apply(weights)
+        weights = self.hf_to_vllm_mapper.apply(weights)
         weights = ((name, data) for name, data in weights
                    if not name.startswith("lm_head."))
         self.model.load_weights(weights)
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 9f744b6918818..63a25137f8aa9 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1123,6 +1123,34 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
 @INPUT_REGISTRY.register_input_processor(input_processor_for_molmo)
 class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            # vision backbone mapping
+            "image_projector.w1.": "image_projector.gate_proj.",
+            "image_projector.w3.": "image_projector.up_proj.",
+            "image_projector.w2.": "image_projector.down_proj.",
+            # language backbone mapping
+            "att_proj": "self_attn.qkv_proj",
+            "attn_out": "self_attn.o_proj",
+            "q_norm": "self_attn.q_norm",
+            "k_norm": "self_attn.k_norm",
+            "ff_proj": "mlp.gate_up_proj",
+            "ff_out": "mlp.down_proj",
+            "attn_norm": "input_layernorm",
+            "ff_norm": "post_attention_layernorm",
+        },
+        orig_to_new_prefix={
+            # vision backbone mapping
+            "model.vision_backbone.": "vision_backbone.",
+            # language backbone mapping
+            "model.transformer.blocks.": "model.layers.",
+            "model.transformer.ln_f.": "model.norm.",
+            # lm_head is renamed to model.transformer.mlp.down_proj firstly,
+            # we need to run a second renaming for it
+            "model.transformer.mlp.down_proj.": "lm_head.",
+        },
+    )
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -1298,36 +1326,10 @@ def sample(
         return next_tokens
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        hf_to_vllm_mapper = WeightsMapper(
-            orig_to_new_substr={
-                # vision backbone mapping
-                "image_projector.w1.": "image_projector.gate_proj.",
-                "image_projector.w3.": "image_projector.up_proj.",
-                "image_projector.w2.": "image_projector.down_proj.",
-                # language backbone mapping
-                "att_proj": "self_attn.qkv_proj",
-                "attn_out": "self_attn.o_proj",
-                "q_norm": "self_attn.q_norm",
-                "k_norm": "self_attn.k_norm",
-                "ff_proj": "mlp.gate_up_proj",
-                "ff_out": "mlp.down_proj",
-                "attn_norm": "input_layernorm",
-                "ff_norm": "post_attention_layernorm",
-            },
-            orig_to_new_prefix={
-                # vision backbone mapping
-                "model.vision_backbone.": "vision_backbone.",
-                # language backbone mapping
-                "model.transformer.blocks.": "model.layers.",
-                "model.transformer.ln_f.": "model.norm.",
-                # lm_head is renamed to model.transformer.mlp.down_proj firstly,
-                # we need to run a second renaming for it
-                "model.transformer.mlp.down_proj.": "lm_head.",
-            },
-        )
+
         loader = AutoWeightsLoader(self)
         weights = _get_weights_with_merged_embedding(weights)
-        return loader.load_weights(weights, mapper=hf_to_vllm_mapper)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
 def _get_weights_with_merged_embedding(
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index e2263f63f7bba..4e2e7f5761544 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -408,6 +408,13 @@ def _get_dummy_mm_inputs(
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens)
 @MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor)
 class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "model.vision_embed_tokens.wte": "embed_tokens",
+            "model.vision_embed_tokens.": "vision_embed_tokens.",
+            "lm_head.": "language_model.lm_head.",
+            "model.": "language_model.model.",
+        })
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -616,17 +623,10 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        hf_to_vllm_mapper = WeightsMapper(
-            orig_to_new_prefix={
-                "model.vision_embed_tokens.wte": "embed_tokens",
-                "model.vision_embed_tokens.": "vision_embed_tokens.",
-                "lm_head.": "language_model.lm_head.",
-                "model.": "language_model.model.",
-            })
 
         loader = AutoWeightsLoader(self)
         autoloaded_weights = loader.load_weights(weights,
-                                                 mapper=hf_to_vllm_mapper)
+                                                 mapper=self.hf_to_vllm_mapper)
 
         # The HF config doesn't specify whether these are tied,
         # so we detect it this way
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 3ce4eb5869f21..7661bb285df95 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -529,6 +529,8 @@ class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
     embedding_modules = {}
     embedding_padding_modules = []
 
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -577,8 +579,7 @@ def pooler(
         return self._pooler(hidden_states, pooling_metadata)
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
-        weights = hf_to_vllm_mapper.apply(weights)
+        weights = self.hf_to_vllm_mapper.apply(weights)
         weights = ((name, data) for name, data in weights
                    if not name.startswith("lm_head."))
         self.model.load_weights(weights)
diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py
index 39c9103527f01..28c37bb96612c 100644
--- a/vllm/model_executor/models/telechat2.py
+++ b/vllm/model_executor/models/telechat2.py
@@ -31,6 +31,19 @@
 
 class TeleChat2Model(LlamaModel):
 
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "transformer.": "model.",
+        },
+        orig_to_new_substr={
+            ".h.": ".layers.",
+            ".self_attention.": ".self_attn.",
+            ".word_embeddings.": ".embed_tokens.",
+            ".dense.": ".o_proj.",
+            ".ln_f.": ".norm.",
+        },
+    )
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # 1. Initialize the LlamaModel with bias
         vllm_config.model_config.hf_config.bias = True
@@ -111,21 +124,9 @@ def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
 
-        hf_to_vllm_mapper = WeightsMapper(
-            orig_to_new_prefix={
-                "transformer.": "model.",
-            },
-            orig_to_new_substr={
-                ".h.": ".layers.",
-                ".self_attention.": ".self_attn.",
-                ".word_embeddings.": ".embed_tokens.",
-                ".dense.": ".o_proj.",
-                ".ln_f.": ".norm.",
-            },
-        )
         loader = AutoWeightsLoader(
             self,
             skip_prefixes=(["lm_head."]
                            if self.config.tie_word_embeddings else None),
         )
-        return loader.load_weights(weights, mapper=hf_to_vllm_mapper)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index c60b208c3d27d..509ad9e580ddf 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -302,6 +302,9 @@ def forward(
 @MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor)
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
 
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."})
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -494,9 +497,7 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        hf_to_vllm_mapper = WeightsMapper(
-            orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."})
 
         loader = AutoWeightsLoader(self,
                                    ignore_unexpected_prefixes=["audio_tower."])
-        return loader.load_weights(weights, mapper=hf_to_vllm_mapper)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)

From 409475a827795000301f0d27582befab0563888b Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Tue, 24 Dec 2024 11:53:28 -0500
Subject: [PATCH 168/357] [Bugfix] Fix issues in CPU build Dockerfile. Fixes
 #9182 (#11435)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 Dockerfile.cpu |  6 +++---
 setup.py       | 10 +++++++---
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index ebe226cf6d148..f163edc27cba8 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -26,10 +26,10 @@ RUN pip install intel_extension_for_pytorch==2.5.0
 
 WORKDIR /workspace
 
+COPY requirements-build.txt requirements-build.txt
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
     pip install --upgrade pip && \
     pip install -r requirements-build.txt
 
@@ -37,9 +37,9 @@ FROM cpu-test-1 AS build
 
 WORKDIR /workspace/vllm
 
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cpu.txt requirements-cpu.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
-    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
     pip install -v -r requirements-cpu.txt
 
 COPY . .
diff --git a/setup.py b/setup.py
index 73407b64edf22..61d2d710aa20e 100644
--- a/setup.py
+++ b/setup.py
@@ -455,9 +455,13 @@ def get_gaudi_sw_version():
 
 
 def get_vllm_version() -> str:
-    version = get_version(
-        write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
-    )
+    # TODO: Revisit this temporary approach: https://github.com/vllm-project/vllm/issues/9182#issuecomment-2404860236
+    try:
+        version = get_version(
+            write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
+        )
+    except LookupError:
+        version = "0.0.0"
 
     sep = "+" if "+" not in version else "."  # dev versions might contain +
 

From 3f3e92e1f2e332547f7d4bac2358bc4e8e5d018b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 25 Dec 2024 02:22:22 +0800
Subject: [PATCH 169/357] [Model] Automatic conversion of classification and
 reward models (#11469)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md        |  22 +-
 .../embedding/language/test_cls_models.py     |   5 +-
 .../models/embedding/language/test_scoring.py |   4 +-
 tests/models/test_registry.py                 |  11 +-
 vllm/model_executor/model_loader/utils.py     |  10 +-
 vllm/model_executor/models/adapters.py        | 190 ++++++++++++++++--
 vllm/model_executor/models/qwen2.py           |   4 +-
 vllm/model_executor/models/qwen2_cls.py       | 104 ----------
 vllm/model_executor/models/registry.py        |  17 +-
 9 files changed, 206 insertions(+), 161 deletions(-)
 delete mode 100644 vllm/model_executor/models/qwen2_cls.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 650293d864011..545a2ccaa5634 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -28,7 +28,7 @@ llm = LLM(model=..., task="generate")  # Name or path of your model
 output = llm.generate("Hello, my name is")
 print(output)
 
-# For pooling models (task={embed,classify,reward}) only
+# For pooling models (task={embed,classify,reward,score}) only
 llm = LLM(model=..., task="embed")  # Name or path of your model
 output = llm.encode("Hello, my name is")
 print(output)
@@ -59,7 +59,7 @@ llm = LLM(model=..., revision=..., task=..., trust_remote_code=True)
 output = llm.generate("Hello, my name is")
 print(output)
 
-# For pooling models (task={embed,classify,reward}) only
+# For pooling models (task={embed,classify,reward,score}) only
 output = llm.encode("Hello, my name is")
 print(output)
 ```
@@ -369,14 +369,6 @@ you should explicitly specify the task type to ensure that the model is used in
 
 #### Text Embedding (`--task embed`)
 
-Any text generation model can be converted into an embedding model by passing {code}`--task embed`.
-
-```{note}
-To get the best results, you should use pooling models that are specifically trained as such.
-```
-
-The following table lists those that are tested in vLLM.
-
 ```{eval-rst}
 .. list-table::
   :widths: 25 25 50 5 5
@@ -437,6 +429,10 @@ On the other hand, its 1.5B variant ({code}`Alibaba-NLP/gte-Qwen2-1.5B-instruct`
 despite being described otherwise on its model card.
 ```
 
+If your model is not in the above list, we will try to automatically convert the model using
+:func:`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
+of the whole prompt are extracted from the normalized hidden state corresponding to the last token.
+
 #### Reward Modeling (`--task reward`)
 
 ```{eval-rst}
@@ -461,6 +457,9 @@ despite being described otherwise on its model card.
     - ✅︎
 ```
 
+If your model is not in the above list, we will try to automatically convert the model using
+:func:`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
+
 ```{important}
 For process-supervised reward models such as {code}`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
 e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
@@ -490,6 +489,9 @@ e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 1
     - ✅︎
 ```
 
+If your model is not in the above list, we will try to automatically convert the model using
+:func:`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
+
 #### Sentence Pair Scoring (`--task score`)
 
 ```{eval-rst}
diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py
index 6321503e7b248..6673a9fc22f69 100644
--- a/tests/models/embedding/language/test_cls_models.py
+++ b/tests/models/embedding/language/test_cls_models.py
@@ -1,7 +1,4 @@
-"""Compare the outputs of HF and vLLM when using greedy sampling.
-
-This test only tests small models. Big models such as 7B should be tested from
-test_big_models.py because it could use a larger instance to run tests.
+"""Compare the classification outputs of HF and vLLM models.
 
 Run `pytest tests/models/test_cls_models.py`.
 """
diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py
index af31e1a635f65..be6e3842821e2 100644
--- a/tests/models/embedding/language/test_scoring.py
+++ b/tests/models/embedding/language/test_scoring.py
@@ -1,6 +1,6 @@
-"""Compare the embedding outputs of HF and vLLM models.
+"""Compare the scoring outputs of HF and vLLM models.
 
-Run `pytest tests/models/embedding/language/test_embedding.py`.
+Run `pytest tests/models/embedding/language/test_scoring.py`.
 """
 import math
 
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index b5368aab3ecf1..73b70d65e8e0b 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -6,7 +6,9 @@
 from vllm.model_executor.models import (is_pooling_model,
                                         is_text_generation_model,
                                         supports_multimodal)
-from vllm.model_executor.models.adapters import as_embedding_model
+from vllm.model_executor.models.adapters import (as_classification_model,
+                                                 as_embedding_model,
+                                                 as_reward_model)
 from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
                                                  _SPECULATIVE_DECODING_MODELS,
                                                  _TEXT_GENERATION_MODELS,
@@ -29,9 +31,10 @@ def test_registry_imports(model_arch):
             or model_arch in _MULTIMODAL_MODELS):
         assert is_text_generation_model(model_cls)
 
-    # All vLLM models should be convertible to an embedding model
-    embed_model = as_embedding_model(model_cls)
-    assert is_pooling_model(embed_model)
+    # All vLLM models should be convertible to a pooling model
+    assert is_pooling_model(as_classification_model(model_cls))
+    assert is_pooling_model(as_embedding_model(model_cls))
+    assert is_pooling_model(as_reward_model(model_cls))
 
     if model_arch in _MULTIMODAL_MODELS:
         assert supports_multimodal(model_cls)
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index f15e7176b3d50..44978a55e072d 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -7,7 +7,9 @@
 
 from vllm.config import ModelConfig
 from vllm.model_executor.models import ModelRegistry
-from vllm.model_executor.models.adapters import as_embedding_model
+from vllm.model_executor.models.adapters import (as_classification_model,
+                                                 as_embedding_model,
+                                                 as_reward_model)
 
 
 @contextlib.contextmanager
@@ -35,8 +37,12 @@ def get_model_architecture(
         architectures = ["QuantMixtralForCausalLM"]
 
     model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
-    if model_config.runner_type == "pooling":
+    if model_config.task == "embed":
         model_cls = as_embedding_model(model_cls)
+    elif model_config.task == "classify":
+        model_cls = as_classification_model(model_cls)
+    elif model_config.task == "reward":
+        model_cls = as_reward_model(model_cls)
 
     return model_cls, arch
 
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 9cc43ae9181b9..55e90b9d41950 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -1,29 +1,48 @@
 from collections.abc import Iterable
-from typing import Any, TypeVar
+from typing import TYPE_CHECKING, Any, Optional, TypeVar
 
 import torch
 import torch.nn as nn
 
 from .interfaces_base import VllmModelForPooling, is_pooling_model
 
+if TYPE_CHECKING:
+    from vllm.model_executor.layers.pooler import PoolingType
+
 _T = TypeVar("_T", bound=type[nn.Module])
 
+_GENERATE_SUFFIXES = [
+    "ForCausalLM",
+    "ForConditionalGeneration",
+    "ChatModel",
+    "LMHeadModel",
+]
 
-def as_embedding_model(cls: _T) -> _T:
-    """Subclass an existing vLLM model to support embeddings."""
-    # Avoid modifying existing embedding models
-    if is_pooling_model(cls):
-        return cls
 
+def _get_pooling_model_name(orig_model_name: str, pooling_suffix: str) -> str:
+    model_name = orig_model_name
+
+    for generate_suffix in _GENERATE_SUFFIXES:
+        model_name = model_name.removesuffix(generate_suffix)
+
+    return model_name + pooling_suffix
+
+
+def _create_pooling_model_cls(
+    orig_cls: _T,
+    *,
+    default_pooling_type: "PoolingType",
+    default_normalize: bool,
+    default_softmax: bool,
+) -> _T:
     # Lazy import
     from vllm.config import VllmConfig
-    from vllm.model_executor.layers.pooler import (Pooler, PoolerOutput,
-                                                   PoolingType)
+    from vllm.model_executor.layers.pooler import Pooler, PoolerOutput
     from vllm.model_executor.pooling_metadata import PoolingMetadata
 
     from .utils import AutoWeightsLoader, WeightsMapper
 
-    class ModelForEmbedding(cls, VllmModelForPooling):
+    class ModelForPooling(orig_cls, VllmModelForPooling):
 
         def __init__(
             self,
@@ -34,7 +53,7 @@ def __init__(
         ) -> None:
             super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
 
-            # These are not used in embedding models
+            # These are not used in pooling models
             for attr in ("lm_head", "logits_processor"):
                 if hasattr(self, attr):
                     delattr(self, attr)
@@ -46,9 +65,9 @@ def __init__(
             if not getattr(self, "_pooler", None):
                 self._pooler = Pooler.from_config_with_defaults(
                     pooler_config,
-                    pooling_type=PoolingType.LAST,
-                    normalize=True,
-                    softmax=False,
+                    pooling_type=default_pooling_type,
+                    normalize=default_normalize,
+                    softmax=default_softmax,
                 )
 
         def pooler(
@@ -82,17 +101,148 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
                     return
 
             # For most other models
-            if hasattr(cls, "load_weights"):
-                cls.load_weights(self, weights)  # type: ignore
+            if hasattr(orig_cls, "load_weights"):
+                orig_cls.load_weights(self, weights)  # type: ignore
             # Fallback
             else:
                 loader = AutoWeightsLoader(self)
                 loader.load_weights(weights)
 
-    ModelForEmbedding.__name__ = cls.__name__ \
-        .removesuffix("ForCausalLM") \
-        .removesuffix("ForConditionalGeneration") \
-        .removesuffix("ChatModel") \
-        .removesuffix("LMHeadModel") + "ForEmbedding"
+    return ModelForPooling  # type: ignore
+
+
+def as_embedding_model(cls: _T) -> _T:
+    """
+    Subclass an existing vLLM model to support embeddings.
+
+    By default, the embeddings of the whole prompt are extracted from the
+    normalized hidden state corresponding to the last token.
+
+    Note:
+        We assume that no extra layers are added to the original model;
+        please implement your own model if this is not the case.
+    """
+    # Avoid modifying existing embedding models
+    if is_pooling_model(cls):
+        return cls
+
+    # Lazy import
+    from vllm.model_executor.layers.pooler import PoolingType
+
+    ModelForEmbedding = _create_pooling_model_cls(
+        cls,
+        default_pooling_type=PoolingType.LAST,
+        default_normalize=True,
+        default_softmax=False,
+    )
+    ModelForEmbedding.__name__ = \
+        _get_pooling_model_name(cls.__name__, "ForEmbedding")
 
     return ModelForEmbedding  # type: ignore
+
+
+def as_classification_model(cls: _T) -> _T:
+    """
+    Subclass an existing vLLM model to support classification.
+
+    By default, the class probabilities are extracted from the softmaxed
+    hidden state corresponding to the last token.
+
+    Note:
+        We assume that the classification head is a single linear layer
+        stored as the attribute `score` of the top-level model;
+        please implement your own model if this is not the case.
+    """
+    # Avoid modifying existing classification models
+    if is_pooling_model(cls):
+        return cls
+
+    # Lazy import
+    from vllm.attention import AttentionMetadata
+    from vllm.config import VllmConfig
+    from vllm.model_executor.layers.linear import RowParallelLinear
+    from vllm.model_executor.layers.pooler import PoolingType
+    from vllm.sequence import IntermediateTensors
+
+    from .utils import maybe_prefix
+
+    ModelForPooling = _create_pooling_model_cls(
+        cls,
+        default_pooling_type=PoolingType.LAST,
+        default_normalize=False,
+        default_softmax=True,
+    )
+
+    class ModelForClassification(ModelForPooling):
+
+        def __init__(
+            self,
+            *,
+            vllm_config: "VllmConfig",
+            prefix: str = "",
+            **kwargs: Any,
+        ) -> None:
+            super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
+
+            config = vllm_config.model_config.hf_config
+            quant_config = vllm_config.quant_config
+
+            self.score = RowParallelLinear(config.hidden_size,
+                                           config.num_labels,
+                                           quant_config=quant_config,
+                                           input_is_parallel=False,
+                                           bias=False,
+                                           prefix=maybe_prefix(
+                                               prefix, "score"))
+
+        def forward(
+            self,
+            input_ids: torch.Tensor,
+            positions: torch.Tensor,
+            kv_caches: list[torch.Tensor],
+            attn_metadata: AttentionMetadata,
+            intermediate_tensors: Optional[IntermediateTensors] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+        ) -> torch.Tensor:
+            hidden_states = super().forward(input_ids, positions, kv_caches,
+                                            attn_metadata,
+                                            intermediate_tensors,
+                                            inputs_embeds)
+            logits, _ = self.score(hidden_states)
+            return logits
+
+
+    ModelForClassification.__name__ = \
+        _get_pooling_model_name(cls.__name__, "ForClassification")
+
+    return ModelForClassification  # type: ignore
+
+
+def as_reward_model(cls: _T) -> _T:
+    """
+    Subclass an existing vLLM model to support reward modeling.
+
+    By default, we return the hidden states of each token directly.
+
+    Note:
+        We assume that no extra layers are added to the original model;
+        please implement your own model if this is not the case.
+    """
+    # Avoid modifying existing reward models
+    if is_pooling_model(cls):
+        return cls
+
+    # Lazy import
+    from vllm.model_executor.layers.pooler import PoolingType
+
+    ModelForReward = _create_pooling_model_cls(
+        cls,
+        default_pooling_type=PoolingType.ALL,
+        default_normalize=False,
+        default_softmax=False,
+    )
+
+    ModelForReward.__name__ = \
+        _get_pooling_model_name(cls.__name__, "ForReward")
+
+    return ModelForReward  # type: ignore
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 7661bb285df95..88f4ea4352726 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -545,8 +545,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.model = Qwen2Model(vllm_config=vllm_config,
                                 prefix=maybe_prefix(prefix, "model"))
 
-        # TODO: Replace this model class with for_embedding(Qwen2ForCausalLM),
-        # after changing the default pooling method
+        # TODO: Replace this model class with as_embedding_model(
+        # Qwen2ForCausalLM) after changing the default pooling method
         if pooler_config.pooling_type is None:
             logger.warning(
                 "This embedding model will default to last-token pooling in "
diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py
deleted file mode 100644
index dc5dabf6fc38b..0000000000000
--- a/vllm/model_executor/models/qwen2_cls.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Adapted from
-# https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
-# Copyright 2024 Kakao Corp. (Kanana-X Team)
-# Copyright 2024 The Qwen team.
-# Copyright 2023 The vLLM team.
-"""Inference-only Qwen2-Classification model compatible with HF weights."""
-from typing import Iterable, List, Optional, Set, Tuple
-
-import torch
-from torch import nn
-
-from vllm.attention import AttentionMetadata
-from vllm.config import VllmConfig
-from vllm.model_executor.layers.linear import RowParallelLinear
-from vllm.model_executor.layers.pooler import Pooler, PoolingType
-from vllm.model_executor.models.qwen2 import Qwen2Model
-from vllm.model_executor.pooling_metadata import PoolingMetadata
-from vllm.sequence import IntermediateTensors, PoolerOutput
-
-from .interfaces import SupportsLoRA, SupportsPP
-from .utils import AutoWeightsLoader, maybe_prefix
-
-
-class Qwen2ForSequenceClassification(nn.Module, SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
-
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj",
-        "o_proj",
-        "gate_up_proj",
-        "down_proj",
-    ]
-    embedding_modules = {}
-    embedding_padding_modules = []
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
-        pooler_config = vllm_config.model_config.pooler_config
-
-        self.config = config
-        self.lora_config = lora_config
-
-        self.quant_config = quant_config
-        self.model = Qwen2Model(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
-
-        # hidden_states from Qwen2Model has been reduced,
-        # the input of score layer is not parallelized.
-        self.score = RowParallelLinear(config.hidden_size,
-                                       config.num_labels,
-                                       quant_config=quant_config,
-                                       input_is_parallel=False,
-                                       bias=False,
-                                       prefix=maybe_prefix(prefix, "score"))
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.LAST,
-            normalize=False,
-            softmax=True)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
-                                   inputs_embeds)
-        logits, _ = self.score(hidden_states)
-        return logits
-
-    def pooler(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> Optional[PoolerOutput]:
-        return self._pooler(hidden_states, pooling_metadata)
-
-    def load_weights(self, weights: Iterable[Tuple[str,
-                                                   torch.Tensor]]) -> Set[str]:
-        loader = AutoWeightsLoader(self,
-                                   ignore_unexpected_prefixes=["lm_head."])
-        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 04d806c3c7eae..b32a3421d5841 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -20,11 +20,10 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
-from .adapters import as_embedding_model
 from .interfaces import (has_inner_state, is_attention_free, is_hybrid,
                          supports_cross_encoding, supports_multimodal,
                          supports_pp)
-from .interfaces_base import is_pooling_model, is_text_generation_model
+from .interfaces_base import is_text_generation_model
 
 logger = init_logger(__name__)
 
@@ -125,12 +124,13 @@
     "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
-    "Qwen2ForSequenceClassification": ("qwen2_cls", "Qwen2ForSequenceClassification"),  # noqa: E501
     "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
     # [Multimodal]
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
+    # [Auto-converted (see adapters.py)]
+    "Qwen2ForSequenceClassification": ("qwen2", "Qwen2ForCausalLM"),
 }
 
 _CROSS_ENCODER_MODELS = {
@@ -226,19 +226,10 @@ class _ModelInfo:
 
     @staticmethod
     def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo":
-        is_pooling_model_ = is_pooling_model(model)
-        if not is_pooling_model_:
-            try:
-                as_embedding_model(model)
-            except Exception:
-                pass
-            else:
-                is_pooling_model_ = True
-
         return _ModelInfo(
             architecture=model.__name__,
             is_text_generation_model=is_text_generation_model(model),
-            is_pooling_model=is_pooling_model_,
+            is_pooling_model=True,  # Can convert any model into a pooling model
             supports_cross_encoding=supports_cross_encoding(model),
             supports_multimodal=supports_multimodal(model),
             supports_pp=supports_pp(model),

From 9832e5572a602967beac0ccb8a4eb65bc18478a3 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Tue, 24 Dec 2024 19:49:46 -0800
Subject: [PATCH 170/357] [V1] Unify VLLM_ENABLE_V1_MULTIPROCESSING handling in
 RayExecutor (#11472)

---
 tests/basic_correctness/test_basic_correctness.py | 5 -----
 vllm/v1/engine/llm_engine.py                      | 2 --
 vllm/v1/executor/ray_executor.py                  | 5 ++++-
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 9e4eb16fc6cc5..1c2193bb17a55 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -127,11 +127,6 @@ def test_models_distributed(
     if attention_backend:
         os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
 
-    # Import VLLM_USE_V1 dynamically to handle patching
-    from vllm.envs import VLLM_USE_V1
-    if VLLM_USE_V1 and distributed_executor_backend != "mp":
-        os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
-
     dtype = "half"
     max_tokens = 5
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 9ad51575b3cc3..b58f62778ffe9 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -21,7 +21,6 @@
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.executor.ray_utils import initialize_ray_cluster
 
 logger = init_logger(__name__)
 
@@ -112,7 +111,6 @@ def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
         distributed_executor_backend = (
             vllm_config.parallel_config.distributed_executor_backend)
         if distributed_executor_backend == "ray":
-            initialize_ray_cluster(vllm_config.parallel_config)
             from vllm.v1.executor.ray_executor import RayExecutor
             executor_class = RayExecutor
         elif distributed_executor_backend == "mp":
diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py
index dfeb69fa701a3..79acc60001c99 100644
--- a/vllm/v1/executor/ray_executor.py
+++ b/vllm/v1/executor/ray_executor.py
@@ -8,7 +8,8 @@
 from vllm.logger import init_logger
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.executor.ray_utils import RayWorkerWrapper, ray
+from vllm.v1.executor.ray_utils import (RayWorkerWrapper,
+                                        initialize_ray_cluster, ray)
 from vllm.v1.outputs import ModelRunnerOutput
 
 if ray is not None:
@@ -33,7 +34,9 @@ def __init__(self, vllm_config: VllmConfig) -> None:
         if ray_usage != "1":
             os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
 
+        initialize_ray_cluster(self.parallel_config)
         placement_group = self.parallel_config.placement_group
+
         # Create the parallel GPU workers.
         self._init_workers_ray(placement_group)
 

From fc601665eb372d0dce9a873ea94eebf2ff5b1d27 Mon Sep 17 00:00:00 2001
From: Jiaxin Shan <seedjeffwan@gmail.com>
Date: Tue, 24 Dec 2024 22:58:48 -0800
Subject: [PATCH 171/357] [Misc] Update disaggregation benchmark scripts and
 test logs (#11456)

Signed-off-by: Jiaxin Shan <seedjeffwan@gmail.com>
---
 .../disagg_overhead_benchmark.sh              | 13 +++++-----
 .../disagg_performance_benchmark.sh           | 13 +++++-----
 tests/kv_transfer/test_lookup_buffer.py       |  4 +--
 tests/kv_transfer/test_lookup_buffer.sh       |  9 +++++--
 tests/kv_transfer/test_send_recv.py           | 25 +++++++++++--------
 tests/kv_transfer/test_send_recv.sh           |  8 +++++-
 6 files changed, 43 insertions(+), 29 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
index 2924ea4a49f54..94999630bae12 100644
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -10,7 +10,8 @@ set -ex
 
 kill_gpu_processes() {
   # kill all processes on GPU.
-  pkill -f pt_main_thread
+  pgrep pt_main_thread | xargs -r kill -9
+  pgrep python3 | xargs -r kill -9
   sleep 10
 
   # remove vllm config file
@@ -54,7 +55,7 @@ benchmark() {
 
   CUDA_VISIBLE_DEVICES=0 python3 \
     -m vllm.entrypoints.openai.api_server \
-    --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --model $model \
     --port 8100 \
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
@@ -64,7 +65,7 @@ benchmark() {
 
   CUDA_VISIBLE_DEVICES=1 python3 \
     -m vllm.entrypoints.openai.api_server \
-    --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --model $model \
     --port 8200 \
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
@@ -87,7 +88,7 @@ benchmark() {
           --port 8100 \
           --save-result \
           --result-dir $results_folder \
-          --result-filename disagg_prefill_2xtp4.json \
+          --result-filename disagg_prefill_tp1.json \
           --request-rate "inf"
 
 
@@ -105,7 +106,7 @@ benchmark() {
           --port 8200 \
           --save-result \
           --result-dir $results_folder \
-          --result-filename disagg_prefill_2xtp4.json \
+          --result-filename disagg_prefill_tp1_overhead.json \
           --request-rate "$qps"
   kill_gpu_processes
 
@@ -118,7 +119,7 @@ main() {
   (which jq) || (apt-get -y install jq)
   (which socat) || (apt-get -y install socat)
 
-  pip install quart httpx
+  pip install quart httpx datasets
 
   cd "$(dirname "$0")"
 
diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
index d8d9e976dce76..eb5d891d0d4a5 100644
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -1,13 +1,12 @@
 #!/bin/bash
 
-# Requirement: 8x H100 GPUs.
+# Requirement: 2x GPUs.
 
 
-# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV 
-# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests
-# Resource: 8x H100
+# Model: meta-llama/Meta-Llama-3.1-8B-Instruct
+# Query: 1024 input tokens, 6 output tokens, QPS 2/4/6/8, 100 requests
+# Resource: 2x GPU
 # Approaches:
-# 1. Chunked prefill: 1 vllm instance with tp=8
 # 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
 # 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
 # Prefilling instance: max_output_token=1
@@ -114,7 +113,6 @@ benchmark() {
           --request-rate "$qps"
 
   sleep 2
-
 }
 
 
@@ -123,8 +121,9 @@ main() {
   (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
   (which jq) || (apt-get -y install jq)
   (which socat) || (apt-get -y install socat)
+  (which lsof) || (apt-get -y install lsof)
 
-  pip install quart httpx matplotlib aiohttp
+  pip install quart httpx matplotlib aiohttp datasets
 
   cd "$(dirname "$0")"
 
diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
index 96b0e58713332..718730bb8cbbe 100644
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device):
         assert buffer.buffer_size == 0
         assert len(buffer.buffer) == 0
 
-    print("Test run passed!")
+    print("My rank: %d, Test run passed!" % (my_rank))
 
 
 def stress_test(my_rank, buf, device):
@@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device):
     else:
         torch.distributed.send(torch.tensor([n]), 0)
 
-    print("Passed stress test!")
+    print("My rank: %d, Passed stress test!" % (my_rank))
 
 
 if __name__ == "__main__":
diff --git a/tests/kv_transfer/test_lookup_buffer.sh b/tests/kv_transfer/test_lookup_buffer.sh
index 09d7ee018c3f4..f2aeaee9ca6d5 100644
--- a/tests/kv_transfer/test_lookup_buffer.sh
+++ b/tests/kv_transfer/test_lookup_buffer.sh
@@ -1,3 +1,8 @@
 #!/bin/bash
-RANK=0 python test_lookup_buffer.py &
-RANK=1 python test_lookup_buffer.py &
\ No newline at end of file
+RANK=0 python3 test_lookup_buffer.py &
+PID0=$!
+RANK=1 python3 test_lookup_buffer.py &
+PID1=$!
+
+wait $PID0
+wait $PID1
diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index 65973bf10a4d7..4beba4dc05dde 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -10,39 +10,42 @@
 
 
 def test_run(my_rank, pipe):
+    print(f"rank {my_rank} test_run starts....")
     # test run
     x = torch.tensor([1]).to(pipe.device)
     y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device)
     if my_rank == 0:
         pipe.send_tensor(x)
-        print("sent tensor x")
+        print(f"rank {my_rank} sent tensor x")
         pipe.send_tensor(y)
-        print("sent tensor y")
+        print(f"rank {my_rank} sent tensor y")
         x2 = pipe.recv_tensor()
-        print("received x2 = ", x2)
+        print(f"rank {my_rank} received x2 = ", x2)
         y2 = pipe.recv_tensor()
-        print("received y2 = ", x2)
+        print(f"rank {my_rank} received y2 = ", x2)
 
     else:
         x2 = pipe.recv_tensor()
-        print("received x2 = ", x2)
+        print(f"rank {my_rank} received x2 = ", x2)
         y2 = pipe.recv_tensor()
-        print("received y2 = ", x2)
+        print(f"rank {my_rank} received y2 = ", x2)
         pipe.send_tensor(x)
-        print("sent tensor x")
+        print(f"rank {my_rank} sent tensor x")
         pipe.send_tensor(y)
-        print("sent tensor y")
+        print(f"rank {my_rank} sent tensor y")
 
     assert torch.allclose(x, x2)
     assert torch.allclose(y, y2)
 
+    print(f"rank {my_rank} test_run passed!")
 
-def stress_test(my_rank, pipe):
 
-    torch.distributed.barrier()
+def stress_test(my_rank, pipe):
+    print(f"rank {my_rank} stress_test starts....")
 
     tensors: List[torch.Tensor] = []
 
+    torch.distributed.barrier()
     torch.manual_seed(0)
 
     for i in tqdm(range(500)):
@@ -86,7 +89,6 @@ def stress_test(my_rank, pipe):
 
 
 def latency_test(my_rank, pipe, nelement, ntensor):
-
     latencies = []
 
     torch.distributed.barrier()
@@ -149,6 +151,7 @@ def latency_test(my_rank, pipe, nelement, ntensor):
     )
 
     test_run(my_rank, pipe)
+
     stress_test(my_rank, pipe)
 
     # Use this function if you want to test the latency of pipe impl.
diff --git a/tests/kv_transfer/test_send_recv.sh b/tests/kv_transfer/test_send_recv.sh
index 1e89e246b4992..54e0604806841 100644
--- a/tests/kv_transfer/test_send_recv.sh
+++ b/tests/kv_transfer/test_send_recv.sh
@@ -1,3 +1,9 @@
 #!/bin/bash
+
 RANK=0 python3 test_send_recv.py &
-RANK=1 python3 test_send_recv.py &
\ No newline at end of file
+PID0=$!
+RANK=1 python3 test_send_recv.py &
+PID1=$!
+
+wait $PID0
+wait $PID1

From b689ada91e381ac8a1b197b5a52d7f1fa32fac05 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 26 Dec 2024 00:33:55 +0800
Subject: [PATCH 172/357] [Frontend] Enable decord to load video from base64
 (#11492)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/multimodal/utils.py | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index c898ca4e6573e..be9643598448d 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -125,17 +125,7 @@ async def async_fetch_image(image_url: str,
     return image.convert(image_mode)
 
 
-def _load_video_frames_from_bytes(b: bytes):
-    frame = Image.open(BytesIO(b))
-    return np.array(frame)
-
-
-def load_video_frames_from_base64(frame: Union[bytes, str]):
-    """Load frame from base64 format."""
-    return _load_video_frames_from_bytes(base64.b64decode(frame))
-
-
-def _load_video_from_bytes(b: bytes, num_frames: int = 32):
+def _load_video_from_bytes(b: bytes, num_frames: int = 32) -> npt.NDArray:
     _, decord = try_import_video_packages()
 
     video_path = BytesIO(b)
@@ -155,13 +145,17 @@ def _load_video_from_bytes(b: bytes, num_frames: int = 32):
     return frames
 
 
-def _load_video_from_data_url(video_url: str):
-    # Only split once and assume the second part is the base64 encoded image
-    frames_base64 = video_url.split(",")[1:]
-    return np.stack([
-        load_video_frames_from_base64(frame_base64)
-        for frame_base64 in frames_base64
-    ])
+def _load_video_from_data_url(video_url: str) -> npt.NDArray:
+    # Only split once and assume the second part is the base64 encoded video
+    _, video_base64 = video_url.split(",", 1)
+
+    if video_url.startswith("data:video/jpeg;"):
+        return np.stack([
+            np.array(load_image_from_base64(frame_base64))
+            for frame_base64 in video_base64.split(",")
+        ])
+
+    return load_video_from_base64(video_base64)
 
 
 def fetch_video(video_url: str, *, num_frames: int = 32) -> npt.NDArray:
@@ -342,7 +336,7 @@ def rescale_image_size(image: Image.Image,
     return image
 
 
-def try_import_video_packages() -> Any:
+def try_import_video_packages():
     try:
         import cv2
         import decord
@@ -384,7 +378,7 @@ def sample_frames_from_video(frames: npt.NDArray,
         return sampled_frames
 
 
-def encode_video_base64(frames: npt.NDArray):
+def encode_video_base64(frames: npt.NDArray) -> str:
     base64_frames = []
     frames_list = [frames[i] for i in range(frames.shape[0])]
     for frame in frames_list:
@@ -393,6 +387,11 @@ def encode_video_base64(frames: npt.NDArray):
     return ",".join(base64_frames)
 
 
+def load_video_from_base64(video: Union[bytes, str]) -> npt.NDArray:
+    """Load video from base64 format."""
+    return _load_video_from_bytes(base64.b64decode(video))
+
+
 def resolve_visual_encoder_outputs(
     encoder_outputs: Union[torch.Tensor, list[torch.Tensor]],
     feature_sample_layers: Optional[list[int]],

From 6ad909fdda54f91379bbee7590a37e38600b6204 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 26 Dec 2024 06:49:26 +0800
Subject: [PATCH 173/357] [Doc] Improve GitHub links (#11491)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/conf.py                           | 29 ++++++++++++++++
 .../contributing/dockerfile/dockerfile.md     |  4 +--
 docs/source/contributing/overview.md          | 14 ++++----
 .../contributing/profiling/profiling_index.md |  8 ++---
 docs/source/design/arch_overview.md           | 17 ++++------
 docs/source/design/multiprocessing.md         | 27 ++++++++-------
 docs/source/generate_examples.py              |  3 +-
 .../getting_started/amd-installation.md       |  4 +--
 .../getting_started/cpu-installation.md       |  4 +--
 docs/source/getting_started/debugging.md      |  7 ++--
 .../getting_started/gaudi-installation.md     |  6 ++--
 docs/source/getting_started/installation.md   |  2 +-
 docs/source/getting_started/quickstart.md     |  7 ++--
 .../getting_started/tpu-installation.md       |  3 +-
 .../getting_started/xpu-installation.md       |  2 +-
 docs/source/models/adding_model.md            |  8 ++---
 .../models/enabling_multimodal_inputs.md      | 12 +++----
 docs/source/models/generative_models.md       |  6 ++--
 docs/source/models/pooling_models.md          |  6 ++--
 docs/source/models/supported_models.md        |  4 +--
 docs/source/performance/benchmarks.md         |  4 +--
 .../source/quantization/supported_hardware.md |  2 +-
 docs/source/serving/deploying_with_docker.md  |  2 +-
 docs/source/serving/distributed_serving.md    |  4 +--
 .../serving/openai_compatible_server.md       | 23 ++++---------
 docs/source/usage/compatibility_matrix.md     | 34 +++++++++----------
 docs/source/usage/lora.md                     |  3 +-
 docs/source/usage/multimodal_inputs.md        | 24 ++++++-------
 docs/source/usage/spec_decode.md              |  8 ++---
 docs/source/usage/structured_outputs.md       |  4 +--
 docs/source/usage/usage_stats.md              |  2 +-
 31 files changed, 147 insertions(+), 136 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 6f1d1842fe686..1fe0474631140 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -74,6 +74,35 @@
 html_static_path = ["_static"]
 html_js_files = ["custom.js"]
 
+myst_url_schemes = {
+    'http': None,
+    'https': None,
+    'mailto': None,
+    'ftp': None,
+    "gh-issue": {
+        "url":
+        "https://github.com/vllm-project/vllm/issues/{{path}}#{{fragment}}",
+        "title": "Issue #{{path}}",
+        "classes": ["github"],
+    },
+    "gh-pr": {
+        "url":
+        "https://github.com/vllm-project/vllm/pull/{{path}}#{{fragment}}",
+        "title": "Pull Request #{{path}}",
+        "classes": ["github"],
+    },
+    "gh-dir": {
+        "url": "https://github.com/vllm-project/vllm/tree/main/{{path}}",
+        "title": "{{path}}",
+        "classes": ["github"],
+    },
+    "gh-file": {
+        "url": "https://github.com/vllm-project/vllm/blob/main/{{path}}",
+        "title": "{{path}}",
+        "classes": ["github"],
+    },
+}
+
 # see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
 READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE')
 if READTHEDOCS_VERSION_TYPE == "tag":
diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md
index d72b99fe017b6..6535414a7dca4 100644
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@@ -1,7 +1,7 @@
 # Dockerfile
 
-See [here](https://github.com/vllm-project/vllm/blob/main/Dockerfile) for the main Dockerfile to construct
-the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found [here](https://docs.vllm.ai/en/stable/serving/deploying_with_docker.html).
+We provide a <gh-file:Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
+More information about deploying with Docker can be found [here](../../serving/deploying_with_docker.md).
 
 Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
 
diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index 53e8e78f08e72..9dac41cff0bcb 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -13,11 +13,12 @@ Finally, one of the most impactful ways to support us is by raising awareness ab
 
 ## License
 
-See [LICENSE](https://github.com/vllm-project/vllm/tree/main/LICENSE).
+See <gh-file:LICENSE>.
 
 ## Developing
 
-Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details.
+Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
+Check out the [building from source](#build-from-source) documentation for details.
 
 ## Testing
 
@@ -43,7 +44,7 @@ Currently, the repository does not pass the `mypy` tests.
 If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
 
 ```{important}
-If you discover a security vulnerability, please follow the instructions [here](https://github.com/vllm-project/vllm/tree/main/SECURITY.md#reporting-a-vulnerability).
+If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
 ```
 
 ## Pull Requests & Code Reviews
@@ -54,9 +55,9 @@ code quality and improve the efficiency of the review process.
 
 ### DCO and Signed-off-by
 
-When contributing changes to this project, you must agree to the [DCO](https://github.com/vllm-project/vllm/tree/main/DCO).
+When contributing changes to this project, you must agree to the <gh-file:DCO>.
 Commits must include a `Signed-off-by:` header which certifies agreement with
-the terms of the [DCO](https://github.com/vllm-project/vllm/tree/main/DCO).
+the terms of the DCO.
 
 Using `-s` with `git commit` will automatically add this header.
 
@@ -89,8 +90,7 @@ If the PR spans more than one category, please include all relevant prefixes.
 The PR needs to meet the following code quality standards:
 
 - We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
-- Pass all linter checks. Please use [format.sh](https://github.com/vllm-project/vllm/blob/main/format.sh) to format your
-  code.
+- Pass all linter checks. Please use <gh-file:format.sh> to format your code.
 - The code needs to be well-documented to ensure future contributors can easily
   understand the code.
 - Include sufficient tests to ensure the project stays correct and robust. This
diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md
index 04e01da556231..46210957c19ec 100644
--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@@ -22,13 +22,13 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve
 `export VLLM_RPC_TIMEOUT=1800000`
 ```
 
-## Example commands and usage:
+## Example commands and usage
 
-### Offline Inference:
+### Offline Inference
 
-Refer to [examples/offline_inference_with_profiler.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_with_profiler.py) for an example.
+Refer to <gh-file:examples/offline_inference_with_profiler.py> for an example.
 
-### OpenAI Server:
+### OpenAI Server
 
 ```bash
 VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md
index 511bee20a91f4..475a3e5fa9ddc 100644
--- a/docs/source/design/arch_overview.md
+++ b/docs/source/design/arch_overview.md
@@ -55,7 +55,7 @@ for output in outputs:
 More API details can be found in the {doc}`Offline Inference
 </dev/offline_inference/offline_index>` section of the API docs.
 
-The code for the `LLM` class can be found in [vllm/entrypoints/llm.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py).
+The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
 
 ### OpenAI-compatible API server
 
@@ -66,7 +66,7 @@ This server can be started using the `vllm serve` command.
 vllm serve <model>
 ```
 
-The code for the `vllm` CLI can be found in [vllm/scripts.py](https://github.com/vllm-project/vllm/blob/main/vllm/scripts.py).
+The code for the `vllm` CLI can be found in <gh-file:vllm/scripts.py>.
 
 Sometimes you may see the API server entrypoint used directly instead of via the
 `vllm` CLI command. For example:
@@ -75,7 +75,7 @@ Sometimes you may see the API server entrypoint used directly instead of via the
 python -m vllm.entrypoints.openai.api_server --model <model>
 ```
 
-That code can be found in [vllm/entrypoints/openai/api_server.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/api_server.py).
+That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
 
 More details on the API server can be found in the {doc}`OpenAI Compatible
 Server </serving/openai_compatible_server>` document.
@@ -105,7 +105,7 @@ processing.
 - **Output Processing**: Processes the outputs generated by the model, decoding the
   token IDs from a language model into human-readable text.
 
-The code for `LLMEngine` can be found in [vllm/engine/llm_engine.py].
+The code for `LLMEngine` can be found in <gh-file:vllm/engine/llm_engine.py>.
 
 ### AsyncLLMEngine
 
@@ -115,10 +115,9 @@ incoming requests. The `AsyncLLMEngine` is designed for online serving, where it
 can handle multiple concurrent requests and stream outputs to clients.
 
 The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo
-API server that serves as a simpler example in
-[vllm/entrypoints/api_server.py].
+API server that serves as a simpler example in <gh-file:vllm/entrypoints/api_server.py>.
 
-The code for `AsyncLLMEngine` can be found in [vllm/engine/async_llm_engine.py].
+The code for `AsyncLLMEngine` can be found in <gh-file:vllm/engine/async_llm_engine.py>.
 
 ## Worker
 
@@ -252,7 +251,3 @@ big problem.
 
 In summary, the complete config object `VllmConfig` can be treated as an
 engine-level global state that is shared among all vLLM classes.
-
-[vllm/engine/async_llm_engine.py]: https://github.com/vllm-project/vllm/tree/main/vllm/engine/async_llm_engine.py
-[vllm/engine/llm_engine.py]: https://github.com/vllm-project/vllm/tree/main/vllm/engine/llm_engine.py
-[vllm/entrypoints/api_server.py]: https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/api_server.py
diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md
index b58456ecc6da8..34564413b34f6 100644
--- a/docs/source/design/multiprocessing.md
+++ b/docs/source/design/multiprocessing.md
@@ -2,13 +2,14 @@
 
 ## Debugging
 
-Please see the [Debugging
-Tips](https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing)
+Please see the [Debugging Tips](#debugging-python-multiprocessing)
 page for information on known issues and how to solve them.
 
 ## Introduction
 
-*Note that source code references are to the state of the code at the time of writing in December, 2024.*
+```{important}
+The source code references are to the state of the code at the time of writing in December, 2024.
+```
 
 The use of Python multiprocessing in vLLM is complicated by:
 
@@ -20,7 +21,7 @@ This document describes how vLLM deals with these challenges.
 
 ## Multiprocessing Methods
 
-[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
+[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html.md#contexts-and-start-methods) include:
 
 - `spawn` - spawn a new Python process. This will be the default as of Python
   3.14.
@@ -82,7 +83,7 @@ There are other miscellaneous places hard-coding the use of `spawn`:
 
 Related PRs:
 
-- <https://github.com/vllm-project/vllm/pull/8823>
+- <gh-pr:8823>
 
 ## Prior State in v1
 
@@ -96,7 +97,7 @@ engine core.
 
 - <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/llm_engine.py#L93-L95>
 - <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/llm_engine.py#L70-L77>
-- https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/core_client.py#L44-L45
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/core_client.py#L44-L45>
 
 It was off by default for all the reasons mentioned above - compatibility with
 dependencies and code using vLLM as a library.
@@ -119,17 +120,17 @@ instruct users to either add a `__main__` guard or to disable multiprocessing.
 If that known-failure case occurs, the user will see two messages that explain
 what is happening. First, a log message from vLLM:
 
-```
-    WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
-      initialized. We must use the `spawn` multiprocessing start method. Setting
-      VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
-      https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
-      for more information.
+```console
+WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
+    initialized. We must use the `spawn` multiprocessing start method. Setting
+    VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
+    https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
+    for more information.
 ```
 
 Second, Python itself will raise an exception with a nice explanation:
 
-```
+```console
 RuntimeError:
         An attempt has been made to start a new process before the
         current process has finished its bootstrapping phase.
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index 4c5a9d9c1da38..aef32f7559f74 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -36,11 +36,10 @@ def generate_examples():
 
     # Generate the example docs for each example script
     for script_path, doc_path in zip(script_paths, doc_paths):
-        script_url = f"https://github.com/vllm-project/vllm/blob/main/examples/{script_path.name}"
         # Make script_path relative to doc_path and call it include_path
         include_path = '../../../..' / script_path.relative_to(root_dir)
         content = (f"{generate_title(doc_path.stem)}\n\n"
-                   f"Source: <{script_url}>.\n\n"
+                   f"Source: <gh-file:examples/{script_path.name}>.\n\n"
                    f"```{{literalinclude}} {include_path}\n"
                    ":language: python\n"
                    ":linenos:\n```")
diff --git a/docs/source/getting_started/amd-installation.md b/docs/source/getting_started/amd-installation.md
index b9ccbd7d6c7fc..6d01efbbf8828 100644
--- a/docs/source/getting_started/amd-installation.md
+++ b/docs/source/getting_started/amd-installation.md
@@ -22,7 +22,7 @@ Installation options:
 
 You can build and install vLLM from source.
 
-First, build a docker image from [Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm) and launch a docker container from the image.
+First, build a docker image from <gh-file:Dockerfile.rocm> and launch a docker container from the image.
 It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
 
 ```console
@@ -33,7 +33,7 @@ It is important that the user kicks off the docker build using buildkit. Either
 }
 ```
 
-[Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm) uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches.
+<gh-file:Dockerfile.rocm> uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches.
 It provides flexibility to customize the build of docker image using the following arguments:
 
 - `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image.
diff --git a/docs/source/getting_started/cpu-installation.md b/docs/source/getting_started/cpu-installation.md
index 4ab5437f091d5..b6f181ace6274 100644
--- a/docs/source/getting_started/cpu-installation.md
+++ b/docs/source/getting_started/cpu-installation.md
@@ -145,10 +145,10 @@ $ python examples/offline_inference.py
 
 - On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel.
 
-  - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](https://github.com/vllm-project/vllm/pull/6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
+  - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
 
     ```console
     $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
     ```
 
-  - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](../serving/deploying_with_nginx) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md).
+  - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](../serving/deploying_with_nginx.md) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md).
diff --git a/docs/source/getting_started/debugging.md b/docs/source/getting_started/debugging.md
index 2f11c95ce0e77..3b0029f2e88ce 100644
--- a/docs/source/getting_started/debugging.md
+++ b/docs/source/getting_started/debugging.md
@@ -24,7 +24,7 @@ To isolate the model downloading and loading issue, you can use the `--load-form
 
 ## Model is too large
 
-If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](https://docs.vllm.ai/en/latest/serving/distributed_serving.html#distributed-inference-and-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using [this example](https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html) . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
 
 ## Enable more logging
 
@@ -139,6 +139,7 @@ A multi-node environment is more complicated than a single-node one. If you see
 Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes.
 ```
 
+(debugging-python-multiprocessing)=
 ## Python multiprocessing
 
 ### `RuntimeError` Exception
@@ -195,5 +196,5 @@ if __name__ == '__main__':
 
 ## Known Issues
 
-- In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](https://github.com/vllm-project/vllm/pull/6759).
-- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable ``NCCL_CUMEM_ENABLE=0`` to disable NCCL's ``cuMem`` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](https://github.com/vllm-project/vllm/issues/5723#issuecomment-2554389656) .
+- In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759).
+- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable ``NCCL_CUMEM_ENABLE=0`` to disable NCCL's ``cuMem`` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) .
diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/gaudi-installation.md
index 170d7e49ba806..acf42f210dffb 100644
--- a/docs/source/getting_started/gaudi-installation.md
+++ b/docs/source/getting_started/gaudi-installation.md
@@ -80,10 +80,8 @@ $ python setup.py develop
 
 ## Supported Features
 
-- [Offline batched
-  inference](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#offline-batched-inference)
-- Online inference via [OpenAI-Compatible
-  Server](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server)
+- [Offline batched inference](#offline-batched-inference)
+- Online inference via [OpenAI-Compatible Server](#openai-compatible-server)
 - HPU autodetection - no need to manually select device within vLLM
 - Paged KV cache with algorithms enabled for Intel Gaudi accelerators
 - Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
diff --git a/docs/source/getting_started/installation.md b/docs/source/getting_started/installation.md
index 8ca634f966a06..996fb346f43d4 100644
--- a/docs/source/getting_started/installation.md
+++ b/docs/source/getting_started/installation.md
@@ -24,7 +24,7 @@ $ pip install vllm
 ```
 
 ```{note}
-Although we recommend using `conda` to create and manage Python environments, it is highly recommended to use `pip` to install vLLM. This is because `pip` can install `torch` with separate library packages like `NCCL`, while `conda` installs `torch` with statically linked `NCCL`. This can cause issues when vLLM tries to use `NCCL`. See [this issue](https://github.com/vllm-project/vllm/issues/8420) for more details.
+Although we recommend using `conda` to create and manage Python environments, it is highly recommended to use `pip` to install vLLM. This is because `pip` can install `torch` with separate library packages like `NCCL`, while `conda` installs `torch` with statically linked `NCCL`. This can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
 ```
 
 ````{note}
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index e3508bce68c2d..165e5df146dcd 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -29,7 +29,7 @@ Please refer to the {ref}`installation documentation <installation>` for more de
 
 ## Offline Batched Inference
 
-With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). The example script for this section can be found [here](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py).
+With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference.py>
 
 The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:
 
@@ -87,7 +87,8 @@ $ vllm serve Qwen/Qwen2.5-1.5B-Instruct
 ```
 
 ```{note}
-By default, the server uses a predefined chat template stored in the tokenizer. You can learn about overriding it [here](https://github.com/vllm-project/vllm/blob/main/docs/source/serving/openai_compatible_server.md#chat-template).
+By default, the server uses a predefined chat template stored in the tokenizer.
+You can learn about overriding it [here](#chat-template).
 ```
 
 This server can be queried in the same format as OpenAI API. For example, to list the models:
@@ -130,7 +131,7 @@ completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
 print("Completion result:", completion)
 ```
 
-A more detailed client example can be found [here](https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py).
+A more detailed client example can be found here: <gh-file:examples/openai_completion_client.py>
 
 ### OpenAI Chat Completions API with vLLM
 
diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/tpu-installation.md
index f4916460026d1..f2a949e7247d8 100644
--- a/docs/source/getting_started/tpu-installation.md
+++ b/docs/source/getting_started/tpu-installation.md
@@ -154,8 +154,7 @@ For more information about using TPUs with GKE, see
 
 ## Build a docker image with {code}`Dockerfile.tpu`
 
-You can use [Dockerfile.tpu](https://github.com/vllm-project/vllm/blob/main/Dockerfile.tpu)
-to build a Docker image with TPU support.
+You can use <gh-file:Dockerfile.tpu> to build a Docker image with TPU support.
 
 ```console
 $ docker build -f Dockerfile.tpu -t vllm-tpu .
diff --git a/docs/source/getting_started/xpu-installation.md b/docs/source/getting_started/xpu-installation.md
index 5c57509aef2db..9554ae4b7fb44 100644
--- a/docs/source/getting_started/xpu-installation.md
+++ b/docs/source/getting_started/xpu-installation.md
@@ -71,4 +71,4 @@ $      --pipeline-parallel-size=2 \
 $      -tp=8
 ```
 
-By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring helper [script](https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh).
+By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/run_cluster.sh> helper script.
diff --git a/docs/source/models/adding_model.md b/docs/source/models/adding_model.md
index 3739873bb547b..02537fba020c4 100644
--- a/docs/source/models/adding_model.md
+++ b/docs/source/models/adding_model.md
@@ -31,8 +31,8 @@ If you don't want to fork the repository and modify vLLM's codebase, please refe
 
 ## 1. Bring your model code
 
-Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the [vllm/model_executor/models](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models) directory.
-For instance, vLLM's [OPT model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/opt.py) was adapted from the HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
+Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the <gh-dir:vllm/model_executor/models> directory.
+For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from the HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
 
 ```{warning}
 When copying the model code, make sure to review and adhere to the code's copyright and licensing terms.
@@ -99,7 +99,7 @@ Currently, vLLM supports the basic multi-head attention mechanism and its varian
 If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
 ```
 
-For reference, check out the [LLAMA model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out the [vLLM models](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models) directory for more examples.
+For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
 
 ## 3. (Optional) Implement tensor parallelism and quantization support
 
@@ -123,7 +123,7 @@ This method should load the weights from the HuggingFace's checkpoint file and a
 
 ## 5. Register your model
 
-Finally, register your {code}`*ForCausalLM` class to the {code}`_VLLM_MODELS` in [vllm/model_executor/models/registry.py](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/registry.py).
+Finally, register your {code}`*ForCausalLM` class to the {code}`_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py>.
 
 ## 6. Out-of-Tree Model Integration
 
diff --git a/docs/source/models/enabling_multimodal_inputs.md b/docs/source/models/enabling_multimodal_inputs.md
index 2f93eb826fb1e..fdd770887900e 100644
--- a/docs/source/models/enabling_multimodal_inputs.md
+++ b/docs/source/models/enabling_multimodal_inputs.md
@@ -78,8 +78,8 @@ and register it via {meth}`INPUT_REGISTRY.register_dummy_data <vllm.inputs.regis
 
 Here are some examples:
 
-- Image inputs (static feature size): [LLaVA-1.5 Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py)
-- Image inputs (dynamic feature size): [LLaVA-NeXT Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py)
+- Image inputs (static feature size): [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py)
+- Image inputs (dynamic feature size): [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py)
 
 ```{seealso}
 [Input Processing Pipeline](#input-processing-pipeline)
@@ -107,8 +107,8 @@ The dummy data should have the maximum possible number of multi-modal tokens, as
 
 Here are some examples:
 
-- Image inputs (static feature size): [LLaVA-1.5 Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py)
-- Image inputs (dynamic feature size): [LLaVA-NeXT Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py)
+- Image inputs (static feature size): [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py)
+- Image inputs (dynamic feature size): [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py)
 
 ```{seealso}
 [Input Processing Pipeline](#input-processing-pipeline)
@@ -135,8 +135,8 @@ You can register input processors via {meth}`INPUT_REGISTRY.register_input_proce
 A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation.
 Here are some examples:
 
-- Insert static number of image tokens: [LLaVA-1.5 Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py)
-- Insert dynamic number of image tokens: [LLaVA-NeXT Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py)
+- Insert static number of image tokens: [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py)
+- Insert dynamic number of image tokens: [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py)
 
 ```{seealso}
 [Input Processing Pipeline](#input-processing-pipeline)
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index e97014dbef156..35e0302b86619 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -46,7 +46,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-A code example can be found in [examples/offline_inference.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py).
+A code example can be found here: <gh-file:examples/offline_inference.py>
 
 ### `LLM.beam_search`
 
@@ -103,7 +103,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-A code example can be found in [examples/offline_inference_chat.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_chat.py).
+A code example can be found here: <gh-file:examples/offline_inference_chat.py>
 
 If the model doesn't have a chat template or you want to specify another one,
 you can explicitly pass a chat template:
@@ -120,7 +120,7 @@ outputs = llm.chat(conversation, chat_template=custom_template)
 
 ## Online Inference
 
-Our [OpenAI Compatible Server](../serving/openai_compatible_server) provides endpoints that correspond to the offline APIs:
+Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
 
 - [Completions API](#completions-api) is similar to `LLM.generate` but only accepts text.
 - [Chat API](#chat-api)  is similar to `LLM.chat`, accepting both text and [multi-modal inputs](#multimodal-inputs) for models with a chat template.
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 6d034f652d2ab..76c96c9edcc5d 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -65,7 +65,7 @@ embeds = output.outputs.embedding
 print(f"Embeddings: {embeds!r} (size={len(embeds)})")
 ```
 
-A code example can be found in [examples/offline_inference_embedding.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_embedding.py).
+A code example can be found here: <gh-file:examples/offline_inference_embedding.py>
 
 ### `LLM.classify`
 
@@ -80,7 +80,7 @@ probs = output.outputs.probs
 print(f"Class Probabilities: {probs!r} (size={len(probs)})")
 ```
 
-A code example can be found in [examples/offline_inference_classification.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_classification.py).
+A code example can be found here: <gh-file:examples/offline_inference_classification.py>
 
 ### `LLM.score`
 
@@ -102,7 +102,7 @@ score = output.outputs.score
 print(f"Score: {score}")
 ```
 
-A code example can be found in [examples/offline_inference_scoring.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_scoring.py).
+A code example can be found here: <gh-file:examples/offline_inference_scoring.py>
 
 ## Online Inference
 
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 545a2ccaa5634..099e6c8f02815 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -756,7 +756,7 @@ and pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGenerati
 
 ```{note}
 The official {code}`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork ({code}`HwwwH/MiniCPM-V-2`) for now.
-For more details, please see: <https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630>
+For more details, please see: <gh-pr:4087#issuecomment-2250397630>
 ```
 
 ### Pooling Models
@@ -834,5 +834,5 @@ We have the following levels of testing for models:
 
 1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test.
 2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
-3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](https://github.com/vllm-project/vllm/tree/main/tests) and [examples](https://github.com/vllm-project/vllm/tree/main/examples) for the models that have passed this test.
+3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:main/examples) for the models that have passed this test.
 4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
diff --git a/docs/source/performance/benchmarks.md b/docs/source/performance/benchmarks.md
index 50ef4a1f3b54d..39dc470a1c708 100644
--- a/docs/source/performance/benchmarks.md
+++ b/docs/source/performance/benchmarks.md
@@ -15,7 +15,7 @@ The performance benchmarks are used for development to confirm whether new chang
 
 The latest performance results are hosted on the public [vLLM Performance Dashboard](https://perf.vllm.ai).
 
-More information on the performance benchmarks and their parameters can be found [here](https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
+More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
 
 (nightly-benchmarks)=
 
@@ -25,4 +25,4 @@ These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lm
 
 The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html).
 
-More information on the nightly benchmarks and their parameters can be found [here](https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/nightly-descriptions.md).
+More information on the nightly benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/nightly-descriptions.md).
diff --git a/docs/source/quantization/supported_hardware.md b/docs/source/quantization/supported_hardware.md
index d2160772a24cb..843ee21627d78 100644
--- a/docs/source/quantization/supported_hardware.md
+++ b/docs/source/quantization/supported_hardware.md
@@ -129,4 +129,4 @@ The table below shows the compatibility of various quantization implementations
 
 Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
 
-For the most up-to-date information on hardware support and quantization methods, please check the [quantization directory](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization) or consult with the vLLM development team.
+For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
diff --git a/docs/source/serving/deploying_with_docker.md b/docs/source/serving/deploying_with_docker.md
index 2d8ceed8cecfd..844bd27800c7a 100644
--- a/docs/source/serving/deploying_with_docker.md
+++ b/docs/source/serving/deploying_with_docker.md
@@ -25,7 +25,7 @@ memory to share data between processes under the hood, particularly for tensor p
 
 ## Building vLLM's Docker Image from Source
 
-You can build and run vLLM from source via the provided [Dockerfile](https://github.com/vllm-project/vllm/blob/main/Dockerfile). To build vLLM:
+You can build and run vLLM from source via the provided <gh-file:Dockerfile>. To build vLLM:
 
 ```console
 $ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index e0485d66c0a26..c0a4b23f6dc70 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -51,7 +51,7 @@ $     --pipeline-parallel-size 2
 
 If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
 
-The first step, is to start containers and organize them into a cluster. We have provided a helper [script](https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh) to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command.
+The first step, is to start containers and organize them into a cluster. We have provided the helper script <gh-file:examples/run_cluster.sh> to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command.
 
 Pick a node as the head node, and run the following command:
 
@@ -95,7 +95,7 @@ $     --tensor-parallel-size 16
 To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
 
 ```{warning}
-After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](https://docs.vllm.ai/en/latest/getting_started/debugging.html) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See the [discussion](https://github.com/vllm-project/vllm/issues/6803) for more information.
+After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](../getting_started/debugging.md) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information.
 ```
 
 ```{warning}
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 597618cc5a215..23c66f72162d2 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -65,8 +65,7 @@ and all chat requests will error.
 vllm serve <model> --chat-template ./path-to-chat-template.jinja
 ```
 
-vLLM community provides a set of chat templates for popular models. You can find them in the examples
-directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
+vLLM community provides a set of chat templates for popular models. You can find them under the <gh-dir:examples> directory.
 
 With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies
 both a `type` and a `text` field. An example is provided below:
@@ -184,9 +183,7 @@ The order of priorities is `command line > config file values > defaults`.
 Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
 you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
 
-#### Code example
-
-See [examples/openai_completion_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py).
+Code example: <gh-file:examples/openai_completion_client.py>
 
 #### Extra parameters
 
@@ -217,9 +214,7 @@ We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
 see our [Multimodal Inputs](../usage/multimodal_inputs.md) guide for more information.
 - *Note: `image_url.detail` parameter is not supported.*
 
-#### Code example
-
-See [examples/openai_chat_completion_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client.py).
+Code example: <gh-file:examples/openai_chat_completion_client.py>
 
 #### Extra parameters
 
@@ -252,9 +247,7 @@ which will be treated as a single prompt to the model.
 This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details.
 ```
 
-#### Code example
-
-See [examples/openai_embedding_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_embedding_client.py).
+Code example: <gh-file:examples/openai_embedding_client.py>
 
 #### Extra parameters
 
@@ -298,9 +291,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_
 
 The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
 
-#### Code example
-
-See [examples/openai_pooling_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_pooling_client.py).
+Code example: <gh-file:examples/openai_pooling_client.py>
 
 (score-api)=
 ### Score API
@@ -310,9 +301,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent
 
 You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
 
-#### Code example
-
-See [examples/openai_cross_encoder_score.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_cross_encoder_score.py).
+Code example: <gh-file:examples/openai_cross_encoder_score.py>
 
 #### Single inference
 
diff --git a/docs/source/usage/compatibility_matrix.md b/docs/source/usage/compatibility_matrix.md
index 763b49dac4f8a..3cefa12ea8a1d 100644
--- a/docs/source/usage/compatibility_matrix.md
+++ b/docs/source/usage/compatibility_matrix.md
@@ -82,7 +82,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
      -
      -
    * - [LoRA](#lora-adapter)
-     - [✗](https://github.com/vllm-project/vllm/pull/9057)
+     - [✗](gh-pr:9057)
      - ✅
      -
      -
@@ -168,10 +168,10 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
      -
    * - <abbr title="Encoder-Decoder Models">enc-dec</abbr>
      - ✗
-     - [✗](https://github.com/vllm-project/vllm/issues/7366)
+     - [✗](gh-issue:7366)
      - ✗
      - ✗
-     - [✗](https://github.com/vllm-project/vllm/issues/7366)
+     - [✗](gh-issue:7366)
      - ✅
      - ✅
      -
@@ -205,7 +205,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
      - ✅
      - ✅
      - ✅
-     - [✗](https://github.com/vllm-project/vllm/pull/8199)
+     - [✗](gh-pr:8199)
      - ✅
      - ✗
      - ✅
@@ -244,7 +244,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
      - ✗
      - ✗
      - ✅
-     - [✗](https://github.com/vllm-project/vllm/issues/8198)
+     - [✗](gh-issue:8198)
      - ✅
      -
      -
@@ -253,8 +253,8 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
      -
    * - <abbr title="Multimodal Inputs">mm</abbr>
      - ✅
-     -  [✗](https://github.com/vllm-project/vllm/pull/8348)
-     -  [✗](https://github.com/vllm-project/vllm/pull/7199)
+     -  [✗](gh-pr:8348)
+     -  [✗](gh-pr:7199)
      - ?
      - ?
      - ✅
@@ -273,14 +273,14 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
      - ✅
      - ✅
      - ✅
-     - [✗](https://github.com/vllm-project/vllm/issues/6137)
+     - [✗](gh-issue:6137)
      - ✅
      - ✗
      - ✅
      - ✅
      - ✅
      - ?
-     - [✗](https://github.com/vllm-project/vllm/issues/7968)
+     - [✗](gh-issue:7968)
      - ✅
      -
      -
@@ -290,14 +290,14 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
      - ✅
      - ✅
      - ✅
-     - [✗](https://github.com/vllm-project/vllm/issues/6137)
+     - [✗](gh-issue:6137)
      - ✅
      - ✗
      - ✅
      - ✅
      - ✅
      - ?
-     - [✗](https://github.com/vllm-project/vllm/issues/7968>)
+     - [✗](gh-issue:7968>)
      - ?
      - ✅
      -
@@ -314,7 +314,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
      - ✅
      - ✅
      - ✅
-     - [✗](https://github.com/vllm-project/vllm/issues/9893)
+     - [✗](gh-issue:9893)
      - ?
      - ✅
      - ✅
@@ -338,7 +338,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
      - CPU
      - AMD
    * - [CP](#chunked-prefill)
-     - [✗](https://github.com/vllm-project/vllm/issues/2729)
+     - [✗](gh-issue:2729)
      - ✅
      - ✅
      - ✅
@@ -346,7 +346,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
      - ✅
      - ✅
    * - [APC](#apc)
-     - [✗](https://github.com/vllm-project/vllm/issues/3687)
+     - [✗](gh-issue:3687)
      - ✅
      - ✅
      - ✅
@@ -359,7 +359,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
      - ✅
      - ✅
      - ✅
-     - [✗](https://github.com/vllm-project/vllm/pull/4830)
+     - [✗](gh-pr:4830)
      - ✅
    * - <abbr title="Prompt Adapter">prmpt adptr</abbr>
      - ✅
@@ -367,7 +367,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
      - ✅
      - ✅
      - ✅
-     - [✗](https://github.com/vllm-project/vllm/issues/8475)
+     - [✗](gh-issue:8475)
      - ✅
    * - [SD](#spec_decode)
      - ✅
@@ -439,7 +439,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
      - ✅
      - ✅
      - ✅
-     - [✗](https://github.com/vllm-project/vllm/issues/8477)
+     - [✗](gh-issue:8477)
      - ✅
    * - best-of
      - ✅
diff --git a/docs/source/usage/lora.md b/docs/source/usage/lora.md
index e2ddde74aaa45..cf06916d70f44 100644
--- a/docs/source/usage/lora.md
+++ b/docs/source/usage/lora.md
@@ -47,8 +47,7 @@ outputs = llm.generate(
 )
 ```
 
-Check out [examples/multilora_inference.py](https://github.com/vllm-project/vllm/blob/main/examples/multilora_inference.py)
-for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
+Check out <gh-file:examples/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
 
 ## Serving LoRA Adapters
 
diff --git a/docs/source/usage/multimodal_inputs.md b/docs/source/usage/multimodal_inputs.md
index b0c887398b1b7..82a3f3b8909a1 100644
--- a/docs/source/usage/multimodal_inputs.md
+++ b/docs/source/usage/multimodal_inputs.md
@@ -5,7 +5,7 @@
 This page teaches you how to pass multi-modal inputs to [multi-modal models](#supported-mm-models) in vLLM.
 
 ```{note}
-We are actively iterating on multi-modal support. See [this RFC](https://github.com/vllm-project/vllm/issues/4194) for upcoming changes,
+We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes,
 and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.
 ```
 
@@ -60,7 +60,7 @@ for o in outputs:
     print(generated_text)
 ```
 
-A code example can be found in [examples/offline_inference_vision_language.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py).
+Full example: <gh-file:examples/offline_inference_vision_language.py>
 
 To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
 
@@ -91,7 +91,7 @@ for o in outputs:
     print(generated_text)
 ```
 
-A code example can be found in [examples/offline_inference_vision_language_multi_image.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py).
+Full example: <gh-file:examples/offline_inference_vision_language_multi_image.py>
 
 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
 
@@ -125,13 +125,13 @@ for o in outputs:
 You can pass a list of NumPy arrays directly to the {code}`'video'` field of the multi-modal dictionary
 instead of using multi-image input.
 
-Please refer to [examples/offline_inference_vision_language.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py) for more details.
+Full example: <gh-file:examples/offline_inference_vision_language.py>
 
 ### Audio
 
 You can pass a tuple {code}`(array, sampling_rate)` to the {code}`'audio'` field of the multi-modal dictionary.
 
-Please refer to [examples/offline_inference_audio_language.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_audio_language.py) for more details.
+Full example: <gh-file:examples/offline_inference_audio_language.py>
 
 ### Embedding
 
@@ -208,7 +208,7 @@ A chat template is **required** to use Chat Completions API.
 
 Although most models come with a chat template, for others you have to define one yourself.
 The chat template can be inferred based on the documentation on the model's HuggingFace repo.
-For example, LLaVA-1.5 (`llava-hf/llava-1.5-7b-hf`) requires a chat template that can be found [here](https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja).
+For example, LLaVA-1.5 (`llava-hf/llava-1.5-7b-hf`) requires a chat template that can be found here: <gh-file:examples/template_llava.jinja>
 ```
 
 ### Image
@@ -271,7 +271,7 @@ chat_response = client.chat.completions.create(
 print("Chat completion output:", chat_response.choices[0].message.content)
 ```
 
-A full code example can be found in [examples/openai_chat_completion_client_for_multimodal.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py).
+Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
 
 ```{tip}
 Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
@@ -296,7 +296,7 @@ $ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
 
 Instead of {code}`image_url`, you can pass a video file via {code}`video_url`.
 
-You can use [these tests](https://github.com/vllm-project/vllm/blob/main/tests/entrypoints/openai/test_video.py) as reference.
+You can use [these tests](gh-file:entrypoints/openai/test_video.py) as reference.
 
 ````{note}
 By default, the timeout for fetching videos through HTTP URL url is `30` seconds.
@@ -399,7 +399,7 @@ result = chat_completion_from_url.choices[0].message.content
 print("Chat completion output from audio url:", result)
 ```
 
-A full code example can be found in [examples/openai_chat_completion_client_for_multimodal.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py).
+Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
 
 ````{note}
 By default, the timeout for fetching audios through HTTP URL is `10` seconds.
@@ -435,7 +435,7 @@ Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to expl
 to run this model in embedding mode instead of text generation mode.
 
 The custom chat template is completely different from the original one for this model,
-and can be found [here](https://github.com/vllm-project/vllm/blob/main/examples/template_vlm2vec.jinja).
+and can be found here: <gh-file:examples/template_vlm2vec.jinja>
 ```
 
 Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
@@ -475,7 +475,7 @@ vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
 Like with VLM2Vec, we have to explicitly pass `--task embed`.
 
 Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
-by [this custom chat template](https://github.com/vllm-project/vllm/blob/main/examples/template_dse_qwen2_vl.jinja).
+by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja>
 ```
 
 ```{important}
@@ -483,4 +483,4 @@ Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of th
 example below for details.
 ```
 
-A full code example can be found in [examples/openai_chat_embedding_client_for_multimodal.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_embedding_client_for_multimodal.py).
+Full example: <gh-file:examples/openai_chat_embedding_client_for_multimodal.py>
diff --git a/docs/source/usage/spec_decode.md b/docs/source/usage/spec_decode.md
index 77e35c437de30..8302da81b6173 100644
--- a/docs/source/usage/spec_decode.md
+++ b/docs/source/usage/spec_decode.md
@@ -4,8 +4,8 @@
 
 ```{warning}
 Please note that speculative decoding in vLLM is not yet optimized and does
-not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. The work
-to optimize it is ongoing and can be followed in [this issue.](https://github.com/vllm-project/vllm/issues/4630)
+not usually yield inter-token latency reductions for all prompt datasets or sampling parameters.
+The work to optimize it is ongoing and can be followed here: <gh-issue:4630>
 ```
 
 ```{warning}
@@ -176,7 +176,7 @@ speculative decoding, breaking down the guarantees into three key areas:
    >   distribution. [View Test Code](https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252)
    > - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling
    >   without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler,
-   >   provides a lossless guarantee. Almost all of the tests in [this directory](https://github.com/vllm-project/vllm/tree/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e)
+   >   provides a lossless guarantee. Almost all of the tests in <gh-dir:tests/spec_decode/e2e>.
    >   verify this property using [this assertion implementation](https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291)
 
 3. **vLLM Logprob Stability**
@@ -202,4 +202,4 @@ For mitigation strategies, please refer to the FAQ entry *Can the output of a pr
 - [A Hacker's Guide to Speculative Decoding in vLLM](https://www.youtube.com/watch?v=9wNAgpX6z_4)
 - [What is Lookahead Scheduling in vLLM?](https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a)
 - [Information on batch expansion](https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8)
-- [Dynamic speculative decoding](https://github.com/vllm-project/vllm/issues/4565)
+- [Dynamic speculative decoding](gh-issue:4565)
diff --git a/docs/source/usage/structured_outputs.md b/docs/source/usage/structured_outputs.md
index 14dd387743aac..3f5d9ffc26278 100644
--- a/docs/source/usage/structured_outputs.md
+++ b/docs/source/usage/structured_outputs.md
@@ -131,7 +131,7 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message.content)
 ```
 
-The complete code of the examples can be found on [examples/openai_chat_completion_structured_outputs.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_structured_outputs.py).
+Full example: <gh-file:examples/openai_chat_completion_structured_outputs.py>
 
 ## Experimental Automatic Parsing (OpenAI API)
 
@@ -257,4 +257,4 @@ outputs = llm.generate(
 print(outputs[0].outputs[0].text)
 ```
 
-A complete example with all options can be found in [examples/offline_inference_structured_outputs.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_structured_outputs.py).
+Full example: <gh-file:examples/offline_inference_structured_outputs.py>
diff --git a/docs/source/usage/usage_stats.md b/docs/source/usage/usage_stats.md
index a7eb6144571a4..3d02fbab9216e 100644
--- a/docs/source/usage/usage_stats.md
+++ b/docs/source/usage/usage_stats.md
@@ -4,7 +4,7 @@ vLLM collects anonymous usage data by default to help the engineering team bette
 
 ## What data is collected?
 
-You can see the up to date list of data collected by vLLM in the [usage_lib.py](https://github.com/vllm-project/vllm/blob/main/vllm/usage/usage_lib.py).
+The list of data collected by the latest version of vLLM can be found here: <gh-file:vllm/usage/usage_lib.py>
 
 Here is an example as of v0.4.0:
 

From 51a624bf024e351e678b598521b72a2e19b5e2ef Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 26 Dec 2024 12:23:20 +0800
Subject: [PATCH 174/357] [Misc] Move some multimodal utils to
 modality-specific modules (#11494)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../decoder_only/vision_language/test_awq.py  |  2 +-
 .../vision_language/test_h2ovl.py             |  2 +-
 .../vision_language/test_phi3v.py             |  2 +-
 .../vision_language/test_qwen2_vl.py          |  4 +-
 .../vision_language/vlm_utils/builders.py     |  5 +-
 .../vlm_utils/custom_inputs.py                |  5 +-
 .../vision_language/test_mllama.py            |  2 +-
 tests/multimodal/test_mapper.py               |  2 +-
 vllm/assets/video.py                          |  2 +-
 vllm/multimodal/audio.py                      | 12 ++++
 vllm/multimodal/image.py                      | 12 ++++
 vllm/multimodal/utils.py                      | 68 +------------------
 vllm/multimodal/video.py                      | 43 ++++++++++++
 13 files changed, 84 insertions(+), 77 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/test_awq.py b/tests/models/decoder_only/vision_language/test_awq.py
index 6e6e5b40d6a35..18ceb34a4e042 100644
--- a/tests/models/decoder_only/vision_language/test_awq.py
+++ b/tests/models/decoder_only/vision_language/test_awq.py
@@ -3,7 +3,7 @@
 import pytest
 import torch
 
-from vllm.multimodal.utils import rescale_image_size
+from vllm.multimodal.image import rescale_image_size
 
 from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
 from ...utils import check_logprobs_close
diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py
index 45a7365204403..7406df253e7f0 100644
--- a/tests/models/decoder_only/vision_language/test_h2ovl.py
+++ b/tests/models/decoder_only/vision_language/test_h2ovl.py
@@ -8,7 +8,7 @@
 # Import the functions to test
 from vllm.model_executor.models.h2ovl import (calculate_num_blocks,
                                               image_to_pixel_values_wrapper)
-from vllm.multimodal.utils import rescale_image_size
+from vllm.multimodal.image import rescale_image_size
 
 models = [
     "h2oai/h2ovl-mississippi-800m",  # Replace with your actual model names
diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py
index 82eae0705c9ba..3a8934adfb076 100644
--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -5,7 +5,7 @@
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.multimodal.utils import rescale_image_size
+from vllm.multimodal.image import rescale_image_size
 from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs
 
diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
index 71b6ba4dca435..51fe7d2ad32a8 100644
--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
@@ -6,8 +6,8 @@
 from PIL import Image
 
 from vllm.entrypoints.llm import LLM
-from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
-                                   sample_frames_from_video)
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
 
 from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
                           PromptVideoInput, VllmRunner)
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/builders.py b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
index 66668296139f5..59773be709fa8 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/builders.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
@@ -5,8 +5,9 @@
 
 import torch
 
-from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
-                                   resize_video, sample_frames_from_video)
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.video import (rescale_video_size, resize_video,
+                                   sample_frames_from_video)
 
 from .....conftest import _ImageAssets, _VideoAssets
 from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER,
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
index e698d8d3f6f56..2291f4fa0d0ac 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
@@ -1,8 +1,9 @@
 """Custom input builders for edge-cases in different models."""
 from typing import Callable
 
-from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
-                                   resize_video, sample_frames_from_video)
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.video import (rescale_video_size, resize_video,
+                                   sample_frames_from_video)
 
 from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
 from .builders import build_multi_image_inputs, build_single_image_inputs
diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index 77dd1d81f84d7..636a3eedff31b 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -6,7 +6,7 @@
 
 from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
                                      global_force_attn_backend_context_manager)
-from vllm.multimodal.utils import rescale_image_size
+from vllm.multimodal.image import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
 from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py
index 71832acbd17b8..81f2a06182bcc 100644
--- a/tests/multimodal/test_mapper.py
+++ b/tests/multimodal/test_mapper.py
@@ -6,7 +6,7 @@
 
 from vllm.config import ModelConfig
 from vllm.multimodal import MultiModalRegistry
-from vllm.multimodal.utils import rescale_image_size
+from vllm.multimodal.image import rescale_image_size
 
 
 @pytest.fixture
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index e4dcab10466db..e6779935bad17 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -7,7 +7,7 @@
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from vllm.multimodal.utils import (sample_frames_from_video,
+from vllm.multimodal.video import (sample_frames_from_video,
                                    try_import_video_packages)
 
 from .base import get_cache_dir
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index c92deddbcb255..314d21b746236 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -1,3 +1,5 @@
+from typing import Any
+
 import numpy as np
 import numpy.typing as npt
 
@@ -26,6 +28,16 @@ def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
             "There is no default maximum multimodal tokens")
 
 
+def try_import_audio_packages() -> tuple[Any, Any]:
+    try:
+        import librosa
+        import soundfile
+    except ImportError as exc:
+        raise ImportError(
+            "Please install vllm[audio] for audio support.") from exc
+    return librosa, soundfile
+
+
 def resample_audio(
     audio: npt.NDArray[np.floating],
     *,
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 97bbce1ce1570..c705e1a3d1554 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -84,3 +84,15 @@ def _default_input_mapper(
 
     def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
         return 3000
+
+
+def rescale_image_size(image: Image.Image,
+                       size_factor: float,
+                       transpose: int = -1) -> Image.Image:
+    """Rescale the dimensions of an image by a constant factor."""
+    new_width = int(image.width * size_factor)
+    new_height = int(image.height * size_factor)
+    image = image.resize((new_width, new_height))
+    if transpose >= 0:
+        image = image.transpose(Image.Transpose(transpose))
+    return image
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index be9643598448d..1cb9036bdfda3 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -2,7 +2,7 @@
 import os
 from functools import lru_cache
 from io import BytesIO
-from typing import Any, List, Optional, Tuple, TypeVar, Union
+from typing import List, Optional, Tuple, TypeVar, Union
 
 import numpy as np
 import numpy.typing as npt
@@ -14,7 +14,9 @@
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
 
+from .audio import try_import_audio_packages
 from .inputs import MultiModalDataDict, PlaceholderRange
+from .video import try_import_video_packages
 
 logger = init_logger(__name__)
 
@@ -198,16 +200,6 @@ async def async_fetch_video(video_url: str,
     return video
 
 
-def try_import_audio_packages() -> Tuple[Any, Any]:
-    try:
-        import librosa
-        import soundfile
-    except ImportError as exc:
-        raise ImportError(
-            "Please install vllm[audio] for audio support.") from exc
-    return librosa, soundfile
-
-
 def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
     """
     Load audio from a URL.
@@ -324,60 +316,6 @@ def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
     return _load_image_from_bytes(base64.b64decode(image))
 
 
-def rescale_image_size(image: Image.Image,
-                       size_factor: float,
-                       transpose: int = -1) -> Image.Image:
-    """Rescale the dimensions of an image by a constant factor."""
-    new_width = int(image.width * size_factor)
-    new_height = int(image.height * size_factor)
-    image = image.resize((new_width, new_height))
-    if transpose >= 0:
-        image = image.transpose(Image.Transpose(transpose))
-    return image
-
-
-def try_import_video_packages():
-    try:
-        import cv2
-        import decord
-    except ImportError as exc:
-        raise ImportError(
-            "Please install vllm[video] for video support.") from exc
-    return cv2, decord
-
-
-def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray:
-    cv2, _ = try_import_video_packages()
-
-    num_frames, _, _, channels = frames.shape
-    new_height, new_width = size
-    resized_frames = np.empty((num_frames, new_height, new_width, channels),
-                              dtype=frames.dtype)
-    for i, frame in enumerate(frames):
-        resized_frame = cv2.resize(frame, (new_width, new_height))
-        resized_frames[i] = resized_frame
-    return resized_frames
-
-
-def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
-    _, height, width, _ = frames.shape
-    new_height = int(height * size_factor)
-    new_width = int(width * size_factor)
-
-    return resize_video(frames, (new_height, new_width))
-
-
-def sample_frames_from_video(frames: npt.NDArray,
-                             num_frames: int) -> npt.NDArray:
-    total_frames = frames.shape[0]
-    if num_frames == -1:
-        return frames
-    else:
-        frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
-        sampled_frames = frames[frame_indices, ...]
-        return sampled_frames
-
-
 def encode_video_base64(frames: npt.NDArray) -> str:
     base64_frames = []
     frames_list = [frames[i] for i in range(frames.shape[0])]
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index ba9bf58a4a20c..bfcdef70718bc 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -2,6 +2,7 @@
 from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import numpy as np
+import numpy.typing as npt
 
 from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
@@ -75,3 +76,45 @@ def _default_input_mapper(
 
     def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
         return 4096
+
+
+def try_import_video_packages() -> tuple[Any, Any]:
+    try:
+        import cv2
+        import decord
+    except ImportError as exc:
+        raise ImportError(
+            "Please install vllm[video] for video support.") from exc
+    return cv2, decord
+
+
+def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
+    cv2, _ = try_import_video_packages()
+
+    num_frames, _, _, channels = frames.shape
+    new_height, new_width = size
+    resized_frames = np.empty((num_frames, new_height, new_width, channels),
+                              dtype=frames.dtype)
+    for i, frame in enumerate(frames):
+        resized_frame = cv2.resize(frame, (new_width, new_height))
+        resized_frames[i] = resized_frame
+    return resized_frames
+
+
+def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
+    _, height, width, _ = frames.shape
+    new_height = int(height * size_factor)
+    new_width = int(width * size_factor)
+
+    return resize_video(frames, (new_height, new_width))
+
+
+def sample_frames_from_video(frames: npt.NDArray,
+                             num_frames: int) -> npt.NDArray:
+    total_frames = frames.shape[0]
+    if num_frames == -1:
+        return frames
+
+    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+    sampled_frames = frames[frame_indices, ...]
+    return sampled_frames

From dbeac95dbbf898bcc0965528fc767e9cadbbe0c5 Mon Sep 17 00:00:00 2001
From: Lucas Tucker <47258766+lucas-tucker@users.noreply.github.com>
Date: Wed, 25 Dec 2024 23:04:07 -0600
Subject: [PATCH 175/357] Mypy checking for vllm/compilation (#11496)

Signed-off-by: lucast2021 <lucast2021@headroyce.org>
Co-authored-by: lucast2021 <lucast2021@headroyce.org>
---
 vllm/compilation/backends.py           | 10 +++++-----
 vllm/compilation/multi_output_match.py |  3 ++-
 vllm/compilation/pass_manager.py       |  4 ++--
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 0c7bbfe599b02..826d1744d88a5 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -141,14 +141,14 @@ def produce_guards_expression(self, *args, **kwargs):
         return ""
 
 
-def wrap_inductor(graph,
+def wrap_inductor(graph: fx.GraphModule,
                   example_inputs,
                   additional_inductor_config,
                   compilation_config: CompilationConfig,
                   graph_index: int = 0,
                   num_graphs: int = 1,
                   runtime_shape: Optional[int] = None,
-                  use_inductor: bool = True):
+                  use_inductor: bool = True) -> Any:
     if graph_index == 0:
         # before compiling the first graph, record the start time
         global compilation_start_time
@@ -209,7 +209,7 @@ def wrap_inductor(graph,
         returns_tuple = graph_returns_tuple(graph)
 
         # this is the graph we return to Dynamo to run
-        def compiled_graph(*args):
+        def compiled_graph(*args) -> Optional[fx.CompiledFxGraph]:
             # convert args to list
             list_args = list(args)
             graph_output = inductor_compiled_graph(list_args)
@@ -247,7 +247,7 @@ def _check_can_cache(*args, **kwargs):
             # see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa
             return
 
-        def _get_shape_env():
+        def _get_shape_env() -> AlwaysHitShapeEnv:
             return AlwaysHitShapeEnv()
 
         with patch(# for hijacking the hash of the compiled graph
@@ -537,7 +537,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
             example_inputs[x].clone() for x in self.sym_tensor_indices
         ]
 
-        def copy_and_call(*args):
+        def copy_and_call(*args) -> fx.GraphModule:
             list_args = list(args)
             for i, index in enumerate(self.sym_tensor_indices):
                 runtime_tensor = list_args[index]
diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py
index 0ad648abfbb3a..b6bcecdc89e26 100644
--- a/vllm/compilation/multi_output_match.py
+++ b/vllm/compilation/multi_output_match.py
@@ -7,6 +7,7 @@
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor import pattern_matcher as pm
 from torch._ops import OpOverload
+from torch.fx import Node
 
 from vllm.compilation.fx_utils import find_auto_fn
 
@@ -97,7 +98,7 @@ def insert_getitems(self, tuple_node: fx.Node,
                 self.graph.call_function(operator.getitem, (tuple_node, idx))
                 for idx in indices)
 
-    def insert_auto_fn(self, op: OpOverload, kwargs):
+    def insert_auto_fn(self, op: OpOverload, kwargs) -> Node:
         """
         Insert an auto_functionalized node with the given op and kwargs.
         """
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index fb522ae053e97..34f5f355798b2 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import Any, Dict, List
 
 from torch import fx as fx
 
@@ -53,7 +53,7 @@ def add(self, pass_: InductorPass):
         assert isinstance(pass_, InductorPass)
         self.passes.append(pass_)
 
-    def __getstate__(self):
+    def __getstate__(self) -> Dict[str, List[Any]]:
         """
         Custom pickling for the pass manager, as some passes cannot be pickled.
         Pickling occurs because the pass manager is set as the value of

From aa25985bd1e7a4925a7061fdfbc93893b492627b Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 26 Dec 2024 15:52:48 +0800
Subject: [PATCH 176/357] [Misc][LoRA] Fix LoRA weight mapper (#11495)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_lora_checkpoints.py | 15 ++++++++-----
 tests/lora/test_qwen2vl.py          |  6 ++++-
 vllm/lora/models.py                 |  3 ++-
 vllm/lora/utils.py                  | 34 ++++++++++-------------------
 vllm/lora/worker_manager.py         |  2 ++
 5 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index 9842203eb15e0..537d95b025a9d 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -74,7 +74,7 @@ def test_load_checkpoints(
                 embedding_padding_modules=embed_padding_modules)
 
 
-def test_lora_weights_mapping(baichuan_lora_files, ):
+def test_lora_weights_mapping(baichuan_lora_files):
     supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
     embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
@@ -86,10 +86,14 @@ def test_lora_weights_mapping(baichuan_lora_files, ):
         else:
             expected_lora_modules.append(module)
 
-    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
-        "model.": "language_model.model.",
-    }, )
-
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "model.": "language_model.model.",
+        },
+        orig_to_new_substr={
+            ".layers.": ".baichuan_layers.",
+        },
+    )
     lora_model = LoRAModel.from_local_checkpoint(
         baichuan_lora_files,
         expected_lora_modules,
@@ -101,3 +105,4 @@ def test_lora_weights_mapping(baichuan_lora_files, ):
     )
     for name in lora_model.loras:
         assert name.startswith(hf_to_vllm_mapper.orig_to_new_prefix["model."])
+        assert ".baichuan_layers." in name
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index c8c720ff0c776..c9f48402b0268 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -22,7 +22,7 @@
 
 # After fine-tuning with LoRA, all generated content should start begin `A`.
 EXPECTED_OUTPUT = [
-    "A stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.",  # noqa: E501
+    "A red stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.",  # noqa: E501
     "A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.",  # noqa: E501
 ]
 
@@ -76,3 +76,7 @@ def test_qwen2vl_lora(qwen2vl_lora_files):
     output1 = do_sample(llm, qwen2vl_lora_files, lora_id=1)
     for i in range(len(EXPECTED_OUTPUT)):
         assert EXPECTED_OUTPUT[i].startswith(output1[i])
+
+    output2 = do_sample(llm, qwen2vl_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output2[i])
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index f50db8e3b8e10..5c0e4e5cbc636 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -231,7 +231,8 @@ def from_local_checkpoint(
             with safetensors.safe_open(lora_tensor_path,
                                        framework="pt") as f:  # type: ignore
                 for lora_module in f.keys():  # noqa
-                    module_name, _, _ = parse_fine_tuned_lora_name(lora_module)
+                    module_name, _, _ = parse_fine_tuned_lora_name(
+                        lora_module, weights_mapper)
                     part_name = module_name.split(".")[-1]
                     if part_name not in expected_lora_modules:
                         unexpected_modules.append(module_name)
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 3a84a6ae1c02a..d72b7638d84af 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -1,4 +1,3 @@
-import copy
 import os
 import re
 from typing import List, Optional, Set, Tuple, Type, Union
@@ -32,7 +31,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.models.utils import WeightsMapper
-from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
 
@@ -112,36 +110,28 @@ def parse_fine_tuned_lora_name(
             is_bias whether the tensor is lora bias.
     """
 
-    w_mapper = None
-    if weights_mapper:
-        w_mapper = copy.deepcopy(weights_mapper)
-        # TODO: Currently only supports mapping for prefix, mapping for
-        # substr and subfix will be supported in the future.
-        for attr, mapping in [
-            ("orig_to_new_substr", w_mapper.orig_to_new_substr),
-            ("orig_to_new_suffix", w_mapper.orig_to_new_suffix),
-        ]:
-            if mapping:
-                print_warning_once(
-                    f"vLLM currently does not support mapping of LoRA weights "
-                    f"for {mapping}.")
-                setattr(w_mapper, attr, {})
-
-    mapper = (lambda name: w_mapper._map_name(name)
-              if w_mapper is not None else name)
+    # LoRA weight qualified name always starts with `base_model.model.`,
+    # so we remove the prefix `base_model.model.` to make the following
+    # mapping correctly.
+    if "base_model.model." in name:
+        name = name.replace("base_model.model.", "")
+        name = weights_mapper._map_name(name) if weights_mapper else name
+        # recover the prefix `base_model.model.`
+        name = "base_model.model." + name
+
     parts = name.split(".")
     if parts[-1] == "weight" and (parts[-2] == "lora_A"
                                   or parts[-2] == "lora_B"):
         new_name = ".".join(parts[2:-2])
-        return mapper(new_name), parts[-2] == "lora_A", False
+        return new_name, parts[-2] == "lora_A", False
 
     if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B":
         new_name = ".".join(parts[2:-1])
-        return mapper(new_name), parts[-1] == "lora_embedding_A", False
+        return new_name, parts[-1] == "lora_embedding_A", False
 
     if parts[-1] == "bias":
         new_name = ".".join(parts[2:-2])
-        return mapper(new_name), False, True
+        return new_name, False, True
 
     raise ValueError(f"{name} is unsupported LoRA weight")
 
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index ef8cc5886103e..10976fac23028 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -91,6 +91,8 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
                         packed_modules_mapping[module])
                 else:
                     expected_lora_modules.append(module)
+
+            expected_lora_modules = list(set(expected_lora_modules))
             lora_path = get_adapter_absolute_path(lora_request.lora_path)
 
             # For some models like Qwen2VL, we need to use hf_to_vllm_mapper

From 7492a362077ace26b4f0374a39ba5b0846962b87 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 26 Dec 2024 01:44:32 -0800
Subject: [PATCH 177/357] [Doc] Add `QVQ` and `QwQ` to the list of supported
 models (#11509)

Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 099e6c8f02815..85fba83147708 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -319,7 +319,7 @@ See [this page](#generative-models) for more information on how to use generativ
     - ✅︎
   * - :code:`Qwen2ForCausalLM`
     - Qwen2
-    - :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc.
+    - :code:`Qwen/QwQ-32B-Preview`, :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc.
     - ✅︎
     - ✅︎
   * - :code:`Qwen2MoeForCausalLM`
@@ -710,7 +710,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * - :code:`Qwen2VLForConditionalGeneration`
     - Qwen2-VL
     - T + I\ :sup:`E+` + V\ :sup:`E+`
-    - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
+    - :code:`Qwen/QVQ-72B-Preview`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
     - ✅︎
     - ✅︎
     -

From dcb1a944d4cf95b4a7b3522ddf970e6d3a28b8b5 Mon Sep 17 00:00:00 2001
From: sroy745 <142070531+sroy745@users.noreply.github.com>
Date: Thu, 26 Dec 2024 02:02:58 -0800
Subject: [PATCH 178/357] [V1] Adding min tokens/repetition/presence/frequence
 penalties to V1 sampler (#10681)

Signed-off-by: Sourashis Roy <sroy@roblox.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/v1/engine/test_engine_core.py     |  38 +++
 tests/v1/sample/__init__.py             |   0
 tests/v1/sample/test_sampler.py         | 331 ++++++++++++++++++++++++
 tests/v1/worker/__init__.py             |   0
 tests/v1/worker/test_gpu_input_batch.py | 224 ++++++++++++++++
 vllm/model_executor/layers/sampler.py   |  51 +---
 vllm/model_executor/layers/utils.py     |  57 ++++
 vllm/v1/sample/metadata.py              |  12 +-
 vllm/v1/sample/sampler.py               |  65 ++++-
 vllm/v1/worker/gpu_input_batch.py       | 142 ++++++++++
 vllm/v1/worker/gpu_model_runner.py      |   8 +-
 11 files changed, 879 insertions(+), 49 deletions(-)
 create mode 100644 tests/v1/sample/__init__.py
 create mode 100644 tests/v1/sample/test_sampler.py
 create mode 100644 tests/v1/worker/__init__.py
 create mode 100644 tests/v1/worker/test_gpu_input_batch.py
 create mode 100644 vllm/model_executor/layers/utils.py

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index a61ec63a365b5..c529cd21f384b 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -139,3 +139,41 @@ def test_engine_core(monkeypatch):
         engine_core.abort_requests([req2.request_id, req0.request_id])
         assert len(engine_core.scheduler.waiting) == 0
         assert len(engine_core.scheduler.running) == 0
+
+
+def test_engine_core_advanced_sampling(monkeypatch):
+    """
+    A basic end-to-end test to verify that the engine functions correctly 
+    when additional sampling parameters, such as min_tokens and 
+    presence_penalty, are set.
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        """Setup the EngineCore."""
+        engine_args = EngineArgs(model=MODEL_NAME)
+        vllm_config = engine_args.create_engine_config(
+            usage_context=UsageContext.UNKNOWN_CONTEXT)
+        executor_class = AsyncLLM._get_executor_cls(vllm_config)
+
+        engine_core = EngineCore(vllm_config=vllm_config,
+                                 executor_class=executor_class,
+                                 usage_context=UsageContext.UNKNOWN_CONTEXT)
+        """Test basic request lifecycle."""
+        # First request.
+        request: EngineCoreRequest = make_request()
+        request.sampling_params = SamplingParams(
+            min_tokens=4,
+            presence_penalty=1.0,
+            frequency_penalty=1.0,
+            repetition_penalty=0.1,
+            stop_token_ids=[1001, 1002],
+        )
+        engine_core.add_request(request)
+        assert len(engine_core.scheduler.waiting) == 1
+        assert len(engine_core.scheduler.running) == 0
+        # Loop through until they are all done.
+        while len(engine_core.step()) > 0:
+            pass
+
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 0
diff --git a/tests/v1/sample/__init__.py b/tests/v1/sample/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
new file mode 100644
index 0000000000000..d8d055805cbea
--- /dev/null
+++ b/tests/v1/sample/test_sampler.py
@@ -0,0 +1,331 @@
+from typing import List, Set, Tuple
+
+import numpy as np
+import pytest
+import torch
+
+from vllm.utils import make_tensor_with_pad
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.sampler import Sampler
+
+VOCAB_SIZE = 1024
+NUM_OUTPUT_TOKENS = 20
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+MAX_NUM_PROMPT_TOKENS = 64
+
+
+def _create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
+    fake_logits = torch.full((batch_size, vocab_size), 1e-2, dtype=torch.float)
+    return fake_logits
+
+
+def _create_penalty_tensor(batch_size: int, penalty_value: float,
+                           device: torch.device) -> torch.Tensor:
+    return torch.full((batch_size, ),
+                      fill_value=penalty_value,
+                      dtype=torch.float,
+                      device=device)
+
+
+def _create_prompt_tokens_tensor(
+    prompt_token_ids: List[List[int]],
+    vocab_size: int,
+    device: torch.device,
+) -> torch.Tensor:
+    return make_tensor_with_pad(
+        prompt_token_ids,
+        pad=vocab_size,
+        device=device,
+        dtype=torch.int64,
+        pin_memory=False,
+    )
+
+
+def _create_default_sampling_metadata(
+    num_output_tokens: int,
+    batch_size: int,
+    vocab_size: int,
+    device: torch.device,
+) -> SamplingMetadata:
+    output_token_ids: List[List[int]] = []
+    prompt_token_ids: List[List[int]] = []
+    for _ in range(batch_size):
+        output_token_ids.append(
+            np.random.randint(0, vocab_size, size=num_output_tokens).tolist())
+        prompt_token_ids.append(
+            np.random.randint(0,
+                              vocab_size,
+                              size=np.random.randint(
+                                  1, MAX_NUM_PROMPT_TOKENS)).tolist())
+    fake_sampling_metadata = SamplingMetadata(
+        temperature=torch.full((batch_size, ), 0.0),
+        all_greedy=True,
+        all_random=False,
+        top_p=torch.empty(batch_size, ),
+        top_k=torch.empty(batch_size, ),
+        no_top_p=True,
+        no_top_k=True,
+        generators={},
+        max_num_logprobs=VOCAB_SIZE,
+        prompt_token_ids=_create_prompt_tokens_tensor(prompt_token_ids,
+                                                      vocab_size, device),
+        output_token_ids=output_token_ids,
+        frequency_penalties=_create_penalty_tensor(batch_size, 0.0, device),
+        presence_penalties=_create_penalty_tensor(batch_size, 0.0, device),
+        repetition_penalties=_create_penalty_tensor(batch_size, 1.0, device),
+        no_penalties=True,
+        min_tokens=[],
+        stop_token_ids=[],
+    )
+    return fake_sampling_metadata
+
+
+def _generate_min_token_penalties_and_stop_tokens(
+    num_output_tokens: int, batch_size: int, vocab_size: int,
+    batch_indices_for_min_token_penalty: List[int]
+) -> Tuple[List[int], List[Set[int]]]:
+    """
+    Generates and returns a list of minimum token penalties (`min_tokens`) 
+    and a corresponding list of stop token IDs (`stop_token_ids`) for each 
+    batch.
+
+    If a batch index is included in `batch_indices_for_min_token_penalty`, 
+    a higher `min_tokens` value is assigned (within a randomized range), 
+    and a random set of stop token IDs is created. Otherwise, a lower 
+    `min_tokens` value is assigned, and the stop token IDs set is empty.   
+    """
+    stop_token_ids: List[Set[int]] = []
+    min_tokens: List[int] = []
+    for index in range(batch_size):
+        if index in batch_indices_for_min_token_penalty:
+            min_tokens.append(
+                np.random.randint(num_output_tokens + 1,
+                                  2 * num_output_tokens))
+            stop_token_ids.append(
+                set(
+                    np.random.randint(0, vocab_size - 1)
+                    for _ in range(np.random.randint(0, vocab_size))))
+
+        else:
+            min_tokens.append(np.random.randint(0, num_output_tokens))
+            stop_token_ids.append(set())
+    return (min_tokens, stop_token_ids)
+
+
+def _create_weighted_output_token_list(
+        batch_size: int,
+        vocab_size: int) -> Tuple[List[List[int]], List[List[int]]]:
+    """
+    Creates an output token list where each token occurs a distinct 
+    number of times.
+
+    For each batch, a random subset of token IDs is selected from the
+    vocabulary. The selected tokens are then added to the output token
+    list, each with a different frequency.
+
+    Returns:
+        Tuple[List[List[int]], List[List[int]]]:
+            - The first element is the output token list, where each sublist 
+              corresponds to a batch and contains tokens with weighted 
+              frequencies.
+            - The second element is a list of distinct token IDs for each
+              batch, ordered by their frequency in the corresponding output
+              list.
+    """
+    output_token_ids: List[List[int]] = []
+    sorted_token_ids_in_output: List[List[int]] = []
+    for _ in range(batch_size):
+        distinct_token_ids = np.random.choice(vocab_size,
+                                              size=np.random.randint(1, 10),
+                                              replace=False).tolist()
+        sorted_token_ids_in_output.append(distinct_token_ids)
+        output_token_ids_for_batch = []
+        for index, token_id in enumerate(distinct_token_ids):
+            output_token_ids_for_batch.extend(
+                [token_id for _ in range(index + 1)])
+        output_token_ids.append(output_token_ids_for_batch)
+    return (output_token_ids, sorted_token_ids_in_output)
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+def test_sampler_min_tokens_penalty(device: str, batch_size: int):
+    """
+    Tests that if the number of output tokens is less than 
+    SamplingParams.min_tokens then we will set the logits for
+    the stop token ids to -inf.
+    """
+    torch.set_default_device(device)
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    batch_indices_for_min_token_penalty = np.random.randint(
+        0, batch_size - 1, size=np.random.randint(0, batch_size)).tolist()
+    min_tokens, stop_token_ids = _generate_min_token_penalties_and_stop_tokens(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE,
+        batch_indices_for_min_token_penalty)
+    sampling_metadata.min_tokens = min_tokens
+    sampling_metadata.stop_token_ids = stop_token_ids
+    sampler = Sampler()
+    sampler_output = sampler(fake_logits, sampling_metadata)
+    for batch_idx in range(batch_size):
+        for vocab in range(VOCAB_SIZE):
+            # Verify that the logprobs for stop token ids is set
+            # to -inf.
+            logprob_index = torch.where(
+                sampler_output.logprob_token_ids[batch_idx] ==
+                vocab)[0].item()
+            if vocab in stop_token_ids[batch_idx]:
+                assert sampler_output.logprobs[batch_idx][
+                    logprob_index] == -float("inf")
+            else:
+                assert sampler_output.logprobs[batch_idx][
+                    logprob_index] != -float("inf")
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("presence_penalty", [-2.0, 2.0])
+def test_sampler_presence_penalty(device: str, batch_size: int,
+                                  presence_penalty: float):
+    """
+    Test to verify that if presence penalty is enabled then tokens
+    are penalized as per their presence in the existing output.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    output_token_ids = sampling_metadata.output_token_ids
+    sampling_metadata.presence_penalties = _create_penalty_tensor(
+        batch_size, presence_penalty, torch.device(device))
+    sampling_metadata.no_penalties = False
+    sampler = Sampler()
+    sampler_output = sampler(fake_logits, sampling_metadata)
+    for batch_idx in range(batch_size):
+        # The logprobs in the SamplerOutput are arranged in descending order.
+        # Since all tokens initially have the same logprobs, the non-penalized
+        # tokens will appear at the beginning, while the penalized tokens
+        #  will appear at the end of the list.
+        penalized_token_id = sampler_output.logprob_token_ids[batch_idx][
+            VOCAB_SIZE - 1]
+        penalized_log_prod = sampler_output.logprobs[batch_idx][VOCAB_SIZE - 1]
+        non_penalized_token_id = sampler_output.logprob_token_ids[batch_idx][0]
+        non_penalized_log_prod = sampler_output.logprobs[batch_idx][0]
+        assert non_penalized_log_prod > penalized_log_prod
+        if presence_penalty > 0:
+            # If `presence_penalty` is set to a value greater than 0, it
+            # indicates a preference for new tokens over those already
+            # present in the output.
+            # Verify that the penalized token ID exists in the output, while the
+            # non-penalized token ID does not.
+            assert penalized_token_id in output_token_ids[batch_idx]
+            assert non_penalized_token_id not in output_token_ids[batch_idx]
+        elif presence_penalty < 0:
+            # If `presence_penalty` is set to a value less than 0, it indicates
+            # a preference for existing tokens over new ones. Verify that the
+            # non-penalized token ID exists in the output, while the penalized
+            # token ID does not.
+            assert non_penalized_token_id in output_token_ids[batch_idx]
+            assert penalized_token_id not in output_token_ids[batch_idx]
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("frequency_penalty", [-2.0, 2.0])
+def test_sampler_frequency_penalty(device: str, batch_size: int,
+                                   frequency_penalty: float):
+    """
+    Test to verify that if frequency penalty is enabled then tokens are
+    penalized as per their frequency of occurrence.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    sampling_metadata.frequency_penalties = _create_penalty_tensor(
+        batch_size, frequency_penalty, torch.device(device))
+    output_token_ids, sorted_token_ids_in_output = \
+        _create_weighted_output_token_list(batch_size, VOCAB_SIZE)
+    sampling_metadata.output_token_ids = output_token_ids
+    sampling_metadata.no_penalties = False
+    sampler = Sampler()
+    sampler_output = sampler(fake_logits, sampling_metadata)
+    for batch_idx in range(batch_size):
+        logprobs_token_ids = sampler_output.logprob_token_ids[batch_idx]
+        non_penalized_token_id = logprobs_token_ids[0]
+        penalized_token_id = logprobs_token_ids[VOCAB_SIZE - 1]
+        distinct_sorted_token_ids_in_output = \
+            sorted_token_ids_in_output[batch_idx]
+        most_frequent_token_id = distinct_sorted_token_ids_in_output[
+            len(distinct_sorted_token_ids_in_output) - 1]
+        if frequency_penalty > 0:
+            # If `frequency_penalty` is set to > 0, it indicates
+            # a preference for new tokens over existing ones. Verify that the
+            # non-penalized token ID is not present in the output, while the
+            # most penalized token is the one that occurs most frequently in
+            # the output.
+            assert non_penalized_token_id \
+                not in distinct_sorted_token_ids_in_output
+            assert penalized_token_id == most_frequent_token_id
+        elif frequency_penalty < 0:
+            # If `frequency_penalty` is set to < 0, it indicates
+            # a preference for existing tokens over new ones. Verify that the
+            # non-penalized token ID is the one that occurs most frequently
+            # in the output, while the penalized token ID is one that has not
+            # yet appeared.
+            assert non_penalized_token_id == most_frequent_token_id
+            assert penalized_token_id \
+                not in distinct_sorted_token_ids_in_output
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("repetition_penalty", [0.1, 1.9])
+def test_sampler_repetition_penalty(device: str, batch_size: int,
+                                    repetition_penalty: float):
+    """
+    Test to verify that when the repetition penalty is enabled, tokens 
+    are penalized based on their presence in the prompt or the existing
+    output.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    sampling_metadata.repetition_penalties = _create_penalty_tensor(
+        batch_size, repetition_penalty, torch.device(device))
+    sampling_metadata.no_penalties = False
+    sampler = Sampler()
+    sampler_output = sampler(fake_logits, sampling_metadata)
+    for batch_idx in range(batch_size):
+        logprobs_token_ids = sampler_output.logprob_token_ids[batch_idx]
+        non_penalized_token_id = logprobs_token_ids[0]
+        penalized_token_id = logprobs_token_ids[VOCAB_SIZE - 1]
+        prompt_tokens = sampling_metadata.prompt_token_ids[
+            batch_idx][:].tolist()
+        output_tokens = sampling_metadata.output_token_ids[batch_idx]
+        if repetition_penalty > 1.0:
+            # If `repetition_penalty` > 1.0, verify that the non-penalized
+            # token ID has not been seen before, while the penalized token ID
+            # exists either in the prompt or the output.
+            assert (non_penalized_token_id not in prompt_tokens and \
+                non_penalized_token_id not in output_tokens)
+            assert (penalized_token_id  in prompt_tokens or \
+                penalized_token_id in output_tokens)
+        elif repetition_penalty < 1.0:
+            # If `repetition_penalty` < 1.0, verify that the penalized
+            # token ID has not been seen before, while the non-penalized
+            # token ID exists either in the prompt or the output.
+            assert (penalized_token_id not in prompt_tokens and \
+                penalized_token_id not in output_tokens)
+            assert (non_penalized_token_id  in prompt_tokens or \
+                non_penalized_token_id in output_tokens)
diff --git a/tests/v1/worker/__init__.py b/tests/v1/worker/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
new file mode 100644
index 0000000000000..694ce81ff6e22
--- /dev/null
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -0,0 +1,224 @@
+from typing import Dict, List, Set, Tuple
+
+import numpy as np
+import pytest
+import torch
+
+from vllm.sampling_params import SamplingParams
+from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
+
+VOCAB_SIZE = 1024
+NUM_OUTPUT_TOKENS = 20
+MAX_PROMPT_SIZE = 100
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+MAX_NUM_PROMPT_TOKENS = 64
+
+
+def _remove_requests(
+        input_batch: InputBatch, batch_size: int,
+        reqs: List[CachedRequestState]) -> Tuple[Set[str], List[int]]:
+    """
+    Remove some requests randomly from the batch and returns a Tuple
+    of 1) set of request removed 2) indices of the requests removed
+    ordered in descending order
+    """
+
+    num_reqs_to_remove = np.random.randint(0, batch_size)
+    req_indices_to_remove: Set[int] = set()
+    for _ in range(num_reqs_to_remove):
+        req_index_to_remove = np.random.randint(0, batch_size)
+        req_indices_to_remove.add(req_index_to_remove)
+
+    req_indices_to_remove_list = list(req_indices_to_remove)
+    req_indices_to_remove_list.sort(reverse=True)
+    req_ids_to_remove: Set[str] = set()
+    for index in req_indices_to_remove:
+        input_batch.remove_request(reqs[index].req_id)
+        req_ids_to_remove.add(reqs[index].req_id)
+    return (req_ids_to_remove, req_indices_to_remove_list)
+
+
+def _construct_expected_sampling_metadata(
+        reqs: List[CachedRequestState], req_ids_retained: Set[int],
+        req_id_index_in_input_batch: Dict[str, int],
+        device: torch.device) -> SamplingMetadata:
+    """
+    Constructs and returns the expected SamplingMetadata for this
+    batch.
+    """
+    num_reqs = len(req_ids_retained)
+    output_token_ids: List[List[int]] = [list() for _ in range(num_reqs)]
+    prompt_token_ids: List[List[int]] = [list() for _ in range(num_reqs)]
+    presence_penalties = [0.0 for _ in range(num_reqs)]
+    frequency_penalties = [0.0 for _ in range(num_reqs)]
+    repetition_penalties = [1.0 for _ in range(num_reqs)]
+    top_k = [0 for _ in range(num_reqs)]
+    top_p = [0.0 for _ in range(num_reqs)]
+    temperature = [0.0 for _ in range(num_reqs)]
+    stop_token_ids: List[Set[int]] = [set() for _ in range(num_reqs)]
+    min_tokens = [0 for _ in range(num_reqs)]
+    for req in reqs:
+        if req.req_id not in req_ids_retained:
+            continue
+        index_in_input_batch = req_id_index_in_input_batch[req.req_id]
+        output_token_ids[index_in_input_batch] = req.output_token_ids
+        prompt_token_ids[index_in_input_batch] = req.prompt_token_ids
+        presence_penalties[
+            index_in_input_batch] = req.sampling_params.presence_penalty
+        frequency_penalties[
+            index_in_input_batch] = req.sampling_params.frequency_penalty
+        repetition_penalties[
+            index_in_input_batch] = req.sampling_params.repetition_penalty
+        top_k[index_in_input_batch] = req.sampling_params.top_k
+        top_p[index_in_input_batch] = req.sampling_params.top_p
+        temperature[index_in_input_batch] = req.sampling_params.temperature
+        stop_token_ids[
+            index_in_input_batch] = req.sampling_params.all_stop_token_ids
+        min_tokens[index_in_input_batch] = req.sampling_params.min_tokens
+
+
+    return SamplingMetadata(
+        temperature=torch.tensor(temperature, dtype=torch.float, device=device),
+        all_greedy=False,
+        all_random=True,
+        top_p=torch.tensor(top_p, dtype=torch.float, device=device),
+        top_k=torch.tensor(top_k, dtype=torch.int, device=device),
+        no_top_p=all(x == 1.0 for x in top_p),
+        no_top_k=all(x == 0 for x in top_k),
+        generators={},
+        max_num_logprobs=0,
+        prompt_token_ids= make_tensor_with_pad(
+            prompt_token_ids,
+            pad=VOCAB_SIZE,
+            device=torch.device(device),
+            dtype=torch.int64,
+        ),
+        frequency_penalties=torch.tensor(
+            frequency_penalties, dtype=torch.float,
+            device=device),
+        presence_penalties=torch.tensor(
+            presence_penalties, dtype=torch.float,
+            device=device),
+        repetition_penalties=torch.tensor(
+            repetition_penalties, dtype=torch.float,
+            device=device),
+        output_token_ids=output_token_ids,
+        min_tokens=min_tokens,
+        stop_token_ids=stop_token_ids,
+        no_penalties=(all(x ==0 for x in presence_penalties) and \
+            all(x ==0 for x in frequency_penalties) and \
+                all(x ==1 for x in repetition_penalties))
+    )
+
+
+def _create_sampling_params():
+    return SamplingParams(top_k=np.random.randint(1, 10),
+                          top_p=np.random.uniform(0.0, 1.0),
+                          presence_penalty=np.random.uniform(-2.0, 2.0),
+                          repetition_penalty=np.random.uniform(0.0, 2.0),
+                          frequency_penalty=np.random.uniform(-2.0, 2.0),
+                          min_tokens=np.random.randint(1, 10),
+                          stop_token_ids=[
+                              np.random.randint(0, VOCAB_SIZE)
+                              for _ in range(np.random.randint(10))
+                          ])
+
+
+def _construct_cached_request_state(req_id_suffix: int):
+    prompt_token_ids = [
+        np.random.randint(0, VOCAB_SIZE)
+        for _ in range(np.random.randint(0, MAX_PROMPT_SIZE))
+    ]
+    output_token_ids = [
+        np.random.randint(0, VOCAB_SIZE)
+        for _ in range(np.random.randint(0, NUM_OUTPUT_TOKENS))
+    ]
+    return CachedRequestState(req_id=f"req_id_{req_id_suffix}",
+                              prompt_token_ids=prompt_token_ids,
+                              prompt=None,
+                              sampling_params=_create_sampling_params(),
+                              mm_inputs=[],
+                              mm_positions=[],
+                              block_ids=[],
+                              generator=None,
+                              num_computed_tokens=len(output_token_ids),
+                              output_token_ids=output_token_ids)
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32, 64])
+def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
+    """
+    Tests the logic for managing sampling metadata in the InputBatch.
+
+    This test involves adding a set of requests to the InputBatch,
+    followed by removing a subset of them. Afterward, the batch is compacted,
+    and the `make_sampling_metadata` method is invoked on the batch. The
+    output of `make_sampling_metadata` is then compared against the expected
+    results to ensure correctness.
+    """
+    input_batch: InputBatch = InputBatch(max_num_reqs=batch_size,
+                                         max_model_len=1024,
+                                         max_num_blocks_per_req=10,
+                                         device=torch.device(device),
+                                         pin_memory=is_pin_memory_available(),
+                                         vocab_size=1024)
+    reqs: List[CachedRequestState] = []
+    req_id_reqs = {}
+    req_id_output_token_ids = {}
+    # Add requests
+    for req_index in range(batch_size):
+        req: CachedRequestState = _construct_cached_request_state(req_index)
+        input_batch.add_request(req, req_index)
+        reqs.append(req)
+        req_id_reqs[req.req_id] = req
+        req_id_output_token_ids[req.req_id] = req.output_token_ids
+
+    # Remove some requests
+    req_ids_to_remove, req_indices_to_remove = _remove_requests(
+        input_batch, batch_size, reqs)
+    req_ids_retained = set(req_id_reqs.keys()) - req_ids_to_remove
+
+    # Compact the input batch
+    input_batch.condense(req_indices_to_remove)
+
+    # Generate the sampling metadata
+    sampling_metadata = input_batch.make_sampling_metadata(
+        req_id_output_token_ids, skip_copy=False)
+
+    # Create expected output.
+    expected_sampling_metadata = _construct_expected_sampling_metadata(
+        reqs,
+        req_ids_retained,
+        input_batch.req_id_to_index,
+        device=torch.device(device))
+
+    # Assert the actual and expected output.
+    assert torch.allclose(expected_sampling_metadata.temperature,
+                          sampling_metadata.temperature)
+    assert torch.allclose(expected_sampling_metadata.top_p,
+                          sampling_metadata.top_p)
+    assert torch.allclose(expected_sampling_metadata.top_k,
+                          sampling_metadata.top_k)
+    assert torch.allclose(expected_sampling_metadata.frequency_penalties,
+                          sampling_metadata.frequency_penalties)
+    assert torch.allclose(expected_sampling_metadata.presence_penalties,
+                          sampling_metadata.presence_penalties)
+    assert torch.allclose(expected_sampling_metadata.repetition_penalties,
+                          sampling_metadata.repetition_penalties)
+    assert torch.allclose(expected_sampling_metadata.prompt_token_ids,
+                          sampling_metadata.prompt_token_ids)
+    assert (expected_sampling_metadata.output_token_ids ==
+            sampling_metadata.output_token_ids)
+    assert (
+        expected_sampling_metadata.min_tokens == sampling_metadata.min_tokens)
+    assert (expected_sampling_metadata.stop_token_ids ==
+            sampling_metadata.stop_token_ids)
+    assert (expected_sampling_metadata.no_penalties ==
+            sampling_metadata.no_penalties)
+    assert (expected_sampling_metadata.no_top_p == sampling_metadata.no_top_p)
+    assert (expected_sampling_metadata.no_top_k == sampling_metadata.no_top_k)
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index c10efefea5471..c2d12c466ba45 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -11,6 +11,7 @@
 import torch.nn as nn
 
 import vllm.envs as envs
+from vllm.model_executor.layers.utils import apply_penalties
 from vllm.model_executor.sampling_metadata import (SamplingMetadata,
                                                    SamplingTensors,
                                                    SequenceGroupToSample)
@@ -258,11 +259,11 @@ def forward(
 
         # Apply presence and frequency penalties.
         if do_penalties:
-            logits = _apply_penalties(logits, sampling_tensors.prompt_tokens,
-                                      sampling_tensors.output_tokens,
-                                      sampling_tensors.presence_penalties,
-                                      sampling_tensors.frequency_penalties,
-                                      sampling_tensors.repetition_penalties)
+            logits = apply_penalties(logits, sampling_tensors.prompt_tokens,
+                                     sampling_tensors.output_tokens,
+                                     sampling_tensors.presence_penalties,
+                                     sampling_tensors.frequency_penalties,
+                                     sampling_tensors.repetition_penalties)
 
         # Use float32 to apply temperature scaling.
         # Use in-place division to avoid creating a new tensor.
@@ -336,23 +337,6 @@ def _should_modify_greedy_probs_inplace(self) -> bool:
         return self.should_modify_greedy_probs_inplace
 
 
-def _get_bin_counts_and_mask(
-    tokens: torch.Tensor,
-    vocab_size: int,
-    num_seqs: int,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    # Compute the bin counts for the tokens.
-    # vocab_size + 1 for padding.
-    bin_counts = torch.zeros((num_seqs, vocab_size + 1),
-                             dtype=torch.long,
-                             device=tokens.device)
-    bin_counts.scatter_add_(1, tokens, torch.ones_like(tokens))
-    bin_counts = bin_counts[:, :vocab_size]
-    mask = bin_counts > 0
-
-    return bin_counts, mask
-
-
 def _apply_min_tokens_penalty(
     logits: torch.Tensor,
     sampling_metadata: SamplingMetadata,
@@ -400,29 +384,6 @@ def _apply_min_tokens_penalty(
     return logits
 
 
-def _apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
-                     output_tokens_tensor: torch.Tensor,
-                     presence_penalties: torch.Tensor,
-                     frequency_penalties: torch.Tensor,
-                     repetition_penalties: torch.Tensor) -> torch.Tensor:
-    num_seqs, vocab_size = logits.shape
-    _, prompt_mask = _get_bin_counts_and_mask(prompt_tokens_tensor, vocab_size,
-                                              num_seqs)
-    output_bin_counts, output_mask = _get_bin_counts_and_mask(
-        output_tokens_tensor, vocab_size, num_seqs)
-
-    repetition_penalties = repetition_penalties[:, None].repeat(1, vocab_size)
-    repetition_penalties[~(prompt_mask | output_mask)] = 1.0
-    logits = torch.where(logits > 0, logits / repetition_penalties,
-                         logits * repetition_penalties)
-
-    # We follow the definition in OpenAI API.
-    # Refer to https://platform.openai.com/docs/api-reference/parameter-details
-    logits -= frequency_penalties.unsqueeze_(dim=1) * output_bin_counts
-    logits -= presence_penalties.unsqueeze_(dim=1) * output_mask
-    return logits
-
-
 def _apply_top_k_top_p(
     logits: torch.Tensor,
     p: torch.Tensor,
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
new file mode 100644
index 0000000000000..f6f34cd49d953
--- /dev/null
+++ b/vllm/model_executor/layers/utils.py
@@ -0,0 +1,57 @@
+"""Utility methods for model layers."""
+from typing import Tuple
+
+import torch
+
+
+def get_token_bin_counts_and_mask(
+    tokens: torch.Tensor,
+    vocab_size: int,
+    num_seqs: int,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Compute the bin counts for the tokens.
+    # vocab_size + 1 for padding.
+    bin_counts = torch.zeros((num_seqs, vocab_size + 1),
+                             dtype=torch.long,
+                             device=tokens.device)
+    bin_counts.scatter_add_(1, tokens, torch.ones_like(tokens))
+    bin_counts = bin_counts[:, :vocab_size]
+    mask = bin_counts > 0
+
+    return bin_counts, mask
+
+
+def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
+                    output_tokens_tensor: torch.Tensor,
+                    presence_penalties: torch.Tensor,
+                    frequency_penalties: torch.Tensor,
+                    repetition_penalties: torch.Tensor) -> torch.Tensor:
+    """
+    Applies penalties in place to the logits tensor
+    logits : The input logits tensor of shape [num_seqs, vocab_size]
+    prompt_tokens_tensor: A tensor containing the prompt tokens. The prompts 
+        are padded to the maximum prompt length within the batch using 
+        `vocab_size` as the padding value. The value `vocab_size` is used 
+        for padding because it does not correspond to any valid token ID 
+        in the vocabulary.
+    output_tokens_tensor: The output tokens tensor.
+    presence_penalties: The presence penalties of shape (num_seqs, )
+    frequency_penalties: The frequency penalties of shape (num_seqs, )
+    repetition_penalties: The repetition penalties of shape (num_seqs, )
+    """
+    num_seqs, vocab_size = logits.shape
+    _, prompt_mask = get_token_bin_counts_and_mask(prompt_tokens_tensor,
+                                                   vocab_size, num_seqs)
+    output_bin_counts, output_mask = get_token_bin_counts_and_mask(
+        output_tokens_tensor, vocab_size, num_seqs)
+    repetition_penalties = repetition_penalties.unsqueeze_(dim=1).repeat(
+        1, vocab_size)
+    logits[logits > 0] /= torch.where(prompt_mask | output_mask,
+                                      repetition_penalties, 1.0)[logits > 0]
+    logits[logits <= 0] *= torch.where(prompt_mask | output_mask,
+                                       repetition_penalties, 1.0)[logits <= 0]
+    # We follow the definition in OpenAI API.
+    # Refer to https://platform.openai.com/docs/api-reference/parameter-details
+    logits -= frequency_penalties.unsqueeze_(dim=1) * output_bin_counts
+    logits -= presence_penalties.unsqueeze_(dim=1) * output_mask
+    return logits
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 9ef36f2e6b212..d60f7eb5d76f9 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict
+from typing import Dict, List, Optional, Set
 
 import torch
 
@@ -19,3 +19,13 @@ class SamplingMetadata:
     generators: Dict[int, torch.Generator]
 
     max_num_logprobs: int
+
+    no_penalties: bool
+    prompt_token_ids: Optional[torch.Tensor]
+    frequency_penalties: torch.Tensor
+    presence_penalties: torch.Tensor
+    repetition_penalties: torch.Tensor
+
+    output_token_ids: List[List[int]]
+    min_tokens: List[int]
+    stop_token_ids: List[Set[int]]
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index d1a755be01ff7..82470fb2610f8 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -1,9 +1,11 @@
 """A layer that samples the next tokens from the model's outputs."""
-from typing import Dict
+from typing import Dict, List, Set, Tuple
 
 import torch
 import torch.nn as nn
 
+from vllm.model_executor.layers.utils import apply_penalties
+from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 from vllm.v1.outputs import SamplerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 
@@ -17,9 +19,18 @@ def forward(
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
+        _apply_min_token_penalties(logits, sampling_metadata.output_token_ids,
+                                   sampling_metadata.stop_token_ids,
+                                   sampling_metadata.min_tokens)
+        if not sampling_metadata.no_penalties:
+            assert sampling_metadata.prompt_token_ids is not None
+            _apply_penalties(logits, sampling_metadata.prompt_token_ids,
+                             sampling_metadata.presence_penalties,
+                             sampling_metadata.frequency_penalties,
+                             sampling_metadata.repetition_penalties,
+                             sampling_metadata.output_token_ids)
         logits = self.apply_temperature(logits, sampling_metadata.temperature)
         logits = self.apply_top_k_top_p(logits, sampling_metadata)
-
         probs = self.get_probs(logits)
         sampled = self.sample(probs, sampling_metadata)
         # Use int32 to reduce the tensor size.
@@ -157,3 +168,53 @@ def _apply_top_k_top_p(
     # Re-sort the probabilities.
     logits = logits_sort.scatter(dim=-1, index=logits_idx, src=logits_sort)
     return logits
+
+
+def _apply_min_token_penalties(logits: torch.Tensor,
+                               output_token_ids: List[List[int]],
+                               stop_token_ids: List[Set[int]],
+                               min_tokens: List[int]):
+    """
+    Applies minimum token penalty by setting the logits of the stop tokens
+    to -inf.
+    """
+    min_tokens_logits_to_penalize: List[Tuple[int, int]] = []
+    for index, min_token in enumerate(min_tokens):
+        if (len(output_token_ids[index]) < min_token):
+            for stop_token_id in stop_token_ids[index]:
+                min_tokens_logits_to_penalize.append((index, stop_token_id))
+    if min_tokens_logits_to_penalize:
+        logits[tuple(zip(*min_tokens_logits_to_penalize))] = -float("inf")
+
+
+def _apply_penalties(logits: torch.Tensor, prompt_token_ids: torch.Tensor,
+                     presence_penalties: torch.Tensor,
+                     frequency_penalties: torch.Tensor,
+                     repetition_penalties: torch.Tensor,
+                     output_token_ids: List[List[int]]):
+    """
+    Applies presence, frequency and repetition penalties to the logits.
+    """
+    _, vocab_size = logits.shape
+    output_tokens_t = _convert_to_tensors(output_token_ids, vocab_size,
+                                          logits.device)
+    return apply_penalties(logits, prompt_token_ids, output_tokens_t,
+                           presence_penalties, frequency_penalties,
+                           repetition_penalties)
+
+
+def _convert_to_tensors(output_token_ids: List[List[int]], vocab_size: int,
+                        device: torch.device) -> torch.Tensor:
+    """
+    Convert the different list data structures to tensors.
+    """
+    output_tokens_tensor = make_tensor_with_pad(
+        output_token_ids,
+        # Use the value of vocab_size as a pad since we don't have a
+        # token_id of this value.
+        pad=vocab_size,
+        device="cpu",
+        dtype=torch.int64,
+        pin_memory=is_pin_memory_available(),
+    )
+    return output_tokens_tensor.to(device, non_blocking=True)
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 5c113c74778df..6c4d300ec6efe 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -43,12 +43,14 @@ def __init__(
         max_num_blocks_per_req: int,
         device: torch.device,
         pin_memory: bool,
+        vocab_size: int,
     ):
         self.max_num_reqs = max_num_reqs
         self.max_model_len = max_model_len
         self.max_num_blocks_per_req = max_num_blocks_per_req
         self.device = device
         self.pin_memory = pin_memory
+        self.vocab_size = vocab_size
 
         self.req_ids: List[Optional[str]] = [None] * max_num_reqs
         self.req_id_to_index: Dict[str, int] = {}
@@ -63,6 +65,7 @@ def __init__(
         )
         self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
         self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
+        self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
 
         # Attention-related.
         self.block_table = torch.zeros(
@@ -110,6 +113,50 @@ def __init__(
         self.top_k_cpu = self.top_k_cpu_tensor.numpy()
         self.top_k_reqs: Set[str] = set()
 
+        # Frequency penalty related data structures
+        self.frequency_penalties = torch.empty((max_num_reqs, ),
+                                               dtype=torch.float,
+                                               device=device)
+        self.frequency_penalties_cpu_tensor = torch.empty(
+            (max_num_reqs, ),
+            dtype=torch.float,
+            device="cpu",
+            pin_memory=pin_memory)
+        self.frequency_penalties_cpu = \
+            self.frequency_penalties_cpu_tensor.numpy()
+        self.frequency_penalties_reqs: Set[str] = set()
+
+        # Presence penalty related data structures
+        self.presence_penalties = torch.empty((max_num_reqs, ),
+                                              dtype=torch.float,
+                                              device=device)
+        self.presence_penalties_cpu_tensor = torch.empty((max_num_reqs, ),
+                                                         dtype=torch.float,
+                                                         device="cpu",
+                                                         pin_memory=pin_memory)
+        self.presence_penalties_cpu = \
+            self.presence_penalties_cpu_tensor.numpy()
+        self.presence_penalties_reqs: Set[str] = set()
+
+        # Repetition penalty related data structures
+        self.repetition_penalties = torch.empty((max_num_reqs, ),
+                                                dtype=torch.float,
+                                                device=device)
+        self.repetition_penalties_cpu_tensor = torch.empty(
+            (max_num_reqs, ),
+            dtype=torch.float,
+            device="cpu",
+            pin_memory=pin_memory)
+        self.repetition_penalties_cpu = \
+            self.repetition_penalties_cpu_tensor.numpy()
+        self.repetition_penalties_reqs: Set[str] = set()
+
+        self.min_tokens: List[int] = [0] * max_num_reqs
+        self.stop_token_ids: List[Set[int]] = [
+            set() for _ in range(max_num_reqs)
+        ]
+        self.prompt_token_ids: Optional[torch.Tensor] = None
+
         # req_index -> generator
         # NOTE(woosuk): The indices of the requests that do not have their own
         # generator should not be included in the dictionary.
@@ -133,6 +180,7 @@ def add_request(
 
         # Copy the prompt token ids and output token ids.
         num_prompt_tokens = len(request.prompt_token_ids)
+        self.num_prompt_tokens[req_index] = num_prompt_tokens
         self.token_ids_cpu[
             req_index, :num_prompt_tokens] = request.prompt_token_ids
         start_idx = num_prompt_tokens
@@ -157,6 +205,20 @@ def add_request(
         self.top_k_cpu[req_index] = sampling_params.top_k
         if sampling_params.top_k > 0:
             self.top_k_reqs.add(req_id)
+        self.frequency_penalties_cpu[req_index] = \
+            sampling_params.frequency_penalty
+        if sampling_params.frequency_penalty != 0.0:
+            self.frequency_penalties_reqs.add(req_id)
+        self.presence_penalties_cpu[req_index] = \
+            sampling_params.presence_penalty
+        if sampling_params.presence_penalty != 0.0:
+            self.presence_penalties_reqs.add(req_id)
+        self.repetition_penalties_cpu[req_index] = \
+            sampling_params.repetition_penalty
+        if sampling_params.repetition_penalty != 1.0:
+            self.repetition_penalties_reqs.add(req_id)
+        self.min_tokens[req_index] = sampling_params.min_tokens
+        self.stop_token_ids[req_index] = sampling_params.all_stop_token_ids
 
         # NOTE(woosuk): self.generators should not include the requests that
         # do not have their own generator.
@@ -179,6 +241,9 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.random_reqs.discard(req_id)
         self.top_p_reqs.discard(req_id)
         self.top_k_reqs.discard(req_id)
+        self.frequency_penalties_reqs.discard(req_id)
+        self.presence_penalties_reqs.discard(req_id)
+        self.repetition_penalties_reqs.discard(req_id)
         self.generators.pop(req_index, None)
         self.num_logprobs.pop(req_id, None)
         self.prompt_logprob_reqs.discard(req_id)
@@ -191,6 +256,9 @@ def clear(self) -> None:
         self.random_reqs.clear()
         self.top_p_reqs.clear()
         self.top_k_reqs.clear()
+        self.frequency_penalties_reqs.clear()
+        self.presence_penalties_reqs.clear()
+        self.repetition_penalties_reqs.clear()
         self.generators.clear()
         self.num_logprobs.clear()
         self.prompt_logprob_reqs.clear()
@@ -224,6 +292,8 @@ def condense(self, empty_req_indices: List[int]) -> None:
             # block_table_cpu.
             self.token_ids_cpu[empty_index] = self.token_ids_cpu[
                 last_req_index]
+            self.num_prompt_tokens[empty_index] = \
+                self.num_prompt_tokens[last_req_index]
             self.num_computed_tokens_cpu[
                 empty_index] = self.num_computed_tokens_cpu[last_req_index]
             self.block_table_cpu[empty_index] = self.block_table_cpu[
@@ -232,6 +302,15 @@ def condense(self, empty_req_indices: List[int]) -> None:
                 last_req_index]
             self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
             self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
+            self.frequency_penalties_cpu[empty_index] = \
+                self.frequency_penalties_cpu[last_req_index]
+            self.presence_penalties_cpu[empty_index] = \
+                self.presence_penalties_cpu[last_req_index]
+            self.repetition_penalties_cpu[empty_index] = \
+                self.repetition_penalties_cpu[last_req_index]
+            self.min_tokens[empty_index] = self.min_tokens[last_req_index]
+            self.stop_token_ids[empty_index] = \
+                self.stop_token_ids[last_req_index]
             generator = self.generators.pop(last_req_index, None)
             if generator is not None:
                 self.generators[empty_index] = generator
@@ -241,6 +320,7 @@ def condense(self, empty_req_indices: List[int]) -> None:
 
     def make_sampling_metadata(
         self,
+        req_id_output_token_ids: Dict[str, List[int]],
         skip_copy: bool = False,
     ) -> SamplingMetadata:
         if not skip_copy:
@@ -250,6 +330,37 @@ def make_sampling_metadata(
                 self.top_p_cpu_tensor[:self.num_reqs], non_blocking=True)
             self.top_k[:self.num_reqs].copy_(
                 self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True)
+            if not self.no_penalties:
+                # Since syncing these tensors is expensive only copy them
+                # if necessary i.e. if there are requests which require
+                # penalties to be applied during sampling.
+                self.frequency_penalties[:self.num_reqs].copy_(
+                    self.frequency_penalties_cpu_tensor[:self.num_reqs],
+                    non_blocking=True)
+                self.presence_penalties[:self.num_reqs].copy_(
+                    self.presence_penalties_cpu_tensor[:self.num_reqs],
+                    non_blocking=True)
+                self.repetition_penalties[:self.num_reqs].copy_(
+                    self.repetition_penalties_cpu_tensor[:self.num_reqs],
+                    non_blocking=True)
+                # The prompt tokens are used only for applying penalties during
+                # the sampling process. Hence copy these tensors only when
+                # there are requests which need penalties to be applied.
+                self.prompt_token_ids = self._make_prompt_token_ids_tensor()
+
+        output_token_ids: List[List[int]] = []
+
+        for req_id in self.req_ids[:self.num_reqs]:
+            assert req_id is not None
+            # Currently we create a tensor for output_token_ids from scratch
+            # at each step. However, for the penalties computation what we
+            # need is stats about the token ids present in the output. This
+            # stats can be maintained incrementally instead of computing it
+            # from scratch at each step.
+            # TODO - Replace this with incremental update to output token
+            # statistics.
+            output_token_ids.append(req_id_output_token_ids[req_id])
+
         return SamplingMetadata(
             temperature=self.temperature[:self.num_reqs],
             all_greedy=self.all_greedy,
@@ -260,8 +371,33 @@ def make_sampling_metadata(
             no_top_k=self.no_top_k,
             generators=self.generators,
             max_num_logprobs=self.max_num_logprobs,
+            prompt_token_ids=self.prompt_token_ids,
+            frequency_penalties=self.frequency_penalties[:self.num_reqs],
+            presence_penalties=self.presence_penalties[:self.num_reqs],
+            repetition_penalties=self.repetition_penalties[:self.num_reqs],
+            output_token_ids=output_token_ids,
+            min_tokens=self.min_tokens[:self.num_reqs],
+            stop_token_ids=self.stop_token_ids[:self.num_reqs],
+            no_penalties=self.no_penalties,
         )
 
+    def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
+        max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max()
+        prompt_token_ids_cpu_tensor = torch.empty(
+            (self.num_reqs, max_prompt_len),
+            device="cpu",
+            dtype=torch.int64,
+            pin_memory=self.pin_memory)
+        prompt_token_ids = prompt_token_ids_cpu_tensor.numpy()
+        prompt_token_ids[:] = (
+            self.token_ids_cpu[:self.num_reqs, :max_prompt_len])
+        # Use the value of vocab_size as a pad since we don't have a
+        # token_id of this value.
+        for i in range(self.num_reqs):
+            prompt_token_ids[i, self.num_prompt_tokens[i]:] = self.vocab_size
+        return prompt_token_ids_cpu_tensor.to(device=self.device,
+                                              non_blocking=True)
+
     @property
     def num_reqs(self) -> int:
         return len(self.req_id_to_index)
@@ -282,6 +418,12 @@ def no_top_p(self) -> bool:
     def no_top_k(self) -> bool:
         return len(self.top_k_reqs) == 0
 
+    @property
+    def no_penalties(self) -> bool:
+        return (len(self.presence_penalties_reqs) == 0
+                and len(self.frequency_penalties_reqs) == 0
+                and len(self.repetition_penalties_reqs) == 0)
+
     @property
     def max_num_logprobs(self) -> int:
         return max(self.num_logprobs.values()) if self.num_logprobs else 0
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index ace62d8978bea..509771b7e2e5a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -105,6 +105,7 @@ def __init__(
             max_num_blocks_per_req=self.max_num_blocks_per_req,
             device=self.device,
             pin_memory=self.pin_memory,
+            vocab_size=model_config.get_vocab_size(),
         )
 
         self.use_cuda_graph = (self.vllm_config.compilation_config.level
@@ -383,7 +384,12 @@ def _prepare_sampling(
                 or scheduler_output.scheduled_resumed_reqs):
             skip_copy = False
         # Create the sampling metadata.
-        sampling_metadata = self.input_batch.make_sampling_metadata(skip_copy)
+        req_id_output_token_ids: Dict[str, List[int]] = \
+            {req_id: req.output_token_ids \
+                for req_id, req in self.requests.items()}
+
+        sampling_metadata = self.input_batch.make_sampling_metadata(
+            req_id_output_token_ids, skip_copy)
         return sampling_metadata
 
     def _execute_encoder(self, scheduler_output: "SchedulerOutput"):

From f57ee5650dd402c6147980824c6936c96cfa59fe Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 26 Dec 2024 21:12:05 +0800
Subject: [PATCH 179/357] [Model]  Modify MolmoForCausalLM MLP  (#11510)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/models/molmo.py | 42 ++++++++++++++++-------------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 63a25137f8aa9..8938f62d0c494 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -464,24 +464,27 @@ def forward(
 class MolmoMLP(nn.Module):
     """Molmo's LLM mlp."""
 
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        input_dim: Optional[int] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-    ) -> None:
+    def __init__(self,
+                 config: PretrainedConfig,
+                 input_dim: Optional[int] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 proj_name: str = "gate_up_proj") -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size // 2
 
-        # Feed-forward input projection.
-        self.gate_up_proj = MergedColumnParallelLinear(
-            input_dim or self.hidden_size,
-            [self.intermediate_size] * 2,
-            bias=False,
-            quant_config=quant_config,
-        )
-
+        # Molmo's LLM proj weights are already merged into the disk, while
+        # image_projector proj is separate. If the same proj_name were used, it
+        # would create ambiguity and make it difficult to support BNB and LoRA.
+        self.proj_name = proj_name
+        setattr(
+            self, proj_name,
+            MergedColumnParallelLinear(
+                input_dim or self.hidden_size,
+                [self.intermediate_size] * 2,
+                bias=False,
+                quant_config=quant_config,
+            ))
         # Activation function.
         self.act_fn = SiluAndMul()
 
@@ -497,7 +500,7 @@ def forward(
         self,
         x: torch.Tensor,
     ) -> torch.Tensor:
-        gate_up, _ = self.gate_up_proj(x)
+        gate_up, _ = getattr(self, self.proj_name)(x)
         x = self.act_fn(gate_up)
         x, _ = self.down_proj(x)
         return x
@@ -520,7 +523,9 @@ def __init__(
                                         prefix=f"{prefix}.self_attn")
 
         # MLP block.
-        self.mlp = MolmoMLP(config, quant_config=quant_config)
+        self.mlp = MolmoMLP(config,
+                            quant_config=quant_config,
+                            proj_name="gate_up_proj")
 
         # LayerNorm
         assert config.layer_norm_type == "rms"
@@ -616,6 +621,7 @@ def __init__(
             config,
             input_dim=vision_config.image_emb_dim,
             quant_config=quant_config,
+            proj_name="merged_linear",
         )
 
         image_dim = vision_config.image_emb_dim * len(self.vit_layers)
@@ -714,8 +720,8 @@ def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
+            ("merged_linear", "gate_proj", 0),
+            ("merged_linear", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
         loaded_params: Set[str] = set()

From eec906d8114cd786315e49ab7f5a3093d1896880 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 26 Dec 2024 21:12:51 +0800
Subject: [PATCH 180/357] [Misc] Add placeholder module (#11501)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/tensorizer_loader/test_tensorizer.py    |  9 ++-
 vllm/assets/audio.py                          | 19 +++---
 vllm/assets/base.py                           |  7 +--
 vllm/assets/image.py                          |  3 +-
 vllm/assets/video.py                          |  9 +--
 vllm/config.py                                | 10 +--
 vllm/model_executor/model_loader/loader.py    | 11 +---
 .../model_executor/model_loader/tensorizer.py | 25 ++++----
 .../model_loader/weight_utils.py              | 17 ++---
 vllm/multimodal/audio.py                      | 24 ++-----
 vllm/multimodal/utils.py                      | 26 +++++---
 vllm/multimodal/video.py                      | 13 +---
 vllm/transformers_utils/s3_utils.py           |  7 ++-
 vllm/utils.py                                 | 63 +++++++++++++++++++
 14 files changed, 143 insertions(+), 100 deletions(-)

diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index edd079bc7a389..0b0792b6b845f 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -9,7 +9,6 @@
 import pytest
 import torch
 from huggingface_hub import snapshot_download
-from tensorizer import EncryptionParams
 
 from vllm import SamplingParams
 from vllm.engine.arg_utils import EngineArgs
@@ -23,12 +22,18 @@
                                                          serialize_vllm_model,
                                                          tensorize_vllm_model)
 # yapf: enable
-from vllm.utils import import_from_path
+from vllm.utils import PlaceholderModule, import_from_path
 
 from ..conftest import VllmRunner
 from ..utils import VLLM_PATH, RemoteOpenAIServer
 from .conftest import retry_until_skip
 
+try:
+    from tensorizer import EncryptionParams
+except ImportError:
+    tensorizer = PlaceholderModule("tensorizer")  # type: ignore[assignment]
+    EncryptionParams = tensorizer.placeholder_attr("EncryptionParams")
+
 EXAMPLES_PATH = VLLM_PATH / "examples"
 
 prompts = [
diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py
index 49bb6aeee90bc..9033644e3264a 100644
--- a/vllm/assets/audio.py
+++ b/vllm/assets/audio.py
@@ -1,11 +1,17 @@
 from dataclasses import dataclass
-from typing import Literal, Tuple
+from typing import Literal
 from urllib.parse import urljoin
 
-import librosa
-import numpy as np
+import numpy.typing as npt
 
-from vllm.assets.base import get_vllm_public_assets, vLLM_S3_BUCKET_URL
+from vllm.utils import PlaceholderModule
+
+from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets
+
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
 
 ASSET_DIR = "multimodal_asset"
 
@@ -15,8 +21,7 @@ class AudioAsset:
     name: Literal["winning_call", "mary_had_lamb"]
 
     @property
-    def audio_and_sample_rate(self) -> Tuple[np.ndarray, int]:
-
+    def audio_and_sample_rate(self) -> tuple[npt.NDArray, int]:
         audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg",
                                             s3_prefix=ASSET_DIR)
         y, sr = librosa.load(audio_path, sr=None)
@@ -25,4 +30,4 @@ def audio_and_sample_rate(self) -> Tuple[np.ndarray, int]:
 
     @property
     def url(self) -> str:
-        return urljoin(vLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg")
+        return urljoin(VLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg")
diff --git a/vllm/assets/base.py b/vllm/assets/base.py
index f97e8c218f65b..249173141106c 100644
--- a/vllm/assets/base.py
+++ b/vllm/assets/base.py
@@ -4,9 +4,8 @@
 
 import vllm.envs as envs
 from vllm.connections import global_http_connection
-from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT
 
-vLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
+VLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
 
 
 def get_cache_dir() -> Path:
@@ -32,8 +31,8 @@ def get_vllm_public_assets(filename: str,
         if s3_prefix is not None:
             filename = s3_prefix + "/" + filename
         global_http_connection.download_file(
-            f"{vLLM_S3_BUCKET_URL}/{filename}",
+            f"{VLLM_S3_BUCKET_URL}/{filename}",
             asset_path,
-            timeout=VLLM_IMAGE_FETCH_TIMEOUT)
+            timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT)
 
     return asset_path
diff --git a/vllm/assets/image.py b/vllm/assets/image.py
index 389ecd5c869bc..cb831cb0b5bb4 100644
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -4,7 +4,7 @@
 import torch
 from PIL import Image
 
-from vllm.assets.base import get_vllm_public_assets
+from .base import get_vllm_public_assets
 
 VLM_IMAGES_DIR = "vision_model_images"
 
@@ -15,7 +15,6 @@ class ImageAsset:
 
     @property
     def pil_image(self) -> Image.Image:
-
         image_path = get_vllm_public_assets(filename=f"{self.name}.jpg",
                                             s3_prefix=VLM_IMAGES_DIR)
         return Image.open(image_path)
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index e6779935bad17..eca2ccc54482c 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -2,13 +2,13 @@
 from functools import lru_cache
 from typing import List, Literal
 
+import cv2
 import numpy as np
 import numpy.typing as npt
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from vllm.multimodal.video import (sample_frames_from_video,
-                                   try_import_video_packages)
+from vllm.multimodal.video import sample_frames_from_video
 
 from .base import get_cache_dir
 
@@ -19,7 +19,7 @@ def download_video_asset(filename: str) -> str:
     Download and open an image from huggingface
     repo: raushan-testing-hf/videos-test
     """
-    video_directory = get_cache_dir() / "video-eample-data"
+    video_directory = get_cache_dir() / "video-example-data"
     video_directory.mkdir(parents=True, exist_ok=True)
 
     video_path = video_directory / filename
@@ -35,8 +35,6 @@ def download_video_asset(filename: str) -> str:
 
 
 def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
-    cv2, _ = try_import_video_packages()
-
     cap = cv2.VideoCapture(path)
     if not cap.isOpened():
         raise ValueError(f"Could not open video file {path}")
@@ -59,7 +57,6 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
 
 def video_to_pil_images_list(path: str,
                              num_frames: int = -1) -> List[Image.Image]:
-    cv2, _ = try_import_video_packages()
     frames = video_to_ndarrays(path, num_frames)
     return [
         Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
diff --git a/vllm/config.py b/vllm/config.py
index 17602bda15c69..de8ba029ddc23 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -29,6 +29,7 @@
     get_hf_text_config, get_pooling_config,
     get_sentence_transformer_tokenizer_config, is_encoder_decoder,
     try_get_generation_config, uses_mrope)
+from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
                         get_cpu_memory, print_warning_once, random_uuid,
@@ -372,15 +373,6 @@ def maybe_pull_model_tokenizer_for_s3(self, model: str,
 
         """
         if is_s3(model) or is_s3(tokenizer):
-            try:
-                from vllm.transformers_utils.s3_utils import S3Model
-            except ImportError as err:
-                raise ImportError(
-                    "Please install Run:ai optional dependency "
-                    "to use the S3 capabilities. "
-                    "You can install it with: pip install vllm[runai]"
-                ) from err
-
             if is_s3(model):
                 self.s3_model = S3Model()
                 self.s3_model.pull_files(model, allow_pattern=["*config.json"])
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 24e554e6060ab..f2d9293b31a83 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -48,6 +48,7 @@
     runai_safetensors_weights_iterator, safetensors_weights_iterator)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
+from vllm.transformers_utils.s3_utils import glob as s3_glob
 from vllm.transformers_utils.utils import is_s3
 from vllm.utils import is_pin_memory_available
 
@@ -1269,16 +1270,6 @@ def _prepare_weights(self, model_name_or_path: str,
 
         If the model is not local, it will be downloaded."""
         is_s3_path = is_s3(model_name_or_path)
-        if is_s3_path:
-            try:
-                from vllm.transformers_utils.s3_utils import glob as s3_glob
-            except ImportError as err:
-                raise ImportError(
-                    "Please install Run:ai optional dependency "
-                    "to use the S3 capabilities. "
-                    "You can install it with: pip install vllm[runai]"
-                ) from err
-
         is_local = os.path.isdir(model_name_or_path)
         safetensors_pattern = "*.safetensors"
         index_file = SAFE_WEIGHTS_INDEX_NAME
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 87f3fcb5cae00..8b929f299c8d8 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -19,9 +19,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
-from vllm.utils import FlexibleArgumentParser
-
-tensorizer_error_msg = None
+from vllm.utils import FlexibleArgumentParser, PlaceholderModule
 
 try:
     from tensorizer import (DecryptionParams, EncryptionParams,
@@ -34,8 +32,19 @@
         open_stream,
         mode=mode,
     ) for mode in ("rb", "wb+"))
-except ImportError as e:
-    tensorizer_error_msg = str(e)
+except ImportError:
+    tensorizer = PlaceholderModule("tensorizer")
+    DecryptionParams = tensorizer.placeholder_attr("DecryptionParams")
+    EncryptionParams = tensorizer.placeholder_attr("EncryptionParams")
+    TensorDeserializer = tensorizer.placeholder_attr("TensorDeserializer")
+    TensorSerializer = tensorizer.placeholder_attr("TensorSerializer")
+    open_stream = tensorizer.placeholder_attr("stream_io.open_stream")
+    convert_bytes = tensorizer.placeholder_attr("utils.convert_bytes")
+    get_mem_usage = tensorizer.placeholder_attr("utils.get_mem_usage")
+    no_init_or_tensor = tensorizer.placeholder_attr("utils.no_init_or_tensor")
+
+    _read_stream = tensorizer.placeholder_attr("_read_stream")
+    _write_stream = tensorizer.placeholder_attr("_write_stream")
 
 __all__ = [
     'EncryptionParams', 'DecryptionParams', 'TensorDeserializer',
@@ -267,12 +276,6 @@ class TensorizerAgent:
     """
 
     def __init__(self, tensorizer_config: TensorizerConfig, vllm_config):
-        if tensorizer_error_msg is not None:
-            raise ImportError(
-                "Tensorizer is not installed. Please install tensorizer "
-                "to use this feature with `pip install vllm[tensorizer]`. "
-                "Error message: {}".format(tensorizer_error_msg))
-
         self.tensorizer_config = tensorizer_config
         self.tensorizer_args = (
             self.tensorizer_config._construct_tensorizer_args())
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index f2a9e7e2687cb..8aa0c98df70d2 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -25,7 +25,15 @@
                                                      get_quantization_config)
 from vllm.model_executor.layers.quantization.schema import QuantParamSchema
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
+from vllm.utils import PlaceholderModule, print_warning_once
+
+try:
+    from runai_model_streamer import SafetensorsStreamer
+except ImportError:
+    runai_model_streamer = PlaceholderModule(
+        "runai_model_streamer")  # type: ignore[assignment]
+    SafetensorsStreamer = runai_model_streamer.placeholder_attr(
+        "SafetensorsStreamer")
 
 logger = init_logger(__name__)
 
@@ -414,13 +422,6 @@ def runai_safetensors_weights_iterator(
     hf_weights_files: List[str]
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
     """Iterate over the weights in the model safetensor files."""
-    try:
-        from runai_model_streamer import SafetensorsStreamer
-    except ImportError as err:
-        raise ImportError(
-            "Please install Run:ai optional dependency."
-            "You can install it with: pip install vllm[runai]") from err
-
     enable_tqdm = not torch.distributed.is_initialized(
     ) or torch.distributed.get_rank() == 0
     with SafetensorsStreamer() as streamer:
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index 314d21b746236..ed3bb82bf0aaa 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -1,13 +1,17 @@
-from typing import Any
-
 import numpy as np
 import numpy.typing as npt
 
 from vllm.inputs.registry import InputContext
+from vllm.utils import PlaceholderModule
 
 from .base import MultiModalPlugin
 from .inputs import AudioItem, MultiModalData, MultiModalKwargs
 
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+
 
 class AudioPlugin(MultiModalPlugin):
     """Plugin for audio data."""
@@ -28,26 +32,10 @@ def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
             "There is no default maximum multimodal tokens")
 
 
-def try_import_audio_packages() -> tuple[Any, Any]:
-    try:
-        import librosa
-        import soundfile
-    except ImportError as exc:
-        raise ImportError(
-            "Please install vllm[audio] for audio support.") from exc
-    return librosa, soundfile
-
-
 def resample_audio(
     audio: npt.NDArray[np.floating],
     *,
     orig_sr: float,
     target_sr: float,
 ) -> npt.NDArray[np.floating]:
-    try:
-        import librosa
-    except ImportError as exc:
-        msg = "Please install vllm[audio] for audio support."
-        raise ImportError(msg) from exc
-
     return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 1cb9036bdfda3..a49da2bdee972 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -13,10 +13,24 @@
 from vllm.connections import global_http_connection
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.utils import PlaceholderModule
 
-from .audio import try_import_audio_packages
 from .inputs import MultiModalDataDict, PlaceholderRange
-from .video import try_import_video_packages
+
+try:
+    import decord
+except ImportError:
+    decord = PlaceholderModule("decord")  # type: ignore[assignment]
+
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+
+try:
+    import soundfile
+except ImportError:
+    soundfile = PlaceholderModule("soundfile")  # type: ignore[assignment]
 
 logger = init_logger(__name__)
 
@@ -128,8 +142,6 @@ async def async_fetch_image(image_url: str,
 
 
 def _load_video_from_bytes(b: bytes, num_frames: int = 32) -> npt.NDArray:
-    _, decord = try_import_video_packages()
-
     video_path = BytesIO(b)
     vr = decord.VideoReader(video_path, num_threads=1)
     total_frame_num = len(vr)
@@ -204,8 +216,6 @@ def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
     """
     Load audio from a URL.
     """
-    librosa, _ = try_import_audio_packages()
-
     if audio_url.startswith("http"):
         audio_bytes = global_http_connection.get_bytes(
             audio_url,
@@ -226,8 +236,6 @@ async def async_fetch_audio(
     """
     Asynchronously fetch audio from a URL.
     """
-    librosa, _ = try_import_audio_packages()
-
     if audio_url.startswith("http"):
         audio_bytes = await global_http_connection.async_get_bytes(
             audio_url,
@@ -286,8 +294,6 @@ def encode_audio_base64(
     sampling_rate: int,
 ) -> str:
     """Encode audio as base64."""
-    _, soundfile = try_import_audio_packages()
-
     buffered = BytesIO()
     soundfile.write(buffered, audio, sampling_rate, format="WAV")
 
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index bfcdef70718bc..c4be100562703 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -1,6 +1,7 @@
 from functools import lru_cache
 from typing import TYPE_CHECKING, Any, Dict, Optional
 
+import cv2
 import numpy as np
 import numpy.typing as npt
 
@@ -78,19 +79,7 @@ def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
         return 4096
 
 
-def try_import_video_packages() -> tuple[Any, Any]:
-    try:
-        import cv2
-        import decord
-    except ImportError as exc:
-        raise ImportError(
-            "Please install vllm[video] for video support.") from exc
-    return cv2, decord
-
-
 def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
-    cv2, _ = try_import_video_packages()
-
     num_frames, _, _, channels = frames.shape
     new_height, new_width = size
     resized_frames = np.empty((num_frames, new_height, new_width, channels),
diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py
index 6f63dab74d696..6ae68161bbd97 100644
--- a/vllm/transformers_utils/s3_utils.py
+++ b/vllm/transformers_utils/s3_utils.py
@@ -6,7 +6,12 @@
 from pathlib import Path
 from typing import Optional
 
-import boto3
+from vllm.utils import PlaceholderModule
+
+try:
+    import boto3
+except ImportError:
+    boto3 = PlaceholderModule("boto3")  # type: ignore[assignment]
 
 
 def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]:
diff --git a/vllm/utils.py b/vllm/utils.py
index 49e532540d7ee..7d290dcb7dad0 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -6,10 +6,12 @@
 import enum
 import gc
 import getpass
+import importlib.metadata
 import importlib.util
 import inspect
 import ipaddress
 import os
+import re
 import signal
 import socket
 import subprocess
@@ -1550,6 +1552,67 @@ def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
     return module
 
 
+@lru_cache(maxsize=None)
+def get_vllm_optional_dependencies():
+    metadata = importlib.metadata.metadata("vllm")
+    requirements = metadata.get_all("Requires-Dist", [])
+    extras = metadata.get_all("Provides-Extra", [])
+
+    return {
+        extra: [
+            re.split(r";|>=|<=|==", req)[0] for req in requirements
+            if req.endswith(f'extra == "{extra}"')
+        ]
+        for extra in extras
+    }
+
+
+@dataclass(frozen=True)
+class PlaceholderModule:
+    """
+    A placeholder object to use when a module does not exist.
+
+    This enables more informative errors when trying to access attributes
+    of a module that does not exists.
+    """
+    name: str
+
+    def placeholder_attr(self, attr_path: str):
+        return _PlaceholderModuleAttr(self, attr_path)
+
+    def __getattr__(self, key: str):
+        name = self.name
+
+        try:
+            importlib.import_module(self.name)
+        except ImportError as exc:
+            for extra, names in get_vllm_optional_dependencies().items():
+                if name in names:
+                    msg = f"Please install vllm[{extra}] for {extra} support"
+                    raise ImportError(msg) from exc
+
+            raise exc
+
+        raise AssertionError("PlaceholderModule should not be used "
+                             "when the original module can be imported")
+
+
+@dataclass(frozen=True)
+class _PlaceholderModuleAttr:
+    module: PlaceholderModule
+    attr_path: str
+
+    def placeholder_attr(self, attr_path: str):
+        return _PlaceholderModuleAttr(self.module,
+                                      f"{self.attr_path}.{attr_path}")
+
+    def __getattr__(self, key: str):
+        getattr(self.module, f"{self.attr_path}.{key}")
+
+        raise AssertionError("PlaceholderModule should not be used "
+                             "when the original module can be imported")
+
+
 # create a library to hold the custom op
 vllm_lib = Library("vllm", "FRAGMENT")  # noqa
 

From b85a977822c4216430c5a27a2fc47c93277e4b29 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 27 Dec 2024 01:31:29 +0800
Subject: [PATCH 181/357] [Doc] Add video example to openai client for
 multimodal (#11521)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/source/usage/multimodal_inputs.md        | 52 ++++++++++++-
 ...i_chat_completion_client_for_multimodal.py | 73 +++++++++++++++++--
 2 files changed, 114 insertions(+), 11 deletions(-)

diff --git a/docs/source/usage/multimodal_inputs.md b/docs/source/usage/multimodal_inputs.md
index 82a3f3b8909a1..4f45a9f448cf0 100644
--- a/docs/source/usage/multimodal_inputs.md
+++ b/docs/source/usage/multimodal_inputs.md
@@ -294,12 +294,58 @@ $ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
 
 ### Video
 
-Instead of {code}`image_url`, you can pass a video file via {code}`video_url`.
+Instead of {code}`image_url`, you can pass a video file via {code}`video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf).
 
-You can use [these tests](gh-file:entrypoints/openai/test_video.py) as reference.
+First, launch the OpenAI-compatible server:
+
+```bash
+vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model-len 8192
+```
+
+Then, you can use the OpenAI client as follows:
+```python
+from openai import OpenAI
+
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
+
+## Use video url in the payload
+chat_completion_from_url = client.chat.completions.create(
+    messages=[{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What's in this video?"
+            },
+            {
+                "type": "video_url",
+                "video_url": {
+                    "url": video_url
+                },
+            },
+        ],
+    }],
+    model=model,
+    max_completion_tokens=64,
+)
+
+result = chat_completion_from_url.choices[0].message.content
+print("Chat completion output from image url:", result)
+```
+
+Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
 
 ````{note}
-By default, the timeout for fetching videos through HTTP URL url is `30` seconds.
+By default, the timeout for fetching videos through HTTP URL is `30` seconds.
 You can override this by setting the environment variable:
 
 ```console
diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py
index 6a160fd70423f..213d075542e81 100644
--- a/examples/openai_chat_completion_client_for_multimodal.py
+++ b/examples/openai_chat_completion_client_for_multimodal.py
@@ -18,7 +18,6 @@
 import requests
 from openai import OpenAI
 
-from vllm.assets.audio import AudioAsset
 from vllm.utils import FlexibleArgumentParser
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
@@ -151,8 +150,66 @@ def run_multi_image() -> None:
     print("Chat completion output:", result)
 
 
+# Video input inference
+def run_video() -> None:
+    video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
+    video_base64 = encode_base64_content_from_url(video_url)
+
+    ## Use video url in the payload
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this video?"
+                },
+                {
+                    "type": "video_url",
+                    "video_url": {
+                        "url": video_url
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from image url:", result)
+
+    ## Use base64 encoded video in the payload
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this video?"
+                },
+                {
+                    "type": "video_url",
+                    "video_url": {
+                        "url": f"data:video/mp4;base64,{video_base64}"
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from base64 encoded image:", result)
+
+
 # Audio input inference
 def run_audio() -> None:
+    from vllm.assets.audio import AudioAsset
+
     audio_url = AudioAsset("winning_call").url
     audio_base64 = encode_base64_content_from_url(audio_url)
 
@@ -240,6 +297,7 @@ def run_audio() -> None:
     "text-only": run_text_only,
     "single-image": run_single_image,
     "multi-image": run_multi_image,
+    "video": run_video,
     "audio": run_audio,
 }
 
@@ -253,12 +311,11 @@ def main(args) -> None:
     parser = FlexibleArgumentParser(
         description='Demo on using OpenAI client for online inference with '
         'multimodal language models served with vLLM.')
-    parser.add_argument(
-        '--chat-type',
-        '-c',
-        type=str,
-        default="single-image",
-        choices=["text-only", "single-image", "multi-image", "audio"],
-        help='Conversation type with multimodal data.')
+    parser.add_argument('--chat-type',
+                        '-c',
+                        type=str,
+                        default="single-image",
+                        choices=list(example_function_map.keys()),
+                        help='Conversation type with multimodal data.')
     args = parser.parse_args()
     main(args)

From 720b10fdc6891b2540fce172b63eb07c1ba48958 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Thu, 26 Dec 2024 18:03:43 -0500
Subject: [PATCH 182/357] [1/N] API Server  (Remove Proxy) (#11529)

---
 vllm/entrypoints/openai/api_server.py | 18 ++++++++++++------
 vllm/entrypoints/openai/cli_args.py   |  6 +++++-
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 3e50613a73dd3..16086689a10d1 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -585,12 +585,18 @@ async def authentication(request: Request, call_next):
                                     status_code=401)
             return await call_next(request)
 
-    @app.middleware("http")
-    async def add_request_id(request: Request, call_next):
-        request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex
-        response = await call_next(request)
-        response.headers["X-Request-Id"] = request_id
-        return response
+    if args.enable_request_id_headers:
+        logger.warning(
+            "CAUTION: Enabling X-Request-Id headers in the API Server. "
+            "This can harm performance at high QPS.")
+
+        @app.middleware("http")
+        async def add_request_id(request: Request, call_next):
+            request_id = request.headers.get(
+                "X-Request-Id") or uuid.uuid4().hex
+            response = await call_next(request)
+            response.headers["X-Request-Id"] = request_id
+            return response
 
     for middleware in args.middleware:
         module_path, object_name = middleware.rsplit(".", 1)
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 24c206a1261f2..908f8c3532c9e 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -196,7 +196,11 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         action="store_true",
         help="If specified, will run the OpenAI frontend server in the same "
         "process as the model serving engine.")
-
+    parser.add_argument(
+        "--enable-request-id-headers",
+        action="store_true",
+        help="If specified, API server will add X-Request-Id header to "
+        "responses. Caution: this hurts performance at high QPS.")
     parser.add_argument(
         "--enable-auto-tool-choice",
         action="store_true",

From 2072924d1480460d3b3578a4548c2bffe33fe1c3 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 26 Dec 2024 18:33:30 -0500
Subject: [PATCH 183/357] [Model] [Quantization] Support deepseek_v3 w8a8 fp8
 block-wise quantization (#11523)

Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: simon-mo <simon.mo@hey.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: HandH1998 <1335248067@qq.com>
---
 tests/kernels/test_block_fp8.py               | 265 +++++++++++++
 vllm/config.py                                |  14 +-
 .../layers/fused_moe/fused_moe.py             | 131 +++++--
 vllm/model_executor/layers/fused_moe/layer.py |   7 +-
 vllm/model_executor/layers/linear.py          |  23 +-
 .../model_executor/layers/quantization/fp8.py | 199 ++++++++--
 .../layers/quantization/utils/fp8_utils.py    | 353 ++++++++++++++++++
 vllm/model_executor/parameter.py              |   9 +
 8 files changed, 931 insertions(+), 70 deletions(-)
 create mode 100644 tests/kernels/test_block_fp8.py
 create mode 100644 vllm/model_executor/layers/quantization/utils/fp8_utils.py

diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py
new file mode 100644
index 0000000000000..a16cc4582a180
--- /dev/null
+++ b/tests/kernels/test_block_fp8.py
@@ -0,0 +1,265 @@
+# Adapted from https://github.com/sgl-project/sglang/pull/2575
+import itertools
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8, w8a8_block_fp8_matmul)
+from vllm.platforms import current_platform
+
+if current_platform.get_device_capability() < (9, 0):
+    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
+                allow_module_level=True)
+
+# Test configurations
+DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
+NUM_TOKENS = [7, 83, 2048]
+D = [512, 4096, 5120, 13824]
+GROUP_SIZE = [64, 128, 256, 512]
+M = [1, 7, 83, 512, 2048]
+N = [128, 512, 1024, 4096, 7748, 13824]
+K = [256, 4096, 5120, 3884, 13824]
+# Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
+# and its hidden size is 7168.
+M_moe = [1, 7, 83, 512, 2048]
+N_moe = [4608]  # [128, 4608, 13824]
+K_moe = [7168]  # [256, 7168, 13824]
+BLOCK_SIZE = [[128, 128]]
+E = [256]  # [8, 24, 128, 256]
+TOP_KS = [1]  # [1, 2, 6]
+OUT_DTYPES = [torch.bfloat16]  # [torch.float32, torch.half, torch.bfloat16]
+SEEDS = [0]
+
+
+def native_per_token_group_quant_fp8(x,
+                                     group_size,
+                                     eps=1e-10,
+                                     dtype=torch.float8_e4m3fn):
+    """Function to perform per-token-group quantization on an input tensor
+    `x` using native torch."""
+    assert x.shape[-1] % group_size == 0, ("the last dimension of `x` cannot "
+                                           "be divisible by `group_size`")
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    finfo = torch.finfo(dtype)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+
+    x_ = x.reshape(x.numel() // group_size, group_size)
+    amax = x_.abs().max(dim=-1,
+                        keepdim=True)[0].clamp(min=eps).to(torch.float32)
+    x_s = amax / fp8_max
+    x_q = (x_ / x_s).clamp(min=fp8_min, max=fp8_max).to(dtype)
+    x_q = x_q.reshape(x.shape)
+    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size, ))
+
+    return x_q, x_s
+
+
+def native_w8a8_block_fp8_matmul(A,
+                                 B,
+                                 As,
+                                 Bs,
+                                 block_size,
+                                 output_dtype=torch.float16):
+    """Matrix multiplication with block-wise quantization using native torch."""
+    A = A.to(torch.float32)
+    B = B.to(torch.float32)
+    assert A.shape[-1] == B.shape[-1]
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+    assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1]
+
+    M = A.numel() // A.shape[-1]
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (N, )
+    A = A.reshape(M, A.shape[-1])
+    As = As.reshape(M, As.shape[-1])
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+    assert n_tiles == Bs.shape[0]
+    assert k_tiles == Bs.shape[1]
+
+    C_shape = (M, N)
+    C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
+
+    A_tiles = [
+        A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
+    ]
+    B_tiles = [[
+        B[j * block_n:min((j + 1) * block_n, N),
+          i * block_k:min((i + 1) * block_k, K), ] for i in range(k_tiles)
+    ] for j in range(n_tiles)]
+    C_tiles = [
+        C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
+    ]
+    As_tiles = [As[:, i:i + 1] for i in range(k_tiles)]
+
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            a = A_tiles[i]
+            b = B_tiles[j][i]
+            c = C_tiles[j]
+            s = As_tiles[i] * Bs[j][i]
+            c[:, :] += torch.matmul(a, b.t()) * s
+
+    C = C.reshape(origin_C_shape).to(output_dtype)
+    return C
+
+
+def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
+    """Fused moe with block-wise quantization using native torch."""
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+
+    _, block_k = block_shape[0], block_shape[1]
+    a_q, a_s = native_per_token_group_quant_fp8(a, block_k)
+    a_q = a_q.to(torch.float32)
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            inter_out = native_w8a8_block_fp8_matmul(a_q[mask],
+                                                     w1[i],
+                                                     a_s[mask],
+                                                     w1_s[i],
+                                                     block_shape,
+                                                     output_dtype=a.dtype)
+            act_out = SiluAndMul().forward_native(inter_out)
+            act_out_q, act_out_s = native_per_token_group_quant_fp8(
+                act_out, block_k)
+            act_out = act_out.to(torch.float32)
+            out[mask] = native_w8a8_block_fp8_matmul(act_out_q,
+                                                     w2[i],
+                                                     act_out_s,
+                                                     w2_s[i],
+                                                     block_shape,
+                                                     output_dtype=a.dtype)
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+# Skip all tests if CUDA is not available
+pytest.importorskip("torch.cuda")
+
+
+@pytest.fixture(autouse=True)
+def setup_cuda():
+    torch.set_default_device("cuda")
+
+
+@pytest.mark.parametrize("num_tokens,d,dtype,group_size,seed",
+                         itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE,
+                                           SEEDS))
+@torch.inference_mode()
+def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
+    torch.manual_seed(seed)
+    x = torch.rand(num_tokens, d, dtype=dtype)
+
+    ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size)
+    out, scale = per_token_group_quant_fp8(x, group_size)
+
+    assert torch.allclose(out.to(torch.float32),
+                          ref_out.to(torch.float32),
+                          rtol=0.15)
+    assert torch.allclose(scale, ref_scale)
+
+
+@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed",
+                         itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES,
+                                           SEEDS))
+@torch.inference_mode()
+def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
+    torch.manual_seed(seed)
+    factor_for_scale = 1e-2
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+    A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+    B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+
+    As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
+    Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
+
+    ref_out = native_w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size,
+                                           out_dtype)
+    out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.001
+
+
+@pytest.mark.parametrize("M,N,K,E,topk,block_size,dtype,seed",
+                         itertools.product(M_moe, N_moe, K_moe, E, TOP_KS,
+                                           BLOCK_SIZE, DTYPES, SEEDS))
+@torch.inference_mode()
+def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
+    torch.manual_seed(seed)
+    factor_for_scale = 1e-2
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    a = torch.randn((M, K), dtype=dtype) / 10
+
+    w1_bf16 = (torch.rand(
+        (E, 2 * N, K), dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
+    w1 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    del w1_bf16
+
+    w2_bf16 = (torch.rand((E, K, N), dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
+    w2 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    del w2_bf16
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles_w1 = (2 * N + block_n - 1) // block_n
+    n_tiles_w2 = (K + block_n - 1) // block_n
+    k_tiles_w1 = (K + block_k - 1) // block_k
+    k_tiles_w2 = (N + block_k - 1) // block_k
+
+    w1_s = torch.rand(
+        (E, n_tiles_w1, k_tiles_w1), dtype=torch.float32) * factor_for_scale
+    w2_s = torch.rand(
+        (E, n_tiles_w2, k_tiles_w2), dtype=torch.float32) * factor_for_scale
+
+    score = torch.randn((M, E), dtype=dtype)
+
+    out = fused_moe(
+        a,
+        w1,
+        w2,
+        score,
+        topk,
+        renormalize=False,
+        use_fp8_w8a8=True,
+        w1_scale=w1_s,
+        w2_scale=w2_s,
+        block_shape=block_size,
+    )
+    ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk,
+                                       block_size)
+
+    print(f"{out.sum()=}")
+    print(f"{ref_out.sum()=}")
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.03
diff --git a/vllm/config.py b/vllm/config.py
index de8ba029ddc23..58649236b4225 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -161,7 +161,7 @@ class ModelConfig:
             override default pooling config for the pooling model.
         logits_processor_pattern: Optional regex pattern specifying valid
             logits processor qualified names that can be passed with the
-            `logits_processors` extra completion argument. Defaults to None, 
+            `logits_processors` extra completion argument. Defaults to None,
             which allows no processors.
         generation_config: Configuration parameter file for generation.
     """
@@ -364,7 +364,7 @@ def __init__(self,
     def maybe_pull_model_tokenizer_for_s3(self, model: str,
                                           tokenizer: str) -> None:
         """
-        Pull the model config or tokenizer to a temporary 
+        Pull the model config or tokenizer to a temporary
         directory in case of S3.
 
         Args:
@@ -866,14 +866,14 @@ def try_get_generation_config(self) -> Dict[str, Any]:
 
     def get_diff_sampling_param(self) -> Dict[str, Any]:
         """
-        This method returns a dictionary containing the parameters 
-        that differ from the default sampling parameters, but only 
-        if `generation_config` is set. If `generation_config` is not 
+        This method returns a dictionary containing the parameters
+        that differ from the default sampling parameters, but only
+        if `generation_config` is set. If `generation_config` is not
         set, an empty dictionary is returned.
 
         Returns:
-            Dict[str, Any]: A dictionary with the differing sampling 
-            parameters if `generation_config` is set, otherwise an 
+            Dict[str, Any]: A dictionary with the differing sampling
+            parameters if `generation_config` is set, otherwise an
             empty dictionary.
         """
         if self.generation_config is None:
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index e6f9f01ef0f74..92e9ba3c9cebd 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -2,7 +2,7 @@
 import functools
 import json
 import os
-from typing import Any, Callable, Dict, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch
 import triton
@@ -11,6 +11,8 @@
 import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8)
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
@@ -45,8 +47,14 @@ def fused_moe_kernel(
         stride_bn,
         stride_cm,
         stride_cn,
+        stride_asm,
+        stride_ask,
         stride_bse,
+        stride_bsk,
         stride_bsn,
+        # Block size for block-wise quantization
+        group_n: tl.constexpr,
+        group_k: tl.constexpr,
         # Meta-parameters
         BLOCK_SIZE_M: tl.constexpr,
         BLOCK_SIZE_N: tl.constexpr,
@@ -125,8 +133,14 @@ def fused_moe_kernel(
         b_scale = tl.load(b_scale_ptrs)
 
     if use_fp8_w8a8:
-        a_scale = tl.load(a_scale_ptr)
-        b_scale = tl.load(b_scale_ptr + off_experts)
+        if group_k > 0 and group_n > 0:
+            a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
+            offs_bsn = offs_bn // group_n
+            b_scale_ptrs = (b_scale_ptr + off_experts * stride_bse +
+                            offs_bsn * stride_bsn)
+        else:
+            a_scale = tl.load(a_scale_ptr)
+            b_scale = tl.load(b_scale_ptr + off_experts)
 
     # -----------------------------------------------------------
     # Iterate to compute a block of the C matrix.
@@ -149,7 +163,18 @@ def fused_moe_kernel(
         if use_int8_w8a16:
             accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)
         elif use_fp8_w8a8:
-            accumulator = tl.dot(a, b, acc=accumulator)
+            if group_k > 0 and group_n > 0:
+                k_start = k * BLOCK_SIZE_K
+                offs_ks = k_start // group_k
+                a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask,
+                                  mask=token_mask,
+                                  other=0.0)
+                b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk)
+
+                accumulator += tl.dot(a, b) * a_scale[:,
+                                                      None] * b_scale[None, :]
+            else:
+                accumulator = tl.dot(a, b, acc=accumulator)
         else:
             accumulator += tl.dot(a, b)
         # Advance the ptrs to the next K block.
@@ -164,7 +189,10 @@ def fused_moe_kernel(
     if use_int8_w8a16:
         accumulator = (accumulator * b_scale).to(compute_type)
     elif use_fp8_w8a8:
-        accumulator = (accumulator * a_scale * b_scale).to(compute_type)
+        if group_k > 0 and group_n > 0:
+            accumulator = accumulator.to(compute_type)
+        else:
+            accumulator = (accumulator * a_scale * b_scale).to(compute_type)
     else:
         accumulator = accumulator.to(compute_type)
     # -----------------------------------------------------------
@@ -233,22 +261,37 @@ def moe_align_block_size(
     return sorted_ids, expert_ids, num_tokens_post_pad
 
 
-def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
+def invoke_fused_moe_kernel(A: torch.Tensor,
+                            B: torch.Tensor,
+                            C: torch.Tensor,
                             A_scale: Optional[torch.Tensor],
                             B_scale: Optional[torch.Tensor],
-                            topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                            topk_weights: torch.Tensor,
+                            topk_ids: torch.Tensor,
                             sorted_token_ids: torch.Tensor,
                             expert_ids: torch.Tensor,
                             num_tokens_post_padded: torch.Tensor,
-                            mul_routed_weight: bool, top_k: int,
-                            config: Dict[str, Any], compute_type: tl.dtype,
-                            use_fp8_w8a8: bool, use_int8_w8a16: bool) -> None:
+                            mul_routed_weight: bool,
+                            top_k: int,
+                            config: Dict[str, Any],
+                            compute_type: tl.dtype,
+                            use_fp8_w8a8: bool,
+                            use_int8_w8a16: bool,
+                            block_shape: Optional[List[int]] = None) -> None:
     assert topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1
 
     if use_fp8_w8a8:
-        A, A_scale = ops.scaled_fp8_quant(A, A_scale)
         assert B_scale is not None
+        if block_shape is None:
+            A, A_scale = ops.scaled_fp8_quant(A, A_scale)
+        else:
+            assert len(block_shape) == 2
+            block_n, block_k = block_shape[0], block_shape[1]
+            A, A_scale = per_token_group_quant_fp8(A, block_k)
+            assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+            assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2]
+            assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1]
     elif use_int8_w8a16:
         assert B_scale is not None
     else:
@@ -279,8 +322,13 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
         B.stride(1),
         C.stride(1),
         C.stride(2),
-        B_scale.stride(0) if B_scale is not None and use_int8_w8a16 else 0,
-        B_scale.stride(1) if B_scale is not None and use_int8_w8a16 else 0,
+        A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0,
+        A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0,
+        B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0,
+        B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0,
+        B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0,
+        0 if block_shape is None else block_shape[0],
+        0 if block_shape is None else block_shape[1],
         MUL_ROUTED_WEIGHT=mul_routed_weight,
         top_k=top_k,
         compute_type=compute_type,
@@ -362,6 +410,7 @@ def try_get_optimal_moe_config(
     dtype: Optional[str],
     M: int,
     is_marlin: bool = False,
+    block_shape: Optional[List[int]] = None,
 ):
     from vllm.model_executor.layers.fused_moe import get_config
     override_config = get_config()
@@ -380,6 +429,12 @@ def try_get_optimal_moe_config(
             # Else use the default config
             config = get_default_config(M, E, N, w1_shape[2], top_k, dtype,
                                         is_marlin)
+    # NOTE: For block-wise quant,
+    # BLOCK_K must be divisible by block_shape[1]
+    # BLOCK_N and BLOCK_M has no requirements
+    if block_shape is not None:
+        config["BLOCK_SIZE_N"] = block_shape[0]
+        config["BLOCK_SIZE_K"] = block_shape[1]
     return config
 
 
@@ -479,10 +534,11 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                           w1_scale: Optional[torch.Tensor] = None,
                           w2_scale: Optional[torch.Tensor] = None,
                           a1_scale: Optional[torch.Tensor] = None,
-                          a2_scale: Optional[torch.Tensor] = None) -> None:
+                          a2_scale: Optional[torch.Tensor] = None,
+                          block_shape: Optional[List[int]] = None) -> None:
     fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
                        use_fp8_w8a8, use_int8_w8a16, w1_scale, w2_scale,
-                       a1_scale, a2_scale)
+                       a1_scale, a2_scale, block_shape)
 
 
 def inplace_fused_experts_fake(
@@ -496,7 +552,8 @@ def inplace_fused_experts_fake(
         w1_scale: Optional[torch.Tensor] = None,
         w2_scale: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
-        a2_scale: Optional[torch.Tensor] = None) -> None:
+        a2_scale: Optional[torch.Tensor] = None,
+        block_shape: Optional[List[int]] = None) -> None:
     pass
 
 
@@ -519,10 +576,11 @@ def outplace_fused_experts(
         w1_scale: Optional[torch.Tensor] = None,
         w2_scale: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
-        a2_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
+        a2_scale: Optional[torch.Tensor] = None,
+        block_shape: Optional[List[int]] = None) -> torch.Tensor:
     return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,
                               False, use_fp8_w8a8, use_int8_w8a16, w1_scale,
-                              w2_scale, a1_scale, a2_scale)
+                              w2_scale, a1_scale, a2_scale, block_shape)
 
 
 def outplace_fused_experts_fake(
@@ -536,7 +594,8 @@ def outplace_fused_experts_fake(
         w1_scale: Optional[torch.Tensor] = None,
         w2_scale: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
-        a2_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
+        a2_scale: Optional[torch.Tensor] = None,
+        block_shape: Optional[List[int]] = None) -> torch.Tensor:
     return torch.empty_like(hidden_states)
 
 
@@ -559,18 +618,22 @@ def fused_experts(hidden_states: torch.Tensor,
                   w1_scale: Optional[torch.Tensor] = None,
                   w2_scale: Optional[torch.Tensor] = None,
                   a1_scale: Optional[torch.Tensor] = None,
-                  a2_scale: Optional[torch.Tensor] = None):
+                  a2_scale: Optional[torch.Tensor] = None,
+                  block_shape: Optional[List[int]] = None):
     if inplace:
         torch.ops.vllm.inplace_fused_experts(hidden_states, w1, w2,
                                              topk_weights, topk_ids,
                                              use_fp8_w8a8, use_int8_w8a16,
                                              w1_scale, w2_scale, a1_scale,
-                                             a2_scale)
+                                             a2_scale, block_shape)
         return hidden_states
     else:
-        return torch.ops.vllm.outplace_fused_experts(
-            hidden_states, w1, w2, topk_weights, topk_ids, use_fp8_w8a8,
-            use_int8_w8a16, w1_scale, w2_scale, a1_scale, a2_scale)
+        return torch.ops.vllm.outplace_fused_experts(hidden_states, w1, w2,
+                                                     topk_weights, topk_ids,
+                                                     use_fp8_w8a8,
+                                                     use_int8_w8a16, w1_scale,
+                                                     w2_scale, a1_scale,
+                                                     a2_scale, block_shape)
 
 
 def fused_experts_impl(hidden_states: torch.Tensor,
@@ -584,7 +647,8 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                        w1_scale: Optional[torch.Tensor] = None,
                        w2_scale: Optional[torch.Tensor] = None,
                        a1_scale: Optional[torch.Tensor] = None,
-                       a2_scale: Optional[torch.Tensor] = None):
+                       a2_scale: Optional[torch.Tensor] = None,
+                       block_shape: Optional[List[int]] = None):
     # Check constraints.
     assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
     assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
@@ -611,6 +675,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
         w2.shape,
         topk_ids.shape[1],
         config_dtype,
+        block_shape=block_shape,
     )
 
     config = get_config_func(M)
@@ -674,7 +739,8 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 config,
                                 compute_type=compute_type,
                                 use_fp8_w8a8=use_fp8_w8a8,
-                                use_int8_w8a16=use_int8_w8a16)
+                                use_int8_w8a16=use_int8_w8a16,
+                                block_shape=block_shape)
 
         ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
 
@@ -693,7 +759,8 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 config,
                                 compute_type=compute_type,
                                 use_fp8_w8a8=use_fp8_w8a8,
-                                use_int8_w8a16=use_int8_w8a16)
+                                use_int8_w8a16=use_int8_w8a16,
+                                block_shape=block_shape)
 
         ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.shape),
                     out_hidden_states[begin_chunk_idx:end_chunk_idx])
@@ -718,6 +785,7 @@ def fused_moe(
     w2_scale: Optional[torch.Tensor] = None,
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[List[int]] = None,
 ) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of
@@ -745,6 +813,12 @@ def fused_moe(
         w1.
     - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
         w2.
+    - a1_scale (Optional[torch.Tensor]): Optional scale to be used for
+        a1.
+    - a2_scale (Optional[torch.Tensor]): Optional scale to be used for
+        a2.
+    - block_shape: (Optional[List[int]]): Optional block size for block-wise
+        quantization.
 
     Returns:
     - torch.Tensor: The output tensor after applying the MoE layer.
@@ -775,4 +849,5 @@ def fused_moe(
                          w1_scale=w1_scale,
                          w2_scale=w2_scale,
                          a1_scale=a1_scale,
-                         a2_scale=a2_scale)
+                         a2_scale=a2_scale,
+                         block_shape=block_shape)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 8c6f7c6e06515..55c0a202920ff 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -29,6 +29,7 @@ class FusedMoeWeightScaleSupported(Enum):
     TENSOR = "tensor"
     CHANNEL = "channel"
     GROUP = "group"
+    BLOCK = "block"
 
 
 class FusedMoEMethodBase(QuantizeMethodBase):
@@ -199,6 +200,7 @@ def __init__(
                         get_tensor_model_parallel_world_size())
         self.top_k = top_k
         self.num_experts = num_experts
+        assert intermediate_size % self.tp_size == 0
         self.intermediate_size_per_partition = intermediate_size // self.tp_size
         self.reduce_results = reduce_results
         self.renormalize = renormalize
@@ -398,7 +400,10 @@ def weight_loader(self, param: torch.nn.Parameter,
                     loaded_weight=loaded_weight,
                     expert_data=expert_data,
                     tp_rank=tp_rank)
-            elif quant_method == FusedMoeWeightScaleSupported.GROUP.value:
+            elif quant_method in [
+                    FusedMoeWeightScaleSupported.GROUP.value,
+                    FusedMoeWeightScaleSupported.BLOCK.value,
+            ]:
                 self._load_model_weight_or_group_weight_scale(
                     shard_id=shard_id,
                     shard_dim=shard_dim,
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 46ef11e7d02c6..33b221b994b2b 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -14,11 +14,14 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
+# yapf: disable
 from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           BlockQuantScaleParameter,
                                            PackedColumnParameter,
                                            PackedvLLMParameter,
                                            PerTensorScaleParameter,
                                            RowvLLMParameter)
+# yapf: enable
 from vllm.model_executor.utils import set_weight_attrs
 
 logger = init_logger(__name__)
@@ -623,8 +626,24 @@ def weight_loader_v2(self,
         assert loaded_shard_id < len(self.output_sizes)
 
         tp_size = get_tensor_model_parallel_world_size()
-        shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
-        shard_size = self.output_sizes[loaded_shard_id] // tp_size
+
+        if isinstance(param, BlockQuantScaleParameter):
+            from vllm.model_executor.layers.quantization.fp8 import (
+                Fp8LinearMethod, Fp8MoEMethod)
+            assert self.quant_method is not None
+            assert isinstance(self.quant_method,
+                              (Fp8LinearMethod, Fp8MoEMethod))
+            weight_block_size = self.quant_method.quant_config.weight_block_size
+            assert weight_block_size is not None
+            block_n, _ = weight_block_size[0], weight_block_size[1]
+            shard_offset = (
+                (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) //
+                block_n) // tp_size
+            shard_size = ((self.output_sizes[loaded_shard_id] + block_n - 1) //
+                          block_n // tp_size)
+        else:
+            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
+            shard_size = self.output_sizes[loaded_shard_id] // tp_size
 
         param.load_merged_column_weight(loaded_weight=loaded_weight,
                                         shard_id=loaded_shard_id,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 978e727bc7cb3..5dfd86727a02a 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -6,6 +6,7 @@
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
+from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
                                                   FusedMoeWeightScaleSupported)
@@ -14,6 +15,8 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    apply_w8a8_block_fp8_linear)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
@@ -22,7 +25,8 @@
     all_close_1d, apply_fp8_linear, convert_to_channelwise,
     cutlass_fp8_supported, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize,
     requantize_with_max_scale)
-from vllm.model_executor.parameter import (ModelWeightParameter,
+from vllm.model_executor.parameter import (BlockQuantScaleParameter,
+                                           ModelWeightParameter,
                                            PerTensorScaleParameter)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
@@ -41,6 +45,7 @@ def __init__(
         is_checkpoint_fp8_serialized: bool = False,
         activation_scheme: str = "dynamic",
         ignored_layers: Optional[List[str]] = None,
+        weight_block_size: Optional[List[int]] = None,
     ) -> None:
         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
         if is_checkpoint_fp8_serialized:
@@ -51,6 +56,20 @@ def __init__(
                 f"Unsupported activation scheme {activation_scheme}")
         self.activation_scheme = activation_scheme
         self.ignored_layers = ignored_layers or []
+        if weight_block_size is not None:
+            if not is_checkpoint_fp8_serialized:
+                raise ValueError(
+                    "The block-wise quantization only supports fp8-serialized "
+                    "checkpoint for now.")
+            if len(weight_block_size) != 2:
+                raise ValueError(
+                    "The quantization block size of weight must have 2 "
+                    f"dimensions, but got {len(weight_block_size)} dimensions")
+            if activation_scheme != "dynamic":
+                raise ValueError("The block-wise quantization only supports "
+                                 "dynamic activation scheme for now, but got "
+                                 f"{activation_scheme} activation scheme.")
+        self.weight_block_size = weight_block_size
 
     @classmethod
     def get_name(cls) -> str:
@@ -74,9 +93,12 @@ def from_config(cls, config: Dict[str, Any]) -> "Fp8Config":
         is_checkpoint_fp8_serialized = ("fp8" in quant_method)
         activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
         ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
+        weight_block_size = cls.get_from_keys_or(config, ["weight_block_size"],
+                                                 None)
         return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
                    activation_scheme=activation_scheme,
-                   ignored_layers=ignored_layers)
+                   ignored_layers=ignored_layers,
+                   weight_block_size=weight_block_size)
 
     def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["QuantizeMethodBase"]:
@@ -123,6 +145,11 @@ def __init__(self, quant_config: Fp8Config):
         if current_platform.is_rocm():
             self.use_marlin = False
 
+        self.block_quant = self.quant_config.weight_block_size is not None
+        if self.block_quant:
+            # Marlin doesn't support block-wise fp8
+            self.use_marlin = False
+
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -133,10 +160,34 @@ def create_weights(
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
-        del input_size, output_size
         output_size_per_partition = sum(output_partition_sizes)
         weight_loader = extra_weight_attrs.get("weight_loader")
 
+        if self.block_quant:
+            tp_size = get_tensor_model_parallel_world_size()
+            assert self.quant_config.weight_block_size is not None
+            block_n, block_k = (
+                self.quant_config.weight_block_size[0],
+                self.quant_config.weight_block_size[1],
+            )
+            # Required by row parallel
+            if (tp_size > 1
+                    and input_size // input_size_per_partition == tp_size
+                    and input_size_per_partition % block_k != 0):
+                raise ValueError(
+                    f"Weight input_size_per_partition = "
+                    f"{input_size_per_partition} is not divisible by "
+                    f"weight quantization block_k = {block_k}.")
+            # Required by column parallel or enabling merged weights
+            if (tp_size > 1 and output_size // output_size_per_partition
+                    == tp_size) or len(output_partition_sizes) > 1:
+                for output_partition_size in output_partition_sizes:
+                    if output_partition_size % block_n != 0:
+                        raise ValueError(
+                            f"Weight output_partition_size = "
+                            f"{output_partition_size} is not divisible by "
+                            f"weight quantization block_n = {block_n}.")
+
         layer.logical_widths = output_partition_sizes
 
         layer.input_size_per_partition = input_size_per_partition
@@ -161,12 +212,29 @@ def create_weights(
         # Otherwise, wait until process_weights_after_loading.
         if self.quant_config.is_checkpoint_fp8_serialized:
             # WEIGHT SCALE
-            scale = PerTensorScaleParameter(data=torch.empty(
-                len(output_partition_sizes), dtype=torch.float32),
-                                            weight_loader=weight_loader)
-
-            scale[:] = torch.finfo(torch.float32).min
-            layer.register_parameter("weight_scale", scale)
+            if not self.block_quant:
+                scale = PerTensorScaleParameter(
+                    data=torch.empty(len(output_partition_sizes),
+                                     dtype=torch.float32),
+                    weight_loader=weight_loader,
+                )
+                scale[:] = torch.finfo(torch.float32).min
+                layer.register_parameter("weight_scale", scale)
+            else:
+                assert self.quant_config.activation_scheme == "dynamic"
+                scale = BlockQuantScaleParameter(
+                    data=torch.empty(
+                        (output_size_per_partition + block_n - 1) // block_n,
+                        (input_size_per_partition + block_k - 1) // block_k,
+                        dtype=torch.float32,
+                    ),
+                    input_dim=1,
+                    output_dim=0,
+                    weight_loader=weight_loader,
+                )
+                scale[:] = torch.finfo(torch.float32).min
+                # The weight_scale_inv name is intentional for deepseekv3
+                layer.register_parameter("weight_scale_inv", scale)
 
             # INPUT ACTIVATION SCALE
             if self.quant_config.activation_scheme == "static":
@@ -180,6 +248,9 @@ def create_weights(
                 layer.register_parameter("input_scale", None)
 
     def process_weights_after_loading(self, layer: Module) -> None:
+        # Block quant doesn't need to process weights after loading
+        if self.block_quant:
+            return
         layer.weight = torch.nn.Parameter(layer.weight.data,
                                           requires_grad=False)
         # If checkpoint not serialized fp8, quantize the weights.
@@ -266,6 +337,17 @@ def apply(self,
                 size_k=layer.input_size_per_partition,
                 bias=bias)
 
+        if self.block_quant:
+            assert self.quant_config.weight_block_size is not None
+            return apply_w8a8_block_fp8_linear(
+                input=x,
+                weight=layer.weight,
+                block_size=self.quant_config.weight_block_size,
+                weight_scale=layer.weight_scale_inv,
+                input_scale=layer.input_scale,
+                bias=bias,
+            )
+
         return apply_fp8_linear(
             input=x,
             weight=layer.weight,
@@ -291,6 +373,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
 
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
+        self.block_quant = self.quant_config.weight_block_size is not None
 
     def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
                        intermediate_size: int, params_dtype: torch.dtype,
@@ -298,6 +381,27 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
 
         if self.quant_config.is_checkpoint_fp8_serialized:
             params_dtype = torch.float8_e4m3fn
+        if self.block_quant:
+            assert self.quant_config.weight_block_size is not None
+            tp_size = get_tensor_model_parallel_world_size()
+            block_n, block_k = (
+                self.quant_config.weight_block_size[0],
+                self.quant_config.weight_block_size[1],
+            )
+            # NOTE: To ensure proper alignment of the block-wise quantization
+            # scales, the output_size of the weights for both the gate and up
+            # layers must be divisible by block_n.
+            # Required by column parallel or enabling merged weights
+            if intermediate_size % block_n != 0:
+                raise ValueError(
+                    f"The output_size of gate's and up's weight = "
+                    f"{intermediate_size} is not divisible by "
+                    f"weight quantization block_n = {block_n}.")
+            if (tp_size > 1 and intermediate_size % block_k != 0):
+                # Required by row parallel
+                raise ValueError(f"The input_size of down's weight = "
+                                 f"{intermediate_size} is not divisible by "
+                                 f"weight quantization block_k = {block_k}.")
 
         # WEIGHTS
         w13_weight = torch.nn.Parameter(torch.empty(num_experts,
@@ -317,21 +421,45 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
         # WEIGHT_SCALES
-        # Allocate 2 scales for w1 and w3 respectively.
-        # They will be combined to a single scale after weight loading.
-        w13_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
-                                                         2,
-                                                         dtype=torch.float32),
-                                              requires_grad=False)
-        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        if not self.block_quant:
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They will be combined to a single scale after weight loading.
+            w13_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, 2, dtype=torch.float32),
+                                                  requires_grad=False)
+            w2_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        else:
+            w13_weight_scale = torch.nn.Parameter(
+                torch.ones(
+                    num_experts,
+                    2 * ((intermediate_size + block_n - 1) // block_n),
+                    (hidden_size + block_k - 1) // block_k,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            w2_weight_scale = torch.nn.Parameter(
+                torch.ones(
+                    num_experts,
+                    (hidden_size + block_n - 1) // block_n,
+                    (intermediate_size + block_k - 1) // block_k,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter("w13_weight_scale_inv", w13_weight_scale)
+            layer.register_parameter("w2_weight_scale_inv", w2_weight_scale)
+            assert self.quant_config.activation_scheme == "dynamic"
 
-        w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
-                                                        dtype=torch.float32),
-                                             requires_grad=False)
-        layer.register_parameter("w2_weight_scale", w2_weight_scale)
         # Add the quantization method used (per tensor/grouped/channel)
         # to ensure the weight scales are loaded in properly
         extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.
+             value} if self.block_quant else
             {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
         # If loading fp8 checkpoint, pass the weight loaders.
         # If loading an fp16 checkpoint, do not (we will quantize in
@@ -364,7 +492,9 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
             layer.w2_input_scale = None
 
     def process_weights_after_loading(self, layer: Module) -> None:
-
+        # Block quant doesn't need to process weights after loading
+        if self.block_quant:
+            return
         # If checkpoint is fp16, quantize in place.
         if not self.quant_config.is_checkpoint_fp8_serialized:
             # If rocm, use float8_e4m3fnuz as dtype
@@ -489,17 +619,22 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function)
 
-        return fused_experts(x,
-                             layer.w13_weight,
-                             layer.w2_weight,
-                             topk_weights=topk_weights,
-                             topk_ids=topk_ids,
-                             inplace=True,
-                             use_fp8_w8a8=True,
-                             w1_scale=layer.w13_weight_scale,
-                             w2_scale=layer.w2_weight_scale,
-                             a1_scale=layer.w13_input_scale,
-                             a2_scale=layer.w2_input_scale)
+        return fused_experts(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            use_fp8_w8a8=True,
+            w1_scale=(layer.w13_weight_scale_inv
+                      if self.block_quant else layer.w13_weight_scale),
+            w2_scale=(layer.w2_weight_scale_inv
+                      if self.block_quant else layer.w2_weight_scale),
+            a1_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+            block_shape=self.quant_config.weight_block_size,
+        )
 
 
 class Fp8KVCacheMethod(BaseKVCacheMethod):
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
new file mode 100644
index 0000000000000..f3c3e130e4161
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -0,0 +1,353 @@
+# Adapted from https://github.com/sgl-project/sglang/pull/2575
+from typing import List, Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+
+def apply_w8a8_block_fp8_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: List[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    assert input_scale is None
+    # View input as 2D matrix for fp8 methods
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+
+    q_input, x_scale = per_token_group_quant_fp8(input_2d, block_size[1])
+    output = w8a8_block_fp8_matmul(q_input,
+                                   weight,
+                                   x_scale,
+                                   weight_scale,
+                                   block_size,
+                                   output_dtype=input.dtype)
+
+    if bias is not None:
+        output = output + bias
+    return output.to(dtype=input.dtype).view(*output_shape)
+
+
+def input_to_float8(
+    x: torch.Tensor,
+    dtype: torch.dtype = torch.float8_e4m3fn
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """This function quantizes input values to float8 values "
+    "with tensor-wise quantization."""
+    finfo = torch.finfo(dtype)
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    scale = finfo.max / amax
+    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
+    return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
+
+
+def block_quant_to_tensor_quant(
+    x_q_block: torch.Tensor,
+    x_s: torch.Tensor,
+    block_size: List[int],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """This function converts block-wise quantization to tensor-wise
+    quantization. The inputs are block-wise quantization tensor `x_q_block`,
+    block-wise quantization scale and the block size.
+    The outputs are tensor-wise quantization tensor and tensor-wise
+    quantization scale. Note only float8 is supported for now.
+    """
+    block_n, block_k = block_size[0], block_size[1]
+    n, k = x_q_block.shape
+    n_tiles = (n + block_n - 1) // block_n
+    k_tiles = (k + block_k - 1) // block_k
+    assert n_tiles == x_s.shape[0]
+    assert k_tiles == x_s.shape[1]
+
+    x_dq_block = x_q_block.to(torch.float32)
+
+    x_dq_block_tiles = [[
+        x_dq_block[j * block_n:min((j + 1) * block_n, n),
+                   i * block_k:min((i + 1) * block_k, k), ]
+        for i in range(k_tiles)
+    ] for j in range(n_tiles)]
+
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            x_dq_block_tiles[j][i][:, :] = x_dq_block_tiles[j][i] * x_s[j][i]
+
+    x_q_tensor, scale = input_to_float8(x_dq_block, dtype=x_q_block.dtype)
+    return x_q_tensor, scale
+
+
+@triton.jit
+def _per_token_group_quant_fp8(
+    # Pointers to inputs and output
+    y_ptr,
+    y_q_ptr,
+    y_s_ptr,
+    # Stride of input
+    y_stride,
+    # Columns of input
+    N,
+    # Avoid to divide zero
+    eps,
+    # Information for float8
+    fp8_min,
+    fp8_max,
+    # Meta-parameters
+    BLOCK: tl.constexpr,
+):
+    """A Triton-accelerated function to perform per-token-group
+    quantization on a tensor.
+    This function converts the tensor values into float8 values.
+    """
+    # Map the program id to the row of X and Y it should compute.
+    g_id = tl.program_id(0)
+    y_ptr += g_id * y_stride
+    y_q_ptr += g_id * y_stride
+    y_s_ptr += g_id
+
+    cols = tl.arange(0, BLOCK)  # N <= BLOCK
+    mask = cols < N
+
+    y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+    # Quant
+    _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
+    y_s = _absmax / fp8_max
+    y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+
+    tl.store(y_q_ptr + cols, y_q, mask=mask)
+    tl.store(y_s_ptr, y_s)
+
+
+def per_token_group_quant_fp8(
+    x: torch.Tensor,
+    group_size: int,
+    eps: float = 1e-10,
+    dtype: torch.dtype = torch.float8_e4m3fn,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Function to perform per-token-group quantization on an input tensor `x`.
+    It converts the tensor values into signed float8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+    Args:
+        x: The input tenosr with ndim >= 2.
+        group_size: The group size used for quantization.
+        eps: The minimum to avoid dividing zero.
+        dtype: The dype of output tensor. Note that only `torch.float8_e4m3fn`
+        is supported for now.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
+        scaling factor for quantization.
+    """
+    assert (x.shape[-1] % group_size == 0), (
+        f"the last dimension of `x` {x.shape[-1]} must be divisible "
+        f"by `group_size` {group_size}")
+    assert x.is_contiguous(), "`x` must be contiguous"
+
+    finfo = torch.finfo(dtype)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+
+    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+    M = x.numel() // group_size
+    N = group_size
+    x_s = torch.empty(
+        x.shape[:-1] + (x.shape[-1] // group_size, ),
+        device=x.device,
+        dtype=torch.float32,
+    )
+
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    num_stages = 1
+    _per_token_group_quant_fp8[(M, )](
+        x,
+        x_q,
+        x_s,
+        group_size,
+        N,
+        eps,
+        fp8_min=fp8_min,
+        fp8_max=fp8_max,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+
+    return x_q, x_s
+
+
+@triton.jit
+def _w8a8_block_fp8_matmul(
+    # Pointers to inputs and output
+    A,
+    B,
+    C,
+    As,
+    Bs,
+    # Shape for matmul
+    M,
+    N,
+    K,
+    # Block size for block-wise quantization
+    group_n,
+    group_k,
+    # Stride for inputs and output
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_As_m,
+    stride_As_k,
+    stride_Bs_k,
+    stride_Bs_n,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """Triton-accelerated function used to perform linear operations (dot
+    product) on input tensors `A` and `B` with block-wise quantization, and
+    store the result in output tensor `C`.
+    """
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+
+    As_ptrs = As + offs_am * stride_As_m
+    offs_bsn = offs_bn // group_n
+    Bs_ptrs = Bs + offs_bsn * stride_Bs_n
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+
+        k_start = k * BLOCK_SIZE_K
+        offs_ks = k_start // group_k
+        a_s = tl.load(As_ptrs + offs_ks * stride_As_k)
+        b_s = tl.load(Bs_ptrs + offs_ks * stride_Bs_k)
+
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+def w8a8_block_fp8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: List[int],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    """This function performs matrix multiplication with block-wise
+    quantization.
+    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
+    The output is returned in the specified `output_dtype`.
+    Args:
+        A: The input tensor, e.g., activation.
+        B: The input tensor, e.g., weight.
+        As: The per-token-group quantization scale for `A`.
+        Bs: The per-block quantization scale for `B`.
+        block_size: The block size for per-block quantization. It should
+        be 2-dim, e.g., [128, 128].
+        output_dytpe: The dtype of the returned tensor.
+    Returns:
+        torch.Tensor: The result of matmul.
+    """
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+
+    assert A.shape[-1] == B.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
+    assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
+    M = A.numel() // A.shape[-1]
+
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    N, K = B.shape
+    assert triton.cdiv(N, block_n) == Bs.shape[0]
+    assert triton.cdiv(K, block_k) == Bs.shape[1]
+
+    C_shape = A.shape[:-1] + (N, )
+    C = A.new_empty(C_shape, dtype=output_dtype)
+
+    # TODO:
+    # BLOCK_SIZE_M, BLOCK_SIZE_K, BLOCK_SIZE_N can be optimized.
+    # BLOCK_SIZE_K must be divisible by block_k
+    # BLOCK_SIZE_N and BLOCK_SIZE_M has no requirements
+    BLOCK_SIZE_M = 128
+    if M < BLOCK_SIZE_M:
+        BLOCK_SIZE_M = triton.next_power_of_2(M)
+        BLOCK_SIZE_M = max(BLOCK_SIZE_M, 16)
+    BLOCK_SIZE_K = block_k
+    assert block_k % BLOCK_SIZE_K == 0
+    BLOCK_SIZE_N = block_n
+
+    def grid(META):
+        return (triton.cdiv(M, META["BLOCK_SIZE_M"]) *
+                triton.cdiv(N, META["BLOCK_SIZE_N"]), )
+
+    _w8a8_block_fp8_matmul[grid](
+        A,
+        B,
+        C,
+        As,
+        Bs,
+        M,
+        N,
+        K,
+        block_n,
+        block_k,
+        A.stride(-2),
+        A.stride(-1),
+        B.stride(1),
+        B.stride(0),
+        C.stride(-2),
+        C.stride(-1),
+        As.stride(-2),
+        As.stride(-1),
+        Bs.stride(1),
+        Bs.stride(0),
+        BLOCK_SIZE_M=BLOCK_SIZE_M,
+        BLOCK_SIZE_N=BLOCK_SIZE_N,
+        BLOCK_SIZE_K=BLOCK_SIZE_K,
+        GROUP_SIZE_M=8,
+    )
+
+    return C
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 7a6d7c90f34d5..02d22a5ca62c0 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -328,6 +328,15 @@ def adjust_shard_indexes_for_packing(self, shard_size, shard_offset):
             marlin_tile_size=self.marlin_tile_size)
 
 
+class BlockQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
+    """
+    Parameter class for weight scales loaded for weights with
+    block-wise quantization. Uses both column and row parallelism.
+    """
+
+    pass
+
+
 def permute_param_layout_(param: BasevLLMParameter, input_dim: int,
                           output_dim: int, **kwargs) -> BasevLLMParameter:
     """

From 55fb97f7bd61273fe8464a72866a72eaa88b5759 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Thu, 26 Dec 2024 18:43:05 -0500
Subject: [PATCH 184/357] [2/N] API Server: Avoid ulimit footgun (#11530)

---
 vllm/entrypoints/api_server.py        |  4 +++-
 vllm/entrypoints/openai/api_server.py |  6 +++++-
 vllm/utils.py                         | 18 ++++++++++++++++++
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 95da1c6e7b9bf..daefbff7e5178 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -21,7 +21,7 @@
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser, random_uuid
+from vllm.utils import FlexibleArgumentParser, random_uuid, set_ulimit
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger("vllm.entrypoints.api_server")
@@ -119,6 +119,8 @@ async def run_server(args: Namespace,
     logger.info("vLLM API server version %s", VLLM_VERSION)
     logger.info("args: %s", args)
 
+    set_ulimit()
+
     app = await init_app(args, llm_engine)
     assert engine is not None
 
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 16086689a10d1..2e45b474237f9 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -68,7 +68,7 @@
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
-                        is_valid_ipv6_address)
+                        is_valid_ipv6_address, set_ulimit)
 from vllm.version import __version__ as VLLM_VERSION
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
@@ -727,6 +727,10 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     sock_addr = (args.host or "", args.port)
     sock = create_server_socket(sock_addr)
 
+    # workaround to avoid footguns where uvicorn drops requests with too
+    # many concurrent requests active
+    set_ulimit()
+
     def signal_handler(*_) -> None:
         # Interrupt server on sigterm while initializing
         raise KeyboardInterrupt("terminated")
diff --git a/vllm/utils.py b/vllm/utils.py
index 7d290dcb7dad0..3d198887021dc 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -12,6 +12,7 @@
 import ipaddress
 import os
 import re
+import resource
 import signal
 import socket
 import subprocess
@@ -1818,3 +1819,20 @@ def memory_profiling(
     result.non_torch_increase_in_bytes = current_cuda_memory_bytes - baseline_memory_in_bytes - weights_memory_in_bytes - diff.torch_memory_in_bytes  # noqa
     result.profile_time = diff.timestamp
     result.non_kv_cache_memory_in_bytes = result.non_torch_increase_in_bytes + result.torch_peak_increase_in_bytes + result.weights_memory_in_bytes  # noqa
+
+
+# Adapted from: https://github.com/sgl-project/sglang/blob/f46f394f4d4dbe4aae85403dec006199b34d2840/python/sglang/srt/utils.py#L630 # noqa: E501Curre
+def set_ulimit(target_soft_limit=65535):
+    resource_type = resource.RLIMIT_NOFILE
+    current_soft, current_hard = resource.getrlimit(resource_type)
+
+    if current_soft < target_soft_limit:
+        try:
+            resource.setrlimit(resource_type,
+                               (target_soft_limit, current_hard))
+        except ValueError as e:
+            logger.warning(
+                "Found ulimit of %s and failed to automatically increase"
+                "with error %s. This can cause fd limit errors like"
+                "`OSError: [Errno 24] Too many open files`. Consider "
+                "increasing with ulimit -n", current_soft, e)

From f49777ba62b4926d0f8c100ab06edb03c5c10098 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Thu, 26 Dec 2024 16:09:44 -0800
Subject: [PATCH 185/357] Deepseek v3 (#11502)

Signed-off-by: mgoin <michael@neuralmagic.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: robertgshaw2-neuralmagic <rshaw@neuralmagic.com>
---
 csrc/moe/moe_align_sum_kernels.cu             | 160 ++++-
 vllm/config.py                                |  11 +-
 .../layers/fused_moe/fused_moe.py             |  17 +-
 vllm/model_executor/layers/fused_moe/layer.py | 102 ++-
 .../model_executor/layers/quantization/fp8.py |   7 +-
 vllm/model_executor/models/deepseek_v3.py     | 650 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 7 files changed, 887 insertions(+), 61 deletions(-)
 create mode 100644 vllm/model_executor/models/deepseek_v3.py

diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index fff7ce34c838a..24341d63fb1f8 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -113,6 +113,92 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
   }
 }
 
+// TODO(simon): this is temporarily adapted from
+// https://github.com/sgl-project/sglang/commit/31548116a8dc8c6df7e146e0587335a59fc5b9d7
+// we did this to unblock Deepseek V3 but there should be a better
+// implementation to manage shared memory.
+template <typename scalar_t>
+__global__ void moe_align_block_size_global_mem_kernel(
+    scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
+    int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
+    int32_t block_size, size_t numel, int32_t* tokens_cnts, int32_t* cumsum) {
+  const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
+  const size_t start_idx = threadIdx.x * tokens_per_thread;
+
+  for (int i = 0; i < num_experts; ++i) {
+    tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
+  }
+
+  /**
+   * In the first step we compute token_cnts[thread_index + 1][expert_index],
+   * which counts how many tokens in the token shard of thread_index are
+   * assigned to expert expert_index.
+   */
+  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
+    ++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])];
+  }
+
+  __syncthreads();
+
+  // For each expert we accumulate the token counts from the different threads.
+  if (threadIdx.x < num_experts) {
+    tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
+    for (int i = 1; i <= blockDim.x; ++i) {
+      tokens_cnts[index(num_experts, i, threadIdx.x)] +=
+          tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
+    }
+  }
+
+  __syncthreads();
+
+  // We accumulate the token counts of all experts in thread 0.
+  if (threadIdx.x == 0) {
+    cumsum[0] = 0;
+    for (int i = 1; i <= num_experts; ++i) {
+      cumsum[i] = cumsum[i - 1] +
+                  CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
+                          block_size) *
+                      block_size;
+    }
+    *total_tokens_post_pad = cumsum[num_experts];
+  }
+
+  __syncthreads();
+
+  /**
+   * For each expert, each thread processes the tokens of the corresponding
+   * blocks and stores the corresponding expert_id for each block.
+   */
+  if (threadIdx.x < num_experts) {
+    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
+         i += block_size) {
+      expert_ids[i / block_size] = threadIdx.x;
+    }
+  }
+
+  /**
+   * Each thread processes a token shard, calculating the index of each token
+   * after sorting by expert number. Given the example topk_ids =
+   * [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *,
+   * *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a
+   * padding value(preset in python).
+   */
+  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
+    int32_t expert_id = topk_ids[i];
+    /** The cumsum[expert_id] stores the starting index of the tokens that the
+     * expert with expert_id needs to process, and
+     * tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
+     * processed by the expert with expert_id within the current thread's token
+     * shard.
+     */
+    int32_t rank_post_pad =
+        tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
+        cumsum[expert_id];
+    sorted_token_ids[rank_post_pad] = i;
+    ++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
+  }
+}
+
 template <typename scalar_t, int TOPK>
 __global__ void moe_sum_kernel(
     scalar_t* __restrict__ out,          // [..., d]
@@ -137,25 +223,61 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                           torch::Tensor experts_ids,
                           torch::Tensor num_tokens_post_pad) {
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_INTEGRAL_TYPES(
-      topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
-        // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
-        // tensors
-        const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
-        const int32_t shared_mem =
-            ((num_thread + 1) * num_experts + (num_experts + 1)) *
-            sizeof(int32_t);
-
-        // set dynamic shared mem
-        auto kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
-        AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
-            (void*)kernel, shared_mem));
-        kernel<<<1, num_thread, shared_mem, stream>>>(
-            topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
-            experts_ids.data_ptr<int32_t>(),
-            num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
-            topk_ids.numel());
-      });
+
+  // If we have very large number of experts, we can no longer use shared
+  // memory.
+  // TODO(simon): the right solution should be calculating the exact right
+  // amount of shared memory and use that. The num_experts >= 256 is just a
+  // temporary solution to unblock Deepseek V3.
+  if (num_experts >= 256) {
+    VLLM_DISPATCH_INTEGRAL_TYPES(
+        topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] {
+          // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
+          // tensors
+          const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
+
+          const int32_t mem_tokens_cnts =
+              ((num_experts + 1) * num_experts) * sizeof(int32_t);
+          const int32_t mem_cumsum = (num_experts + 1) * sizeof(int32_t);
+          // allocate global memory
+          int32_t* tokens_cnts;
+          int32_t* cumsum;
+          cudaMalloc(&tokens_cnts, mem_tokens_cnts);
+          cudaMalloc(&cumsum, mem_cumsum);
+
+          auto kernel =
+              vllm::moe::moe_align_block_size_global_mem_kernel<scalar_t>;
+          kernel<<<1, num_thread, 0, stream>>>(
+              topk_ids.data_ptr<scalar_t>(),
+              sorted_token_ids.data_ptr<int32_t>(),
+              experts_ids.data_ptr<int32_t>(),
+              num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
+              topk_ids.numel(), tokens_cnts, cumsum);
+          cudaFree(tokens_cnts);
+          cudaFree(cumsum);
+        });
+  } else {
+    VLLM_DISPATCH_INTEGRAL_TYPES(
+        topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
+          // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
+          // tensors
+          const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
+          const int32_t shared_mem =
+              ((num_thread + 1) * num_experts + (num_experts + 1)) *
+              sizeof(int32_t);
+
+          // set dynamic shared mem
+          auto kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
+          AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
+              (void*)kernel, shared_mem));
+          kernel<<<1, num_thread, shared_mem, stream>>>(
+              topk_ids.data_ptr<scalar_t>(),
+              sorted_token_ids.data_ptr<int32_t>(),
+              experts_ids.data_ptr<int32_t>(),
+              num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
+              topk_ids.numel());
+        });
+  }
 }
 
 void moe_sum(torch::Tensor& input,   // [num_tokens, topk, hidden_size]
diff --git a/vllm/config.py b/vllm/config.py
index 58649236b4225..ac767bbe14be4 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -596,6 +596,12 @@ def _verify_cuda_graph(self) -> None:
         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                           self.max_model_len)
 
+        if (self.hf_config.model_type == 'deepseek_v3'
+                and not self.enforce_eager):
+            logger.warning("CUDA graph is not supported for Deepseek V3 yet, "
+                           "fallback to the eager mode.")
+            self.enforce_eager = True
+
     def _verify_bnb_config(self) -> None:
         """
         The current version of bitsandbytes (0.44.0) with 8-bit models does not
@@ -712,8 +718,9 @@ def get_hidden_size(self) -> int:
 
     def get_head_size(self) -> int:
         # TODO remove hard code
-        if hasattr(self.hf_text_config, "model_type"
-                   ) and self.hf_text_config.model_type == 'deepseek_v2':
+        if hasattr(self.hf_text_config,
+                   "model_type") and (self.hf_text_config.model_type
+                                      in ('deepseek_v2', 'deepseek_v3')):
             # FlashAttention supports only head_size 32, 64, 128, 256,
             # we need to pad head_size 192 to 256
             return 256
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 92e9ba3c9cebd..4101facbe7874 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -476,18 +476,29 @@ def fused_topk(
     return topk_weights, topk_ids
 
 
-# This is used by the Deepseek-V2 model
+# This is used by the Deepseek-V2 and Deepseek-V3 model
 def grouped_topk(hidden_states: torch.Tensor,
                  gating_output: torch.Tensor,
                  topk: int,
                  renormalize: bool,
                  num_expert_group: int = 0,
-                 topk_group: int = 0):
+                 topk_group: int = 0,
+                 scoring_func: str = "softmax",
+                 e_score_correction_bias: Optional[torch.Tensor] = None):
 
     assert hidden_states.shape[0] == gating_output.shape[0], (
         "Number of tokens mismatch")
 
-    scores = torch.softmax(gating_output, dim=-1)
+    if scoring_func == "softmax":
+        scores = torch.softmax(gating_output, dim=-1)
+    elif scoring_func == "sigmoid":
+        scores = gating_output.sigmoid()
+    else:
+        raise ValueError(f"Unsupported scoring function: {scoring_func}")
+
+    if e_score_correction_bias is not None:
+        scores.add_(e_score_correction_bias.unsqueeze(0))
+
     num_token = scores.shape[0]
     group_scores = scores.view(num_token, num_expert_group,
                                -1).max(dim=-1).values  # [n, n_group]
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 55c0a202920ff..01ffac4550f28 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -73,16 +73,18 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
     def apply(
-            self,
-            layer: torch.nn.Module,
-            x: torch.Tensor,
-            router_logits: torch.Tensor,
-            top_k: int,
-            renormalize: bool,
-            use_grouped_topk: bool,
-            topk_group: Optional[int] = None,
-            num_expert_group: Optional[int] = None,
-            custom_routing_function: Optional[Callable] = None
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
         return self.forward(x=x,
                             layer=layer,
@@ -92,19 +94,23 @@ def apply(
                             use_grouped_topk=use_grouped_topk,
                             topk_group=topk_group,
                             num_expert_group=num_expert_group,
-                            custom_routing_function=custom_routing_function)
+                            custom_routing_function=custom_routing_function,
+                            scoring_func=scoring_func,
+                            e_score_correction_bias=e_score_correction_bias)
 
     def forward_cuda(
-            self,
-            layer: torch.nn.Module,
-            x: torch.Tensor,
-            use_grouped_topk: bool,
-            top_k: int,
-            router_logits: torch.Tensor,
-            renormalize: bool,
-            topk_group: Optional[int] = None,
-            num_expert_group: Optional[int] = None,
-            custom_routing_function: Optional[Callable] = None
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -114,7 +120,9 @@ def forward_cuda(
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
 
         return fused_experts(hidden_states=x,
                              w1=layer.w13_weight,
@@ -128,21 +136,29 @@ def forward_cpu(self, *args, **kwargs):
             "The CPU backend currently does not support MoE.")
 
     def forward_tpu(
-            self,
-            layer: torch.nn.Module,
-            x: torch.Tensor,
-            use_grouped_topk: bool,
-            top_k: int,
-            router_logits: torch.Tensor,
-            renormalize: bool,
-            topk_group: Optional[int] = None,
-            num_expert_group: Optional[int] = None,
-            custom_routing_function: Optional[Callable] = None
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
         assert not use_grouped_topk
         assert num_expert_group is None
         assert topk_group is None
         assert custom_routing_function is None
+        if scoring_func != "softmax":
+            raise NotImplementedError(
+                "Only softmax scoring function is supported for TPU.")
+        if e_score_correction_bias is not None:
+            raise NotImplementedError(
+                "Expert score correction bias is not supported for TPU.")
         return fused_moe_pallas(hidden_states=x,
                                 w1=layer.w13_weight,
                                 w2=layer.w2_weight,
@@ -156,7 +172,7 @@ def forward_tpu(
 class FusedMoE(torch.nn.Module):
     """FusedMoE layer for MoE models.
 
-    This layer contains both MergedColumnParallel weights (gate_up_proj / 
+    This layer contains both MergedColumnParallel weights (gate_up_proj /
     w13) and RowParallelLinear weights (down_proj/ w2).
 
     Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
@@ -190,6 +206,8 @@ def __init__(
         tp_size: Optional[int] = None,
         prefix: str = "",
         custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
     ):
         super().__init__()
 
@@ -210,6 +228,12 @@ def __init__(
         self.num_expert_group = num_expert_group
         self.topk_group = topk_group
         self.custom_routing_function = custom_routing_function
+        self.scoring_func = scoring_func
+        self.e_score_correction_bias = e_score_correction_bias
+
+        if self.scoring_func != "softmax" and not self.use_grouped_topk:
+            raise ValueError("Only softmax scoring function is supported for "
+                             "non-grouped topk.")
 
         if quant_config is None:
             self.quant_method: Optional[QuantizeMethodBase] = (
@@ -446,7 +470,9 @@ def select_experts(hidden_states: torch.Tensor,
                        renormalize: bool,
                        topk_group: Optional[int] = None,
                        num_expert_group: Optional[int] = None,
-                       custom_routing_function: Optional[Callable] = None):
+                       custom_routing_function: Optional[Callable] = None,
+                       scoring_func: str = "softmax",
+                       e_score_correction_bias: Optional[torch.Tensor] = None):
         from vllm.model_executor.layers.fused_moe.fused_moe import (
             fused_topk, grouped_topk)
 
@@ -460,7 +486,9 @@ def select_experts(hidden_states: torch.Tensor,
                 topk=top_k,
                 renormalize=renormalize,
                 num_expert_group=num_expert_group,
-                topk_group=topk_group)
+                topk_group=topk_group,
+                scoring_func=scoring_func,
+                e_score_correction_bias=e_score_correction_bias)
         elif custom_routing_function is None:
             topk_weights, topk_ids = fused_topk(hidden_states=hidden_states,
                                                 gating_output=router_logits,
@@ -489,7 +517,9 @@ def forward(self, hidden_states: torch.Tensor,
             use_grouped_topk=self.use_grouped_topk,
             topk_group=self.topk_group,
             num_expert_group=self.num_expert_group,
-            custom_routing_function=self.custom_routing_function)
+            custom_routing_function=self.custom_routing_function,
+            scoring_func=self.scoring_func,
+            e_score_correction_bias=self.e_score_correction_bias)
 
         if self.reduce_results and self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 5dfd86727a02a..4362468c1db69 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -605,6 +605,8 @@ def apply(
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
 
         from vllm.model_executor.layers.fused_moe import fused_experts
@@ -617,7 +619,10 @@ def apply(
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+        )
 
         return fused_experts(
             x,
diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py
new file mode 100644
index 0000000000000..333dc019b4d99
--- /dev/null
+++ b/vllm/model_executor/models/deepseek_v3.py
@@ -0,0 +1,650 @@
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only DeepseekV3 model."""
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .utils import (PPMissingLayer, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class DeepseekV3MLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           reduce_results=reduce_results,
+                                           prefix=f"{prefix}.down_proj")
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class DeepseekV3MoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.n_shared_experts = config.n_shared_experts
+        self.routed_scaling_factor = config.routed_scaling_factor
+        if self.tp_size > config.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.n_routed_experts}.")
+
+        if config.hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
+                             "Only silu is supported for now.")
+
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     config.n_routed_experts,
+                                     bias=False,
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
+        if config.topk_method == "noaux_tc":
+            self.gate.e_score_correction_bias = nn.Parameter(
+                torch.empty(config.n_routed_experts))
+        else:
+            self.gate.e_score_correction_bias = None
+
+        self.experts = FusedMoE(
+            num_experts=config.n_routed_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            topk_group=config.topk_group,
+            prefix=f"{prefix}.experts",
+            scoring_func=config.scoring_func,
+            e_score_correction_bias=self.gate.e_score_correction_bias)
+
+        if config.n_shared_experts is not None:
+            intermediate_size = (config.moe_intermediate_size *
+                                 config.n_shared_experts)
+            self.shared_experts = DeepseekV3MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+            )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        if self.n_shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states,
+            router_logits=router_logits) * self.routed_scaling_factor
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    import math
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+class DeepseekV3Attention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: int,
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.num_heads = num_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        assert num_heads % tp_size == 0
+        self.num_local_heads = num_heads // tp_size
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        if self.q_lora_rank is not None:
+            self.q_a_proj = ReplicatedLinear(self.hidden_size,
+                                             self.q_lora_rank,
+                                             bias=False,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.q_a_proj")
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
+                                         eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(q_lora_rank,
+                                                 self.num_heads *
+                                                 self.qk_head_dim,
+                                                 bias=False,
+                                                 quant_config=quant_config,
+                                                 prefix=f"{prefix}.q_b_proj")
+        else:
+            self.q_proj = ColumnParallelLinear(self.hidden_size,
+                                               self.num_heads *
+                                               self.qk_head_dim,
+                                               bias=False,
+                                               quant_config=quant_config,
+                                               prefix=f"{prefix}.q_proj")
+
+        self.kv_a_proj_with_mqa = ReplicatedLinear(
+            self.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_a_proj_with_mqa")
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
+                                      eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_b_proj")
+        # O projection.
+        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
+                                        self.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
+        rope_scaling["rope_type"] = 'deepseek_yarn'
+        self.rotary_emb = get_rope(qk_rope_head_dim,
+                                   rotary_dim=qk_rope_head_dim,
+                                   max_position=max_position_embeddings,
+                                   base=rope_theta,
+                                   rope_scaling=rope_scaling,
+                                   is_neox_style=False)
+
+        if rope_scaling:
+            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
+            scaling_factor = rope_scaling["factor"]
+            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+            self.scaling = self.scaling * mscale * mscale
+
+        # self.attn = Attention(self.num_heads,
+        #                       self.qk_head_dim,
+        #                       self.scaling,
+        #                       num_kv_heads=self.num_heads)
+
+        # TODO, support head_size 192
+        self.attn = Attention(self.num_local_heads,
+                              256,
+                              self.scaling,
+                              num_kv_heads=self.num_local_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        if self.q_lora_rank is not None:
+            q = self.q_a_proj(hidden_states)[0]
+            q = self.q_a_layernorm(q)
+            q = self.q_b_proj(q)[0].view(-1, self.num_local_heads,
+                                         self.qk_head_dim)
+        else:
+            q = self.q_proj(hidden_states)[0].view(-1, self.num_local_heads,
+                                                   self.qk_head_dim)
+        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim],
+                               dim=-1)
+        latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
+        kv_a, _ = latent_cache.split(
+            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        latent_cache = latent_cache.unsqueeze(1)
+        kv_a = self.kv_a_layernorm(kv_a.contiguous())
+        kv = self.kv_b_proj(kv_a)[0]
+        kv = kv.view(-1, self.num_local_heads,
+                     self.qk_nope_head_dim + self.v_head_dim)
+        k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+        k_pe = latent_cache[:, :, self.kv_lora_rank:]
+        q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+        q[..., self.qk_nope_head_dim:] = q_pe
+        k = torch.empty_like(q)
+        k[..., :self.qk_nope_head_dim] = k_nope
+        k[..., self.qk_nope_head_dim:] = k_pe
+        q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim],
+                                    value=0).view(-1,
+                                                  self.num_local_heads * 256)
+        k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim],
+                                    value=0).view(-1,
+                                                  self.num_local_heads * 256)
+        v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim],
+                                    value=0).view(-1,
+                                                  self.num_local_heads * 256)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = attn_output.view(
+            -1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape(
+                -1, self.num_local_heads * self.v_head_dim)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class DeepseekV3DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        prefix: str,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        # DecoderLayers are created with `make_layers` which passes the prefix
+        # with the layer's index.
+        layer_idx = int(prefix.split(sep='.')[-1])
+        self.self_attn = DeepseekV3Attention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            qk_nope_head_dim=config.qk_nope_head_dim,
+            qk_rope_head_dim=config.qk_rope_head_dim,
+            v_head_dim=config.v_head_dim,
+            q_lora_rank=config.q_lora_rank
+            if hasattr(config, "q_lora_rank") else None,
+            kv_lora_rank=config.kv_lora_rank,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        if (config.n_routed_experts is not None
+                and layer_idx >= config.first_k_dense_replace
+                and layer_idx % config.moe_layer_freq == 0):
+            self.mlp = DeepseekV3MoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+        else:
+            self.mlp = DeepseekV3MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+# TODO(simon): check whether we support torch compile for Deepseek V3
+# @support_torch_compile
+class DeepseekV3Model(nn.Module):
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: DeepseekV3DecoderLayer(
+                config,
+                prefix,
+                cache_config=cache_config,
+                quant_config=quant_config,
+            ),
+            prefix=f"{prefix}.layers")
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states,
+                                            kv_caches[i - self.start_layer],
+                                            attn_metadata, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class DeepseekV3ForCausalLM(nn.Module, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = DeepseekV3Model(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = get_sampler()
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            # TODO(simon): support nextn predict layers
+            if self.config.num_nextn_predict_layers > 0:
+                assert self.config.num_nextn_predict_layers == 1
+                layer_idx = self.config.num_hidden_layers
+                if name.startswith(f"model.layers.{layer_idx}"):
+                    continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if (("mlp.experts." in name) and name not in params_dict):
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    if name not in params_dict:
+                        for key in params_dict:
+                            print(key)
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index b32a3421d5841..feb33bb373c3e 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -45,6 +45,7 @@
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
     "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
+    "DeepseekV3ForCausalLM": ("deepseek_v3", "DeepseekV3ForCausalLM"),
     "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"),
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
     "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),

From 82d24f7aacf79bbccb6413333dff6303fbbb44b9 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Thu, 26 Dec 2024 16:21:56 -0800
Subject: [PATCH 186/357] [Docs] Document Deepseek V3 support (#11535)

Signed-off-by: simon-mo <simon.mo@hey.com>
---
 README.md                              | 2 +-
 docs/source/models/supported_models.md | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 93b71ddaccc61..f83c9d759b359 100644
--- a/README.md
+++ b/README.md
@@ -60,7 +60,7 @@ vLLM is flexible and easy to use with:
 
 vLLM seamlessly supports most popular open-source models on HuggingFace, including:
 - Transformer-like LLMs (e.g., Llama)
-- Mixture-of-Expert LLMs (e.g., Mixtral)
+- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
 - Embedding Models (e.g. E5-Mistral)
 - Multi-modal LLMs (e.g., LLaVA)
 
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 85fba83147708..95add0d71bbab 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -137,6 +137,11 @@ See [this page](#generative-models) for more information on how to use generativ
     - :code:`deepseek-ai/DeepSeek-V2`, :code:`deepseek-ai/DeepSeek-V2-Chat` etc.
     -
     - ✅︎
+  * - :code:`DeepseekV3ForCausalLM`
+    - DeepSeek-V3
+    - :code:`deepseek-ai/DeepSeek-V3-Base`, :code:`deepseek-ai/DeepSeek-V3` etc.
+    -
+    - ✅︎
   * - :code:`ExaoneForCausalLM`
     - EXAONE-3
     - :code:`LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.
@@ -676,7 +681,7 @@ See [this page](#generative-models) for more information on how to use generativ
     - PaliGemma, PaliGemma 2
     - T + I\ :sup:`E`
     - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, :code:`google/paligemma2-3b-ft-docci-448`, etc.
-    - 
+    -
     - ✅︎
     -
   * - :code:`Phi3VForCausalLM`

From 0c0c2015c526f1fe6f86fdd8d6bd99a935d2d275 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Thu, 26 Dec 2024 19:26:18 -0500
Subject: [PATCH 187/357] Update openai_compatible_server.md (#11536)

Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 docs/source/serving/openai_compatible_server.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 23c66f72162d2..caf5e8cafd9aa 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -112,7 +112,13 @@ completion = client.chat.completions.create(
 
 ## Extra HTTP Headers
 
-Only `X-Request-Id` HTTP request header is supported for now.
+Only `X-Request-Id` HTTP request header is supported for now. It can be enabled
+with `--enable-request-id-headers`. 
+
+> Note that enablement of the headers can impact performance significantly at high QPS
+> rates. We recommend implementing HTTP headers at the router level (e.g. via Istio),
+> rather than within the vLLM layer for this reason.
+> See https://github.com/vllm-project/vllm/pull/11529 for more details.
 
 ```python
 completion = client.chat.completions.create(

From 371d04d39bf056e4cc56100c83d4812b7cb230e4 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 27 Dec 2024 09:32:38 +0900
Subject: [PATCH 188/357] [V1] Use FlashInfer Sampling Kernel for Top-P & Top-K
 Sampling (#11394)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/v1/sample/test_sampler.py         |  54 +++---
 vllm/envs.py                            |   5 +-
 vllm/v1/sample/ops/__init__.py          |   0
 vllm/v1/sample/ops/penalties.py         |  57 ++++++
 vllm/v1/sample/ops/topk_topp_sampler.py | 201 +++++++++++++++++++++
 vllm/v1/sample/sampler.py               | 228 ++++++++----------------
 6 files changed, 355 insertions(+), 190 deletions(-)
 create mode 100644 vllm/v1/sample/ops/__init__.py
 create mode 100644 vllm/v1/sample/ops/penalties.py
 create mode 100644 vllm/v1/sample/ops/topk_topp_sampler.py

diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index d8d055805cbea..5ebf72927cfd6 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -68,7 +68,7 @@ def _create_default_sampling_metadata(
         no_top_p=True,
         no_top_k=True,
         generators={},
-        max_num_logprobs=VOCAB_SIZE,
+        max_num_logprobs=0,
         prompt_token_ids=_create_prompt_tokens_tensor(prompt_token_ids,
                                                       vocab_size, device),
         output_token_ids=output_token_ids,
@@ -169,20 +169,14 @@ def test_sampler_min_tokens_penalty(device: str, batch_size: int):
     sampling_metadata.min_tokens = min_tokens
     sampling_metadata.stop_token_ids = stop_token_ids
     sampler = Sampler()
-    sampler_output = sampler(fake_logits, sampling_metadata)
+    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
+    logits = logits.cpu()
     for batch_idx in range(batch_size):
-        for vocab in range(VOCAB_SIZE):
-            # Verify that the logprobs for stop token ids is set
-            # to -inf.
-            logprob_index = torch.where(
-                sampler_output.logprob_token_ids[batch_idx] ==
-                vocab)[0].item()
-            if vocab in stop_token_ids[batch_idx]:
-                assert sampler_output.logprobs[batch_idx][
-                    logprob_index] == -float("inf")
+        for token_id in range(VOCAB_SIZE):
+            if token_id in stop_token_ids[batch_idx]:
+                assert logits[batch_idx][token_id] == -float("inf")
             else:
-                assert sampler_output.logprobs[batch_idx][
-                    logprob_index] != -float("inf")
+                assert logits[batch_idx][token_id] != -float("inf")
 
 
 @pytest.mark.parametrize("device", CUDA_DEVICES)
@@ -205,18 +199,14 @@ def test_sampler_presence_penalty(device: str, batch_size: int,
         batch_size, presence_penalty, torch.device(device))
     sampling_metadata.no_penalties = False
     sampler = Sampler()
-    sampler_output = sampler(fake_logits, sampling_metadata)
+    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
+    logits = logits.cpu()
     for batch_idx in range(batch_size):
-        # The logprobs in the SamplerOutput are arranged in descending order.
-        # Since all tokens initially have the same logprobs, the non-penalized
-        # tokens will appear at the beginning, while the penalized tokens
-        #  will appear at the end of the list.
-        penalized_token_id = sampler_output.logprob_token_ids[batch_idx][
-            VOCAB_SIZE - 1]
-        penalized_log_prod = sampler_output.logprobs[batch_idx][VOCAB_SIZE - 1]
-        non_penalized_token_id = sampler_output.logprob_token_ids[batch_idx][0]
-        non_penalized_log_prod = sampler_output.logprobs[batch_idx][0]
-        assert non_penalized_log_prod > penalized_log_prod
+        # Since all tokens initially have the same logits, the non-penalized
+        # token ID will be the one with the highest logit value, while the
+        # penalized token ID will be the one with the lowest logit value.
+        non_penalized_token_id = logits[batch_idx].argmax().item()
+        penalized_token_id = logits[batch_idx].argmin().item()
         if presence_penalty > 0:
             # If `presence_penalty` is set to a value greater than 0, it
             # indicates a preference for new tokens over those already
@@ -256,11 +246,11 @@ def test_sampler_frequency_penalty(device: str, batch_size: int,
     sampling_metadata.output_token_ids = output_token_ids
     sampling_metadata.no_penalties = False
     sampler = Sampler()
-    sampler_output = sampler(fake_logits, sampling_metadata)
+    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
+    logits = logits.cpu()
     for batch_idx in range(batch_size):
-        logprobs_token_ids = sampler_output.logprob_token_ids[batch_idx]
-        non_penalized_token_id = logprobs_token_ids[0]
-        penalized_token_id = logprobs_token_ids[VOCAB_SIZE - 1]
+        non_penalized_token_id = logits[batch_idx].argmax().item()
+        penalized_token_id = logits[batch_idx].argmin().item()
         distinct_sorted_token_ids_in_output = \
             sorted_token_ids_in_output[batch_idx]
         most_frequent_token_id = distinct_sorted_token_ids_in_output[
@@ -305,11 +295,11 @@ def test_sampler_repetition_penalty(device: str, batch_size: int,
         batch_size, repetition_penalty, torch.device(device))
     sampling_metadata.no_penalties = False
     sampler = Sampler()
-    sampler_output = sampler(fake_logits, sampling_metadata)
+    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
+    logits = logits.cpu()
     for batch_idx in range(batch_size):
-        logprobs_token_ids = sampler_output.logprob_token_ids[batch_idx]
-        non_penalized_token_id = logprobs_token_ids[0]
-        penalized_token_id = logprobs_token_ids[VOCAB_SIZE - 1]
+        non_penalized_token_id = logits[batch_idx].argmax().item()
+        penalized_token_id = logits[batch_idx].argmin().item()
         prompt_tokens = sampling_metadata.prompt_token_ids[
             batch_idx][:].tolist()
         output_tokens = sampling_metadata.output_token_ids[batch_idx]
diff --git a/vllm/envs.py b/vllm/envs.py
index 18870c1c6b51a..c4a568c680db0 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -30,7 +30,7 @@
     VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: Optional[str] = None
-    VLLM_USE_FLASHINFER_SAMPLER: bool = False
+    VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
     VLLM_USE_FLASHINFER_REJECTION_SAMPLER: bool = False
     VLLM_FLASHINFER_FORCE_TENSOR_CORES: bool = False
     VLLM_PP_LAYER_PARTITION: Optional[str] = None
@@ -277,7 +277,8 @@ def get_default_config_root():
 
     # If set, vllm will use flashinfer sampler
     "VLLM_USE_FLASHINFER_SAMPLER":
-    lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_SAMPLER", "0"))),
+    lambda: bool(int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"]))
+    if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ else None,
 
     # If set, vllm will force flashinfer to use tensor cores;
     # otherwise will use heuristic based on model architecture.
diff --git a/vllm/v1/sample/ops/__init__.py b/vllm/v1/sample/ops/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py
new file mode 100644
index 0000000000000..91ebaf9269f32
--- /dev/null
+++ b/vllm/v1/sample/ops/penalties.py
@@ -0,0 +1,57 @@
+from typing import List, Set, Tuple
+
+import torch
+
+from vllm.model_executor.layers.utils import (
+    apply_penalties as _apply_penalties)
+from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+
+
+def apply_min_token_penalties(logits: torch.Tensor,
+                              output_token_ids: List[List[int]],
+                              stop_token_ids: List[Set[int]],
+                              min_tokens: List[int]) -> None:
+    """
+    Applies minimum token penalty by setting the logits of the stop tokens
+    to -inf.
+    """
+    min_tokens_logits_to_penalize: List[Tuple[int, int]] = []
+    for index, min_token in enumerate(min_tokens):
+        if (len(output_token_ids[index]) < min_token):
+            for stop_token_id in stop_token_ids[index]:
+                min_tokens_logits_to_penalize.append((index, stop_token_id))
+    if min_tokens_logits_to_penalize:
+        logits[tuple(zip(*min_tokens_logits_to_penalize))] = -float("inf")
+
+
+def apply_penalties(logits: torch.Tensor, prompt_token_ids: torch.Tensor,
+                    presence_penalties: torch.Tensor,
+                    frequency_penalties: torch.Tensor,
+                    repetition_penalties: torch.Tensor,
+                    output_token_ids: List[List[int]]) -> torch.Tensor:
+    """
+    Applies presence, frequency and repetition penalties to the logits.
+    """
+    _, vocab_size = logits.shape
+    output_tokens_t = _convert_to_tensors(output_token_ids, vocab_size,
+                                          logits.device)
+    return _apply_penalties(logits, prompt_token_ids, output_tokens_t,
+                            presence_penalties, frequency_penalties,
+                            repetition_penalties)
+
+
+def _convert_to_tensors(output_token_ids: List[List[int]], vocab_size: int,
+                        device: torch.device) -> torch.Tensor:
+    """
+    Convert the different list data structures to tensors.
+    """
+    output_tokens_tensor = make_tensor_with_pad(
+        output_token_ids,
+        # Use the value of vocab_size as a pad since we don't have a
+        # token_id of this value.
+        pad=vocab_size,
+        device="cpu",
+        dtype=torch.int64,
+        pin_memory=is_pin_memory_available(),
+    )
+    return output_tokens_tensor.to(device, non_blocking=True)
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
new file mode 100644
index 0000000000000..c088c3c129ca5
--- /dev/null
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -0,0 +1,201 @@
+from typing import Dict
+
+import torch
+import torch.nn as nn
+
+from vllm import envs
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+try:
+    import flashinfer.sampling
+    is_flashinfer_available = True
+except ImportError:
+    is_flashinfer_available = False
+
+
+class TopKTopPSampler(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda:
+            if is_flashinfer_available:
+                if envs.VLLM_USE_FLASHINFER_SAMPLER is not False:
+                    # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for
+                    # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by
+                    # default it is unused). For backward compatibility, we set
+                    # `VLLM_USE_FLASHINFER_SAMPLER` as None by default and
+                    # interpret it differently in V0 and V1 samplers: In V0,
+                    # None means False, while in V1, None means True. This is
+                    # why we use the condition
+                    # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here.
+                    logger.info("Using FlashInfer for top-p & top-k sampling.")
+                    self.forward = self.forward_cuda
+                else:
+                    logger.warning(
+                        "FlashInfer is available, but it is not enabled. "
+                        "Falling back to the PyTorch-native implementation of "
+                        "top-p & top-k sampling. For the best performance, "
+                        "please set VLLM_USE_FLASHINFER_SAMPLER=1.")
+                    self.forward = self.forward_native
+            else:
+                logger.warning(
+                    "FlashInfer is not available. Falling back to the PyTorch-"
+                    "native implementation of top-p & top-k sampling. For the "
+                    "best performance, please install FalshInfer.")
+                self.forward = self.forward_native
+        else:
+            self.forward = self.forward_native
+
+    def forward_native(
+        self,
+        logits: torch.Tensor,
+        generators: Dict[int, torch.Generator],
+        no_top_k: bool,
+        k: torch.Tensor,
+        no_top_p: bool,
+        p: torch.Tensor,
+    ) -> torch.Tensor:
+        """PyTorch-native implementation of top-k and top-p sampling."""
+        logits = apply_top_k_top_p(logits, no_top_k, k, no_top_p, p)
+        probs = logits.softmax(dim=-1, dtype=torch.float32)
+        return random_sample(probs, generators)
+
+    def forward_cuda(
+        self,
+        logits: torch.Tensor,
+        generators: Dict[int, torch.Generator],
+        no_top_k: bool,
+        k: torch.Tensor,
+        no_top_p: bool,
+        p: torch.Tensor,
+    ) -> torch.Tensor:
+        """More optimized implementation for top-k and top-p sampling."""
+        probs = logits.softmax(dim=-1, dtype=torch.float32)
+        if no_top_k and no_top_p:
+            # We prefer `random_sample` over `flashinfer_sample` when sorting is
+            # not needed. This is because `random_sample` does not require
+            # CPU-GPU synchronization while `flashinfer_sample` does.
+            return random_sample(probs, generators)
+        return flashinfer_sample(probs, no_top_k, k, no_top_p, p, generators)
+
+
+def apply_top_k_top_p(
+    logits: torch.Tensor,
+    no_top_k: bool,
+    k: torch.Tensor,
+    no_top_p: bool,
+    p: torch.Tensor,
+) -> torch.Tensor:
+    """Apply top-k and top-p masks to the logits.
+
+    This function sorts the logits tensor, which can be slow for large batches.
+    """
+    if no_top_k and no_top_p:
+        return logits
+    logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
+
+    if not no_top_k:
+        # Apply top-k.
+        top_k_mask = logits_sort.size(1) - k.to(torch.long)
+        # Get all the top_k values.
+        top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1))
+        top_k_mask = logits_sort < top_k_mask
+        logits_sort.masked_fill_(top_k_mask, -float("inf"))
+
+    if not no_top_p:
+        # Apply top-p.
+        probs_sort = logits_sort.softmax(dim=-1)
+        probs_sum = probs_sort.cumsum(dim=-1)
+        top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1)
+        # at least one
+        top_p_mask[:, -1] = False
+        logits_sort.masked_fill_(top_p_mask, -float("inf"))
+
+    # Re-sort the probabilities.
+    logits = logits_sort.scatter(dim=-1, index=logits_idx, src=logits_sort)
+    return logits
+
+
+def random_sample(
+    probs: torch.Tensor,
+    generators: Dict[int, torch.Generator],
+) -> torch.Tensor:
+    """Randomly sample from the probabilities.
+
+    We use this function instead of torch.multinomial because torch.multinomial
+    causes CPU-GPU synchronization.
+    """
+    q = torch.empty_like(probs)
+    # NOTE(woosuk): To batch-process the requests without their own seeds,
+    # which is the common case, we first assume that every request does
+    # not have its own seed. Then, we overwrite the values for the requests
+    # that have their own seeds.
+    if len(generators) != probs.shape[0]:
+        q.exponential_()
+    if generators:
+        # TODO(woosuk): This can be slow because we handle each request
+        # one by one. Optimize this.
+        for i, generator in generators.items():
+            q[i].exponential_(generator=generator)
+    return probs.div_(q).argmax(dim=-1).view(-1)
+
+
+def flashinfer_sample(
+    probs: torch.Tensor,
+    no_top_k: bool,
+    k: torch.Tensor,
+    no_top_p: bool,
+    p: torch.Tensor,
+    generators: Dict[int, torch.Generator],
+) -> torch.Tensor:
+    """Sample from the probabilities using FlashInfer.
+
+    Statistically, this function is equivalent to the `random_sample` function.
+    However, this function is faster because it avoids sorting the logits tensor
+    via rejection sampling.
+    
+    NOTE: The outputs of this function do not necessarily match the outputs of
+    the `random_sample` function. It only guarantees that the outputs are
+    statistically equivalent.
+
+    NOTE: This function includes CPU-GPU synchronization, while `random_sample`
+    does not. Call this function at the end of the forward pass to minimize
+    the synchronization overhead.
+    """
+    assert not (no_top_k and no_top_p)
+    max_top_k_round = 32
+    batch_size = probs.shape[0]
+    uniform_samples = torch.empty((max_top_k_round, batch_size),
+                                  device=probs.device)
+    if len(generators) != batch_size:
+        uniform_samples.uniform_()
+    if generators:
+        for i, generator in generators.items():
+            uniform_samples[:, i].uniform_(generator=generator)
+
+    if no_top_k:
+        # Top-p only.
+        next_token_ids, success = flashinfer.sampling.top_p_sampling_from_probs(
+            probs, uniform_samples, p, deterministic=True)
+    elif no_top_p:
+        # Top-k only.
+        next_token_ids, success = flashinfer.sampling.top_k_sampling_from_probs(
+            probs, uniform_samples, k, deterministic=True)
+    else:
+        # Both top-k and top-p.
+        next_token_ids, success = (
+            flashinfer.sampling.top_k_top_p_sampling_from_probs(
+                probs, uniform_samples, k, p, deterministic=True))
+
+    # NOTE: CPU-GPU synchronization happens here.
+    if not success.all():
+        if not no_top_k:
+            probs = flashinfer.sampling.top_k_renorm_prob(probs, k)
+        if not no_top_p:
+            probs = flashinfer.sampling.top_p_renorm_prob(probs, p)
+        next_token_ids = flashinfer.sampling.sampling_from_probs(
+            probs, uniform_samples[0], deterministic=True)
+    return next_token_ids.view(-1)
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 82470fb2610f8..1e38453a0ff28 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -1,53 +1,55 @@
 """A layer that samples the next tokens from the model's outputs."""
-from typing import Dict, List, Set, Tuple
+from typing import Tuple
 
 import torch
 import torch.nn as nn
 
-from vllm.model_executor.layers.utils import apply_penalties
-from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 from vllm.v1.outputs import SamplerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.ops.penalties import (apply_min_token_penalties,
+                                          apply_penalties)
+from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
 
 _SAMPLING_EPS = 1e-5
 
 
 class Sampler(nn.Module):
 
+    def __init__(self):
+        super().__init__()
+        self.topk_topp_sampler = TopKTopPSampler()
+
     def forward(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
-        _apply_min_token_penalties(logits, sampling_metadata.output_token_ids,
-                                   sampling_metadata.stop_token_ids,
-                                   sampling_metadata.min_tokens)
-        if not sampling_metadata.no_penalties:
-            assert sampling_metadata.prompt_token_ids is not None
-            _apply_penalties(logits, sampling_metadata.prompt_token_ids,
-                             sampling_metadata.presence_penalties,
-                             sampling_metadata.frequency_penalties,
-                             sampling_metadata.repetition_penalties,
-                             sampling_metadata.output_token_ids)
-        logits = self.apply_temperature(logits, sampling_metadata.temperature)
-        logits = self.apply_top_k_top_p(logits, sampling_metadata)
-        probs = self.get_probs(logits)
-        sampled = self.sample(probs, sampling_metadata)
-        # Use int32 to reduce the tensor size.
-        sampled = sampled.to(torch.int32)
-
-        if sampling_metadata.max_num_logprobs > 0:
-            logprobs = self.get_logprobs(logits)
-            # FIXME: Mask the sampled token_id, get topk logprobs,
-            # and concatenate the topk with the sampled token_id.
-            topk_logprobs, topk_indices = torch.topk(
-                logprobs, sampling_metadata.max_num_logprobs, dim=-1)
-            # Use int32 to reduce the tensor size.
-            topk_indices = topk_indices.to(torch.int32)
+        needs_logprobs = sampling_metadata.max_num_logprobs > 0
+        if needs_logprobs:
+            # NOTE(woosuk): Use the original logits (before any penalties or
+            # temperature scaling) for the top-k logprobs.
+            # This is different from the V0 sampler, which uses the logits that
+            # is used for sampling (after penalties and temperature scaling).
+            # NOTE: We compute logprobs first because the below ops may
+            # modify the logits tensor in-place (and we don't want to clone
+            # the logits tensor for memory efficiency).
+            topk_logprobs, topk_indices = self.get_topk_logprobs(
+                logits, sampling_metadata)
         else:
             topk_logprobs = None
             topk_indices = None
 
+        # Use float32 for the logits.
+        logits = logits.to(torch.float32)
+        # Apply penalties (e.g., min_tokens, freq_penalties).
+        logits = self.apply_penalties(logits, sampling_metadata)
+        # Apply temperature.
+        logits = self.apply_temperature(logits, sampling_metadata.temperature)
+        # Sample the next token.
+        sampled = self.sample(logits, sampling_metadata)
+        # Use int32 to reduce the tensor size.
+        sampled = sampled.to(torch.int32)
+
         # NOTE: CPU-GPU synchronization happens here.
         sampler_output = SamplerOutput(
             sampled_token_ids=sampled.tolist(),
@@ -63,71 +65,37 @@ def apply_temperature(
         logits: torch.Tensor,
         temp: torch.Tensor,
     ) -> torch.Tensor:
-        # Use float32 to apply temperature scaling.
-        logits = logits.to(torch.float32)
         # Avoid division by zero.
         temp = torch.where(temp < _SAMPLING_EPS, 1.0, temp)
         # Use in-place division to avoid creating a new tensor.
         logits.div_(temp.unsqueeze(dim=1))
         return logits
 
-    def apply_top_k_top_p(
+    def greedy_sample(self, logits: torch.Tensor) -> torch.Tensor:
+        return logits.argmax(dim=-1).view(-1)
+
+    def sample(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> torch.Tensor:
-        return _apply_top_k_top_p(
+        assert not (sampling_metadata.all_greedy
+                    and sampling_metadata.all_random)
+        if sampling_metadata.all_greedy:
+            return self.greedy_sample(logits)
+
+        random_sampled = self.topk_topp_sampler(
             logits,
+            sampling_metadata.generators,
             sampling_metadata.no_top_k,
             sampling_metadata.top_k,
             sampling_metadata.no_top_p,
             sampling_metadata.top_p,
         )
-
-    def get_probs(self, logits: torch.Tensor) -> torch.Tensor:
-        return torch.softmax(logits, dim=-1, dtype=torch.float32)
-
-    def get_logprobs(self, logits: torch.Tensor) -> torch.Tensor:
-        return torch.log_softmax(logits, dim=-1, dtype=torch.float32)
-
-    def greedy_sample(self, probs: torch.Tensor) -> torch.Tensor:
-        return probs.argmax(dim=-1).view(-1)
-
-    def random_sample(
-        self,
-        probs: torch.Tensor,
-        generators: Dict[int, torch.Generator],
-    ) -> torch.Tensor:
-        q = torch.empty_like(probs)
-        # NOTE(woosuk): To batch-process the requests without their own seeds,
-        # which is the common case, we first assume that every request does
-        # not have its own seed. Then, we overwrite the values for the requests
-        # that have their own seeds.
-        if len(generators) != probs.shape[0]:
-            # This might still be done here unnecessarily if there are greedies
-            q.exponential_()
-        if generators:
-            # TODO(woosuk): This can be slow because we handle each request
-            # one by one. Optimize this.
-            for i, generator in generators.items():
-                q[i].exponential_(generator=generator)
-        return probs.div_(q).argmax(dim=-1).view(-1)
-
-    def sample(
-        self,
-        probs: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> torch.Tensor:
-        assert not (sampling_metadata.all_greedy
-                    and sampling_metadata.all_random)
-        if sampling_metadata.all_greedy:
-            return self.greedy_sample(probs)
         if sampling_metadata.all_random:
-            return self.random_sample(probs, sampling_metadata.generators)
+            return random_sampled
 
-        greedy_sampled = self.greedy_sample(probs)
-        random_sampled = self.random_sample(probs,
-                                            sampling_metadata.generators)
+        greedy_sampled = self.greedy_sample(logits)
         sampled = torch.where(
             sampling_metadata.temperature < _SAMPLING_EPS,
             greedy_sampled,
@@ -135,86 +103,34 @@ def sample(
         )
         return sampled
 
+    def get_topk_logprobs(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        logprobs = logits.log_softmax(dim=-1, dtype=torch.float32)
+        # FIXME: Mask the sampled token_id, get topk logprobs,
+        # and concatenate the topk with the sampled token_id.
+        topk_logprobs, topk_indices = torch.topk(
+            logprobs, sampling_metadata.max_num_logprobs, dim=-1)
+        # Use int32 to reduce the tensor size.
+        topk_indices = topk_indices.to(torch.int32)
+        return topk_logprobs, topk_indices
 
-# TODO(woosuk): Optimize this with a custom kernel.
-def _apply_top_k_top_p(
-    logits: torch.Tensor,
-    no_top_k: bool,
-    k: torch.Tensor,
-    no_top_p: bool,
-    p: torch.Tensor,
-) -> torch.Tensor:
-    if no_top_k and no_top_p:
+    def apply_penalties(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> torch.Tensor:
+        apply_min_token_penalties(logits, sampling_metadata.output_token_ids,
+                                  sampling_metadata.stop_token_ids,
+                                  sampling_metadata.min_tokens)
+        if not sampling_metadata.no_penalties:
+            assert sampling_metadata.prompt_token_ids is not None
+            logits = apply_penalties(logits,
+                                     sampling_metadata.prompt_token_ids,
+                                     sampling_metadata.presence_penalties,
+                                     sampling_metadata.frequency_penalties,
+                                     sampling_metadata.repetition_penalties,
+                                     sampling_metadata.output_token_ids)
         return logits
-    logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
-
-    if not no_top_k:
-        # Apply top-k.
-        top_k_mask = logits_sort.size(1) - k.to(torch.long)
-        # Get all the top_k values.
-        top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1))
-        top_k_mask = logits_sort < top_k_mask
-        logits_sort.masked_fill_(top_k_mask, -float("inf"))
-
-    if not no_top_p:
-        # Apply top-p.
-        probs_sort = logits_sort.softmax(dim=-1)
-        probs_sum = probs_sort.cumsum(dim=-1)
-        top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1)
-        # at least one
-        top_p_mask[:, -1] = False
-        logits_sort.masked_fill_(top_p_mask, -float("inf"))
-
-    # Re-sort the probabilities.
-    logits = logits_sort.scatter(dim=-1, index=logits_idx, src=logits_sort)
-    return logits
-
-
-def _apply_min_token_penalties(logits: torch.Tensor,
-                               output_token_ids: List[List[int]],
-                               stop_token_ids: List[Set[int]],
-                               min_tokens: List[int]):
-    """
-    Applies minimum token penalty by setting the logits of the stop tokens
-    to -inf.
-    """
-    min_tokens_logits_to_penalize: List[Tuple[int, int]] = []
-    for index, min_token in enumerate(min_tokens):
-        if (len(output_token_ids[index]) < min_token):
-            for stop_token_id in stop_token_ids[index]:
-                min_tokens_logits_to_penalize.append((index, stop_token_id))
-    if min_tokens_logits_to_penalize:
-        logits[tuple(zip(*min_tokens_logits_to_penalize))] = -float("inf")
-
-
-def _apply_penalties(logits: torch.Tensor, prompt_token_ids: torch.Tensor,
-                     presence_penalties: torch.Tensor,
-                     frequency_penalties: torch.Tensor,
-                     repetition_penalties: torch.Tensor,
-                     output_token_ids: List[List[int]]):
-    """
-    Applies presence, frequency and repetition penalties to the logits.
-    """
-    _, vocab_size = logits.shape
-    output_tokens_t = _convert_to_tensors(output_token_ids, vocab_size,
-                                          logits.device)
-    return apply_penalties(logits, prompt_token_ids, output_tokens_t,
-                           presence_penalties, frequency_penalties,
-                           repetition_penalties)
-
-
-def _convert_to_tensors(output_token_ids: List[List[int]], vocab_size: int,
-                        device: torch.device) -> torch.Tensor:
-    """
-    Convert the different list data structures to tensors.
-    """
-    output_tokens_tensor = make_tensor_with_pad(
-        output_token_ids,
-        # Use the value of vocab_size as a pad since we don't have a
-        # token_id of this value.
-        pad=vocab_size,
-        device="cpu",
-        dtype=torch.int64,
-        pin_memory=is_pin_memory_available(),
-    )
-    return output_tokens_tensor.to(device, non_blocking=True)

From 81b979f2a8f7ec91c262dac7dcbf30ed577ebafd Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 27 Dec 2024 09:47:10 +0900
Subject: [PATCH 189/357] [V1] Fix yapf (#11538)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/sample/ops/penalties.py | 24 +++++++++++++-----------
 vllm/v1/sample/sampler.py       | 16 ++++++++--------
 2 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py
index 91ebaf9269f32..2796d049457d0 100644
--- a/vllm/v1/sample/ops/penalties.py
+++ b/vllm/v1/sample/ops/penalties.py
@@ -2,8 +2,7 @@
 
 import torch
 
-from vllm.model_executor.layers.utils import (
-    apply_penalties as _apply_penalties)
+from vllm.model_executor.layers.utils import apply_penalties
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 
 
@@ -17,27 +16,30 @@ def apply_min_token_penalties(logits: torch.Tensor,
     """
     min_tokens_logits_to_penalize: List[Tuple[int, int]] = []
     for index, min_token in enumerate(min_tokens):
-        if (len(output_token_ids[index]) < min_token):
+        if len(output_token_ids[index]) < min_token:
             for stop_token_id in stop_token_ids[index]:
                 min_tokens_logits_to_penalize.append((index, stop_token_id))
     if min_tokens_logits_to_penalize:
         logits[tuple(zip(*min_tokens_logits_to_penalize))] = -float("inf")
 
 
-def apply_penalties(logits: torch.Tensor, prompt_token_ids: torch.Tensor,
-                    presence_penalties: torch.Tensor,
-                    frequency_penalties: torch.Tensor,
-                    repetition_penalties: torch.Tensor,
-                    output_token_ids: List[List[int]]) -> torch.Tensor:
+def apply_all_penalties(
+    logits: torch.Tensor,
+    prompt_token_ids: torch.Tensor,
+    presence_penalties: torch.Tensor,
+    frequency_penalties: torch.Tensor,
+    repetition_penalties: torch.Tensor,
+    output_token_ids: List[List[int]],
+) -> torch.Tensor:
     """
     Applies presence, frequency and repetition penalties to the logits.
     """
     _, vocab_size = logits.shape
     output_tokens_t = _convert_to_tensors(output_token_ids, vocab_size,
                                           logits.device)
-    return _apply_penalties(logits, prompt_token_ids, output_tokens_t,
-                            presence_penalties, frequency_penalties,
-                            repetition_penalties)
+    return apply_penalties(logits, prompt_token_ids, output_tokens_t,
+                           presence_penalties, frequency_penalties,
+                           repetition_penalties)
 
 
 def _convert_to_tensors(output_token_ids: List[List[int]], vocab_size: int,
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 1e38453a0ff28..7cd42ca211a22 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -6,8 +6,8 @@
 
 from vllm.v1.outputs import SamplerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.sample.ops.penalties import (apply_min_token_penalties,
-                                          apply_penalties)
+from vllm.v1.sample.ops.penalties import (apply_all_penalties,
+                                          apply_min_token_penalties)
 from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
 
 _SAMPLING_EPS = 1e-5
@@ -127,10 +127,10 @@ def apply_penalties(
                                   sampling_metadata.min_tokens)
         if not sampling_metadata.no_penalties:
             assert sampling_metadata.prompt_token_ids is not None
-            logits = apply_penalties(logits,
-                                     sampling_metadata.prompt_token_ids,
-                                     sampling_metadata.presence_penalties,
-                                     sampling_metadata.frequency_penalties,
-                                     sampling_metadata.repetition_penalties,
-                                     sampling_metadata.output_token_ids)
+            logits = apply_all_penalties(
+                logits, sampling_metadata.prompt_token_ids,
+                sampling_metadata.presence_penalties,
+                sampling_metadata.frequency_penalties,
+                sampling_metadata.repetition_penalties,
+                sampling_metadata.output_token_ids)
         return logits

From 46d4359450cd194ab2a4f2fdc370ff4b33a188e2 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Thu, 26 Dec 2024 21:49:16 -0500
Subject: [PATCH 190/357] [CI] Fix broken CI (#11543)

---
 tests/models/registry.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 819ef957a07f3..f5a37420a2909 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -61,6 +61,8 @@ class _HfExamplesInfo:
     "DeepseekForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-llm-7b-chat"),
     "DeepseekV2ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V2-Lite-Chat",  # noqa: E501
                                          trust_remote_code=True),
+    "DeepseekV3ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V3",  # noqa: E501
+                                         trust_remote_code=True),
     "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"),  # noqa: E501
     "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
     "GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"),

From eb881ed006ca458b052905e33f0d16dbb428063a Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 27 Dec 2024 11:05:08 +0800
Subject: [PATCH 191/357] [misc] fix typing (#11540)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 826d1744d88a5..4f960b441f21d 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -208,8 +208,8 @@ def wrap_inductor(graph: fx.GraphModule,
         from torch._inductor.compile_fx import graph_returns_tuple
         returns_tuple = graph_returns_tuple(graph)
 
-        # this is the graph we return to Dynamo to run
-        def compiled_graph(*args) -> Optional[fx.CompiledFxGraph]:
+        # this is the callable we return to Dynamo to run
+        def compiled_graph(*args):
             # convert args to list
             list_args = list(args)
             graph_output = inductor_compiled_graph(list_args)
@@ -537,7 +537,8 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
             example_inputs[x].clone() for x in self.sym_tensor_indices
         ]
 
-        def copy_and_call(*args) -> fx.GraphModule:
+        # this is the callable we return to Dynamo to run
+        def copy_and_call(*args):
             list_args = list(args)
             for i, index in enumerate(self.sym_tensor_indices):
                 runtime_tensor = list_args[index]

From 1b875a0ef3767a7da7507943f46e4a53d3f552c9 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 27 Dec 2024 00:19:21 -0500
Subject: [PATCH 192/357] [V1][3/N] API Server: Reduce Task Switching + Handle
 Abort Properly (#11534)

---
 vllm/v1/engine/async_llm.py    | 159 +++++++++++++--------------------
 vllm/v1/engine/async_stream.py |  55 ------------
 vllm/v1/engine/core.py         |   2 +-
 3 files changed, 63 insertions(+), 153 deletions(-)
 delete mode 100644 vllm/v1/engine/async_stream.py

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index cfdbea8004c35..ba2b8377759d6 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -9,14 +9,13 @@
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.outputs import PoolingRequestOutput, RequestOutput
+from vllm.outputs import RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.v1.engine.async_stream import AsyncStream
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
@@ -54,10 +53,8 @@ def __init__(
             lora_config=vllm_config.lora_config)
         self.tokenizer.ping()
 
-        # Request streams (map of request_id -> AsyncStream).
-        self.request_streams: Dict[str, AsyncStream] = {}
-        # List of cancelled request ids to be aborted.
-        self.client_aborted_requests: List[str] = []
+        # Request streams (map of request_id -> queue).
+        self.rid_to_queue: Dict[str, asyncio.Queue] = {}
 
         # Processor (converts Inputs --> EngineCoreRequests).
         self.processor = Processor(
@@ -153,14 +150,13 @@ async def add_request(
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
-    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
+    ) -> asyncio.Queue[RequestOutput]:
         """Add new request to the AsyncLLM."""
 
-        if self.detokenizer.is_request_active(request_id):
-            raise ValueError(f"Request {request_id} already exists.")
-
-        # 1) Create a new AsyncStream for the request.
-        stream = self._add_request_to_streams(request_id)
+        # 1) Create a new output queue for the request.
+        if request_id in self.rid_to_queue:
+            raise ValueError(f"Request id {request_id} already running.")
+        self.rid_to_queue[request_id] = asyncio.Queue()
 
         # 2) Convert input --> DetokenizerRequest / EngineCoreRequest.
         detokenizer_req, engine_core_req = self.processor.process_inputs(
@@ -173,8 +169,10 @@ async def add_request(
         # 4) Add the EngineCoreRequest to EngineCore (separate process).
         await self.engine_core.add_request_async(engine_core_req)
 
-        # 5) Return the generator.
-        return stream.generator()
+        if self.log_requests:
+            logger.info("Added request %s.", request_id)
+
+        return self.rid_to_queue[request_id]
 
     # TODO: we should support multiple prompts in one call, as you
     # can do with LLM.generate. So that for multi-prompt completion
@@ -194,7 +192,7 @@ async def generate(
         """
         Main function called by the API server to kick off a request
             * 1) Making an AsyncStream corresponding to the Request.
-            # 2) Processing the Input.
+            * 2) Processing the Input.
             * 3) Adding the Request to the Detokenizer.
             * 4) Adding the Request to the EngineCore (separate process).
 
@@ -206,14 +204,15 @@ async def generate(
         returning the RequestOutput back to the caller.
         """
 
-        # We start the output_handler on the first call to generate() so that
-        # we can call __init__ before the event loop starts, which enables us
-        # to handle startup failure gracefully in the OpenAI server.
-        if self.output_handler is None:
-            self.output_handler = asyncio.create_task(
-                self._run_output_handler())
-
-        async for output in await self.add_request(
+        try:
+            # We start the output_handler on the first call to generate() so
+            # we can call __init__ before the event loop, which enables us
+            # to handle startup failure gracefully in the OpenAI server.
+            if self.output_handler is None:
+                self.output_handler = asyncio.create_task(
+                    self._run_output_handler())
+
+            q = await self.add_request(
                 request_id,
                 prompt,
                 sampling_params,
@@ -221,79 +220,42 @@ async def generate(
                 trace_headers=trace_headers,
                 prompt_adapter_request=prompt_adapter_request,
                 priority=priority,
-        ):
-            yield output
-
-    def _finish_stream(self, request_id: str):
-        stream = self.request_streams.pop(request_id, None)
-        if stream is not None:
-            stream.finish()
-
-    def _add_request_to_streams(
-        self,
-        request_id: str,
-    ) -> AsyncStream:
-
-        if request_id in self.request_streams:
-            raise ValueError(f"Request id {request_id} already running.")
-
-        # Avoid streams having circular ref to parent AsyncLLM object.
-        aborted_reqs = self.client_aborted_requests
-        stream = AsyncStream(request_id, aborted_reqs.append)
-        self.request_streams[request_id] = stream
-
-        if self.log_requests:
-            logger.info("Added request %s.", request_id)
+            )
 
-        return stream
-
-    async def _process_cancellations(self) -> None:
-        """
-        Process requests cancelled from user disconnecting.
-
-        When a client disconnects, AsyncStream._cancel() is called.
-        We passed a callback to AsyncStream(), which appends to 
-        self.client_aborted_requests.
-
-        As a result, if any requests are canceled from the user side
-        the request_id will show up in self.client_aborted_requests.
-        """
-
-        # Avoid streams having circular ref to parent AsyncLLM object.
-        if not self.client_aborted_requests:
-            return
-        reqs_to_abort = self.client_aborted_requests.copy()
-        self.client_aborted_requests.clear()
-
-        # Remove from Detokenizer.
-        self.detokenizer.abort_requests(reqs_to_abort)
-
-        # Remove from RequestStreams.
-        for request_id in reqs_to_abort:
-            if self.log_requests:
-                logger.info("User-cancelled request %s.", request_id)
-            self._finish_stream(request_id)
-
-        # Remove from EngineCore.
-        await self.engine_core.abort_requests_async(reqs_to_abort)
+            # The output_handler task pushes items into the queue.
+            # This task pulls from the queue and yields to caller.
+            while True:
+                # Note: drain queue without await if possible (avoids
+                # task switching under load which helps performance).
+                out = q.get_nowait() if q.qsize() > 0 else await q.get()
+
+                # Note: both Detokenizer and EngineCore handle their
+                # own request cleanup based on finished.
+                if out.finished:
+                    del self.rid_to_queue[request_id]
+                    yield out
+                    break
+
+                yield out
+
+        # If the request is disconnected by the client, the
+        # generate() task will be canceled. So, we abort the
+        # request if we end up here.
+        except asyncio.CancelledError:
+            await self.abort(request_id)
+            raise
 
     def _process_request_outputs(self, request_outputs: List[RequestOutput]):
-        """Process outputs by putting them into per-request AsyncStreams."""
+        """Process outputs by putting them into per-request queues."""
 
         for request_output in request_outputs:
             request_id = request_output.request_id
-            assert request_id in self.request_streams
 
-            # Each request in the API server pulls from the per-request stream.
-            stream = self.request_streams.get(request_id)
-            if stream is not None:
-                stream.put(request_output)
-
-                # If finished, remove from the tracker.
-                if request_output.finished:
-                    if self.log_requests:
-                        logger.info("Finished request %s.", request_id)
-                    self._finish_stream(request_id)
+            # Note: it is possible a request was aborted and removed from
+            # the state due to client cancellations, so if we encounter a
+            # request id not in the state, we skip.
+            if request_id in self.rid_to_queue:
+                self.rid_to_queue[request_id].put_nowait(request_output)
 
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
@@ -306,24 +268,27 @@ async def _run_output_handler(self):
                 # 2) Detokenize based on the output.
                 request_outputs, reqs_to_abort = self.detokenizer.step(outputs)
 
-                # 3) Put the RequestOutputs into the per-request AsyncStreams.
+                # 3) Put the RequestOutputs into the per-request queues.
                 self._process_request_outputs(request_outputs)
 
                 # 4) Abort any requests that finished due to stop strings.
                 await self.engine_core.abort_requests_async(reqs_to_abort)
 
-                # 5) Abort any requests due to client cancellations.
-                await self._process_cancellations()
-
         except BaseException as e:
             logger.error(e)
             raise e
 
-    # TODO: can we eliminate these?
-
     async def abort(self, request_id: str) -> None:
-        # Note: Who Calls this? I dont think this is actually used.
-        raise ValueError("Not Supported on V1 yet.")
+        """Abort RequestId in self, detokenizer, and engine core."""
+
+        request_ids = [request_id]
+        await self.engine_core.abort_requests_async(request_ids)
+        self.detokenizer.abort_requests(request_ids)
+
+        # If a request finishes while we await then the request_id
+        # will be removed from the tracked queues before we get here.
+        if request_id in self.rid_to_queue:
+            del self.rid_to_queue[request_id]
 
     def encode(
         self,
diff --git a/vllm/v1/engine/async_stream.py b/vllm/v1/engine/async_stream.py
deleted file mode 100644
index 35449238c3259..0000000000000
--- a/vllm/v1/engine/async_stream.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import asyncio
-from typing import Any, AsyncGenerator, Callable, Optional, Type, Union
-
-from vllm.outputs import PoolingRequestOutput, RequestOutput
-
-
-class AsyncStream:
-    """A stream of RequestOutputs or PoolingRequestOutputs for a request
-    that can be iterated over asynchronously via an async generator."""
-
-    STOP_ITERATION = Exception()  # Sentinel
-
-    def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
-        self.request_id = request_id
-        self._cancel = cancel
-        self._queue: asyncio.Queue = asyncio.Queue()
-        self._finished = False
-
-    def put(self, item: Union[RequestOutput, PoolingRequestOutput,
-                              Exception]) -> None:
-        if not self._finished:
-            self._queue.put_nowait(item)
-
-    def finish(
-        self,
-        exception: Optional[Union[BaseException, Type[BaseException]]] = None,
-    ) -> None:
-        if not self._finished:
-            self._finished = True
-            self._queue.put_nowait(exception if self._is_raisable(exception)
-                                   else AsyncStream.STOP_ITERATION)
-
-    async def generator(
-        self
-    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
-        finished = False
-        try:
-            while True:
-                result = await self._queue.get()
-                if self._is_raisable(result):
-                    finished = True
-                    if result == AsyncStream.STOP_ITERATION:
-                        return
-                    raise result
-                yield result
-        finally:
-            self._finished = True
-            if not finished:
-                self._cancel(self.request_id)
-
-    @staticmethod
-    def _is_raisable(value: Any):
-        return isinstance(value, BaseException) or \
-                (isinstance(value, type) and \
-                 issubclass(value, BaseException))
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 497d5db5b4c99..0aef61fc7f680 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -32,7 +32,7 @@
 
 POLLING_TIMEOUT_MS = 5000
 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
-LOGGING_TIME_S = 5000
+LOGGING_TIME_S = POLLING_TIMEOUT_S
 
 
 class EngineCore:

From 2339d59f9260499599d60599f83978fad1827999 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 27 Dec 2024 01:23:29 -0500
Subject: [PATCH 193/357] [BugFix] Fix quantization for all other methods
 (#11547)

---
 vllm/model_executor/layers/fused_moe/layer.py | 19 ++++++++++++----
 .../layers/quantization/awq_marlin.py         | 10 ++++++---
 .../compressed_tensors_moe.py                 | 22 +++++++++++++------
 .../layers/quantization/experts_int8.py       | 10 ++++++---
 .../model_executor/layers/quantization/fp8.py |  3 +--
 .../layers/quantization/gptq_marlin.py        | 10 ++++++---
 6 files changed, 52 insertions(+), 22 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 01ffac4550f28..b108cbd52c218 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -41,9 +41,20 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
         raise NotImplementedError
 
     @abstractmethod
-    def apply(self, layer: torch.nn.Module, x: torch.Tensor,
-              router_logits: torch.Tensor, top_k: int, renormalize: bool,
-              use_grouped_topk: bool) -> torch.Tensor:
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
         raise NotImplementedError
 
 
@@ -79,7 +90,7 @@ def apply(
         router_logits: torch.Tensor,
         top_k: int,
         renormalize: bool,
-        use_grouped_topk: bool,
+        use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 4d1a837d11585..c28fd0c6737e0 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -440,11 +440,13 @@ def apply(
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
-        renormalize: bool = True,
+        renormalize: bool,
         use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
         topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -454,7 +456,9 @@ def apply(
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
 
         return torch.ops.vllm.fused_marlin_moe(
             x,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index dad04017d3212..5fd6b017f444b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -203,13 +203,14 @@ def apply(
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
-        renormalize: bool = True,
+        renormalize: bool,
         use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
         topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-
         from vllm.model_executor.layers.fused_moe import fused_experts
 
         topk_weights, topk_ids = FusedMoE.select_experts(
@@ -220,7 +221,9 @@ def apply(
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
 
         return fused_experts(x,
                              layer.w13_weight,
@@ -476,12 +479,15 @@ def apply(
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
-        renormalize: bool = True,
+        renormalize: bool,
         use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
         topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -490,7 +496,9 @@ def apply(
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
 
         return torch.ops.vllm.fused_marlin_moe(
             x,
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 97297970d9317..209f12c6dfec9 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -99,11 +99,13 @@ def apply(
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
-        renormalize: bool = True,
+        renormalize: bool,
         use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
         topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -115,7 +117,9 @@ def apply(
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
 
         return fused_experts(x,
                              layer.w13_weight,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 4362468c1db69..7f779ac8d3b3e 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -601,14 +601,13 @@ def apply(
         router_logits: torch.Tensor,
         top_k: int,
         renormalize: bool,
-        use_grouped_topk: bool,
+        use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-
         from vllm.model_executor.layers.fused_moe import fused_experts
 
         topk_weights, topk_ids = FusedMoE.select_experts(
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index a3e58bf1b2a4c..a006d729cc627 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -532,11 +532,13 @@ def apply(
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
-        renormalize: bool = True,
+        renormalize: bool,
         use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
         topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         # The input must currently be float16
         orig_dtype = x.dtype
@@ -550,7 +552,9 @@ def apply(
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
-            custom_routing_function=None)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
 
         return torch.ops.vllm.fused_marlin_moe(
             x,

From 6c6f7fe8a850ca08f9a8774de020163a2a7c2164 Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Fri, 27 Dec 2024 16:45:25 +0800
Subject: [PATCH 194/357] [Platform] Move model arch check to platform (#11503)

Signed-off-by: Mengqing Cao <cmq0113@163.com>
---
 vllm/model_executor/models/registry.py | 37 +-----------------------
 vllm/platforms/interface.py            | 12 ++++++++
 vllm/platforms/rocm.py                 | 39 +++++++++++++++++++++++++-
 3 files changed, 51 insertions(+), 37 deletions(-)

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index feb33bb373c3e..89992de7e238d 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -187,31 +187,6 @@
     **_SPECULATIVE_DECODING_MODELS,
 }
 
-# Models not supported by ROCm.
-_ROCM_UNSUPPORTED_MODELS: List[str] = []
-
-# Models partially supported by ROCm.
-# Architecture -> Reason.
-_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in "
-                    "Triton flash attention. For half-precision SWA support, "
-                    "please use CK flash attention by setting "
-                    "`VLLM_USE_TRITON_FLASH_ATTN=0`")
-_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
-    "Qwen2ForCausalLM":
-    _ROCM_SWA_REASON,
-    "MistralForCausalLM":
-    _ROCM_SWA_REASON,
-    "MixtralForCausalLM":
-    _ROCM_SWA_REASON,
-    "PaliGemmaForConditionalGeneration":
-    ("ROCm flash attention does not yet "
-     "fully support 32-bit precision on PaliGemma"),
-    "Phi3VForCausalLM":
-    ("ROCm Triton flash attention may run into compilation errors due to "
-     "excessive use of shared memory. If this happens, disable Triton FA "
-     "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
-}
-
 
 @dataclass(frozen=True)
 class _ModelInfo:
@@ -297,17 +272,7 @@ def _try_load_model_cls(
     model_arch: str,
     model: _BaseRegisteredModel,
 ) -> Optional[Type[nn.Module]]:
-    if current_platform.is_rocm():
-        if model_arch in _ROCM_UNSUPPORTED_MODELS:
-            raise ValueError(f"Model architecture '{model_arch}' is not "
-                             "supported by ROCm for now.")
-
-        if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
-            msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]
-            logger.warning(
-                "Model architecture '%s' is partially "
-                "supported by ROCm: %s", model_arch, msg)
-
+    current_platform.verify_model_arch(model_arch)
     try:
         return model.load_model_cls()
     except Exception:
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 4150b0cdf836a..ddccaa2ce0148 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -199,6 +199,18 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         """
         pass
 
+    @classmethod
+    def verify_model_arch(cls, model_arch: str) -> None:
+        """
+        Verify whether the current platform supports the specified model
+        architecture.
+
+        - This will raise an Error or Warning based on the model support on
+        the current platform.
+        - By default all models are considered supported.
+        """
+        pass
+
     @classmethod
     def verify_quantization(cls, quant: str) -> None:
         """
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 7778b565372cb..aa779f265135f 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -1,6 +1,6 @@
 import os
 from functools import lru_cache
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Dict, List, Optional
 
 import torch
 
@@ -33,6 +33,31 @@
                    " `spawn` instead.")
     os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
+# Models not supported by ROCm.
+_ROCM_UNSUPPORTED_MODELS: List[str] = []
+
+# Models partially supported by ROCm.
+# Architecture -> Reason.
+_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in "
+                    "Triton flash attention. For half-precision SWA support, "
+                    "please use CK flash attention by setting "
+                    "`VLLM_USE_TRITON_FLASH_ATTN=0`")
+_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
+    "Qwen2ForCausalLM":
+    _ROCM_SWA_REASON,
+    "MistralForCausalLM":
+    _ROCM_SWA_REASON,
+    "MixtralForCausalLM":
+    _ROCM_SWA_REASON,
+    "PaliGemmaForConditionalGeneration":
+    ("ROCm flash attention does not yet "
+     "fully support 32-bit precision on PaliGemma"),
+    "Phi3VForCausalLM":
+    ("ROCm Triton flash attention may run into compilation errors due to "
+     "excessive use of shared memory. If this happens, disable Triton FA "
+     "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
+}
+
 
 class RocmPlatform(Platform):
     _enum = PlatformEnum.ROCM
@@ -102,6 +127,18 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             else:
                 parallel_config.worker_cls = "vllm.worker.worker.Worker"
 
+    @classmethod
+    def verify_model_arch(cls, model_arch: str) -> None:
+        if model_arch in _ROCM_UNSUPPORTED_MODELS:
+            raise ValueError(f"Model architecture '{model_arch}' is not "
+                             "supported by ROCm for now.")
+
+        if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
+            msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]
+            logger.warning(
+                "Model architecture '%s' is partially "
+                "supported by ROCm: %s", model_arch, msg)
+
     @classmethod
     def verify_quantization(cls, quant: str) -> None:
         super().verify_quantization(quant)

From d003f3ea391b4c879f6f848dd485dd3c04fa6ca9 Mon Sep 17 00:00:00 2001
From: AlexHe99 <alehe@amd.com>
Date: Fri, 27 Dec 2024 18:00:04 +0800
Subject: [PATCH 195/357] Update deploying_with_k8s.md with AMD ROCm GPU
 example (#11465)

Signed-off-by: Alex He <alehe@amd.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/source/serving/deploying_with_k8s.md | 79 ++++++++++++++++++++++-
 1 file changed, 78 insertions(+), 1 deletion(-)

diff --git a/docs/source/serving/deploying_with_k8s.md b/docs/source/serving/deploying_with_k8s.md
index d27db826cd006..77f848088ea43 100644
--- a/docs/source/serving/deploying_with_k8s.md
+++ b/docs/source/serving/deploying_with_k8s.md
@@ -47,7 +47,11 @@ data:
   token: "REPLACE_WITH_TOKEN"
 ```
 
-Create a deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model:
+Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model.
+
+Here are two examples for using NVIDIA GPU and AMD GPU. 
+
+- NVIDIA GPU
 
 ```yaml
 apiVersion: apps/v1
@@ -119,6 +123,79 @@ spec:
           periodSeconds: 5
 ```
 
+- AMD GPU
+
+You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X.
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: mistral-7b
+  namespace: default
+  labels:
+    app: mistral-7b
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: mistral-7b
+  template:
+    metadata:
+      labels:
+        app: mistral-7b
+    spec:
+      volumes:
+      # PVC
+      - name: cache-volume
+        persistentVolumeClaim:
+          claimName: mistral-7b
+      # vLLM needs to access the host's shared memory for tensor parallel inference.
+      - name: shm
+        emptyDir:
+          medium: Memory
+          sizeLimit: "8Gi"
+      hostNetwork: true
+      hostIPC: true
+      containers:
+      - name: mistral-7b
+        image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+        securityContext:
+          seccompProfile:
+            type: Unconfined
+          runAsGroup: 44
+          capabilities:
+            add:
+            - SYS_PTRACE
+        command: ["/bin/sh", "-c"]
+        args: [
+          "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
+        ]
+        env:
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-token-secret
+              key: token
+        ports:
+        - containerPort: 8000
+        resources:
+          limits:
+            cpu: "10"
+            memory: 20G
+            amd.com/gpu: "1"
+          requests:
+            cpu: "6"
+            memory: 6G
+            amd.com/gpu: "1"
+        volumeMounts:
+        - name: cache-volume
+          mountPath: /root/.cache/huggingface
+        - name: shm
+          mountPath: /dev/shm
+```
+You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>.
+
 2. **Create a Kubernetes Service for vLLM**
 
 Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:

From 2c9b8ea2b006e763b8268b8ab02181c9822cfe76 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 27 Dec 2024 18:39:15 +0800
Subject: [PATCH 196/357] [Bugfix] Fix TeleChat2ForCausalLM weights mapper
 (#11546)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/models/telechat2.py | 26 ++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py
index 28c37bb96612c..02ca7fe08e556 100644
--- a/vllm/model_executor/models/telechat2.py
+++ b/vllm/model_executor/models/telechat2.py
@@ -31,19 +31,6 @@
 
 class TeleChat2Model(LlamaModel):
 
-    hf_to_vllm_mapper = WeightsMapper(
-        orig_to_new_prefix={
-            "transformer.": "model.",
-        },
-        orig_to_new_substr={
-            ".h.": ".layers.",
-            ".self_attention.": ".self_attn.",
-            ".word_embeddings.": ".embed_tokens.",
-            ".dense.": ".o_proj.",
-            ".ln_f.": ".norm.",
-        },
-    )
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # 1. Initialize the LlamaModel with bias
         vllm_config.model_config.hf_config.bias = True
@@ -118,6 +105,19 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
 class TeleChat2ForCausalLM(LlamaForCausalLM):
 
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "transformer.": "model.",
+        },
+        orig_to_new_substr={
+            ".h.": ".layers.",
+            ".self_attention.": ".self_attn.",
+            ".word_embeddings.": ".embed_tokens.",
+            ".dense.": ".o_proj.",
+            ".ln_f.": ".norm.",
+        },
+    )
+
     def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
         return TeleChat2Model(vllm_config=vllm_config, prefix=prefix)
 

From 7af553ea30031446b4c1c74ad83187f9fd3de4e7 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 27 Dec 2024 19:21:23 +0800
Subject: [PATCH 197/357] [Misc] Abstract the logic for reading and writing
 media content (#11527)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/entrypoints/openai/test_serving_chat.py |   1 +
 tests/entrypoints/test_chat_utils.py          |   6 +-
 tests/multimodal/test_utils.py                |  59 ++-
 vllm/assets/audio.py                          |   6 +-
 vllm/entrypoints/chat_utils.py                | 129 +++--
 vllm/multimodal/audio.py                      |  36 +-
 vllm/multimodal/base.py                       |  38 +-
 vllm/multimodal/image.py                      |  41 +-
 vllm/multimodal/utils.py                      | 477 ++++++++----------
 vllm/multimodal/video.py                      |  87 +++-
 10 files changed, 493 insertions(+), 387 deletions(-)

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 51b255bb2a6db..61677b65af342 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -33,6 +33,7 @@ class MockModelConfig:
     hf_config = MockHFConfig()
     logits_processor_pattern = None
     diff_sampling_param: Optional[dict] = None
+    allowed_local_media_path: str = ""
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 996e60bfee592..d63b963522e73 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -2,7 +2,6 @@
 from typing import Optional
 
 import pytest
-from PIL import Image
 
 from vllm.assets.image import ImageAsset
 from vllm.config import ModelConfig
@@ -91,10 +90,7 @@ def _assert_mm_data_is_image_input(
     image_data = mm_data.get("image")
     assert image_data is not None
 
-    if image_count == 1:
-        assert isinstance(image_data, Image.Image)
-    else:
-        assert isinstance(image_data, list) and len(image_data) == image_count
+    assert isinstance(image_data, list) and len(image_data) == image_count
 
 
 def test_parse_chat_messages_single_image(
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index fd82fb0c55fd7..6029f2e514772 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -9,7 +9,7 @@
 from PIL import Image, ImageChops
 from transformers import AutoConfig, AutoTokenizer
 
-from vllm.multimodal.utils import (async_fetch_image, fetch_image,
+from vllm.multimodal.utils import (MediaConnector,
                                    repeat_and_pad_placeholder_tokens)
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
@@ -23,7 +23,12 @@
 
 @pytest.fixture(scope="module")
 def url_images() -> Dict[str, Image.Image]:
-    return {image_url: fetch_image(image_url) for image_url in TEST_IMAGE_URLS}
+    connector = MediaConnector()
+
+    return {
+        image_url: connector.fetch_image(image_url)
+        for image_url in TEST_IMAGE_URLS
+    }
 
 
 def get_supported_suffixes() -> Tuple[str, ...]:
@@ -43,8 +48,10 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool:
 @pytest.mark.asyncio
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_fetch_image_http(image_url: str):
-    image_sync = fetch_image(image_url)
-    image_async = await async_fetch_image(image_url)
+    connector = MediaConnector()
+
+    image_sync = connector.fetch_image(image_url)
+    image_async = await connector.fetch_image_async(image_url)
     assert _image_equals(image_sync, image_async)
 
 
@@ -53,6 +60,7 @@ async def test_fetch_image_http(image_url: str):
 @pytest.mark.parametrize("suffix", get_supported_suffixes())
 async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
                                   image_url: str, suffix: str):
+    connector = MediaConnector()
     url_image = url_images[image_url]
 
     try:
@@ -75,48 +83,49 @@ async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
         base64_image = base64.b64encode(f.read()).decode("utf-8")
         data_url = f"data:{mime_type};base64,{base64_image}"
 
-        data_image_sync = fetch_image(data_url)
+        data_image_sync = connector.fetch_image(data_url)
         if _image_equals(url_image, Image.open(f)):
             assert _image_equals(url_image, data_image_sync)
         else:
             pass  # Lossy format; only check that image can be opened
 
-        data_image_async = await async_fetch_image(data_url)
+        data_image_async = await connector.fetch_image_async(data_url)
         assert _image_equals(data_image_sync, data_image_async)
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_fetch_image_local_files(image_url: str):
+    connector = MediaConnector()
+
     with TemporaryDirectory() as temp_dir:
-        origin_image = fetch_image(image_url)
+        local_connector = MediaConnector(allowed_local_media_path=temp_dir)
+
+        origin_image = connector.fetch_image(image_url)
         origin_image.save(os.path.join(temp_dir, os.path.basename(image_url)),
                           quality=100,
                           icc_profile=origin_image.info.get('icc_profile'))
 
-        image_async = await async_fetch_image(
-            f"file://{temp_dir}/{os.path.basename(image_url)}",
-            allowed_local_media_path=temp_dir)
-
-        image_sync = fetch_image(
-            f"file://{temp_dir}/{os.path.basename(image_url)}",
-            allowed_local_media_path=temp_dir)
+        image_async = await local_connector.fetch_image_async(
+            f"file://{temp_dir}/{os.path.basename(image_url)}")
+        image_sync = local_connector.fetch_image(
+            f"file://{temp_dir}/{os.path.basename(image_url)}")
         # Check that the images are equal
         assert not ImageChops.difference(image_sync, image_async).getbbox()
 
-        with pytest.raises(ValueError):
-            await async_fetch_image(
-                f"file://{temp_dir}/../{os.path.basename(image_url)}",
-                allowed_local_media_path=temp_dir)
-        with pytest.raises(ValueError):
-            await async_fetch_image(
+        with pytest.raises(ValueError, match="must be a subpath"):
+            await local_connector.fetch_image_async(
+                f"file://{temp_dir}/../{os.path.basename(image_url)}")
+        with pytest.raises(RuntimeError, match="Cannot load local files"):
+            await connector.fetch_image_async(
                 f"file://{temp_dir}/../{os.path.basename(image_url)}")
 
-        with pytest.raises(ValueError):
-            fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}",
-                        allowed_local_media_path=temp_dir)
-        with pytest.raises(ValueError):
-            fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}")
+        with pytest.raises(ValueError, match="must be a subpath"):
+            local_connector.fetch_image(
+                f"file://{temp_dir}/../{os.path.basename(image_url)}")
+        with pytest.raises(RuntimeError, match="Cannot load local files"):
+            connector.fetch_image(
+                f"file://{temp_dir}/../{os.path.basename(image_url)}")
 
 
 @pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-mistral-7b-hf"])
diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py
index 9033644e3264a..a46c67ad7e00e 100644
--- a/vllm/assets/audio.py
+++ b/vllm/assets/audio.py
@@ -21,12 +21,10 @@ class AudioAsset:
     name: Literal["winning_call", "mary_had_lamb"]
 
     @property
-    def audio_and_sample_rate(self) -> tuple[npt.NDArray, int]:
+    def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
         audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg",
                                             s3_prefix=ASSET_DIR)
-        y, sr = librosa.load(audio_path, sr=None)
-        assert isinstance(sr, int)
-        return y, sr
+        return librosa.load(audio_path, sr=None)
 
     @property
     def url(self) -> str:
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 3df08c740d65b..a492d5496e025 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -6,7 +6,7 @@
 from functools import lru_cache, partial
 from pathlib import Path
 from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List,
-                    Literal, Mapping, Optional, Tuple, TypeVar, Union, cast)
+                    Literal, Optional, Tuple, TypeVar, Union, cast)
 
 import jinja2.nodes
 import transformers.utils.chat_template_utils as hf_chat_utils
@@ -23,6 +23,8 @@
     ChatCompletionMessageParam as OpenAIChatCompletionMessageParam)
 from openai.types.chat import (ChatCompletionMessageToolCallParam,
                                ChatCompletionToolMessageParam)
+from openai.types.chat.chat_completion_content_part_input_audio_param import (
+    InputAudio)
 # yapf: enable
 # pydantic needs the TypedDict from typing_extensions
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
@@ -31,11 +33,7 @@
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalDataDict
-from vllm.multimodal.utils import (async_get_and_parse_audio,
-                                   async_get_and_parse_image,
-                                   async_get_and_parse_video,
-                                   get_and_parse_audio, get_and_parse_image,
-                                   get_and_parse_video)
+from vllm.multimodal.utils import MediaConnector
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import print_warning_once
 
@@ -368,14 +366,17 @@ def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer):
         self._tokenizer = tokenizer
         self._allowed_items = (model_config.multimodal_config.limit_per_prompt
                                if model_config.multimodal_config else {})
-        self._consumed_items = {k: 0 for k in self._allowed_items}
 
-        self._items: List[_T] = []
+        self._items_by_modality = defaultdict[str, list[_T]](list)
 
     @property
     def model_config(self) -> ModelConfig:
         return self._model_config
 
+    @property
+    def allowed_local_media_path(self):
+        return self._model_config.allowed_local_media_path
+
     @staticmethod
     @lru_cache(maxsize=None)
     def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
@@ -435,38 +436,19 @@ def _placeholder_str(self, modality: ModalityStr,
         else:
             raise TypeError(f"Unknown modality: {modality}")
 
-    @staticmethod
-    def _combine(items: List[MultiModalDataDict]) -> MultiModalDataDict:
-        mm_lists: Mapping[str, List[object]] = defaultdict(list)
-
-        # Merge all the multi-modal items
-        for single_mm_data in items:
-            for mm_key, mm_item in single_mm_data.items():
-                if isinstance(mm_item, list):
-                    mm_lists[mm_key].extend(mm_item)
-                else:
-                    mm_lists[mm_key].append(mm_item)
-
-        # Unpack any single item lists for models that don't expect multiple.
-        return {
-            mm_key: mm_list[0] if len(mm_list) == 1 else mm_list
-            for mm_key, mm_list in mm_lists.items()
-        }
-
     def add(self, modality: ModalityStr, item: _T) -> Optional[str]:
         """
         Add a multi-modal item to the current prompt and returns the
         placeholder string to use, if any.
         """
         allowed_count = self._allowed_items.get(modality, 1)
-        current_count = self._consumed_items.get(modality, 0) + 1
+        current_count = len(self._items_by_modality[modality]) + 1
         if current_count > allowed_count:
             raise ValueError(
                 f"At most {allowed_count} {modality}(s) may be provided in "
                 "one request.")
 
-        self._consumed_items[modality] = current_count
-        self._items.append(item)
+        self._items_by_modality[modality].append(item)
 
         return self._placeholder_str(modality, current_count)
 
@@ -475,22 +457,26 @@ def create_parser(self) -> "BaseMultiModalContentParser":
         raise NotImplementedError
 
 
-class MultiModalItemTracker(BaseMultiModalItemTracker[MultiModalDataDict]):
+class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
 
     def all_mm_data(self) -> Optional[MultiModalDataDict]:
-        return self._combine(self._items) if self._items else None
+        if self._items_by_modality:
+            return dict(self._items_by_modality)
+
+        return None
 
     def create_parser(self) -> "BaseMultiModalContentParser":
         return MultiModalContentParser(self)
 
 
-class AsyncMultiModalItemTracker(
-        BaseMultiModalItemTracker[Awaitable[MultiModalDataDict]]):
+class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
 
     async def all_mm_data(self) -> Optional[MultiModalDataDict]:
-        if self._items:
-            items = await asyncio.gather(*self._items)
-            return self._combine(items)
+        if self._items_by_modality:
+            return {
+                modality: await asyncio.gather(*items)
+                for modality, items in self._items_by_modality.items()
+            }
 
         return None
 
@@ -522,7 +508,7 @@ def parse_audio(self, audio_url: str) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
+    def parse_input_audio(self, input_audio: InputAudio) -> None:
         raise NotImplementedError
 
     @abstractmethod
@@ -537,31 +523,31 @@ def __init__(self, tracker: MultiModalItemTracker) -> None:
 
         self._tracker = tracker
 
+        self._connector = MediaConnector(
+            allowed_local_media_path=tracker.allowed_local_media_path,
+        )
+
     def parse_image(self, image_url: str) -> None:
-        image = get_and_parse_image(image_url,
-                                    allowed_local_media_path=self._tracker.
-                                    _model_config.allowed_local_media_path)
+        image = self._connector.fetch_image(image_url)
 
         placeholder = self._tracker.add("image", image)
         self._add_placeholder(placeholder)
 
     def parse_audio(self, audio_url: str) -> None:
-        audio = get_and_parse_audio(audio_url)
+        audio = self._connector.fetch_audio(audio_url)
 
         placeholder = self._tracker.add("audio", audio)
         self._add_placeholder(placeholder)
 
-    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
-        input_audio_data = input_audio.get("data","")
-        input_audio_format = input_audio.get("format","")
-        audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}"
-        audio = get_and_parse_audio(audio_url)
+    def parse_input_audio(self, input_audio: InputAudio) -> None:
+        audio_data = input_audio.get("data", "")
+        audio_format = input_audio.get("format", "")
+        audio_url = f"data:audio/{audio_format};base64,{audio_data}"
 
-        placeholder = self._tracker.add("audio", audio)
-        self._add_placeholder(placeholder)
+        return self.parse_audio(audio_url)
 
     def parse_video(self, video_url: str) -> None:
-        video = get_and_parse_video(video_url)
+        video = self._connector.fetch_video(video_url)
 
         placeholder = self._tracker.add("video", video)
         self._add_placeholder(placeholder)
@@ -573,33 +559,31 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None:
         super().__init__()
 
         self._tracker = tracker
+        self._connector = MediaConnector(
+            allowed_local_media_path=tracker.allowed_local_media_path,
+        )
 
     def parse_image(self, image_url: str) -> None:
-        image_coro = async_get_and_parse_image(
-            image_url,
-            allowed_local_media_path=self._tracker._model_config.
-            allowed_local_media_path)
+        image_coro = self._connector.fetch_image_async(image_url)
 
         placeholder = self._tracker.add("image", image_coro)
         self._add_placeholder(placeholder)
 
     def parse_audio(self, audio_url: str) -> None:
-        audio_coro = async_get_and_parse_audio(audio_url)
+        audio_coro = self._connector.fetch_audio_async(audio_url)
 
         placeholder = self._tracker.add("audio", audio_coro)
         self._add_placeholder(placeholder)
 
-    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
-        input_audio_data = input_audio.get("data","")
-        input_audio_format = input_audio.get("format","")
-        audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}"
-        audio_coro = async_get_and_parse_audio(audio_url)
+    def parse_input_audio(self, input_audio: InputAudio) -> None:
+        audio_data = input_audio.get("data", "")
+        audio_format = input_audio.get("format", "")
+        audio_url = f"data:audio/{audio_format};base64,{audio_data}"
 
-        placeholder = self._tracker.add("audio", audio_coro)
-        self._add_placeholder(placeholder)
+        return self.parse_audio(audio_url)
 
     def parse_video(self, video_url: str) -> None:
-        video = async_get_and_parse_video(video_url)
+        video = self._connector.fetch_video_async(video_url)
 
         placeholder = self._tracker.add("video", video)
         self._add_placeholder(placeholder)
@@ -695,10 +679,13 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
 _VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
 
+_ContentPart: TypeAlias = Union[str, Dict[str, str], InputAudio]
+
 # Define a mapping from part types to their corresponding parsing functions.
-MM_PARSER_MAP: Dict[str,
-                    Callable[[ChatCompletionContentPartParam],
-                             Union[str, Dict[str,str]]]] = {
+MM_PARSER_MAP: Dict[
+    str,
+    Callable[[ChatCompletionContentPartParam], _ContentPart],
+] = {
     "text":
     lambda part: _TextParser(part).get("text", ""),
     "image_url":
@@ -715,8 +702,7 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
 
 
 def _parse_chat_message_content_mm_part(
-        part: ChatCompletionContentPartParam) -> Tuple[str,
-                                                Union[str, Dict[str, str]]]:
+        part: ChatCompletionContentPartParam) -> tuple[str, _ContentPart]:
     """
     Parses a given multi-modal content part based on its type.
 
@@ -783,7 +769,7 @@ def _parse_chat_message_content_parts(
     *,
     wrap_dicts: bool,
 ) -> List[ConversationMessage]:
-    content: List[Union[str, Dict[str, str]]] = []
+    content = list[_ContentPart]()
 
     mm_parser = mm_tracker.create_parser()
 
@@ -814,7 +800,7 @@ def _parse_chat_message_content_part(
     mm_parser: BaseMultiModalContentParser,
     *,
     wrap_dicts: bool,
-) -> Optional[Union[str, Dict[str, str]]]:
+) -> Optional[_ContentPart]:
     """Parses a single part of a conversation. If wrap_dicts is True,
     structured dictionary pieces for texts and images will be
     wrapped in dictionaries, i.e., {"type": "text", "text", ...} and
@@ -823,8 +809,7 @@ def _parse_chat_message_content_part(
     with multimodal placeholders.
     """
     if isinstance(part, str):  # Handle plain text parts
-        text = _TextParser(part)
-        return text
+        return part
 
     # Handle structured dictionary parts
     part_type, content = _parse_chat_message_content_mm_part(part)
@@ -855,7 +840,7 @@ def _parse_chat_message_content_part(
         return {'type': 'audio'} if wrap_dicts else None
 
     if part_type == "input_audio":
-        dict_content = cast(Dict[str, str], content)
+        dict_content = cast(InputAudio, content)
         mm_parser.parse_input_audio(dict_content)
         return {'type': 'audio'} if wrap_dicts else None
 
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index ed3bb82bf0aaa..3e09ef1fcbb56 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -1,10 +1,14 @@
+import base64
+from io import BytesIO
+from pathlib import Path
+
 import numpy as np
 import numpy.typing as npt
 
 from vllm.inputs.registry import InputContext
 from vllm.utils import PlaceholderModule
 
-from .base import MultiModalPlugin
+from .base import MediaIO, MultiModalPlugin
 from .inputs import AudioItem, MultiModalData, MultiModalKwargs
 
 try:
@@ -12,6 +16,11 @@
 except ImportError:
     librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
 
+try:
+    import soundfile
+except ImportError:
+    soundfile = PlaceholderModule("soundfile")  # type: ignore[assignment]
+
 
 class AudioPlugin(MultiModalPlugin):
     """Plugin for audio data."""
@@ -39,3 +48,28 @@ def resample_audio(
     target_sr: float,
 ) -> npt.NDArray[np.floating]:
     return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
+
+
+class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
+
+    def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
+        return librosa.load(BytesIO(data), sr=None)
+
+    def load_base64(
+        self,
+        media_type: str,
+        data: str,
+    ) -> tuple[npt.NDArray, float]:
+        return self.load_bytes(base64.b64decode(data))
+
+    def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
+        return librosa.load(filepath, sr=None)
+
+    def encode_base64(self, media: tuple[npt.NDArray, float]) -> str:
+        audio, sr = media
+
+        with BytesIO() as buffer:
+            soundfile.write(buffer, audio, sr, format="WAV")
+            data = buffer.getvalue()
+
+        return base64.b64encode(data).decode('utf-8')
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 1e5a46946c6c0..10488e24b30cc 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,6 +1,7 @@
 from abc import ABC, abstractmethod
 from collections import defaultdict
-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple,
+from pathlib import Path
+from typing import (TYPE_CHECKING, Any, Callable, Generic, NamedTuple,
                     Optional, Sequence, Tuple, Type, TypeVar, Union)
 
 from torch import nn
@@ -118,7 +119,7 @@ def map_input(
         self,
         model_config: "ModelConfig",
         data: MultiModalData[Any],
-        mm_processor_kwargs: Optional[Dict[str, Any]],
+        mm_processor_kwargs: Optional[dict[str, Any]],
     ) -> MultiModalKwargs:
         """
         Transform the data into a dictionary of model inputs using the
@@ -254,10 +255,10 @@ class MultiModalPlaceholderMap:
     """
 
     class IndexMap(NamedTuple):
-        src: List[int]
-        dest: List[int]
+        src: list[int]
+        dest: list[int]
 
-    src_ranges: List[range]
+    src_ranges: list[range]
     """
     The indices of the multi-modal embeddings that will replace the
     corresponding placeholder embeddings pointed to by ``dest_ranges``.
@@ -268,7 +269,7 @@ class IndexMap(NamedTuple):
     The total number of flattened multi-modal embeddings.
     """
 
-    dest_ranges: List[range]
+    dest_ranges: list[range]
     """
     The indices of the placeholder embeddings that will be replaced by the
     multimodal embeddings.
@@ -288,7 +289,7 @@ def __init__(self):
     @classmethod
     def from_seq_group(
         cls, seq_group: "SequenceGroupMetadata", positions: range
-    ) -> Tuple[Optional[MultiModalDataDict], Dict[str,
+    ) -> Tuple[Optional[MultiModalDataDict], dict[str,
                                                   "MultiModalPlaceholderMap"]]:
         """
         Returns the multi-modal items that intersect with the portion of a
@@ -376,9 +377,9 @@ def from_seq_group(
     def append_items_from_seq_group(
         self,
         positions: range,
-        multi_modal_items: List[_T],
+        multi_modal_items: list[_T],
         multi_modal_placeholders: Sequence[PlaceholderRange],
-    ) -> List[_T]:
+    ) -> list[_T]:
         """
         Adds the multi-modal items that intersect ```positions`` to this
         placeholder map and returns the intersecting items.
@@ -454,3 +455,22 @@ def index_map(self) -> "IndexMap":
 
         return MultiModalPlaceholderMap.IndexMap(src=src_indices,
                                                  dest=dest_indices)
+
+
+class MediaIO(ABC, Generic[_T]):
+
+    @abstractmethod
+    def load_bytes(self, data: bytes) -> _T:
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_base64(self, media_type: str, data: str) -> _T:
+        """
+        List of media types:
+        https://www.iana.org/assignments/media-types/media-types.xhtml
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_file(self, filepath: Path) -> _T:
+        raise NotImplementedError
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index c705e1a3d1554..14c79dfadec0c 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,4 +1,7 @@
+import base64
 from functools import lru_cache
+from io import BytesIO
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import torch
@@ -9,7 +12,7 @@
 from vllm.transformers_utils.processor import get_image_processor
 from vllm.utils import is_list_of
 
-from .base import MultiModalPlugin
+from .base import MediaIO, MultiModalPlugin
 from .inputs import ImageItem, MultiModalData, MultiModalKwargs
 
 if TYPE_CHECKING:
@@ -96,3 +99,39 @@ def rescale_image_size(image: Image.Image,
     if transpose >= 0:
         image = image.transpose(Image.Transpose(transpose))
     return image
+
+
+class ImageMediaIO(MediaIO[Image.Image]):
+
+    def __init__(self, *, image_mode: str = "RGB") -> None:
+        super().__init__()
+
+        self.image_mode = image_mode
+
+    def load_bytes(self, data: bytes) -> Image.Image:
+        image = Image.open(BytesIO(data))
+        image.load()
+        return image.convert(self.image_mode)
+
+    def load_base64(self, media_type: str, data: str) -> Image.Image:
+        return self.load_bytes(base64.b64decode(data))
+
+    def load_file(self, filepath: Path) -> Image.Image:
+        image = Image.open(filepath)
+        image.load()
+        return image.convert(self.image_mode)
+
+    def encode_base64(
+        self,
+        media: Image.Image,
+        *,
+        image_format: str = "JPEG",
+    ) -> str:
+        image = media
+
+        with BytesIO() as buffer:
+            image = image.convert(self.image_mode)
+            image.save(buffer, image_format)
+            data = buffer.getvalue()
+
+        return base64.b64encode(data).decode('utf-8')
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index a49da2bdee972..87b12a6fb33c1 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,8 +1,7 @@
-import base64
-import os
 from functools import lru_cache
-from io import BytesIO
-from typing import List, Optional, Tuple, TypeVar, Union
+from pathlib import Path
+from typing import Optional, TypeVar, Union
+from urllib.parse import ParseResult, urlparse
 
 import numpy as np
 import numpy.typing as npt
@@ -10,283 +9,246 @@
 from PIL import Image
 
 import vllm.envs as envs
-from vllm.connections import global_http_connection
+from vllm.connections import HTTPConnection, global_http_connection
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
-from vllm.utils import PlaceholderModule
 
-from .inputs import MultiModalDataDict, PlaceholderRange
-
-try:
-    import decord
-except ImportError:
-    decord = PlaceholderModule("decord")  # type: ignore[assignment]
-
-try:
-    import librosa
-except ImportError:
-    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
-
-try:
-    import soundfile
-except ImportError:
-    soundfile = PlaceholderModule("soundfile")  # type: ignore[assignment]
+from .audio import AudioMediaIO
+from .base import MediaIO
+from .image import ImageMediaIO
+from .inputs import PlaceholderRange
+from .video import VideoMediaIO
 
 logger = init_logger(__name__)
 
 cached_get_tokenizer = lru_cache(get_tokenizer)
 
+_M = TypeVar("_M")
 
-def _load_image_from_bytes(b: bytes) -> Image.Image:
-    image = Image.open(BytesIO(b))
-    image.load()
-    return image
-
-
-def _is_subpath(image_path: str, allowed_local_media_path: str) -> bool:
-    # Get the common path
-    common_path = os.path.commonpath([
-        os.path.abspath(image_path),
-        os.path.abspath(allowed_local_media_path)
-    ])
-    # Check if the common path is the same as allowed_local_media_path
-    return common_path == os.path.abspath(allowed_local_media_path)
 
+class MediaConnector:
 
-def _load_image_from_file(image_url: str,
-                          allowed_local_media_path: str) -> Image.Image:
-    if not allowed_local_media_path:
-        raise ValueError("Invalid 'image_url': Cannot load local files without"
-                         "'--allowed-local-media-path'.")
-    if allowed_local_media_path:
-        if not os.path.exists(allowed_local_media_path):
-            raise ValueError(
-                "Invalid '--allowed-local-media-path': "
-                f"The path {allowed_local_media_path} does not exist.")
-        if not os.path.isdir(allowed_local_media_path):
+    def __init__(
+        self,
+        connection: HTTPConnection = global_http_connection,
+        *,
+        allowed_local_media_path: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.connection = connection
+
+        if allowed_local_media_path:
+            allowed_local_media_path_ = Path(allowed_local_media_path)
+
+            if not allowed_local_media_path_.exists():
+                raise ValueError(
+                    "Invalid `--allowed-local-media-path`: The path "
+                    f"{allowed_local_media_path_} does not exist.")
+            if not allowed_local_media_path_.is_dir():
+                raise ValueError(
+                    "Invalid `--allowed-local-media-path`: The path "
+                    f"{allowed_local_media_path_} must be a directory.")
+        else:
+            allowed_local_media_path_ = None
+
+        self.allowed_local_media_path = allowed_local_media_path_
+
+    def _load_data_url(
+        self,
+        url_spec: ParseResult,
+        media_io: MediaIO[_M],
+    ) -> _M:
+        data_spec, data = url_spec.path.split(",", 1)
+        media_type, data_type = data_spec.split(";", 1)
+
+        if data_type != "base64":
+            msg = "Only base64 data URLs are supported for now."
+            raise NotImplementedError(msg)
+
+        return media_io.load_base64(media_type, data)
+
+    def _load_file_url(
+        self,
+        url_spec: ParseResult,
+        media_io: MediaIO[_M],
+    ) -> _M:
+        allowed_local_media_path = self.allowed_local_media_path
+        if allowed_local_media_path is None:
+            raise RuntimeError("Cannot load local files without "
+                               "`--allowed-local-media-path`.")
+
+        filepath = Path(url_spec.path)
+        if allowed_local_media_path not in filepath.resolve().parents:
             raise ValueError(
-                "Invalid '--allowed-local-media-path': "
-                f"The path {allowed_local_media_path} must be a directory.")
-
-    # Only split once and assume the second part is the image path
-    _, image_path = image_url.split("file://", 1)
-    if not _is_subpath(image_path, allowed_local_media_path):
-        raise ValueError(
-            f"Invalid 'image_url': The file path {image_path} must"
-            " be a subpath of '--allowed-local-media-path'"
-            f" '{allowed_local_media_path}'.")
-
-    image = Image.open(image_path)
-    image.load()
-    return image
+                f"The file path {filepath} must be a subpath "
+                f"of `--allowed-local-media-path` {allowed_local_media_path}.")
 
+        return media_io.load_file(filepath)
 
-def _load_image_from_data_url(image_url: str) -> Image.Image:
-    # Only split once and assume the second part is the base64 encoded image
-    _, image_base64 = image_url.split(",", 1)
-    return load_image_from_base64(image_base64)
-
-
-def fetch_image(image_url: str,
-                *,
-                image_mode: str = "RGB",
-                allowed_local_media_path: str = "") -> Image.Image:
-    """
-    Load a PIL image from a HTTP or base64 data URL.
-
-    By default, the image is converted into RGB format.
-    """
-    if image_url.startswith('http'):
-        image_raw = global_http_connection.get_bytes(
-            image_url,
-            timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
-        )
-        image = _load_image_from_bytes(image_raw)
-
-    elif image_url.startswith('data:image'):
-        image = _load_image_from_data_url(image_url)
-    elif image_url.startswith('file://'):
-        image = _load_image_from_file(image_url, allowed_local_media_path)
-    else:
-        raise ValueError("Invalid 'image_url': A valid 'image_url' must start "
-                         "with either 'data:image', 'file://' or 'http'.")
-
-    return image.convert(image_mode)
-
-
-async def async_fetch_image(image_url: str,
-                            *,
-                            image_mode: str = "RGB",
-                            allowed_local_media_path: str = "") -> Image.Image:
-    """
-    Asynchronously load a PIL image from a HTTP or base64 data URL.
-
-    By default, the image is converted into RGB format.
-    """
-    if image_url.startswith('http'):
-        image_raw = await global_http_connection.async_get_bytes(
-            image_url,
-            timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
-        )
-        image = _load_image_from_bytes(image_raw)
-
-    elif image_url.startswith('data:image'):
-        image = _load_image_from_data_url(image_url)
-    elif image_url.startswith('file://'):
-        image = _load_image_from_file(image_url, allowed_local_media_path)
-    else:
-        raise ValueError("Invalid 'image_url': A valid 'image_url' must start "
-                         "with either 'data:image', 'file://' or 'http'.")
+    def load_from_url(
+        self,
+        url: str,
+        media_io: MediaIO[_M],
+        *,
+        fetch_timeout: Optional[int] = None,
+    ) -> _M:
+        url_spec = urlparse(url)
 
-    return image.convert(image_mode)
+        if url_spec.scheme.startswith("http"):
+            connection = self.connection
+            data = connection.get_bytes(url, timeout=fetch_timeout)
 
+            return media_io.load_bytes(data)
 
-def _load_video_from_bytes(b: bytes, num_frames: int = 32) -> npt.NDArray:
-    video_path = BytesIO(b)
-    vr = decord.VideoReader(video_path, num_threads=1)
-    total_frame_num = len(vr)
+        if url_spec.scheme == "data":
+            return self._load_data_url(url_spec, media_io)
 
-    if total_frame_num > num_frames:
-        uniform_sampled_frames = np.linspace(0,
-                                             total_frame_num - 1,
-                                             num_frames,
-                                             dtype=int)
-        frame_idx = uniform_sampled_frames.tolist()
-    else:
-        frame_idx = [i for i in range(0, total_frame_num)]
-    frames = vr.get_batch(frame_idx).asnumpy()
+        if url_spec.scheme == "file":
+            return self._load_file_url(url_spec, media_io)
 
-    return frames
+        msg = "The URL must be either a HTTP, data or file URL."
+        raise ValueError(msg)
 
+    async def load_from_url_async(
+        self,
+        url: str,
+        media_io: MediaIO[_M],
+        *,
+        fetch_timeout: Optional[int] = None,
+    ) -> _M:
+        url_spec = urlparse(url)
 
-def _load_video_from_data_url(video_url: str) -> npt.NDArray:
-    # Only split once and assume the second part is the base64 encoded video
-    _, video_base64 = video_url.split(",", 1)
+        if url_spec.scheme.startswith("http"):
+            connection = self.connection
+            data = await connection.async_get_bytes(url, timeout=fetch_timeout)
 
-    if video_url.startswith("data:video/jpeg;"):
-        return np.stack([
-            np.array(load_image_from_base64(frame_base64))
-            for frame_base64 in video_base64.split(",")
-        ])
+            return media_io.load_bytes(data)
 
-    return load_video_from_base64(video_base64)
+        if url_spec.scheme == "data":
+            return self._load_data_url(url_spec, media_io)
 
+        if url_spec.scheme == "file":
+            return self._load_file_url(url_spec, media_io)
 
-def fetch_video(video_url: str, *, num_frames: int = 32) -> npt.NDArray:
-    """
-    Load video from a HTTP or base64 data URL.
-    """
-    if video_url.startswith('http') or video_url.startswith('https'):
-        video_raw = global_http_connection.get_bytes(
-            video_url,
-            timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
-        )
-        video = _load_video_from_bytes(video_raw, num_frames)
-    elif video_url.startswith('data:video'):
-        video = _load_video_from_data_url(video_url)
-    else:
-        raise ValueError("Invalid 'video_url': A valid 'video_url' must start "
-                         "with either 'data:video' or 'http'.")
-    return video
+        msg = "The URL must be either a HTTP, data or file URL."
+        raise ValueError(msg)
 
+    def fetch_audio(
+        self,
+        audio_url: str,
+    ) -> tuple[np.ndarray, Union[int, float]]:
+        """
+        Load audio from a URL.
+        """
+        audio_io = AudioMediaIO()
 
-async def async_fetch_video(video_url: str,
-                            *,
-                            num_frames: int = 32) -> npt.NDArray:
-    """
-    Asynchronously load video from a HTTP or base64 data URL.
-
-    By default, the image is converted into RGB format.
-    """
-    if video_url.startswith('http') or video_url.startswith('https'):
-        video_raw = await global_http_connection.async_get_bytes(
-            video_url,
-            timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
-        )
-        video = _load_video_from_bytes(video_raw, num_frames)
-    elif video_url.startswith('data:video'):
-        video = _load_video_from_data_url(video_url)
-    else:
-        raise ValueError("Invalid 'video_url': A valid 'video_url' must start "
-                         "with either 'data:video' or 'http'.")
-    return video
-
-
-def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
-    """
-    Load audio from a URL.
-    """
-    if audio_url.startswith("http"):
-        audio_bytes = global_http_connection.get_bytes(
+        return self.load_from_url(
             audio_url,
-            timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
+            audio_io,
+            fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
         )
-    elif audio_url.startswith("data:audio"):
-        _, audio_base64 = audio_url.split(",", 1)
-        audio_bytes = base64.b64decode(audio_base64)
-    else:
-        raise ValueError("Invalid 'audio_url': A valid 'audio_url' must start "
-                         "with either 'data:audio' or 'http'.")
-
-    return librosa.load(BytesIO(audio_bytes), sr=None)
 
+    async def fetch_audio_async(
+        self,
+        audio_url: str,
+    ) -> tuple[np.ndarray, Union[int, float]]:
+        """
+        Asynchronously fetch audio from a URL.
+        """
+        audio_io = AudioMediaIO()
 
-async def async_fetch_audio(
-        audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
-    """
-    Asynchronously fetch audio from a URL.
-    """
-    if audio_url.startswith("http"):
-        audio_bytes = await global_http_connection.async_get_bytes(
+        return await self.load_from_url_async(
             audio_url,
-            timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
+            audio_io,
+            fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
         )
-    elif audio_url.startswith("data:audio"):
-        _, audio_base64 = audio_url.split(",", 1)
-        audio_bytes = base64.b64decode(audio_base64)
-    else:
-        raise ValueError("Invalid 'audio_url': A valid 'audio_url' must start "
-                         "with either 'data:audio' or 'http'.")
-
-    return librosa.load(BytesIO(audio_bytes), sr=None)
 
+    def fetch_image(
+        self,
+        image_url: str,
+        *,
+        image_mode: str = "RGB",
+    ) -> Image.Image:
+        """
+        Load a PIL image from a HTTP or base64 data URL.
 
-def get_and_parse_audio(audio_url: str) -> MultiModalDataDict:
-    audio, sr = fetch_audio(audio_url)
-    return {"audio": (audio, sr)}
+        By default, the image is converted into RGB format.
+        """
+        image_io = ImageMediaIO(image_mode=image_mode)
 
+        return self.load_from_url(
+            image_url,
+            image_io,
+            fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+        )
 
-def get_and_parse_image(
+    async def fetch_image_async(
+        self,
         image_url: str,
         *,
-        allowed_local_media_path: str = "") -> MultiModalDataDict:
-    image = fetch_image(image_url,
-                        allowed_local_media_path=allowed_local_media_path)
-    return {"image": image}
-
+        image_mode: str = "RGB",
+    ) -> Image.Image:
+        """
+        Asynchronously load a PIL image from a HTTP or base64 data URL.
 
-def get_and_parse_video(video_url: str) -> MultiModalDataDict:
-    video = fetch_video(video_url)
-    return {"video": video}
+        By default, the image is converted into RGB format.
+        """
+        image_io = ImageMediaIO(image_mode=image_mode)
 
+        return await self.load_from_url_async(
+            image_url,
+            image_io,
+            fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+        )
 
-async def async_get_and_parse_audio(audio_url: str) -> MultiModalDataDict:
-    audio, sr = await async_fetch_audio(audio_url)
-    return {"audio": (audio, sr)}
-
+    def fetch_video(
+        self,
+        video_url: str,
+        *,
+        image_mode: str = "RGB",
+        num_frames: int = 32,
+    ) -> npt.NDArray:
+        """
+        Load video from a HTTP or base64 data URL.
+        """
+        image_io = ImageMediaIO(image_mode=image_mode)
+        video_io = VideoMediaIO(image_io, num_frames=num_frames)
+
+        return self.load_from_url(
+            video_url,
+            video_io,
+            fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
+        )
 
-async def async_get_and_parse_image(
-        image_url: str,
+    async def fetch_video_async(
+        self,
+        video_url: str,
         *,
-        allowed_local_media_path: str = "") -> MultiModalDataDict:
-    image = await async_fetch_image(
-        image_url, allowed_local_media_path=allowed_local_media_path)
-    return {"image": image}
+        image_mode: str = "RGB",
+        num_frames: int = 32,
+    ) -> npt.NDArray:
+        """
+        Asynchronously load video from a HTTP or base64 data URL.
+
+        By default, the image is converted into RGB format.
+        """
+        image_io = ImageMediaIO(image_mode=image_mode)
+        video_io = VideoMediaIO(image_io, num_frames=num_frames)
+
+        return await self.load_from_url_async(
+            video_url,
+            video_io,
+            fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
+        )
 
 
-async def async_get_and_parse_video(video_url: str) -> MultiModalDataDict:
-    video = await async_fetch_video(video_url)
-    return {"video": video}
+global_media_connector = MediaConnector()
+"""The global :class:`MediaConnector` instance used by vLLM."""
+
+fetch_audio = global_media_connector.fetch_audio
+fetch_image = global_media_connector.fetch_image
+fetch_video = global_media_connector.fetch_video
 
 
 def encode_audio_base64(
@@ -294,10 +256,8 @@ def encode_audio_base64(
     sampling_rate: int,
 ) -> str:
     """Encode audio as base64."""
-    buffered = BytesIO()
-    soundfile.write(buffered, audio, sampling_rate, format="WAV")
-
-    return base64.b64encode(buffered.getvalue()).decode('utf-8')
+    audio_io = AudioMediaIO()
+    return audio_io.encode_base64((audio, sampling_rate))
 
 
 def encode_image_base64(
@@ -311,29 +271,14 @@ def encode_image_base64(
 
     By default, the image is converted into RGB format before being encoded.
     """
-    buffered = BytesIO()
-    image = image.convert(image_mode)
-    image.save(buffered, format)
-    return base64.b64encode(buffered.getvalue()).decode('utf-8')
-
-
-def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
-    """Load image from base64 format."""
-    return _load_image_from_bytes(base64.b64decode(image))
+    image_io = ImageMediaIO(image_mode=image_mode)
+    return image_io.encode_base64(image, image_format=format)
 
 
 def encode_video_base64(frames: npt.NDArray) -> str:
-    base64_frames = []
-    frames_list = [frames[i] for i in range(frames.shape[0])]
-    for frame in frames_list:
-        img_base64 = encode_image_base64(Image.fromarray(frame))
-        base64_frames.append(img_base64)
-    return ",".join(base64_frames)
-
-
-def load_video_from_base64(video: Union[bytes, str]) -> npt.NDArray:
-    """Load video from base64 format."""
-    return _load_video_from_bytes(base64.b64decode(video))
+    image_io = ImageMediaIO()
+    video_io = VideoMediaIO(image_io)
+    return video_io.encode_base64(frames)
 
 
 def resolve_visual_encoder_outputs(
@@ -389,7 +334,7 @@ def repeat_and_pad_token(
     repeat_count: int = 1,
     pad_token_left: Optional[_T] = None,
     pad_token_right: Optional[_T] = None,
-) -> List[_T]:
+) -> list[_T]:
     replacement = [token] * repeat_count
     if pad_token_left is not None:
         replacement = [pad_token_left] + replacement
@@ -402,13 +347,13 @@ def repeat_and_pad_token(
 def repeat_and_pad_placeholder_tokens(
     tokenizer: AnyTokenizer,
     prompt: Optional[str],
-    prompt_token_ids: List[int],
+    prompt_token_ids: list[int],
     *,
     placeholder_token_id: int,
-    repeat_count: Union[int, List[int]],
+    repeat_count: Union[int, list[int]],
     pad_token_left: Optional[int] = None,
     pad_token_right: Optional[int] = None,
-) -> Tuple[Optional[str], List[int], List[PlaceholderRange]]:
+) -> tuple[Optional[str], list[int], list[PlaceholderRange]]:
     if isinstance(repeat_count, int):
         repeat_count = [repeat_count]
 
@@ -450,8 +395,8 @@ def repeat_and_pad_placeholder_tokens(
             new_prompt += prompt_parts[i] + replacement_str
         new_prompt += prompt_parts[-1]
 
-    new_token_ids: List[int] = []
-    placeholder_ranges: List[PlaceholderRange] = []
+    new_token_ids = list[int]()
+    placeholder_ranges = list[PlaceholderRange]()
     placeholder_token_idx = 0
     for i, token in enumerate(prompt_token_ids):
         if token == placeholder_token_id:
@@ -481,7 +426,7 @@ def repeat_and_pad_placeholder_tokens(
 def consecutive_placeholder_ranges(
         num_items: int,
         item_size: int,
-        initial_offset: int = 0) -> List[PlaceholderRange]:
+        initial_offset: int = 0) -> list[PlaceholderRange]:
     """Returns a list of consecutive PlaceholderRanges of a fixed size"""
 
     return [
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index c4be100562703..b7d43c830cc46 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -1,23 +1,32 @@
-from functools import lru_cache
+import base64
+from functools import lru_cache, partial
+from io import BytesIO
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import cv2
 import numpy as np
 import numpy.typing as npt
+from PIL import Image
 
 from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import get_video_processor
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.utils import is_list_of
+from vllm.utils import PlaceholderModule, is_list_of
 
-from .base import MultiModalData
-from .image import ImagePlugin
+from .base import MediaIO, MultiModalData
+from .image import ImageMediaIO, ImagePlugin
 from .inputs import MultiModalKwargs, VideoItem
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
 
+try:
+    import decord
+except ImportError:
+    decord = PlaceholderModule("decord")  # type: ignore[assignment]
+
 logger = init_logger(__name__)
 
 cached_get_video_processor = lru_cache(get_video_processor)
@@ -107,3 +116,73 @@ def sample_frames_from_video(frames: npt.NDArray,
     frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
     sampled_frames = frames[frame_indices, ...]
     return sampled_frames
+
+
+class VideoMediaIO(MediaIO[npt.NDArray]):
+
+    def __init__(
+        self,
+        image_io: ImageMediaIO,
+        *,
+        num_frames: int = 32,
+    ) -> None:
+        super().__init__()
+
+        self.image_io = image_io
+        self.num_frames = num_frames
+
+    def load_bytes(self, data: bytes) -> npt.NDArray:
+        vr = decord.VideoReader(BytesIO(data), num_threads=1)
+        total_frame_num = len(vr)
+
+        num_frames = self.num_frames
+        if total_frame_num > num_frames:
+            uniform_sampled_frames = np.linspace(0,
+                                                 total_frame_num - 1,
+                                                 num_frames,
+                                                 dtype=int)
+            frame_idx = uniform_sampled_frames.tolist()
+        else:
+            frame_idx = list(range(0, total_frame_num))
+
+        return vr.get_batch(frame_idx).asnumpy()
+
+    def load_base64(self, media_type: str, data: str) -> npt.NDArray:
+        if media_type.lower() == "video/jpeg":
+            load_frame = partial(
+                self.image_io.load_base64,
+                "image/jpeg",
+            )
+
+            return np.stack([
+                np.array(load_frame(frame_data))
+                for frame_data in data.split(",")
+            ])
+
+        return self.load_bytes(base64.b64decode(data))
+
+    def load_file(self, filepath: Path) -> npt.NDArray:
+        with filepath.open("rb") as f:
+            data = f.read()
+
+        return self.load_bytes(data)
+
+    def encode_base64(
+        self,
+        media: npt.NDArray,
+        *,
+        video_format: str = "JPEG",
+    ) -> str:
+        video = media
+
+        if video_format == "JPEG":
+            encode_frame = partial(
+                self.image_io.encode_base64,
+                image_format=video_format,
+            )
+
+            return ",".join(
+                encode_frame(Image.fromarray(frame)) for frame in video)
+
+        msg = "Only JPEG format is supported for now."
+        raise NotImplementedError(msg)

From 5ce4627a7ec4cf4e19ff4be7f030883ef486393f Mon Sep 17 00:00:00 2001
From: Chen1022 <112855051+ccjincong@users.noreply.github.com>
Date: Fri, 27 Dec 2024 21:05:10 +0800
Subject: [PATCH 198/357] [Doc]  Add xgrammar in doc (#11549)

Signed-off-by: ccjincong <chenjincong11@gmail.com>
---
 docs/source/usage/structured_outputs.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/usage/structured_outputs.md b/docs/source/usage/structured_outputs.md
index 3f5d9ffc26278..7292012e36a26 100644
--- a/docs/source/usage/structured_outputs.md
+++ b/docs/source/usage/structured_outputs.md
@@ -2,7 +2,7 @@
 
 # Structured Outputs
 
-vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines) or [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer) as backends for the guided decoding.
+vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines), [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer), or [xgrammar](https://github.com/mlc-ai/xgrammar) as backends for the guided decoding.
 This document shows you some examples of the different options that are available to generate structured outputs.
 
 ## Online Inference (OpenAI API)

From 101418096ffe3c83b6d541e1303b10e9d5e03861 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 28 Dec 2024 01:22:48 +0800
Subject: [PATCH 199/357] [VLM] Support caching in merged multi-modal processor
 (#11396)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/conf.py                           |   3 +-
 .../design/multimodal/multimodal_index.md     |  24 +-
 docs/source/models/supported_models.md        |   3 +-
 .../openai/test_vision_embedding.py           |   4 +-
 .../mm_processor_kwargs/test_qwen2_vl.py      |   2 +-
 .../vision_language/test_models.py            |   4 +-
 tests/multimodal/test_processing.py           | 209 ++++++-
 vllm/inputs/registry.py                       |  22 +-
 vllm/model_executor/models/llava.py           | 178 +++---
 vllm/model_executor/models/phi3v.py           | 107 +++-
 vllm/model_executor/models/qwen.py            |   4 +-
 vllm/model_executor/models/qwen2_audio.py     |  65 ++-
 vllm/model_executor/models/qwen2_vl.py        | 115 ++--
 vllm/model_executor/models/ultravox.py        |  76 ++-
 vllm/multimodal/base.py                       |  44 +-
 vllm/multimodal/inputs.py                     | 438 ++++++++++++++-
 vllm/multimodal/processing.py                 | 516 ++++++++++++------
 vllm/multimodal/registry.py                   |  50 +-
 vllm/transformers_utils/processor.py          |  12 +-
 vllm/utils.py                                 |  27 +-
 20 files changed, 1455 insertions(+), 448 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 1fe0474631140..71394c5302a39 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -191,6 +191,7 @@ def linkcode_resolve(domain, info):
 
 # Mock out external dependencies here, otherwise the autodoc pages may be blank.
 autodoc_mock_imports = [
+    "blake3",
     "compressed_tensors",
     "cpuinfo",
     "cv2",
@@ -207,7 +208,7 @@ def linkcode_resolve(domain, info):
     "tensorizer",
     "pynvml",
     "outlines",
-    "xgrammar,"
+    "xgrammar",
     "librosa",
     "soundfile",
     "gguf",
diff --git a/docs/source/design/multimodal/multimodal_index.md b/docs/source/design/multimodal/multimodal_index.md
index 88af07afc7018..e4f2171e84ff7 100644
--- a/docs/source/design/multimodal/multimodal_index.md
+++ b/docs/source/design/multimodal/multimodal_index.md
@@ -45,39 +45,39 @@ adding_multimodal_plugin
 ### Base Classes
 
 ```{eval-rst}
-.. autodata:: vllm.multimodal.NestedTensors
+.. automodule:: vllm.multimodal.base
+    :members:
+    :show-inheritance:
 ```
 
-```{eval-rst}
-.. autodata:: vllm.multimodal.BatchedTensorInputs
-```
+### Input Classes
 
 ```{eval-rst}
-.. autoclass:: vllm.multimodal.MultiModalDataBuiltins
+.. automodule:: vllm.multimodal.inputs
     :members:
     :show-inheritance:
 ```
 
-```{eval-rst}
-.. autodata:: vllm.multimodal.MultiModalDataDict
-```
+### Audio Classes
 
 ```{eval-rst}
-.. autoclass:: vllm.multimodal.MultiModalKwargs
+.. automodule:: vllm.multimodal.audio
     :members:
     :show-inheritance:
 ```
 
+### Image Classes
+
 ```{eval-rst}
-.. autoclass:: vllm.multimodal.MultiModalPlugin
+.. automodule:: vllm.multimodal.image
     :members:
     :show-inheritance:
 ```
 
-### Image Classes
+### Video Classes
 
 ```{eval-rst}
-.. automodule:: vllm.multimodal.image
+.. automodule:: vllm.multimodal.video
     :members:
     :show-inheritance:
 ```
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 95add0d71bbab..7acafda50793c 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -755,8 +755,7 @@ vLLM currently only supports adding LoRA to the language backbone of multimodal
 ```
 
 ```{note}
-To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have to install their GitHub repo ({code}`pip install git+https://github.com/TIGER-AI-Lab/Mantis.git`)
-and pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
+To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
 ```
 
 ```{note}
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index 3731b2dcdeae1..c851539c610ec 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -91,5 +91,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
     assert len(embeddings.data) == 1
     assert len(embeddings.data[0].embedding) == 3072
     assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 765
-    assert embeddings.usage.total_tokens == 765
+    assert embeddings.usage.prompt_tokens == 764
+    assert embeddings.usage.total_tokens == 764
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
index cd8954ffc48c2..5897c04c89e19 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
@@ -30,7 +30,7 @@ def get_max_qwen2_vl_image_tokens():
 
 
 @pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [
-    ({}, 1225),
+    ({}, 16384),
     ({
         MIN_PIXELS: 64**2,
         MAX_PIXELS: 512**2
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 3101d1d2ea831..1a9c1b4ef1be0 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -201,6 +201,7 @@
         vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
         num_logprobs=10,
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        marks=[large_gpu_mark(min_gb=48)],
     ),
     "glm4": VLMTestInfo(
         models=["THUDM/glm-4v-9b"],
@@ -212,7 +213,7 @@
         dtype="bfloat16",
         get_stop_token_ids=lambda tok: [151329, 151336, 151338],
         patch_hf_runner=model_utils.glm_patch_hf_runner,
-        marks=[large_gpu_mark(min_gb=48)],
+        marks=[large_gpu_mark(min_gb=32)],
     ),
     "h2ovl": VLMTestInfo(
         models = [
@@ -261,6 +262,7 @@
         dtype="bfloat16",
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.internvl_patch_hf_runner,
+        marks=[large_gpu_mark(min_gb=32)],
     ),
     "llava_next": VLMTestInfo(
         models=["llava-hf/llava-v1.6-mistral-7b-hf"],
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index d22d778f81fa8..1b2847ed0f534 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1,12 +1,20 @@
+from functools import partial
 from typing import cast
 
+import numpy as np
 import pytest
-
-from vllm.multimodal.processing import (PromptReplacement, _PlaceholderInfo,
-                                        find_text_matches, find_token_matches,
-                                        iter_placeholders, iter_token_matches,
+from PIL import Image
+
+from vllm.config import ModelConfig
+from vllm.inputs import InputProcessingContext
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.processing import (ProcessingCache, PromptReplacement,
+                                        _PlaceholderInfo, find_text_matches,
+                                        find_token_matches, iter_placeholders,
+                                        iter_token_matches,
                                         replace_text_matches,
                                         replace_token_matches)
+from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import full_groupby
 
@@ -457,6 +465,7 @@ def test_find_replace_tokens(
         ),
     ]
 )
+# yapf: enable
 def test_iter_placeholders(
     repl_by_key,
     prompt,
@@ -475,11 +484,199 @@ def test_iter_placeholders(
             prompt_repls,
             prompt,
             # Effectively match all occurrences in the prompt
-            {key: 3 for key in repl_by_key},
-         ))
+            {key: 3
+             for key in repl_by_key},
+        ))
 
     # Only displayed on error
     print("result:", result)
 
     # Manually constructed results
     assert result == expected
+
+
+def _rand_img(rng: np.random.RandomState, min_wh: int, max_wh: int):
+    w, h = rng.randint(min_wh, max_wh, size=(2, ))
+    arr = rng.randint(0, 255, size=(w, h, 3), dtype=np.uint8)
+    return Image.fromarray(arr)
+
+
+def _rand_video(
+    rng: np.random.RandomState,
+    min_frames: int,
+    max_frames: int,
+    min_wh: int,
+    max_wh: int,
+):
+    # Temporary workaround for https://github.com/huggingface/transformers/issues/35412
+    num_frames = rng.randint(min_frames, max_frames)
+    num_frames = (num_frames // 2) * 2
+
+    w, h = rng.randint(min_wh, max_wh, size=(2, ))
+    return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8)
+
+
+def _rand_audio(
+    rng: np.random.RandomState,
+    min_len: int,
+    max_len: int,
+    sr: int,
+):
+    audio_len = rng.randint(min_len, max_len)
+    return rng.rand(audio_len), sr
+
+
+def _test_processing_cache_correctness(
+    model_id: str,
+    modalities: set[str],
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3":
+        hf_overrides = {"architectures": ["MantisForConditionalGeneration"]}
+    else:
+        hf_overrides = {}
+
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        seed=0,
+        dtype="float16",
+        revision=None,
+        hf_overrides=hf_overrides,
+    )
+    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
+
+    processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
+    ctx = InputProcessingContext(
+        model_config,
+        tokenizer=cached_get_tokenizer(model_config.tokenizer),
+    )
+    # Ensure that it can fit all of the data
+    cache = ProcessingCache(capacity=1 << 30)
+
+    baseline_processor = processor_factory(ctx, cache=None)
+    cached_processor = processor_factory(ctx, cache=cache)
+
+    rng = np.random.RandomState(0)
+
+    input_to_hit = {
+        "image": Image.new("RGB", size=(128, 128)),
+        "video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
+        "audio": (np.zeros((512, )), 16000),
+    }
+    input_factory = {
+        "image":
+        partial(_rand_img, rng, min_wh=128, max_wh=256),
+        "video":
+        partial(_rand_video,
+                rng,
+                min_frames=2,
+                max_frames=8,
+                min_wh=128,
+                max_wh=256),
+        "audio":
+        partial(_rand_audio, rng, min_len=256, max_len=512, sr=16000),
+    }
+    input_max_count = {
+        "image": 3,
+        "video": 3,
+        "audio": 3,
+    }
+
+    for batch_idx in range(num_batches):
+        mm_data = {
+            k:
+            [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
+             for _ in range(rng.randint(input_max_count[k]))]
+            for k in modalities
+        }
+
+        mm_counts = {k: len(vs) for k, vs in mm_data.items()}
+        prompt = baseline_processor._get_dummy_mm_inputs(mm_counts).prompt_text
+
+        # Drop unnecessary keys and test single -> multi conversion
+        if rng.rand() < simplify_rate:
+            for k in list(mm_data.keys()):
+                if not mm_data[k]:
+                    del mm_data[k]
+                elif len(mm_data[k]) == 1:
+                    mm_data[k] = mm_data[k][0]
+
+        baseline_result = baseline_processor.apply(
+            prompt,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+        cached_result = cached_processor.apply(
+            prompt,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+        assert baseline_result == cached_result, (
+            f"Failed ({batch_idx=}, {mm_data=})")
+
+
+# yapf: disable
+@pytest.mark.parametrize(("model_id", "modalities"), [
+    ("llava-hf/llava-1.5-7b-hf", {"image"}),
+    ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image"}),
+    ("mistral-community/pixtral-12b", {"image"}),
+    ("Qwen/Qwen2-VL-2B-Instruct", {"image", "video"}),
+    ("Qwen/Qwen2-Audio-7B-Instruct", {"audio"}),
+    ("fixie-ai/ultravox-v0_3", {"audio"}),
+])
+@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
+@pytest.mark.parametrize("num_batches", [32])
+@pytest.mark.parametrize("simplify_rate", [1.0])
+# yapf: enable
+def test_processing_cache_correctness(
+    model_id: str,
+    modalities: set[str],
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    _test_processing_cache_correctness(
+        model_id,
+        modalities,
+        hit_rate=hit_rate,
+        num_batches=num_batches,
+        simplify_rate=simplify_rate,
+    )
+
+
+# yapf: disable
+@pytest.mark.parametrize(("model_id", "modalities"), [
+    ("microsoft/Phi-3-vision-128k-instruct", {"image"}),
+])
+@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
+@pytest.mark.parametrize("num_batches", [32])
+@pytest.mark.parametrize("simplify_rate", [1.0])
+# yapf: enable
+def test_processing_cache_correctness_phi3v(
+    model_id: str,
+    modalities: set[str],
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    # HACK - this is an attempted workaround for the following bug
+    # https://github.com/huggingface/transformers/issues/34307
+    from transformers import AutoImageProcessor  # noqa: F401
+    from transformers import AutoProcessor  # noqa: F401
+
+    AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True)
+
+    _test_processing_cache_correctness(
+        model_id,
+        modalities,
+        hit_rate=hit_rate,
+        num_batches=num_batches,
+        simplify_rate=simplify_rate,
+    )
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index f3ec9d115c9ba..46346b08e99c2 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -99,6 +99,9 @@ def get_hf_processor(
 
         merged_kwargs = {**base_kwargs, **kwargs}
 
+        if isinstance(typ, type):
+            merged_kwargs["processor_cls"] = typ
+
         hf_processor = cached_get_processor(
             self.model_config.model,
             trust_remote_code=self.model_config.trust_remote_code,
@@ -132,10 +135,13 @@ def get_hf_processor(
     def call_hf_processor(
         self,
         hf_processor: ProcessorMixin,
-        prompt: str,
-        processor_data: Mapping[str, object],
-        inference_kwargs: Mapping[str, object],
+        data: Mapping[str, object],
+        kwargs: Mapping[str, object] = {},
     ) -> BatchFeature:
+        """
+        Call :code:`hf_processor` on the prompt :code:`data`
+        (text, image, audio...) with configurable options :code:`kwargs`.
+        """
         assert callable(hf_processor)
 
         base_kwargs = self.model_config.mm_processor_kwargs
@@ -144,21 +150,15 @@ def call_hf_processor(
 
         merged_kwargs = resolve_mm_processor_kwargs(
             base_kwargs,
-            inference_kwargs,
+            kwargs,
             hf_processor,
             requires_kw_only=False,
             allow_var_kwargs=True,
         )
 
         try:
-            return hf_processor(
-                text=prompt,
-                **processor_data,
-                **merged_kwargs,
-                return_tensors="pt",
-            )
+            return hf_processor(**data, **merged_kwargs, return_tensors="pt")
         except Exception as exc:
-            data = dict(text=prompt, **processor_data)
             msg = (f"Failed to apply {type(hf_processor).__name__} "
                    f"on data={data} with kwargs={merged_kwargs}")
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 0662d90e79b92..0ecba5a1cae0f 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,5 +1,4 @@
 from functools import cached_property
-from types import MethodType
 from typing import (Iterable, List, Literal, Mapping, Optional, Protocol, Set,
                     Tuple, TypedDict, Union)
 
@@ -7,7 +6,7 @@
 import torch.nn as nn
 from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig,
                           PixtralVisionConfig, PretrainedConfig,
-                          ProcessorMixin, SiglipVisionConfig)
+                          SiglipVisionConfig)
 from transformers.models.llava import LlavaProcessor
 from transformers.models.pixtral import PixtralProcessor
 
@@ -21,10 +20,12 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems,
+                                    MultiModalFieldConfig, MultiModalInputsV2,
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
-                                        PromptReplacement)
+                                        ProcessorInputs, PromptReplacement,
+                                        full_groupby_modality)
 from vllm.sequence import IntermediateTensors
 
 from .clip import (CLIPVisionModel, dummy_image_for_clip,
@@ -116,36 +117,54 @@ def get_max_llava_image_tokens(ctx: InputContext):
 
 class LlavaMultiModalProcessor(BaseMultiModalProcessor):
 
-    def _patch_pixtral_processor(self, hf_processor: PixtralProcessor):
-        if getattr(hf_processor, "__is_patched__", False):
-            return  # Already patched
-
-        image_processor = hf_processor.image_processor  # type: ignore
-        orig_preprocess = image_processor.preprocess
+    def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]:
+        return self.ctx.get_hf_processor((LlavaProcessor, PixtralProcessor))
 
-        def preprocess(__self, *args, **kwargs):
-            hf_inputs = orig_preprocess(*args, **kwargs)
-            hf_inputs["is_pixtral"] = torch.tensor(True)
-            return hf_inputs
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
 
-        image_processor.preprocess = MethodType(preprocess, image_processor)
+        # NOTE: pixel_values=None for MLlavaProcessor
+        pixel_values = processed_outputs.get("pixel_values")
+        if pixel_values is not None:
+            images = mm_data["images"]
+            assert isinstance(images, list)
 
-        hf_processor.__is_patched__ = True  # type: ignore
+            if isinstance(self._get_hf_processor(), PixtralProcessor):
+                # Original output: (1, num_images, C, H, W)
+                # New output: (num_images, C, H, W)
+                assert (isinstance(pixel_values, list)
+                        and len(pixel_values) == 1
+                        and isinstance(pixel_values[0], list)
+                        and len(pixel_values[0]) == len(images))
 
-    def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]:
-        hf_processor = self.ctx.get_hf_processor(
-            (LlavaProcessor, PixtralProcessor))
+                processed_outputs["pixel_values"] = pixel_values[0]
 
-        if isinstance(hf_processor, PixtralProcessor):
-            self._patch_pixtral_processor(hf_processor)
+        return processed_outputs
 
-        return hf_processor
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
 
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_inputs: BatchFeature,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         hf_config = self.ctx.get_hf_config(LlavaConfig)
         image_token_id = hf_config.image_token_index
@@ -200,7 +219,7 @@ def _get_dummy_mm_inputs(
     ) -> ProcessorInputs:
         hf_config = self.ctx.get_hf_config(LlavaConfig)
         vision_config = hf_config.vision_config
-        num_images = mm_counts["image"]
+        num_images = mm_counts.get("image", 0)
 
         if isinstance(vision_config, CLIPVisionConfig):
             data = dummy_image_for_clip(vision_config, num_images)
@@ -218,7 +237,6 @@ def _get_dummy_mm_inputs(
         return ProcessorInputs(
             prompt_text=image_token * num_images,
             mm_data=data,
-            mm_processor_kwargs={},
         )
 
 
@@ -379,7 +397,6 @@ def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[LlavaImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
-        is_pixtral = kwargs.pop("is_pixtral", torch.tensor([False]))
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values is None and image_embeds is None:
@@ -390,33 +407,6 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
-            assert isinstance(is_pixtral, torch.Tensor)
-            if is_pixtral.any():
-                images = pixel_values
-
-                def flatten_to_3d_tensors(item):
-                    if isinstance(item, torch.Tensor):
-                        if item.dim() >= 3:
-                            return [t for t in item.view(-1, *item.shape[-3:])]
-                        else:
-                            raise ValueError(
-                                f"Unexpected tensor dimension: {item.dim()}")
-                    elif isinstance(item, list):
-                        return [
-                            t for subitem in item
-                            for t in flatten_to_3d_tensors(subitem)
-                        ]
-                    else:
-                        raise ValueError(f"Unexpected type: {type(item)}")
-
-                # Restructure the batched images into a list of lists of images
-                images = flatten_to_3d_tensors(pixel_values)
-
-                return LlavaImagePixelInputs(
-                    type="pixel_values",
-                    data=images,
-                )
-
             return LlavaImagePixelInputs(
                 type="pixel_values",
                 data=self._validate_pixel_values(
@@ -586,19 +576,71 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
 class MantisMultiModalProcessor(LlavaMultiModalProcessor):
 
-    def _get_hf_processor(self) -> ProcessorMixin:
-        try:
-            from mantis.models.mllava import MLlavaProcessor
-        except ModuleNotFoundError as exc:
-            raise ModuleNotFoundError(
-                "You need to `pip install "
-                "git+https://github.com/TIGER-AI-Lab/Mantis.git` "
-                "to use this model") from exc
-
-        processor = MLlavaProcessor.from_pretrained(
-            self.ctx.model_config.tokenizer)
-        assert isinstance(processor, ProcessorMixin)
-        return processor
+    def _get_hf_processor(self):
+        return self.ctx.get_hf_processor(LlavaProcessor)
+
+    def apply(
+        self,
+        prompt_text: str,
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        hf_config = self.ctx.get_hf_config(LlavaConfig)
+        image_token_id = hf_config.image_token_index
+        max_image_tokens = get_max_llava_image_tokens(self.ctx)
+
+        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+
+        mm_items = self._get_mm_items(mm_data)
+        mm_item_counts = mm_items.get_item_counts()
+        mm_kwargs = result["mm_kwargs"]
+
+        # We reimplement the functionality of MLlavaProcessor from
+        # https://github.com/TIGER-AI-Lab/Mantis.git
+        def get_replacement_mantis(item_idx: int):
+            return "".join([
+                f"(image {item_idx+1}: <Image>",  # 7 tokens
+                "<image>" * max_image_tokens,
+                "</Image>)",  # 3 tokens
+            ])
+
+        mantis_repls = self._bind_prompt_replacements([
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id] * max_image_tokens,
+                replacement=get_replacement_mantis,
+            )
+        ])
+
+        prompt_ids, prompt_text, _ = self._apply_prompt_replacements(
+            result["prompt_token_ids"],
+            mantis_repls,
+            mm_item_counts,
+        )
+
+        unbound_orig_repls = self._get_prompt_replacements(
+            mm_items,
+            hf_processor_mm_kwargs,
+            mm_kwargs,
+        )
+        orig_repls = self._bind_prompt_replacements(unbound_orig_repls)
+
+        all_placeholders = self._find_placeholders(orig_repls, prompt_ids,
+                                                   mm_item_counts)
+        assert len(all_placeholders) == mm_item_counts.get("image", 0)
+
+        mm_placeholders = {
+            modality: [item.to_range() for item in items]
+            for modality, items in full_groupby_modality(all_placeholders)
+        }
+
+        return MultiModalInputsV2(
+            type="multimodal",
+            prompt=prompt_text,
+            prompt_token_ids=prompt_ids,
+            mm_kwargs=mm_kwargs,
+            mm_placeholders=mm_placeholders,
+        )
 
 
 # To use this model, please use
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 4e2e7f5761544..fefa9fd62d1d0 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -12,9 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -32,10 +32,14 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems,
+                                    MultiModalFieldConfig, MultiModalInputsV2,
+                                    MultiModalKwargs, NestedTensors,
+                                    PlaceholderRange)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
-                                        PromptReplacement)
+                                        ProcessorInputs, PromptReplacement,
+                                        _BoundPromptReplacement,
+                                        _PlaceholderInfo)
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
@@ -306,11 +310,11 @@ def get_max_phi3v_image_tokens(
     *,
     num_crops: Optional[int] = None,
 ) -> int:
-    mm_processor_kwargs = {}
+    hf_processor_mm_kwargs = {}
     if num_crops:
-        mm_processor_kwargs["num_crops"] = num_crops
+        hf_processor_mm_kwargs["num_crops"] = num_crops
 
-    processor = ctx.get_hf_processor(**mm_processor_kwargs)
+    processor = ctx.get_hf_processor(**hf_processor_mm_kwargs)
 
     return processor.calc_num_image_tokens_from_image_size(
         width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
@@ -331,39 +335,50 @@ def _get_hf_processor(
 
     def _call_hf_processor(
         self,
-        hf_processor: ProcessorMixin,
         prompt: str,
-        processor_data: Mapping[str, object],
-        mm_processor_kwargs: Mapping[str, object],
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
         processed_outputs = super()._call_hf_processor(
-            hf_processor,
             prompt=prompt,
-            processor_data=processor_data,
-            mm_processor_kwargs=mm_processor_kwargs,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
         )
 
+        input_ids = processed_outputs["input_ids"]
+        assert isinstance(input_ids, torch.Tensor)
+
         # Phi3v processor has inserted -1, -2 etc as placeholder in prompt_ids,
         # which will cause OverflowError when decoding the prompt_ids.
         # Therefore, we need to do an early replacement here
-        token_ids = processed_outputs['input_ids']
-        token_ids[token_ids < 0] = _IMAGE_TOKEN_ID
-        processed_outputs['input_ids'] = token_ids
+        input_ids.masked_fill_(input_ids < 0, _IMAGE_TOKEN_ID)
 
         return processed_outputs
 
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_inputs: BatchFeature,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         hf_processor = self._get_hf_processor()
         image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
         image_processor = hf_processor.image_processor  # type: ignore
 
-        mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.limit_per_prompt.get("image", 1)
+        tokenizer = self._get_tokenizer()
+        bos_token_id = tokenizer.bos_token_id
+        assert isinstance(bos_token_id, int)
 
         def get_replacement_phi3v(item_idx: int):
             image_size = mm_items.get_image_size(item_idx)
@@ -372,21 +387,44 @@ def get_replacement_phi3v(item_idx: int):
                 height=image_size.height,
             )
 
-            return [_IMAGE_TOKEN_ID] * num_tokens
+            return [_IMAGE_TOKEN_ID] * num_tokens + [bos_token_id]
 
         return [
             PromptReplacement(
                 modality="image",
                 target=image_token,
                 replacement=get_replacement_phi3v,
-            ) for image_token in image_tokens[:max_images]
+            ) for image_token in image_tokens[:len(mm_items.images)]
         ]
 
+    def _apply_prompt_replacements(
+        self,
+        token_ids: list[int],
+        prompt_repls: Sequence[_BoundPromptReplacement],
+        mm_item_counts: Mapping[str, int],
+    ) -> tuple[list[int], str, list[_PlaceholderInfo]]:
+        token_ids, text, placeholders = super()._apply_prompt_replacements(
+            token_ids=token_ids,
+            prompt_repls=prompt_repls,
+            mm_item_counts=mm_item_counts,
+        )
+
+        # Keep the behavior in line with HF processor
+        if text.startswith("<s> <|image|>"):
+            text = text.replace("<s> <|image|>", "<s><|image|>", 1)
+            token_ids = [token_ids[0], *token_ids[2:]]
+            placeholders = [
+                _PlaceholderInfo(p.modality, p.start_idx - 1, p.replacement)
+                for p in placeholders
+            ]
+
+        return token_ids, text, placeholders
+
     def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        num_images = mm_counts["image"]
+        num_images = mm_counts.get("image", 0)
 
         data = dummy_image_for_clip(
             CLIP_VIT_LARGE_PATCH14_336_CONFIG,
@@ -401,9 +439,28 @@ def _get_dummy_mm_inputs(
         return ProcessorInputs(
             prompt_text="".join(image_tokens[:num_images]),
             mm_data=data,
-            mm_processor_kwargs={},
         )
 
+    def apply(
+        self,
+        prompt_text: str,
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+
+        # Only <|image|> tokens should be considered as placeholders,
+        # so we ignore the trailing bos_token_id
+        result["mm_placeholders"] = {
+            modality: [
+                PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
+                for p in ps
+            ]
+            for modality, ps in result["mm_placeholders"].items()
+        }
+
+        return result
+
 
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens)
 @MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor)
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 63d1374ab4092..baf955f6b515d 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -225,7 +225,7 @@ def __init__(
         d_model: int,
         n_head: int,
         mlp_ratio: float = 4.0,
-        norm_layer: Callable = nn.LayerNorm,
+        norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
@@ -266,7 +266,7 @@ def __init__(
         layers: int,
         heads: int,
         mlp_ratio: float = 4.0,
-        norm_layer: Callable = nn.LayerNorm,
+        norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 6259166a7fc57..25a351bd9c656 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -26,7 +26,7 @@
 import numpy as np
 import torch
 import torch.nn as nn
-from transformers import BatchFeature, ProcessorMixin
+from transformers import BatchFeature
 from transformers.models.qwen2_audio import (Qwen2AudioConfig,
                                              Qwen2AudioEncoder,
                                              Qwen2AudioProcessor)
@@ -38,10 +38,10 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.inputs import (MultiModalDataItems, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
-                                        PromptReplacement)
+                                        ProcessorInputs, PromptReplacement)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -73,7 +73,7 @@ def forward(self, audio_features):
 
 
 # From Qwen2AudioEncoder._get_feat_extract_output_lengths
-def _get_feat_extract_output_lengths(input_lengths: torch.LongTensor):
+def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
     feat_lengths = (input_lengths - 1) // 2 + 1
     output_lengths = (feat_lengths - 2) // 2 + 1
     return feat_lengths, output_lengths
@@ -88,13 +88,18 @@ def get_max_qwen2_audio_audio_tokens(ctx: InputContext) -> int:
 
 class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor):
 
-    def _get_hf_processor(self) -> Qwen2AudioProcessor:
+    def _get_hf_processor(
+        self,
+        *,
+        # Ignored in initialization
+        sampling_rate: Optional[int] = None,
+    ) -> Qwen2AudioProcessor:
         return self.ctx.get_hf_processor(Qwen2AudioProcessor)
 
     def _get_feature_extractor(self) -> WhisperFeatureExtractor:
         return self._get_hf_processor().feature_extractor  # type: ignore
 
-    def _get_processor_data(
+    def _get_hf_mm_data(
         self,
         mm_items: MultiModalDataItems,
     ) -> tuple[dict[str, Any], dict[str, Any]]:
@@ -102,50 +107,61 @@ def _get_processor_data(
         feature_extractor = self._get_feature_extractor()
         mm_items.resample_audios(feature_extractor.sampling_rate)
 
-        return super()._get_processor_data(mm_items)
+        return super()._get_hf_mm_data(mm_items)
 
     def _call_hf_processor(
         self,
-        hf_processor: ProcessorMixin,
         prompt: str,
-        processor_data: Mapping[str, object],
-        mm_processor_kwargs: Mapping[str, object],
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        processor_data = dict(processor_data)
-        audios = processor_data.pop("audios", [])
+        mm_data = dict(mm_data)
+        audios = mm_data.pop("audios", [])
 
         if audios:
-            processor_data["audios"] = audios
+            mm_data["audios"] = audios
 
             feature_extractor = self._get_feature_extractor()
-            mm_processor_kwargs = dict(
-                **mm_processor_kwargs,
+            mm_kwargs = dict(
+                **mm_kwargs,
                 sampling_rate=feature_extractor.sampling_rate,
             )
         else:
             # NOTE: WhisperFeatureExtractor cannot handle empty list of audios
             pass
 
-        return super()._call_hf_processor(
-            hf_processor,
+        processed_outputs = super()._call_hf_processor(
             prompt=prompt,
-            processor_data=processor_data,
-            mm_processor_kwargs=mm_processor_kwargs,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            input_features=MultiModalFieldConfig.batched("audio"),
+            feature_attention_mask=MultiModalFieldConfig.batched("audio"),
         )
 
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_inputs: BatchFeature,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         hf_config = self.ctx.get_hf_config(Qwen2AudioConfig)
         placeholder = hf_config.audio_token_index
 
-        feature_attention_mask = hf_inputs.get("feature_attention_mask")
+        feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
         if feature_attention_mask is None:
             audio_output_lengths = []
         else:
+            assert isinstance(feature_attention_mask, torch.Tensor)
             _, audio_output_lengths = _get_feat_extract_output_lengths(
                 feature_attention_mask.sum(-1))
 
@@ -168,14 +184,13 @@ def _get_dummy_mm_inputs(
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
 
-        audio_count = mm_counts["audio"]
+        audio_count = mm_counts.get("audio", 0)
         audio = np.zeros(audio_len)
         data = {"audio": [audio] * audio_count}
 
         return ProcessorInputs(
             prompt_text="<|AUDIO|>" * audio_count,
             mm_data=data,
-            mm_processor_kwargs={},
         )
 
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index fb97eb1916002..574845ef5a525 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -22,9 +22,10 @@
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
 from functools import cached_property, partial
-from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set,
-                    Tuple, Type, TypedDict, Union)
+from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional,
+                    Set, Tuple, Type, TypedDict, Union)
 
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -54,10 +55,11 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalDataDict, NestedTensors
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems,
+                                    MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
-                                        PromptReplacement)
+                                        ProcessorInputs, PromptReplacement)
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
@@ -229,9 +231,9 @@ class Qwen2VisionAttention(nn.Module):
 
     def __init__(
         self,
-        embed_dim: Optional[int] = None,
-        num_heads: Optional[int] = None,
-        projection_size: Optional[int] = None,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ) -> None:
@@ -264,7 +266,7 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor = None,
+        rotary_pos_emb: torch.Tensor,
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
@@ -347,7 +349,7 @@ def __init__(
         num_heads: int,
         mlp_ratio: float,
         act_layer: Type[nn.Module] = QuickGELU,
-        norm_layer: Type[nn.Module] = None,
+        norm_layer: Optional[Callable[[int], nn.Module]] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ) -> None:
@@ -384,7 +386,7 @@ def __init__(
         self,
         patch_size: int = 14,
         temporal_patch_size: int = 2,
-        in_chans: int = 3,
+        in_channels: int = 3,
         embed_dim: int = 1152,
     ) -> None:
         super().__init__()
@@ -392,8 +394,8 @@ def __init__(
         self.temporal_patch_size = temporal_patch_size
         self.embed_dim = embed_dim
 
-        kernel_size = [temporal_patch_size, patch_size, patch_size]
-        self.proj = nn.Conv3d(in_chans,
+        kernel_size = (temporal_patch_size, patch_size, patch_size)
+        self.proj = nn.Conv3d(in_channels,
                               embed_dim,
                               kernel_size=kernel_size,
                               stride=kernel_size,
@@ -413,7 +415,7 @@ def __init__(
         self,
         d_model: int,
         context_dim: int,
-        norm_layer: Type[nn.Module] = None,
+        norm_layer: Optional[Callable[[int], nn.Module]] = None,
         spatial_merge_size: int = 2,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -489,15 +491,15 @@ def __init__(
     ) -> None:
         super().__init__()
 
-        patch_size: int = vision_config.patch_size
-        temporal_patch_size: int = vision_config.temporal_patch_size
-        spatial_merge_size: int = vision_config.spatial_merge_size
-        in_chans: int = vision_config.in_chans
-        hidden_size: int = vision_config.hidden_size
-        embed_dim: int = vision_config.embed_dim
-        depth: int = vision_config.depth
-        num_heads: int = vision_config.num_heads
-        mlp_ratio: float = vision_config.mlp_ratio
+        patch_size = vision_config.patch_size
+        temporal_patch_size = vision_config.temporal_patch_size
+        spatial_merge_size = vision_config.spatial_merge_size
+        in_channels = vision_config.in_channels
+        hidden_size = vision_config.hidden_size
+        embed_dim = vision_config.embed_dim
+        depth = vision_config.depth
+        num_heads = vision_config.num_heads
+        mlp_ratio = vision_config.mlp_ratio
 
         self.spatial_merge_size = spatial_merge_size
         self.num_heads = num_heads
@@ -506,7 +508,7 @@ def __init__(
         self.patch_embed = Qwen2VisionPatchEmbed(
             patch_size=patch_size,
             temporal_patch_size=temporal_patch_size,
-            in_chans=in_chans,
+            in_channels=in_channels,
             embed_dim=embed_dim,
         )
 
@@ -733,8 +735,12 @@ def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
             if k == "video":
                 # Special case since even a single item can be a list
                 multi_data[k] = (  # type: ignore[index]
-                    v if (isinstance(v, (dict, torch.Tensor))  # type: ignore[assignment]
-                          or is_list_of(v, list)) else [v]
+                    v if (
+                        isinstance(v, (dict, torch.Tensor))  # type: ignore[assignment]
+                        or is_list_of(v, list)
+                        or isinstance(v[0], (np.ndarray, torch.Tensor))
+                           and v[0].ndim == 4
+                    ) else [v]
                 )
             elif k in ("image", "audio"):
                 multi_data[k] = (  # type: ignore[index]
@@ -754,6 +760,12 @@ def get_item_counts(self) -> Mapping[str, int]:
             for m, items in self.items()
         }
 
+    def has_embedding_inputs(self) -> bool:
+        return any(
+            isinstance(items, dict) or any(
+                isinstance(item, torch.Tensor) for item in items)
+            for items in self.values())
+
 
 class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
 
@@ -784,7 +796,7 @@ def _get_hf_processor(
 
         return hf_processor
 
-    def _get_processor_data(
+    def _get_hf_mm_data(
         self,
         mm_items: MultiModalDataItems,
     ) -> tuple[dict[str, Any], dict[str, Any]]:
@@ -805,7 +817,7 @@ def _get_processor_data(
                       and v[0].ndim == 2):
                     # Pass through embedding inputs (multi)
                     passthrough_data[f"{k}_embeds"] = v
-                else:
+                elif len(v) > 0:
                     # Map keys to plural form, e.g.: image -> images
                     processor_data[f"{k}s"] = v
             else:
@@ -816,8 +828,8 @@ def _get_processor_data(
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_inputs: BatchFeature,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         hf_processor = self._get_hf_processor()
         image_processor = _get_image_processor(hf_processor)
@@ -831,7 +843,9 @@ def _get_prompt_replacements(
         merge_length = image_processor.merge_size**2
 
         def get_replacement_qwen2vl(item_idx: int, modality: str):
-            grid_thw = hf_inputs[f"{modality}_grid_thw"][item_idx]
+            grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
+            assert isinstance(grid_thw, torch.Tensor)
+
             num_tokens = grid_thw.prod() // merge_length
             return placeholder[modality] * num_tokens
 
@@ -844,11 +858,40 @@ def get_replacement_qwen2vl(item_idx: int, modality: str):
             ) for modality in ("image", "video")
         ]
 
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
+        image_slice_idxs = [0] + image_grid_thw.prod(-1).cumsum_(0).tolist()
+        image_slices = [
+            slice(image_slice_idxs[i], image_slice_idxs[i + 1])
+            for i in range(len(image_grid_thw))
+        ]
+
+        video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
+        video_slice_idxs = [0] + video_grid_thw.prod(-1).cumsum_(0).tolist()
+        video_slices = [
+            slice(video_slice_idxs[i], video_slice_idxs[i + 1])
+            for i in range(len(video_grid_thw))
+        ]
+
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat("image", image_slices),
+            image_embeds=MultiModalFieldConfig.flat("image", image_slices),
+            image_grid_thw=MultiModalFieldConfig.batched("image"),
+            pixel_values_videos=MultiModalFieldConfig.flat(
+                "video", video_slices),
+            video_embeds=MultiModalFieldConfig.flat("video", video_slices),
+            video_grid_thw=MultiModalFieldConfig.batched("video"),
+        )
+
     def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        num_images = mm_counts["image"]
+        num_images = mm_counts.get("image", 0)
         hf_processor = self._get_hf_processor()
         image_token: str = hf_processor.image_token
         image_processor = _get_image_processor(hf_processor)
@@ -869,7 +912,6 @@ def _get_dummy_mm_inputs(
         return ProcessorInputs(
             prompt_text=image_token * num_images,
             mm_data=data,
-            mm_processor_kwargs={},
         )
 
 
@@ -950,9 +992,7 @@ def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
             return None
         return quant_config
 
-    def _validate_and_reshape_mm_tensor(self,
-                                        mm_input: Union[torch.Tensor,
-                                                        List[torch.Tensor]],
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
                                         name: str) -> torch.Tensor:
         if not isinstance(mm_input, (torch.Tensor, list)):
             raise ValueError(f"Incorrect type of {name}. "
@@ -962,7 +1002,8 @@ def _validate_and_reshape_mm_tensor(self,
                 return mm_input
             if mm_input.ndim != 3:
                 raise ValueError(f"{name} should be 2D or batched 3D tensor. "
-                                 f"Got ndim: {mm_input.ndim}")
+                                 f"Got ndim: {mm_input.ndim} "
+                                 f"(shape={mm_input.shape})")
             return torch.concat(list(mm_input))
         else:
             return torch.concat(mm_input)
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 509ad9e580ddf..7b4aeeec5f403 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -23,10 +23,11 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.loader import DefaultModelLoader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataItems, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
-                                        PromptReplacement)
+                                        ProcessorInputs, PromptReplacement)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 from vllm.utils import is_list_of
@@ -72,11 +73,19 @@ def get_ultravox_max_audio_tokens(ctx: InputContext):
 
 class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
 
+    def _get_hf_processor(
+        self,
+        *,
+        # Ignored in initialization
+        sampling_rate: Optional[int] = None,
+    ) -> ProcessorMixin:
+        return self.ctx.get_hf_processor()
+
     def _get_feature_extractor(self) -> WhisperFeatureExtractor:
         hf_processor = self._get_hf_processor()
         return hf_processor.audio_processor.feature_extractor  # type: ignore
 
-    def _get_processor_data(
+    def _get_hf_mm_data(
         self,
         mm_items: MultiModalDataItems,
     ) -> tuple[dict[str, Any], dict[str, Any]]:
@@ -84,33 +93,41 @@ def _get_processor_data(
         feature_extractor = self._get_feature_extractor()
         mm_items.resample_audios(feature_extractor.sampling_rate)
 
-        return super()._get_processor_data(mm_items)
+        return super()._get_hf_mm_data(mm_items)
 
     def _call_hf_processor(
         self,
-        hf_processor: ProcessorMixin,
         prompt: str,
-        processor_data: Mapping[str, object],
-        mm_processor_kwargs: Mapping[str, object],
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        processor_data = dict(processor_data)
-        audios = processor_data.pop("audios", [])
+        # Text-only input not supported in composite processor
+        if not mm_data:
+            tokenizer = self._get_tokenizer()
+
+            prompt_ids = tokenizer.encode(
+                prompt,
+                add_special_tokens=False,  # type: ignore
+            )
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        mm_data = dict(mm_data)
+        audios = mm_data.pop("audios", [])
 
         if not audios:
             return super()._call_hf_processor(
-                hf_processor,
                 prompt=prompt,
-                processor_data=processor_data,
-                mm_processor_kwargs=mm_processor_kwargs,
+                mm_data=mm_data,
+                mm_kwargs=mm_kwargs,
             )
 
         feature_extractor = self._get_feature_extractor()
-        mm_processor_kwargs = dict(
-            **mm_processor_kwargs,
+        mm_kwargs = dict(
+            **mm_kwargs,
             sampling_rate=feature_extractor.sampling_rate,
         )
 
-        # Already resampled by _get_processor_data
+        # Already resampled by _get_hf_mm_data
         assert is_list_of(audios, np.ndarray)
 
         # Ultravox processor doesn't support multiple inputs,
@@ -119,13 +136,12 @@ def _call_hf_processor(
         shared_outputs = {}
         for audio in audios:
             # NOTE: Ultravox processor accepts "audio" instead of "audios"
-            item_processor_data = dict(**processor_data, audio=audio)
+            item_processor_data = dict(**mm_data, audio=audio)
 
             item_outputs = super()._call_hf_processor(
-                hf_processor,
                 prompt=prompt,
-                processor_data=item_processor_data,
-                mm_processor_kwargs=mm_processor_kwargs,
+                mm_data=item_processor_data,
+                mm_kwargs=mm_kwargs,
             )
 
             audio_features.append(item_outputs.pop("audio_values")[0])
@@ -139,17 +155,28 @@ def _call_hf_processor(
         )
         return BatchFeature(combined_outputs)
 
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            audio_features=MultiModalFieldConfig.batched("audio"),
+            audio_token_len=MultiModalFieldConfig.batched("audio"),
+            audio_embeds=MultiModalFieldConfig.batched("audio"),
+        )
+
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_inputs: BatchFeature,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         hf_processor = self._get_hf_processor()
         placeholder = hf_processor.audio_token_replacement  # type: ignore
 
         def get_replacement_ultravox(item_idx: int):
-            audio_token_len = hf_inputs["audio_token_len"][item_idx]
+            audio_token_len = out_mm_kwargs["audio_token_len"][item_idx]
             return placeholder * audio_token_len
 
         return [
@@ -168,14 +195,13 @@ def _get_dummy_mm_inputs(
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
 
-        audio_count = mm_counts["audio"]
+        audio_count = mm_counts.get("audio", 0)
         audio = np.zeros(audio_len)
         data = {"audio": [audio] * audio_count}
 
         return ProcessorInputs(
             prompt_text="<|audio|>" * audio_count,
             mm_data=data,
-            mm_processor_kwargs={},
         )
 
 
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 10488e24b30cc..cdda6f8052794 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -297,35 +297,37 @@ def from_seq_group(
         ``MultiModalPlaceholderMap`` that relates the multi-modal embedding
         vectors to their corresponding placeholders.
 
-        Consider the following scenarios:
+        Examples:
 
-           Prompt: |AAAA BBBB What's in these images?|
-        Positions: |.................................|
+        .. code-block::
 
-            images      = [A, B]
-            src_ranges  = [(0, 4), (4, 8)]
-            dest_ranges = [(0, 4), (5, 9)]
+            Prompt:    |AAAA BBBB What's in these images?|
+            Positions: |.................................|
 
-           Prompt: |AAAA BBBB What's in these images?|
-        Positions: |  .....                          |
+                images      = [A, B]
+                src_ranges  = [(0, 4), (4, 8)]
+                dest_ranges = [(0, 4), (5, 9)]
 
-            images      = [A, B]
-            src_ranges  = [(2, 4), (4, 6)]
-            dest_ranges = [(0, 2), (3, 5)]
+            Prompt:    |AAAA BBBB What's in these images?|
+            Positions: |  .....                          |
 
-           Prompt: |AAAA BBBB What's in these images?|
-        Positions: |     .........                   |
+                images      = [A, B]
+                src_ranges  = [(2, 4), (4, 6)]
+                dest_ranges = [(0, 2), (3, 5)]
 
-            images      = [B]
-            src_ranges  = [(0, 4)]
-            dest_ranges = [(0, 4)]
+            Prompt:    |AAAA BBBB What's in these images?|
+            Positions: |     .........                   |
 
-           Prompt: |AAAA BBBB What's in these images?|
-        Positions: |          .......................|
+                images      = [B]
+                src_ranges  = [(0, 4)]
+                dest_ranges = [(0, 4)]
 
-            images      = []
-            src_ranges  = []
-            dest_ranges = []
+            Prompt:    |AAAA BBBB What's in these images?|
+            Positions: |          .......................|
+
+                images      = []
+                src_ranges  = []
+                dest_ranges = []
         """
         seq_mm_data = seq_group.multi_modal_data
         seq_mm_placeholders = seq_group.multi_modal_placeholders
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 9ecae2c1ca2bf..1fbda6e0b8750 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -1,12 +1,16 @@
+from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
-from typing import (Any, Dict, List, Literal, Mapping, Sequence, Tuple,
-                    TypedDict, TypeVar, Union, cast, final)
+from collections.abc import Mapping, Sequence
+from dataclasses import dataclass
+from typing import (Any, Literal, NamedTuple, TypedDict, TypeVar, Union, cast,
+                    final)
 
 import numpy as np
 import torch
 import torch.types
 from PIL.Image import Image
-from typing_extensions import NotRequired, TypeAlias
+from transformers import BatchFeature
+from typing_extensions import NotRequired, TypeAlias, assert_never
 
 from vllm.utils import JSONTree, is_list_of, json_map_leaves
 
@@ -44,7 +48,7 @@
 """
 # yapf: enable
 
-MultiModalData: TypeAlias = Union[_T, List[_T]]
+MultiModalData: TypeAlias = Union[_T, list[_T]]
 """
 Either a single data item, or a list of data items.
 
@@ -79,13 +83,135 @@ class MultiModalDataBuiltins(TypedDict, total=False):
 """
 
 
+class ImageSize(NamedTuple):
+    width: int
+    height: int
+
+
+class MultiModalDataItems(UserDict[str, list[Any]]):
+    """
+    As :class:`MultiModalDataDict`, but normalized such that each entry
+    corresponds to a list.
+    """
+
+    @staticmethod
+    def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
+        """
+        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
+        """
+        multi_data = MultiModalDataItems()
+
+        for k, v in data.items():
+            # TODO: Make a separate modality for embedding inputs
+            # to avoid confusion
+            # yapf: disable
+            if k == "video":
+                # Special case since even a single item can be a list
+                multi_data[k] = (  # type: ignore[index]
+                    v if (
+                        isinstance(v, torch.Tensor)
+                        or is_list_of(v, list)
+                        or isinstance(v[0], (np.ndarray, torch.Tensor))
+                           and v[0].ndim == 4
+                    ) else [v]
+                )
+            elif k in ("image", "audio"):
+                multi_data[k] = (  # type: ignore[index]
+                    v if isinstance(v, (torch.Tensor, list)) else [v]
+                )
+            else:
+                multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
+            # yapf: enable
+
+        return multi_data
+
+    # NOTE: When a field (e.g. `images`) doesn't exist, directly appending to
+    # `self.images` doesn't update this dictionary, which may be confusing
+    # We annotate the getter methods as `Sequence` to prevent others from
+    # trying to update the list in this way
+    @property
+    def images(self) -> Sequence[ImageItem]:
+        return self.get("image", [])
+
+    @property
+    def videos(self) -> Sequence[VideoItem]:
+        return self.get("video", [])
+
+    @property
+    def audios(self) -> Sequence[AudioItem]:
+        return self.get("audio", [])
+
+    def get_item_counts(self) -> Mapping[str, int]:
+        return {m: len(items) for m, items in self.items()}
+
+    def has_embedding_inputs(self) -> bool:
+        return any(
+            any(isinstance(item, torch.Tensor) for item in items)
+            for items in self.values())
+
+    def get_image_size(self, item_idx: int) -> ImageSize:
+        image = self.images[item_idx]
+
+        if isinstance(image, Image):
+            return ImageSize(*image.size)
+        if isinstance(image, (np.ndarray, torch.Tensor)):
+            _, h, w = image.shape
+            return ImageSize(w, h)
+
+        assert_never(image)
+
+    def get_audio_with_sr(
+        self,
+        item_idx: int,
+        *,
+        default_sr: float,
+    ) -> tuple[np.ndarray, float]:
+        audio = self.audios[item_idx]
+
+        if isinstance(audio, tuple):
+            return audio
+        if isinstance(audio, list):
+            return np.array(audio), default_sr
+        if isinstance(audio, np.ndarray):
+            return audio, default_sr
+
+        assert_never(audio)
+
+    def resample_audios(self, new_sr: float, *, drop_sr: bool = True) -> None:
+        """
+        If :code:`drop_sr=True`, the audio items in this dictionary are updated
+        to be NumPy arrays which implicitly means that their sampling rate is
+        the same as the model's expected sampling rate; otherwise, they remain
+        as :code:`(audio, new_sr)` tuples.
+        """
+        # Avoid circular import
+        from .audio import resample_audio
+
+        if not self.audios:
+            return
+
+        new_audios = []
+        for item_idx in range(len(self.audios)):
+            audio, sr = self.get_audio_with_sr(item_idx, default_sr=new_sr)
+            audio = resample_audio(audio, orig_sr=sr, target_sr=new_sr)
+
+            new_audios.append(audio if drop_sr else (audio, new_sr))
+
+        self["audio"] = new_audios
+
+
 class PlaceholderRange(TypedDict):
     """
     Placeholder location information for multi-modal data.
 
-    For example:
-        Prompt: AAAA BBBB What is in these images?
+    Example:
+
+        Prompt: :code:`AAAA BBBB What is in these images?`
+
         Images A and B will have:
+
+        .. code-block::
+
             A: { "offset": 0, "length": 4 }
             B: { "offset": 5, "length": 4 }
     """
@@ -97,25 +223,256 @@ class PlaceholderRange(TypedDict):
     """The length of the placeholder."""
 
 
-NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor,
-                      Tuple[torch.Tensor, ...]]
+NestedTensors = Union[list["NestedTensors"], list[torch.Tensor], torch.Tensor,
+                      tuple[torch.Tensor, ...]]
 """
 Uses a list instead of a tensor if the dimensions of each element do not match.
 """
 
-BatchedTensorInputs: TypeAlias = Dict[str, NestedTensors]
+
+def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
+    """Equality check between :data:`NestedTensors` objects."""
+    if isinstance(a, torch.Tensor):
+        return isinstance(b, torch.Tensor) and bool((a == b).all().item())
+    elif isinstance(b, torch.Tensor):
+        return isinstance(a, torch.Tensor) and bool((b == a).all().item())
+
+    if isinstance(a, list):
+        return (isinstance(b, list)
+                and all(nested_tensors_equal(a_, b_) for a_, b_ in zip(a, b)))
+    if isinstance(b, list):
+        return (isinstance(a, list)
+                and all(nested_tensors_equal(b_, a_) for b_, a_ in zip(b, a)))
+
+    # Both a and b are scalars
+    return a == b
+
+
+BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors]
 """
 A dictionary containing nested tensors which have been batched via
 :meth:`MultiModalKwargs.batch`.
 """
 
 
+@dataclass(frozen=True)
+class MultiModalFieldItem:
+    """
+    Contains metadata and data in :class:`MultiModalKwargs`
+    corresponding to a data item in :class:`MultiModalDataItems`.
+    """
+    field: "BaseMultiModalField"
+    data: NestedTensors
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, self.__class__):
+            return False
+
+        return (self.field == other.field
+                and nested_tensors_equal(self.data, other.data))
+
+
+@dataclass(frozen=True)
+class BaseMultiModalField(ABC):
+    """Abstract base class for a field in :class:`MultiModalKwargs`."""
+    key: str
+    modality: str
+
+    @abstractmethod
+    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+        raise NotImplementedError
+
+    def _build_item(self, data: NestedTensors) -> MultiModalFieldItem:
+        return MultiModalFieldItem(self, data)
+
+    def reduce(self, batch: list[MultiModalFieldItem]) -> MultiModalFieldItem:
+        """Merge multiple instances of :class:`MultiModalFieldItem` together."""
+        fields = [item.field for item in batch]
+        if len(set(fields)) > 1:
+            raise ValueError(f"Cannot merge different {fields=}")
+
+        data = self._reduce_data([item.data for item in batch])
+
+        return self._build_item(data)
+
+
+@dataclass(frozen=True)
+class MultiModalBatchedField(BaseMultiModalField):
+    """
+    A :class:`BaseMultiModalField` implementation where an item is obtained by
+    directly indexing into the first dimension of the underlying data.
+    """
+
+    def build_items(self, batch: NestedTensors) -> list[MultiModalFieldItem]:
+        return [self._build_item(item) for item in batch]
+
+    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+        if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
+            first_shape = batch[0].shape
+            if all(item.shape == first_shape for item in batch):
+                return torch.stack(batch)
+
+        return batch
+
+
+@dataclass(frozen=True)
+class MultiModalFlatField(BaseMultiModalField):
+    """
+    A :class:`BaseMultiModalField` implementation where an item is obtained by
+    slicing along the first dimension of the underlying data.
+    """
+
+    def build_items(
+        self,
+        batch: NestedTensors,
+        slices: Sequence[slice],
+    ) -> list[MultiModalFieldItem]:
+        return [self._build_item(batch[slice_]) for slice_ in slices]
+
+    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+        if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
+            first_shape = batch[0].shape
+            if all(item.shape[1:] == first_shape[1:] for item in batch):
+                return torch.concat(batch)
+
+        return [elem for item in batch for elem in item]
+
+
+class MultiModalFieldConfig:
+
+    @staticmethod
+    def batched(modality: str):
+        return MultiModalFieldConfig(
+            field_cls=MultiModalBatchedField,
+            modality=modality,
+        )
+
+    @staticmethod
+    def flat(modality: str, slices: Sequence[slice]):
+        return MultiModalFieldConfig(
+            field_cls=MultiModalFlatField,
+            modality=modality,
+            slices=slices,
+        )
+
+    def __init__(
+        self,
+        field_cls: type[BaseMultiModalField],
+        modality: str,
+        **field_config: Any,
+    ) -> None:
+        super().__init__()
+
+        self._field_cls = field_cls
+        self._modality = modality
+        self._field_config = field_config
+
+    def build_items(
+        self,
+        key: str,
+        batch: NestedTensors,
+    ) -> list[MultiModalFieldItem]:
+        field = self._field_cls(key=key, modality=self._modality)
+        return field.build_items(batch, **self._field_config)  # type: ignore
+
+
 class MultiModalKwargs(UserDict[str, NestedTensors]):
     """
     A dictionary that represents the keyword arguments to
     :meth:`~torch.nn.Module.forward`.
+
+    The metadata :code:`items_by_key` defines how to split batched keyword
+    arguments corresponding to each data item in :class:`MultiModalDataItems`:
+
+    - For a keyword argument, we can access the :code:`i` th item in the batch
+      via :code:`items_by_key[key][i]`.
+    - We can gather the keyword arguments belonging to a modality by finding
+      the keys with items that belong to that modality, then accessing
+      the :code:`i` th item in the batch for each such key.
+
+    Example:
+
+        .. code-block:: python
+
+            # All items belong to the "image" modality
+            items_by_key={
+                "pixel_values": [a, b, c, d],  # "image" modality
+                "image_grid_thw": [e, f, g, h],  # "image" modality
+                "pixel_values_video": [h, i, j],  # "video" modality
+                "video_grid_thw": [k, l, m],  # "video" modality
+            }
+
+        - The keyword arguments belonging to the first image are
+          :code:`{"pixel_values": a, "image_grid_thw": e}`.
+        - The keyword arguments belonging to the second video are
+          :code:`{"pixel_values_video": i, "video_grid_thw": l}`.
     """
 
+    @staticmethod
+    def from_hf_inputs(
+        hf_inputs: BatchFeature,
+        config_by_key: Mapping[str, MultiModalFieldConfig],
+        *,
+        enable_sanity_checks: bool = False,
+    ):
+        # NOTE: This skips fields in `hf_inputs` that are not in `config_by_key`
+        # We assume that those fields are not used in vLLM
+        items_by_key = {
+            key: config.build_items(key, batch)
+            for key, config in config_by_key.items()
+            if (batch := hf_inputs.get(key)) is not None
+        }
+
+        return MultiModalKwargs.from_items_by_key(
+            items_by_key,
+            enable_sanity_checks=enable_sanity_checks,
+        )
+
+    @staticmethod
+    def from_items_by_key(
+        items_by_key: Mapping[str, list[MultiModalFieldItem]],
+        *,
+        enable_sanity_checks: bool = False,
+    ) -> "MultiModalKwargs":
+        data = {
+            key: items[0].field.reduce(items).data
+            for key, items in items_by_key.items()
+        }
+
+        return MultiModalKwargs(data,
+                                items_by_key=items_by_key,
+                                enable_sanity_checks=enable_sanity_checks)
+
+    def __init__(
+        self,
+        data: Mapping[str, NestedTensors],
+        *,
+        items_by_key: Mapping[str, list[MultiModalFieldItem]] = {},
+        enable_sanity_checks: bool = False,
+    ) -> None:
+        super().__init__(data)
+
+        # Shallow copy to avoid footgun in case a defaultdict is passed in
+        self._items_by_key = dict(items_by_key)
+
+        keys_by_modality = defaultdict[str, set[str]](set)
+        for key, items in items_by_key.items():
+            for item in items:
+                keys_by_modality[item.field.modality].add(key)
+
+        self._keys_by_modality = dict(keys_by_modality)
+
+        if enable_sanity_checks:
+            for modality, keys in keys_by_modality.items():
+                items_in_modality = {k: items_by_key[k] for k in keys}
+                batch_sizes = {k: len(v) for k, v in items_in_modality.items()}
+                batch_size = next(iter(batch_sizes.values()), 0)
+                assert all(bs == batch_size
+                           for bs in batch_sizes.values()), dict(
+                               modality=modality,
+                               batch_sizes=batch_sizes,
+                               items_by_key=items_by_key)
+
     @staticmethod
     def _try_stack(nested_tensors: NestedTensors) -> NestedTensors:
         """
@@ -139,7 +496,7 @@ def _try_stack(nested_tensors: NestedTensors) -> NestedTensors:
             # Only tensors (not lists) can be stacked.
             return stacked
 
-        tensors_ = cast(List[torch.Tensor], stacked)
+        tensors_ = cast(list[torch.Tensor], stacked)
         if any(t.shape != tensors_[0].shape for t in tensors_):
             # The tensors have incompatible shapes and can't be stacked.
             return tensors_
@@ -147,7 +504,7 @@ def _try_stack(nested_tensors: NestedTensors) -> NestedTensors:
         return torch.stack(tensors_)
 
     @staticmethod
-    def batch(inputs_list: List["MultiModalKwargs"]) -> BatchedTensorInputs:
+    def batch(inputs_list: list["MultiModalKwargs"]) -> BatchedTensorInputs:
         """
         Batch multiple inputs together into a dictionary.
 
@@ -162,7 +519,7 @@ def batch(inputs_list: List["MultiModalKwargs"]) -> BatchedTensorInputs:
 
         # We need to consider the case where each item in the batch
         # contains different modalities (i.e. different keys).
-        item_lists: Dict[str, List[NestedTensors]] = defaultdict(list)
+        item_lists = defaultdict[str, list[NestedTensors]](list)
 
         for inputs in inputs_list:
             for k, v in inputs.items():
@@ -188,6 +545,57 @@ def as_kwargs(
 
         return cast(BatchedTensorInputs, json_mapped)
 
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, self.__class__):
+            return False
+        if self._items_by_key != other._items_by_key:
+            return False
+
+        ks = self.keys()
+        return (ks == other.keys()
+                and all(nested_tensors_equal(self[k], other[k]) for k in ks))
+
+    def get_item(self, key: str, item_index: int) -> MultiModalFieldItem:
+        return self._items_by_key[key][item_index]
+
+    def get_items_by_modality(
+        self,
+        modality: str,
+        item_index: int,
+    ) -> Mapping[str, MultiModalFieldItem]:
+        """
+        Get the keyword arguments corresponding to an item identified by
+        its modality and index.
+        """
+        keys_to_gather = self._keys_by_modality[modality]
+
+        return {
+            key: self.get_item(key, item_index)
+            for key in keys_to_gather if key in self
+        }
+
+    @staticmethod
+    def from_items_by_modality(
+        items_by_modality: Mapping[str, list[Mapping[str,
+                                                     MultiModalFieldItem]]],
+        *,
+        enable_sanity_checks: bool = False,
+    ) -> "MultiModalKwargs":
+        """
+        Construct a new :class:`MultiModalKwargs` from multiple items returned
+        by :meth:`get_fields_by_modality`.
+        """
+        items_by_key = defaultdict[str, list[MultiModalFieldItem]](list)
+        for fields in items_by_modality.values():
+            for field in fields:
+                for k, v in field.items():
+                    items_by_key[k].append(v)
+
+        return MultiModalKwargs.from_items_by_key(
+            items_by_key,
+            enable_sanity_checks=enable_sanity_checks,
+        )
+
 
 MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]]
 """
@@ -207,16 +615,16 @@ class MultiModalInputsV2(TypedDict):
     prompt: str
     """The processed prompt text."""
 
-    prompt_token_ids: List[int]
+    prompt_token_ids: list[int]
     """The processed token IDs which includes placeholder tokens."""
 
-    token_type_ids: NotRequired[List[int]]
+    token_type_ids: NotRequired[list[int]]
     """The token type IDs of the prompt."""
 
     mm_kwargs: MultiModalKwargs
     """Keyword arguments to be directly passed to the model after batching."""
 
-    mm_hashes: NotRequired[List[str]]
+    mm_hashes: NotRequired[list[str]]
     """The hashes of the multi-modal data."""
 
     mm_placeholders: MultiModalPlaceholderDict
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 6baf19d675d50..3ece0762e3228 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,6 +1,6 @@
+import pickle
 import re
 from abc import ABC, abstractmethod
-from collections import UserDict
 from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence
 from dataclasses import dataclass, field
 from functools import lru_cache
@@ -8,19 +8,18 @@
 
 import numpy as np
 import torch
+from blake3 import blake3
 from PIL.Image import Image
 from transformers import BatchFeature, ProcessorMixin
-from typing_extensions import assert_never
 
 from vllm.inputs import DummyData, InputProcessingContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import flatten_2d_lists, full_groupby, is_list_of
+from vllm.utils import LRUCache, flatten_2d_lists, full_groupby, is_list_of
 
-from .audio import resample_audio
-from .inputs import (AudioItem, ImageItem, MultiModalDataDict,
-                     MultiModalInputsV2, MultiModalKwargs, PlaceholderRange,
-                     VideoItem)
+from .inputs import (MultiModalDataDict, MultiModalDataItems,
+                     MultiModalFieldConfig, MultiModalFieldItem,
+                     MultiModalInputsV2, MultiModalKwargs, PlaceholderRange)
 
 logger = init_logger(__name__)
 
@@ -201,111 +200,6 @@ def get_replacement(self, item_idx: int) -> _BoundPromptSequence:
         return bound_replacement
 
 
-class ImageSize(NamedTuple):
-    width: int
-    height: int
-
-
-class MultiModalDataItems(UserDict[str, list[Any]]):
-    """
-    As :class:`MultiModalDataDict`, but normalized such that each entry
-    corresponds to a list.
-    """
-
-    @staticmethod
-    def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
-        """
-        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
-        """
-        multi_data = MultiModalDataItems()
-
-        for k, v in data.items():
-            # TODO: Make a separate modality for embedding inputs
-            # to avoid confusion
-            # yapf: disable
-            if k == "video":
-                # Special case since even a single item can be a list
-                multi_data[k] = (  # type: ignore[index]
-                    v if (isinstance(v, torch.Tensor)
-                          or is_list_of(v, list)) else [v]
-                )
-            elif k in ("image", "audio"):
-                multi_data[k] = (  # type: ignore[index]
-                    v if isinstance(v, (torch.Tensor, list)) else [v]
-                )
-            else:
-                multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
-            # yapf: enable
-
-        return multi_data
-
-    # NOTE: When a field (e.g. `images`) doesn't exist, directly appending to
-    # `self.images` doesn't update this dictionary, which may be confusing
-    # We annotate the getter methods as `Sequence` to prevent others from
-    # trying to update the list in this way
-    @property
-    def images(self) -> Sequence[ImageItem]:
-        return self.get("image", [])
-
-    @property
-    def videos(self) -> Sequence[VideoItem]:
-        return self.get("video", [])
-
-    @property
-    def audios(self) -> Sequence[AudioItem]:
-        return self.get("audio", [])
-
-    def get_item_counts(self) -> Mapping[str, int]:
-        return {m: len(items) for m, items in self.items()}
-
-    def get_image_size(self, item_idx: int) -> ImageSize:
-        image = self.images[item_idx]
-
-        if isinstance(image, Image):
-            return ImageSize(*image.size)
-        if isinstance(image, (np.ndarray, torch.Tensor)):
-            _, h, w = image.shape
-            return ImageSize(w, h)
-
-        assert_never(image)
-
-    def get_audio_with_sr(
-        self,
-        item_idx: int,
-        *,
-        default_sr: float,
-    ) -> tuple[np.ndarray, float]:
-        audio = self.audios[item_idx]
-
-        if isinstance(audio, tuple):
-            return audio
-        if isinstance(audio, list):
-            return np.array(audio), default_sr
-        if isinstance(audio, np.ndarray):
-            return audio, default_sr
-
-        assert_never(audio)
-
-    def resample_audios(self, new_sr: float, *, drop_sr: bool = True) -> None:
-        """
-        If :code:`drop_sr=True`, the audio items in this dictionary are updated
-        to be NumPy arrays which implicitly means that their sampling rate is
-        the same as the model's expected sampling rate; otherwise, they remain
-        as :code:`(audio, new_sr)` tuples.
-        """
-        if not self.audios:
-            return
-
-        new_audios = []
-        for item_idx in range(len(self.audios)):
-            audio, sr = self.get_audio_with_sr(item_idx, default_sr=new_sr)
-            audio = resample_audio(audio, orig_sr=sr, target_sr=new_sr)
-
-            new_audios.append(audio if drop_sr else (audio, new_sr))
-
-        self["audio"] = new_audios
-
-
 class _TokenMatch(NamedTuple):
     start_idx: int
     end_idx: int
@@ -583,11 +477,124 @@ def iter_placeholders(
             )
 
 
-class ProcessorInputs(NamedTuple):
-    """Keyword arguments to :meth:`BaseMultiModalProcessor`"""
+@dataclass
+class ProcessorInputs:
+    """Keyword arguments to :meth:`BaseMultiModalProcessor`."""
     prompt_text: str
     mm_data: MultiModalDataDict
-    mm_processor_kwargs: Mapping[str, object]
+    hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
+
+
+class ProcessingCache:
+
+    def __init__(self, capacity: int) -> None:
+        super().__init__()
+
+        # DEBUG: Set to None to disable
+        self.debug_cache_hit_ratio_steps: Optional[int] = None
+
+        self._cache = LRUCache[str, Mapping[str,
+                                            MultiModalFieldItem]](capacity)
+
+    def _maybe_log_cache_stats(self) -> None:
+        steps = self.debug_cache_hit_ratio_steps
+        if not steps:
+            return
+
+        cache_stats = self._cache.stat()
+        if cache_stats.total % steps == 0:
+            logger.debug("ProcessingCache: hit_ratio = %.2f",
+                         cache_stats.hit_ratio)
+
+    def _serialize_item(self, obj: object) -> bytes:
+        # Simple cases
+        if isinstance(obj, str):
+            return obj.encode("utf-8")
+        if isinstance(obj, bytes):
+            return obj
+        if isinstance(obj, Image):
+            return obj.tobytes()
+
+        # Convertible to NumPy arrays
+        if isinstance(obj, torch.Tensor):
+            obj = obj.numpy()
+        if isinstance(obj, (int, float)):
+            obj = np.array(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tobytes()
+
+        logger.warning(
+            "No serialization method found for %s. "
+            "Falling back to pickle.", type(obj))
+
+        return pickle.dumps(obj)
+
+    def _item_to_bytes(
+        self,
+        key: str,
+        obj: object,
+    ) -> Iterable[tuple[bytes, bytes]]:
+        # Recursive cases
+        if isinstance(obj, (list, tuple)):
+            for i, elem in enumerate(obj):
+                yield from self._item_to_bytes(f"{key}.{i}", elem)
+        elif isinstance(obj, dict):
+            for k, v in obj.items():
+                yield from self._item_to_bytes(f"{key}.{k}", v)
+        else:
+            key_bytes = self._serialize_item(key)
+            value_bytes = self._serialize_item(obj)
+            yield key_bytes, value_bytes
+
+    def _hash_kwargs(self, **kwargs: object) -> str:
+        hasher = blake3()
+
+        for k, v in kwargs.items():
+            for k_bytes, v_bytes in self._item_to_bytes(k, v):
+                hasher.update(k_bytes)
+                hasher.update(v_bytes)
+
+        return hasher.hexdigest()
+
+    def get(
+        self,
+        model_id: str,
+        modality: str,
+        input_item: object,
+        input_kwargs: Mapping[str, object],
+    ) -> Optional[Mapping[str, MultiModalFieldItem]]:
+        """
+        Get a processed multi-modal item from the cache
+        according to its dependencies, including:
+
+        - The model ID
+        - The modality of the item
+        - The original data item passed to the HF processor
+        - The configuration options of the HF processor
+        """
+        self._maybe_log_cache_stats()
+
+        cache_key = self._hash_kwargs(model_id=model_id,
+                                      **{modality: input_item},
+                                      **input_kwargs)
+        return self._cache.get(cache_key)
+
+    def put(
+        self,
+        model_id: str,
+        modality: str,
+        input_item: object,
+        input_kwargs: Mapping[str, object],
+        output_kwargs: Mapping[str, MultiModalFieldItem],
+    ) -> None:
+        """
+        Put a processed multi-modal item into the cache
+        according to its dependencies (see :meth:`get`).
+        """
+        cache_key = self._hash_kwargs(model_id=model_id,
+                                      **{modality: input_item},
+                                      **input_kwargs)
+        self._cache.put(cache_key, output_kwargs)
 
 
 class BaseMultiModalProcessor(ABC):
@@ -595,18 +602,24 @@ class BaseMultiModalProcessor(ABC):
     Abstract base class to process multi-modal inputs to be used in vLLM.
     """
 
-    def __init__(self, ctx: InputProcessingContext) -> None:
+    def __init__(self,
+                 ctx: InputProcessingContext,
+                 *,
+                 cache: Optional[ProcessingCache] = None,
+                 enable_sanity_checks: bool = True) -> None:
         super().__init__()
 
         self.ctx = ctx
+        self.cache = cache
+        self.enable_sanity_checks = enable_sanity_checks
 
     def __call__(
         self,
         prompt: str,
         mm_data: MultiModalDataDict,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        return self.apply(prompt, mm_data, mm_processor_kwargs)
+        return self.apply(prompt, mm_data, hf_processor_mm_kwargs)
 
     def _get_hf_processor(self) -> ProcessorMixin:
         """
@@ -624,12 +637,21 @@ def _get_mm_items(
     ) -> MultiModalDataItems:
         return MultiModalDataItems.from_dict(mm_data)
 
+    @abstractmethod
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        """Given the HF-processed data, output the metadata of each field."""
+        raise NotImplementedError
+
     @abstractmethod
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_inputs: BatchFeature,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         """
         Given the original multi-modal items for this modality
@@ -651,7 +673,7 @@ def _find_placeholders(
         return list(
             iter_placeholders(all_prompt_repls, new_token_ids, mm_item_counts))
 
-    def _get_processor_data(
+    def _get_hf_mm_data(
         self,
         mm_items: MultiModalDataItems,
     ) -> tuple[dict[str, Any], dict[str, Any]]:
@@ -669,7 +691,7 @@ def _get_processor_data(
                       and v[0].ndim == 2):
                     # Pass through embedding inputs (multi)
                     passthrough_data[f"{k}_embeds"] = v
-                else:
+                elif len(v) > 0:
                     # Map keys to plural form, e.g.: image -> images
                     processor_data[f"{k}s"] = v
             else:
@@ -679,39 +701,181 @@ def _get_processor_data(
 
     def _call_hf_processor(
         self,
-        hf_processor: ProcessorMixin,
         prompt: str,
-        processor_data: Mapping[str, object],
-        mm_processor_kwargs: Mapping[str, object],
+        # Not to be confused with `mm_data` in `self.apply`.
+        # This refers to the data to be passed to HF processor.
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
         return self.ctx.call_hf_processor(
-            hf_processor,
-            prompt,
-            processor_data,
-            mm_processor_kwargs,
+            self._get_hf_processor(**mm_kwargs),
+            dict(text=prompt, **mm_data),
+            mm_kwargs,
         )
 
     def _apply_hf_processor(
         self,
-        prompt: str,
+        prompt_text: str,
         mm_items: MultiModalDataItems,
-        mm_processor_kwargs: Mapping[str, object],
-    ) -> BatchFeature:
-        # some mm_processor_kwargs may be used in processor initialization
-        # instead of processor call
-        hf_processor = self._get_hf_processor(**mm_processor_kwargs)
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> tuple[list[int], MultiModalKwargs]:
+        """
+        Apply the HF processor on the full prompt text and multi-modal data.
+        """
+        processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
+
+        processed_data = self._call_hf_processor(
+            prompt=prompt_text,
+            mm_data=processor_data,
+            mm_kwargs=hf_processor_mm_kwargs,
+        )
+        processed_data.update(passthrough_data)
 
-        processor_data, passthrough_data = self._get_processor_data(mm_items)
+        prompt_ids, = processed_data.pop("input_ids").tolist()
 
-        hf_inputs = self._call_hf_processor(
-            hf_processor,
-            prompt=prompt,
-            processor_data=processor_data,
-            mm_processor_kwargs=mm_processor_kwargs,
+        mm_kwargs = MultiModalKwargs.from_hf_inputs(
+            processed_data,
+            self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs),
+            enable_sanity_checks=self.enable_sanity_checks,
         )
-        hf_inputs.update(passthrough_data)
 
-        return hf_inputs
+        return prompt_ids, mm_kwargs
+
+    def _apply_hf_processor_missing(
+        self,
+        prompt_text: str,
+        mm_missing_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ):
+        """
+        Apply the HF processor on the full prompt text, but only on the
+        multi-modal data that are missing from the cache.
+
+        Note: We pass prompt text and multi-modal data into the HF processor
+        in separate calls to avoid HF prompt replacement being done for
+        cached items; instead, we rely on our own prompt replacement logic
+        for the full text.
+        """
+        mm_missing_counts = mm_missing_data_items.get_item_counts()
+
+        prompt_ids, _ = self._apply_hf_processor(
+            prompt_text=prompt_text,
+            mm_items=MultiModalDataItems({}),
+            hf_processor_mm_kwargs={},
+        )
+
+        # Some HF processors (e.g. Qwen2-VL) expect corresponding
+        # multi-modal tokens to be in the prompt text
+        dummy_inputs = self._get_dummy_mm_inputs(mm_missing_counts)
+
+        _, mm_missing_kwargs = self._apply_hf_processor(
+            prompt_text=dummy_inputs.prompt_text,
+            mm_items=mm_missing_data_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
+        return prompt_ids, mm_missing_kwargs
+
+    def _cached_apply_hf_processor(
+        self,
+        prompt_text: str,
+        mm_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> tuple[list[int], MultiModalKwargs]:
+        """
+        Apply the HF processor on the full prompt text,
+        caching the results and reusing cached results.
+        """
+        cache = self.cache
+        model_id = self.ctx.model_config.model
+
+        if cache is None or mm_data_items.has_embedding_inputs():
+            return self._apply_hf_processor(
+                prompt_text=prompt_text,
+                mm_items=mm_data_items,
+                hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            )
+
+        mm_maybe_cached_field_items = {
+            modality: [
+                cache.get(model_id, modality, item, hf_processor_mm_kwargs)
+                for item in items
+            ]
+            for modality, items in mm_data_items.items()
+        }
+
+        mm_missing_idxs = {
+            modality: [idx for idx, out in enumerate(fields) if out is None]
+            for modality, fields in mm_maybe_cached_field_items.items()
+        }
+        mm_missing_data = {
+            modality: [mm_data_items[modality][idx] for idx in idxs]
+            for modality, idxs in mm_missing_idxs.items()
+        }
+        mm_missing_data_items = self._get_mm_items(mm_missing_data)
+
+        prompt_ids, mm_missing_kwargs = self._apply_hf_processor_missing(
+            prompt_text=prompt_text,
+            mm_missing_data_items=mm_missing_data_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
+        mm_missing_next_idx = {
+            modality: 0
+            for modality in mm_missing_data_items
+        }
+
+        mm_merged_field_items = dict[str, list[Mapping[str,
+                                                       MultiModalFieldItem]]]()
+        for modality, modal_items_lst in mm_maybe_cached_field_items.items():
+            merged_modal_items_lst = list[Mapping[str, MultiModalFieldItem]]()
+
+            for idx, modal_items in enumerate(modal_items_lst):
+                if modal_items is None:
+                    modal_items = mm_missing_kwargs.get_items_by_modality(
+                        modality,
+                        mm_missing_next_idx[modality],
+                    )
+
+                    cache.put(
+                        model_id,
+                        modality,
+                        mm_data_items[modality][idx],
+                        hf_processor_mm_kwargs,
+                        modal_items,
+                    )
+
+                    mm_missing_next_idx[modality] += 1
+
+                merged_modal_items_lst.append(modal_items)
+
+            mm_merged_field_items[modality] = merged_modal_items_lst
+
+        if self.enable_sanity_checks:
+            mm_missing_counts = mm_missing_data_items.get_item_counts()
+            assert all(
+                item_count == mm_missing_counts[modality]
+                for modality, item_count in mm_missing_next_idx.items()), dict(
+                    mm_missing_next_idx=mm_missing_next_idx,
+                    mm_missing_counts=mm_missing_counts)
+
+        mm_kwargs = MultiModalKwargs.from_items_by_modality(
+            mm_merged_field_items,
+            enable_sanity_checks=self.enable_sanity_checks,
+        )
+
+        if self.enable_sanity_checks:
+            mm_item_counts = mm_data_items.get_item_counts()
+
+            for modality, item_count in mm_item_counts.items():
+                for item_idx in range(item_count):
+                    try:
+                        mm_kwargs.get_items_by_modality(modality, item_idx)
+                    except Exception as e:
+                        # Make it easy to set a breakpoint in the debugger
+                        raise e
+
+        return prompt_ids, mm_kwargs
 
     def _bind_prompt_replacements(
         self,
@@ -730,6 +894,10 @@ def _apply_prompt_replacements(
         tokenizer = self._get_tokenizer()
 
         token_matches = find_token_matches(token_ids, prompt_repls)
+        mm_match_counts = {
+            modality: len(matches)
+            for modality, matches in full_groupby_modality(token_matches)
+        }
 
         # If the search text does not represent a special token,
         # it may have different token IDs in the prompt, because
@@ -742,8 +910,8 @@ def _apply_prompt_replacements(
         # of the search text in the prompt, we instead perform string
         # replacement on the decoded token IDs, then encode them back.
         if all(
-            len(matches) >= mm_item_counts[modality]
-            for modality, matches in full_groupby_modality(token_matches)
+            mm_match_counts.get(modality, 0) >= item_count
+            for modality, item_count in mm_item_counts.items()
         ):  # yapf: disable
             token_ids = replace_token_matches(
                 token_ids,
@@ -775,7 +943,7 @@ def apply(
         self,
         prompt_text: str,
         mm_data: MultiModalDataDict,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -792,20 +960,24 @@ def apply(
         """
         mm_items = self._get_mm_items(mm_data)
 
-        hf_inputs = self._apply_hf_processor(prompt_text, mm_items,
-                                             mm_processor_kwargs)
-        prompt_ids, = hf_inputs.pop("input_ids").tolist()
-        mm_kwargs = MultiModalKwargs(hf_inputs)
+        prompt_ids, mm_kwargs = self._cached_apply_hf_processor(
+            prompt_text,
+            mm_items,
+            hf_processor_mm_kwargs,
+        )
 
-        prompt_repls = self._get_prompt_replacements(mm_items, hf_inputs,
-                                                     mm_processor_kwargs)
-        all_prompt_repls = self._bind_prompt_replacements(prompt_repls)
+        unbound_prompt_repls = self._get_prompt_replacements(
+            mm_items,
+            hf_processor_mm_kwargs,
+            mm_kwargs,
+        )
+        prompt_repls = self._bind_prompt_replacements(unbound_prompt_repls)
 
         # If HF processor already inserts placeholder tokens,
         # there is no need for us to insert them
         mm_item_counts = mm_items.get_item_counts()
-        all_placeholders = self._find_placeholders(all_prompt_repls,
-                                                   prompt_ids, mm_item_counts)
+        all_placeholders = self._find_placeholders(prompt_repls, prompt_ids,
+                                                   mm_item_counts)
 
         if all_placeholders:
             tokenizer = self._get_tokenizer()
@@ -817,7 +989,7 @@ def apply(
                 all_placeholders,
             ) = self._apply_prompt_replacements(
                 prompt_ids,
-                all_prompt_repls,
+                prompt_repls,
                 mm_item_counts,
             )
 
@@ -855,23 +1027,29 @@ def get_dummy_data(
         from vllm.sequence import SequenceData
 
         processor_inputs = self._get_dummy_mm_inputs(mm_counts)
-        mm_inputs = self.apply(*processor_inputs)
+        mm_inputs = self.apply(
+            prompt_text=processor_inputs.prompt_text,
+            mm_data=processor_inputs.mm_data,
+            hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
+        )
 
         prompt_token_ids = mm_inputs["prompt_token_ids"]
         placeholders_by_modality = mm_inputs["mm_placeholders"]
 
-        total_placeholders_by_modality = dict[str, int]()
-        for modality, placeholders in placeholders_by_modality.items():
-            num_placeholders = sum(item["length"] for item in placeholders)
-            max_tokens = mm_max_tokens[modality]
-
-            if num_placeholders != max_tokens:
-                logger.warning(
-                    "The processed dummy data has a total of %d placeholder "
-                    "tokens for the '%s' modality, which is not the expected "
-                    "%d tokens.", num_placeholders, modality, max_tokens)
-
-            total_placeholders_by_modality[modality] = num_placeholders
+        total_placeholders_by_modality = {
+            modality: sum(item["length"] for item in placeholders)
+            for modality, placeholders in placeholders_by_modality.items()
+        }
+        expected_placeholders_by_modality = {
+            modality: mm_max_tokens[modality]
+            for modality in placeholders_by_modality
+        }
+        if total_placeholders_by_modality != expected_placeholders_by_modality:
+            raise AssertionError(
+                f"The processed dummy data has a total of "
+                f"{total_placeholders_by_modality} placeholder tokens, which "
+                f"is not the expected {expected_placeholders_by_modality} "
+                "tokens.")
 
         total_len = len(prompt_token_ids)
         if total_len > seq_len:
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index ded45a7184b5d..3a5e11867ad9e 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,10 +1,9 @@
 import functools
 from collections import UserDict
-from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, Optional,
+from typing import (TYPE_CHECKING, Any, Dict, Mapping, Optional, Protocol,
                     Sequence, Type, TypeVar)
 
 import torch.nn as nn
-from typing_extensions import TypeAlias
 
 from vllm.inputs import InputProcessingContext
 from vllm.logger import init_logger
@@ -15,7 +14,7 @@
 from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc
 from .image import ImagePlugin
 from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
-from .processing import BaseMultiModalProcessor
+from .processing import BaseMultiModalProcessor, ProcessingCache
 from .video import VideoPlugin
 
 if TYPE_CHECKING:
@@ -23,15 +22,22 @@
 
 logger = init_logger(__name__)
 
+# TODO: Tune the MM cache size
+MM_CACHE_SIZE = 256
+
 N = TypeVar("N", bound=Type[nn.Module])
 
-MultiModalProcessorFactory: TypeAlias = Callable[[InputProcessingContext],
-                                                 BaseMultiModalProcessor]
-"""
-Constructs a :class:`MultiModalProcessor` instance from the context.
 
-The processing metadata should be derived from the context.
-"""
+class MultiModalProcessorFactory(Protocol):
+    """Constructs a :class:`MultiModalProcessor` instance from the context."""
+
+    def __call__(
+        self,
+        ctx: InputProcessingContext,
+        *,
+        cache: Optional[ProcessingCache] = None,
+    ) -> BaseMultiModalProcessor:
+        ...
 
 
 class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]):
@@ -71,6 +77,8 @@ def __init__(
 
         self._limits_by_model = _MultiModalLimits()
 
+        self._processing_cache = ProcessingCache(MM_CACHE_SIZE)
+
     def register_plugin(self, plugin: MultiModalPlugin) -> None:
         """
         Register a multi-modal plugin so it can be recognized by vLLM.
@@ -328,15 +336,18 @@ def wrapper(model_cls: N) -> N:
 
         return wrapper
 
-    def has_processor(self, model_config: "ModelConfig") -> bool:
-        """
-        Test whether a multi-modal processor is defined for a specific model.
-        """
+    def _get_model_cls(self, model_config: "ModelConfig"):
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
 
         model_cls, _ = get_model_architecture(model_config)
-        return model_cls in self._processor_factories
+        return model_cls
+
+    def has_processor(self, model_config: "ModelConfig") -> bool:
+        """
+        Test whether a multi-modal processor is defined for a specific model.
+        """
+        return self._get_model_cls(model_config) in self._processor_factories
 
     def create_processor(
         self,
@@ -346,12 +357,11 @@ def create_processor(
         """
         Create a multi-modal processor for a specific model and tokenizer.
         """
-
-        # Avoid circular import
-        from vllm.model_executor.model_loader import get_model_architecture
-
-        model_cls, _ = get_model_architecture(model_config)
+        model_cls = self._get_model_cls(model_config)
         processor_factory = self._processor_factories[model_cls]
 
         ctx = InputProcessingContext(model_config, tokenizer)
-        return processor_factory(ctx)
+        cache = (None if model_config.disable_mm_preprocessor_cache else
+                 self._processing_cache)
+
+        return processor_factory(ctx, cache=cache)
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index f1523667b0466..b12cc83a22970 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -1,25 +1,31 @@
 from functools import lru_cache
 from typing import Any, cast
 
+from transformers.processing_utils import ProcessorMixin
+
 
 def get_processor(
     processor_name: str,
     *args: Any,
     trust_remote_code: bool = False,
+    processor_cls: type[ProcessorMixin] = ProcessorMixin,
     **kwargs: Any,
 ):
     """Load a processor for the given model name via HuggingFace."""
     # don't put this import at the top level
     # it will call torch.cuda.device_count()
     from transformers import AutoProcessor
-    from transformers.processing_utils import ProcessorMixin
+
+    processor_factory = (AutoProcessor
+                         if processor_cls == ProcessorMixin else processor_cls)
 
     try:
-        processor = AutoProcessor.from_pretrained(
+        processor = processor_factory.from_pretrained(
             processor_name,
             *args,
             trust_remote_code=trust_remote_code,
-            **kwargs)
+            **kwargs,
+        )
     except ValueError as e:
         # If the error pertains to the processor class not existing or not
         # currently being imported, suggest using the --trust-remote-code flag.
diff --git a/vllm/utils.py b/vllm/utils.py
index 3d198887021dc..5eb4e8c4180c4 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -25,11 +25,11 @@
 import weakref
 from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
 from collections import OrderedDict, UserDict, defaultdict
-from collections.abc import Iterable, Mapping
+from collections.abc import Hashable, Iterable, Mapping
 from dataclasses import dataclass, field
 from functools import lru_cache, partial, wraps
 from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
-                    Dict, Generator, Generic, Hashable, List, Literal,
+                    Dict, Generator, Generic, List, Literal, NamedTuple,
                     Optional, Tuple, Type, TypeVar, Union, overload)
 from uuid import uuid4
 
@@ -194,13 +194,29 @@ def reset(self) -> None:
         self.counter = 0
 
 
+class CacheInfo(NamedTuple):
+    hits: int
+    total: int
+
+    @property
+    def hit_ratio(self) -> float:
+        if self.total == 0:
+            return 0
+
+        return self.hits / self.total
+
+
 class LRUCache(Generic[_K, _V]):
+    """Note: This class is not thread safe!"""
 
     def __init__(self, capacity: int) -> None:
         self.cache = OrderedDict[_K, _V]()
         self.pinned_items = set[_K]()
         self.capacity = capacity
 
+        self._hits = 0
+        self._total = 0
+
     def __contains__(self, key: _K) -> bool:
         return key in self.cache
 
@@ -218,6 +234,9 @@ def __setitem__(self, key: _K, value: _V) -> None:
     def __delitem__(self, key: _K) -> None:
         self.pop(key)
 
+    def stat(self) -> CacheInfo:
+        return CacheInfo(hits=self._hits, total=self._total)
+
     def touch(self, key: _K) -> None:
         self.cache.move_to_end(key)
 
@@ -226,8 +245,12 @@ def get(self, key: _K, default: Optional[_V] = None) -> Optional[_V]:
         if key in self.cache:
             value = self.cache[key]
             self.cache.move_to_end(key)
+
+            self._hits += 1
         else:
             value = default
+
+        self._total += 1
         return value
 
     def put(self, key: _K, value: _V) -> None:

From 55509c2114718c1292c11348f002461ba44cb23b Mon Sep 17 00:00:00 2001
From: ErezSC42 <erezs@ai21.com>
Date: Fri, 27 Dec 2024 19:58:21 +0200
Subject: [PATCH 200/357] [MODEL] LoRA support for Jamba model (#11209)

Signed-off-by: Erez Schwartz <erezs@ai21.com>
---
 tests/lora/conftest.py                        | 24 +++++++++
 tests/lora/test_jamba.py                      | 54 +++++++++++++++++++
 .../layers/mamba/mamba_mixer.py               | 22 ++++++--
 vllm/model_executor/models/jamba.py           | 50 ++++++++---------
 vllm/model_executor/models/mamba.py           | 14 +++--
 5 files changed, 132 insertions(+), 32 deletions(-)
 create mode 100644 tests/lora/test_jamba.py

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 8b247fb9b2388..57ebaa424fc59 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -4,6 +4,7 @@
 from unittest.mock import MagicMock, patch
 
 import pytest
+import safetensors
 import torch
 import torch.nn as nn
 from huggingface_hub import snapshot_download
@@ -169,6 +170,29 @@ def mixtral_lora_files_all_target_modules():
     return snapshot_download(repo_id="dyang415/mixtral-lora-v0")
 
 
+@pytest.fixture(scope="session")
+def jamba_lora_files():
+    #   some of the adapters have unnecessary weights for serving,
+    #   hence we remove them
+    def remove_unnecessary_weights(path):
+        lora_path = f"{adapter_path}/adapter_model.safetensors"
+        tensors = safetensors.torch.load_file(lora_path)
+        nonlora_keys = []
+        for k in list(tensors.keys()):
+            if "lora" not in k:
+                nonlora_keys.append(k)
+        for k in nonlora_keys:
+            del tensors[k]
+        safetensors.torch.save_file(tensors, lora_path)
+
+    adapter_path = snapshot_download(
+        repo_id=
+        "hf-100/Jamba-1.5-mini-Spellbound-StoryWriter-0.1-6583896-ckpt53-lora")
+
+    remove_unnecessary_weights(adapter_path)
+    return adapter_path
+
+
 @pytest.fixture(scope="session")
 def gemma_lora_files():
     return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
diff --git a/tests/lora/test_jamba.py b/tests/lora/test_jamba.py
new file mode 100644
index 0000000000000..6aa33926cb6b8
--- /dev/null
+++ b/tests/lora/test_jamba.py
@@ -0,0 +1,54 @@
+from typing import List
+
+import pytest
+import torch
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+MODEL_PATH = "ai21labs/AI21-Jamba-1.5-Mini"
+
+MAX_TOKENS = 40
+
+
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
+              prompts: List[str]) -> List[str]:
+
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=MAX_TOKENS)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts: List[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+@pytest.mark.parametrize("tp_size", [4])
+def test_jamba_lora(jamba_lora_files, tp_size):
+    """Original test, the LoRA model has the common target modules, not all"""
+    if torch.cuda.device_count() < tp_size:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+
+    prompts = ["Write a story about a sheep and a goat."]
+
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        distributed_executor_backend="ray",
+        tensor_parallel_size=tp_size,
+    )
+
+    expected_jamba_output = [
+        """Once upon a time, in a lush green meadow, there lived a sheep named Clara and a goat named Billy. Clara was a gentle creature, always nibbling on the soft grass and humming"""  # noqa: E501
+    ]
+    assert do_sample(llm, jamba_lora_files, lora_id=1,
+                     prompts=prompts) == expected_jamba_output
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 10bec75f49fdf..606c796d503cf 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -42,12 +42,14 @@ def __init__(self,
                  use_rms_norm: bool,
                  rms_norm_has_weight: bool = True,
                  rms_norm_eps: float = 1e-5,
-                 activation="silu"):
+                 activation="silu",
+                 is_lora_enabled: bool = False):
         super().__init__()
         self.time_step_rank = time_step_rank
         self.ssm_state_size = ssm_state_size
         self.use_rms_norm = use_rms_norm
         self.activation = activation
+        self.is_lora_enabled = is_lora_enabled
 
         self.conv1d = ColumnParallelLinear(
             input_size=conv_kernel_size,
@@ -63,6 +65,7 @@ def __init__(self,
         self.in_proj = MergedColumnParallelLinear(hidden_size,
                                                   [intermediate_size] * 2,
                                                   bias=use_bias)
+
         # selective projection used to make dt, B and C input dependent
         self.x_proj = RowParallelLinear(
             intermediate_size,
@@ -170,7 +173,13 @@ def forward_cuda(self, hidden_states: torch.Tensor,
 
         # 3. State Space Model sequence transformation
         # 3.a. input varying initialization of time_step, B and C
-        ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0]
+
+        if self.is_lora_enabled:
+            #   lora kernel requires contiguous tensor
+            ssm_parameters = self.x_proj(
+                hidden_states.transpose(-2, -1).contiguous())[0]
+        else:
+            ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0]
 
         time_step, B, C = torch.split(
             ssm_parameters,
@@ -222,6 +231,11 @@ def forward_cuda(self, hidden_states: torch.Tensor,
             scan_outputs = scan_outputs.transpose(0, 1)
 
         # 4. Final linear projection
-        contextualized_states = self.out_proj(scan_outputs.transpose(-2,
-                                                                     -1))[0]
+        if self.is_lora_enabled:
+            #  lora kernel requires contiguous tensor
+            contextualized_states = self.out_proj(
+                scan_outputs.transpose(-2, -1).contiguous())[0]
+        else:
+            contextualized_states = self.out_proj(
+                scan_outputs.transpose(-2, -1))[0]
         return contextualized_states
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 91786db5ddc96..890b5530b97d6 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -107,9 +107,11 @@ def __init__(self,
                  layer_idx: int,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "") -> None:
+                 is_lora_enabled: Optional[bool] = False,
+                 **kwargs) -> None:
         super().__init__()
         self.config = config
+        self.is_lora_enabled = is_lora_enabled
         self.mamba = MambaMixer(hidden_size= config.hidden_size,
                                 ssm_state_size = config.mamba_d_state,
                                 conv_kernel_size = config.mamba_d_conv,
@@ -120,7 +122,9 @@ def __init__(self,
                                 use_bias = config.mamba_proj_bias,
                                 use_rms_norm=True,
                                 rms_norm_eps=config.rms_norm_eps,
-                                activation=config.hidden_act)
+                                activation=config.hidden_act,
+                                is_lora_enabled = self.is_lora_enabled
+                                )
 
         num_experts = config.layers_num_experts[layer_idx]
         ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP
@@ -156,14 +160,13 @@ def forward(
 
 class JambaAttentionDecoderLayer(nn.Module):
 
-    def __init__(
-        self,
-        config: JambaConfig,
-        layer_idx: int,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self,
+                 config: JambaConfig,
+                 layer_idx: int,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "",
+                 **kwargs) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         tp_size = get_tensor_model_parallel_world_size()
@@ -287,17 +290,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             org_num_embeddings=config.vocab_size,
         )
 
+        extra_kwargs = {"is_lora_enabled": bool(vllm_config.lora_config)}
+
         def get_layer(prefix: str):
             layer_idx = int(prefix.rsplit(".", 1)[1])
             layer_class = ALL_DECODER_LAYER_TYPES[
                 config.layers_block_type[layer_idx]]
-            return layer_class(
-                config,
-                layer_idx,
-                cache_config,
-                quant_config=quant_config,
-                prefix=prefix,
-            )
+            return layer_class(config,
+                               layer_idx,
+                               cache_config,
+                               quant_config=quant_config,
+                               prefix=prefix,
+                               **extra_kwargs)
 
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
@@ -371,14 +375,13 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
             "k_proj",
             "v_proj",
         ],
+        "in_proj": ["in_proj"],
     }
 
     # LoRA specific attributes
     supported_lora_modules = [
-        "qkv_proj",
-        "o_proj",
-        "embed_tokens",
-        "lm_head",
+        "qkv_proj", "o_proj", "embed_tokens", "lm_head", "up_proj",
+        "down_proj", "gate_proj", "out_proj", "in_proj", "x_proj"
     ]
     embedding_modules = {
         "embed_tokens": "input_embeddings",
@@ -423,9 +426,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
         if self.scheduler_config is not None and \
-            not self.model_config.enforce_eager:
+                not self.model_config.enforce_eager:
             if self.scheduler_config.max_num_seqs > \
-                vllm_config.compilation_config.max_capture_size:
+                    vllm_config.compilation_config.max_capture_size:
                 self.max_batch_size = \
                     vllm_config.compilation_config.max_capture_size
             else:
@@ -446,7 +449,6 @@ def forward(self,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
         if self.mamba_cache is None:
-
             num_mamba_layers = self.model_config.get_num_layers_by_block_type(
                 self.vllm_config.parallel_config, LayerBlockType.mamba)
             self.mamba_cache = MambaCacheManager(
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 06c8d9723cd01..553bc9c28cb21 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -38,10 +38,12 @@ class MambaDecoderLayer(nn.Module):
     def __init__(self,
                  config: MambaConfig,
                  cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None) -> None:
+                 quant_config: Optional[QuantizationConfig] = None,
+                 is_lora_enabled: Optional[bool] = False) -> None:
         super().__init__()
         self.config = config
         self.is_falcon_mamba = config.model_type == "falcon_mamba"
+        self.is_lora_enabled = is_lora_enabled
         mixer_rms_eps = config.mixer_rms_eps if self.is_falcon_mamba else None
         self.mixer = MambaMixer(hidden_size=config.hidden_size,
                                 ssm_state_size=config.state_size,
@@ -53,7 +55,8 @@ def __init__(self,
                                 use_rms_norm=self.is_falcon_mamba,
                                 rms_norm_has_weight=not self.is_falcon_mamba,
                                 rms_norm_eps=mixer_rms_eps,
-                                activation=config.hidden_act)
+                                activation=config.hidden_act,
+                                is_lora_enabled=self.is_lora_enabled)
 
         self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
@@ -85,6 +88,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
+        is_lora_enabled = bool(lora_config)
 
         self.config = config
         self.padding_idx = config.pad_token_id
@@ -101,8 +105,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: MambaDecoderLayer(
-                config, cache_config=cache_config, quant_config=quant_config),
+            lambda prefix: MambaDecoderLayer(config,
+                                             cache_config=cache_config,
+                                             quant_config=quant_config,
+                                             is_lora_enabled=is_lora_enabled),
             prefix=f"{prefix}.layers")
 
         self.norm_f = RMSNorm(config.hidden_size,

From 0240402c4632604c9cd02f7eae4ae36fa990b38f Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 28 Dec 2024 02:48:24 +0800
Subject: [PATCH 201/357] [Misc]Add BNB quantization for MolmoForCausalLM 
 (#11551)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/model_loader/loader.py | 26 +++++--
 vllm/model_executor/models/molmo.py        | 90 ++++++++++++++++------
 2 files changed, 83 insertions(+), 33 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index f2d9293b31a83..4bca13cb2f60c 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -11,7 +11,8 @@
 import warnings
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
-from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, cast
+from typing import (Any, Callable, Dict, Generator, Iterable, List, Optional,
+                    Tuple, cast)
 
 import gguf
 import huggingface_hub
@@ -706,6 +707,8 @@ def __init__(self, load_config: LoadConfig):
         # Store all module names (from transformers) that support
         # BNB quantization.
         self.target_modules: List[str] = []
+        # mapping weight names from transformers to vllm.
+        self.weight_mapper: Callable = lambda name: name
 
     def _get_weight_files(
         self,
@@ -763,9 +766,12 @@ def _prepare_weights(self, model_name_or_path: str,
 
     def _hf_weight_iter(self, hf_weights_files, use_safetensors: bool):
         if use_safetensors:
-            return safetensors_weights_iterator(hf_weights_files)
+            iterator = safetensors_weights_iterator(hf_weights_files)
         else:
-            return pt_weights_iterator(hf_weights_files)
+            iterator = pt_weights_iterator(hf_weights_files)
+        for name, param in iterator:
+            # mapping weight names from transformers to vllm.
+            yield self.weight_mapper(name), param
 
     def _get_quantized_weights_iterator(
         self,
@@ -782,12 +788,12 @@ def _get_quantized_weights_iterator(
         try:
             import bitsandbytes
 
-            if bitsandbytes.__version__ < "0.44.0":
+            if bitsandbytes.__version__ < "0.45.0":
                 raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.44.0.")
+                                  "install bitsandbytes>=0.45.0.")
         except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.44.0 via "
-                              "`pip install bitsandbytes>=0.44.0` to use "
+            raise ImportError("Please install bitsandbytes>=0.45.0 via "
+                              "`pip install bitsandbytes>=0.45.0` to use "
                               "bitsandbytes quantizer.") from err
 
         hf_weights_files, use_safetensors = self._prepare_weights(
@@ -991,7 +997,7 @@ def _get_bnb_target_modules(self, model: nn.Module) -> None:
             if isinstance(module, (LinearBase, )):
                 last_name = name.split(".")[-1]
                 if sub_modules := inverse_stacked_mapping.get(last_name, []):
-                    # Map vllm's names to transformers' names.
+                    # Map vllm's names to transformers's names.
                     for sub_name in sub_modules:
                         self.target_modules.append(
                             name.replace(last_name, sub_name))
@@ -1013,6 +1019,10 @@ def _load_weights(self, model_config: ModelConfig,
                 f"Model {type(model).__name__} does not support BitsAndBytes "
                 "quantization yet.")
 
+        # For some models like Molmo, we need to use hf_to_vllm_mapper
+        # to ensure correct loading of weights.
+        if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None):
+            self.weight_mapper = lambda name: hf_to_vllm_mapper._map_name(name)
         # Modules whose weights might have fused on disk
         # we need their output_sizes to make shard in flight correctly with TP
         self.maybe_fused_weights_modules: Dict[str, List[int]] = {}
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 8938f62d0c494..5d52d2c3e6b48 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -461,30 +461,71 @@ def forward(
         return output
 
 
-class MolmoMLP(nn.Module):
+class SwiGLU(nn.Module):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, gate = x.chunk(2, dim=-1)
+        # Note that the order is reversed compared to
+        # SiluAndMul.
+        return x * F.silu(gate)
+
+
+class LanuageModelMLP(nn.Module):
     """Molmo's LLM mlp."""
 
     def __init__(self,
                  config: PretrainedConfig,
                  input_dim: Optional[int] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 proj_name: str = "gate_up_proj") -> None:
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size // 2
 
-        # Molmo's LLM proj weights are already merged into the disk, while
-        # image_projector proj is separate. If the same proj_name were used, it
-        # would create ambiguity and make it difficult to support BNB and LoRA.
-        self.proj_name = proj_name
-        setattr(
-            self, proj_name,
-            MergedColumnParallelLinear(
-                input_dim or self.hidden_size,
-                [self.intermediate_size] * 2,
-                bias=False,
-                quant_config=quant_config,
-            ))
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_dim or self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+        )
+        # Activation function.
+        self.act_fn = SwiGLU()
+        # Feed-forward output projection.
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class ImageProjectorMLP(nn.Module):
+    """Molmo's image_projector mlp."""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        input_dim: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size // 2
+
+        self.merged_linear = MergedColumnParallelLinear(
+            input_dim or self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+        )
         # Activation function.
         self.act_fn = SiluAndMul()
 
@@ -500,7 +541,7 @@ def forward(
         self,
         x: torch.Tensor,
     ) -> torch.Tensor:
-        gate_up, _ = getattr(self, self.proj_name)(x)
+        gate_up, _ = self.merged_linear(x)
         x = self.act_fn(gate_up)
         x, _ = self.down_proj(x)
         return x
@@ -523,9 +564,7 @@ def __init__(
                                         prefix=f"{prefix}.self_attn")
 
         # MLP block.
-        self.mlp = MolmoMLP(config,
-                            quant_config=quant_config,
-                            proj_name="gate_up_proj")
+        self.mlp = LanuageModelMLP(config, quant_config=quant_config)
 
         # LayerNorm
         assert config.layer_norm_type == "rms"
@@ -617,11 +656,10 @@ def __init__(
             vision_config,
             nlayers=len(self.vit_layers),
             quant_config=quant_config)
-        self.image_projector = MolmoMLP(
+        self.image_projector = ImageProjectorMLP(
             config,
             input_dim=vision_config.image_emb_dim,
             quant_config=quant_config,
-            proj_name="merged_linear",
         )
 
         image_dim = vision_config.image_emb_dim * len(self.vit_layers)
@@ -842,10 +880,6 @@ def load_weights(self, weights: Iterable[Tuple[str,
         loaded_params: Set[str] = set()
 
         for name, loaded_weight in weights:
-            if "gate_up_proj" in name:
-                up_proj, gate_proj = loaded_weight.chunk(2, dim=0)
-                loaded_weight = torch.cat([gate_proj, up_proj], dim=0)
-
             if name.endswith(".bias") and name not in params_dict:
                 continue
             if is_pp_missing_parameter(name, self):
@@ -1157,6 +1191,12 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         },
     )
 
+    # BitandBytes specific attributes
+    bitsandbytes_stacked_params_mapping = {
+        "gate_proj": ("merged_linear", 0),
+        "up_proj": ("merged_linear", 1),
+    }
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config

From dde1fa18c9f9ba992a8300a300543d6c18d5f08d Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 28 Dec 2024 03:45:13 +0800
Subject: [PATCH 202/357] [Misc] Improve BNB loader to handle mixture of
 sharded and merged weights with same suffix (#11566)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/model_loader/loader.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 4bca13cb2f60c..a9c1fa7221217 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1001,8 +1001,11 @@ def _get_bnb_target_modules(self, model: nn.Module) -> None:
                     for sub_name in sub_modules:
                         self.target_modules.append(
                             name.replace(last_name, sub_name))
-                else:
-                    self.target_modules.append(name)
+                # Add original module name even if the module has stacked map,
+                # in case model has a mixture of disk-merged and disk-splitted
+                # weights with same last name.
+                self.target_modules.append(name)
+
         assert (self.target_modules
                 ), "vllm currently does not support BNB quantization for"
         f" {type(model).__name__}"

From ac797994039ba9e6ed0c2b3a503099cb122a936e Mon Sep 17 00:00:00 2001
From: Selali <selali.adobor@gmail.com>
Date: Fri, 27 Dec 2024 12:12:11 -0800
Subject: [PATCH 203/357] [Bugfix] Fix for ROCM compressed tensor support
 (#11561)

---
 .../schemes/compressed_tensors_w8a8_fp8.py             | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 73cc8ce0d2a4b..1d4e4bd52adaa 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -41,10 +41,12 @@ def process_weights_after_loading(self, layer) -> None:
             )
 
             if current_platform.is_rocm():
+                input_scale = getattr(layer, 'input_scale', None)
+
                 weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
                     weight=weight,
                     weight_scale=max_w_scale,
-                    input_scale=layer.input_scale)
+                    input_scale=input_scale)
                 if input_scale is not None:
                     layer.input_scale = Parameter(input_scale,
                                                   requires_grad=False)
@@ -57,11 +59,13 @@ def process_weights_after_loading(self, layer) -> None:
             weight = layer.weight
 
             if current_platform.is_rocm():
+                input_scale = getattr(layer, 'input_scale', None)
+
                 weight, weight_scale, input_scale = \
                     normalize_e4m3fn_to_e4m3fnuz(
                         weight=weight,
                         weight_scale=layer.weight_scale,
-                        input_scale=layer.input_scale)
+                        input_scale=input_scale)
                 if input_scale is not None:
                     layer.input_scale = Parameter(input_scale,
                                                   requires_grad=False)
@@ -76,7 +80,7 @@ def process_weights_after_loading(self, layer) -> None:
             raise ValueError(f"Unknown quantization strategy {self.strategy}")
 
         # INPUT SCALE
-        if self.is_static_input_scheme:
+        if self.is_static_input_scheme and hasattr(layer, 'input_scale'):
             layer.input_scale = Parameter(layer.input_scale.max(),
                                           requires_grad=False)
         else:

From a60731247fba82fae5e71af7a19ea0df96de1caa Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Sat, 28 Dec 2024 08:31:10 +0800
Subject: [PATCH 204/357] [Doc] Update mllama example based on official doc
 (#11567)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 examples/offline_inference_vision_language.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index d5a71862656e7..77af914a6ef02 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -308,7 +308,20 @@ def run_mllama(question: str, modality: str):
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
-    prompt = f"<|image|><|begin_of_text|>{question}"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    messages = [{
+        "role":
+        "user",
+        "content": [{
+            "type": "image"
+        }, {
+            "type": "text",
+            "text": f"{question}"
+        }]
+    }]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           add_generation_prompt=True,
+                                           tokenize=False)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 

From df04dffade84c87cafd74de4c39e6fd7cb95c24f Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 27 Dec 2024 20:45:08 -0500
Subject: [PATCH 205/357] [V1] [4/N] API Server: ZMQ/MP Utilities (#11541)

---
 docs/requirements-docs.txt                 |   1 +
 tests/v1/engine/test_engine_core.py        |  13 +--
 tests/v1/engine/test_engine_core_client.py |  10 +-
 vllm/entrypoints/openai/api_server.py      |  11 +-
 vllm/executor/multiproc_worker_utils.py    |  22 +---
 vllm/utils.py                              |  90 ++++++++++++++++-
 vllm/v1/engine/async_llm.py                |   6 +-
 vllm/v1/engine/core.py                     | 111 ++++-----------------
 vllm/v1/engine/core_client.py              |  92 ++++++++---------
 vllm/v1/engine/llm_engine.py               |   6 +-
 vllm/v1/executor/multiproc_executor.py     |  11 +-
 vllm/v1/utils.py                           |  89 +++++++++++------
 12 files changed, 247 insertions(+), 215 deletions(-)

diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index 4859c8ac08bea..25a700033cc9e 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -19,3 +19,4 @@ openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entr
 fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 requests
+zmq
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index c529cd21f384b..954cec734b956 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -7,7 +7,6 @@
 from vllm import SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
-from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core import EngineCore
@@ -43,13 +42,11 @@ def test_engine_core(monkeypatch):
         m.setenv("VLLM_USE_V1", "1")
         """Setup the EngineCore."""
         engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config(
-            usage_context=UsageContext.UNKNOWN_CONTEXT)
+        vllm_config = engine_args.create_engine_config()
         executor_class = AsyncLLM._get_executor_cls(vllm_config)
 
         engine_core = EngineCore(vllm_config=vllm_config,
-                                 executor_class=executor_class,
-                                 usage_context=UsageContext.UNKNOWN_CONTEXT)
+                                 executor_class=executor_class)
         """Test basic request lifecycle."""
 
         # First request.
@@ -151,13 +148,11 @@ def test_engine_core_advanced_sampling(monkeypatch):
         m.setenv("VLLM_USE_V1", "1")
         """Setup the EngineCore."""
         engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config(
-            usage_context=UsageContext.UNKNOWN_CONTEXT)
+        vllm_config = engine_args.create_engine_config()
         executor_class = AsyncLLM._get_executor_cls(vllm_config)
 
         engine_core = EngineCore(vllm_config=vllm_config,
-                                 executor_class=executor_class,
-                                 usage_context=UsageContext.UNKNOWN_CONTEXT)
+                                 executor_class=executor_class)
         """Test basic request lifecycle."""
         # First request.
         request: EngineCoreRequest = make_request()
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 2f1cbec607a91..729975e4ea8c4 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -86,11 +86,10 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
             UsageContext.UNKNOWN_CONTEXT)
         executor_class = AsyncLLM._get_executor_cls(vllm_config)
         client = EngineCoreClient.make_client(
-            vllm_config,
-            executor_class,
-            UsageContext.UNKNOWN_CONTEXT,
             multiprocess_mode=multiprocessing_mode,
             asyncio_mode=False,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
         )
 
         MAX_TOKENS = 20
@@ -158,11 +157,10 @@ async def test_engine_core_client_asyncio(monkeypatch):
             usage_context=UsageContext.UNKNOWN_CONTEXT)
         executor_class = AsyncLLM._get_executor_cls(vllm_config)
         client = EngineCoreClient.make_client(
-            vllm_config,
-            executor_class,
-            UsageContext.UNKNOWN_CONTEXT,
             multiprocess_mode=True,
             asyncio_mode=True,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
         )
 
         MAX_TOKENS = 20
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 2e45b474237f9..094cc15a317e9 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -68,7 +68,7 @@
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
-                        is_valid_ipv6_address, set_ulimit)
+                        is_valid_ipv6_address, kill_process_tree, set_ulimit)
 from vllm.version import __version__ as VLLM_VERSION
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
@@ -737,6 +737,15 @@ def signal_handler(*_) -> None:
 
     signal.signal(signal.SIGTERM, signal_handler)
 
+    # The child processes will send SIGQUIT to this process when
+    # any error happens. This process then clean up the whole tree.
+    # TODO(rob): move this into AsyncLLM.__init__ once we remove
+    # the context manager below.
+    def sigquit_handler(signum, frame):
+        kill_process_tree(os.getpid())
+
+    signal.signal(signal.SIGQUIT, sigquit_handler)
+
     async with build_async_engine_client(args) as engine_client:
         app = build_app(args)
 
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index c4d90f0856f86..bc32826529eef 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -1,5 +1,4 @@
 import asyncio
-import multiprocessing
 import os
 import sys
 import threading
@@ -13,10 +12,9 @@
 
 import torch
 
-import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.triton_utils.importing import HAS_TRITON
-from vllm.utils import cuda_is_initialized
+from vllm.utils import _check_multiproc_method, get_mp_context
 
 if HAS_TRITON:
     from vllm.triton_utils import maybe_set_triton_cache_manager
@@ -274,24 +272,6 @@ def write_with_prefix(s: str):
     file.write = write_with_prefix  # type: ignore[method-assign]
 
 
-def _check_multiproc_method():
-    if (cuda_is_initialized()
-            and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
-        logger.warning("CUDA was previously initialized. We must use "
-                       "the `spawn` multiprocessing start method. Setting "
-                       "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
-                       "See https://docs.vllm.ai/en/latest/getting_started/"
-                       "debugging.html#python-multiprocessing "
-                       "for more information.")
-        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-
-
-def get_mp_context():
-    _check_multiproc_method()
-    mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
-    return multiprocessing.get_context(mp_method)
-
-
 def set_multiprocessing_worker_envs(parallel_config):
     """ Set up environment variables that should be used when there are workers
     in a multiprocessing environment. This should be called by the parent 
diff --git a/vllm/utils.py b/vllm/utils.py
index 5eb4e8c4180c4..2b46c1fef0d09 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -10,6 +10,7 @@
 import importlib.util
 import inspect
 import ipaddress
+import multiprocessing
 import os
 import re
 import resource
@@ -20,6 +21,7 @@
 import tempfile
 import threading
 import time
+import traceback
 import uuid
 import warnings
 import weakref
@@ -29,8 +31,9 @@
 from dataclasses import dataclass, field
 from functools import lru_cache, partial, wraps
 from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
-                    Dict, Generator, Generic, List, Literal, NamedTuple,
-                    Optional, Tuple, Type, TypeVar, Union, overload)
+                    Dict, Generator, Generic, Iterator, List, Literal,
+                    NamedTuple, Optional, Tuple, Type, TypeVar, Union,
+                    overload)
 from uuid import uuid4
 
 import numpy as np
@@ -39,6 +42,8 @@
 import torch
 import torch.types
 import yaml
+import zmq
+import zmq.asyncio
 from packaging.version import Version
 from torch.library import Library
 from typing_extensions import ParamSpec, TypeIs, assert_never
@@ -1844,7 +1849,7 @@ def memory_profiling(
     result.non_kv_cache_memory_in_bytes = result.non_torch_increase_in_bytes + result.torch_peak_increase_in_bytes + result.weights_memory_in_bytes  # noqa
 
 
-# Adapted from: https://github.com/sgl-project/sglang/blob/f46f394f4d4dbe4aae85403dec006199b34d2840/python/sglang/srt/utils.py#L630 # noqa: E501Curre
+# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501
 def set_ulimit(target_soft_limit=65535):
     resource_type = resource.RLIMIT_NOFILE
     current_soft, current_hard = resource.getrlimit(resource_type)
@@ -1859,3 +1864,82 @@ def set_ulimit(target_soft_limit=65535):
                 "with error %s. This can cause fd limit errors like"
                 "`OSError: [Errno 24] Too many open files`. Consider "
                 "increasing with ulimit -n", current_soft, e)
+
+
+# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/utils.py#L28 # noqa: E501
+def get_exception_traceback():
+    etype, value, tb = sys.exc_info()
+    err_str = "".join(traceback.format_exception(etype, value, tb))
+    return err_str
+
+
+# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L783 # noqa: E501
+def make_zmq_socket(
+    ctx: Union[zmq.asyncio.Context, zmq.Context],  # type: ignore[name-defined]
+    path: str,
+    type: Any,
+) -> Union[zmq.Socket, zmq.asyncio.Socket]:  # type: ignore[name-defined]
+    """Make a ZMQ socket with the proper bind/connect semantics."""
+
+    mem = psutil.virtual_memory()
+    socket = ctx.socket(type)
+
+    # Calculate buffer size based on system memory
+    total_mem = mem.total / 1024**3
+    available_mem = mem.available / 1024**3
+    # For systems with substantial memory (>32GB total, >16GB available):
+    # - Set a large 0.5GB buffer to improve throughput
+    # For systems with less memory:
+    # - Use system default (-1) to avoid excessive memory consumption
+    if total_mem > 32 and available_mem > 16:
+        buf_size = int(0.5 * 1024**3)  # 0.5GB in bytes
+    else:
+        buf_size = -1  # Use system default buffer size
+
+    if type == zmq.constants.PULL:
+        socket.setsockopt(zmq.constants.RCVHWM, 0)
+        socket.setsockopt(zmq.constants.RCVBUF, buf_size)
+        socket.connect(path)
+    elif type == zmq.constants.PUSH:
+        socket.setsockopt(zmq.constants.SNDHWM, 0)
+        socket.setsockopt(zmq.constants.SNDBUF, buf_size)
+        socket.bind(path)
+    else:
+        raise ValueError(f"Unknown Socket Type: {type}")
+
+    return socket
+
+
+@contextlib.contextmanager
+def zmq_socket_ctx(
+        path: str,
+        type: Any) -> Iterator[zmq.Socket]:  # type: ignore[name-defined]
+    """Context manager for a ZMQ socket"""
+
+    ctx = zmq.Context(io_threads=2)  # type: ignore[attr-defined]
+    try:
+        yield make_zmq_socket(ctx, path, type)
+
+    except KeyboardInterrupt:
+        logger.debug("Got Keyboard Interrupt.")
+
+    finally:
+        ctx.destroy(linger=0)
+
+
+def _check_multiproc_method():
+    if (cuda_is_initialized()
+            and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
+        logger.warning("CUDA was previously initialized. We must use "
+                       "the `spawn` multiprocessing start method. Setting "
+                       "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
+                       "See https://docs.vllm.ai/en/latest/getting_started/"
+                       "debugging.html#python-multiprocessing "
+                       "for more information.")
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+
+def get_mp_context():
+    _check_multiproc_method()
+    mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
+    return multiprocessing.get_context(mp_method)
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index ba2b8377759d6..da3da6dad6436 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -75,11 +75,11 @@ def __init__(
 
         # EngineCore (starts the engine in background process).
         self.engine_core = EngineCoreClient.make_client(
-            vllm_config=vllm_config,
-            executor_class=executor_class,
-            usage_context=usage_context,
             multiprocess_mode=True,
             asyncio_mode=True,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=self.log_stats,
         )
 
         self.output_handler: Optional[asyncio.Task] = None
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 0aef61fc7f680..5840541d774ba 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -3,20 +3,19 @@
 import signal
 import threading
 import time
-from dataclasses import dataclass
-from multiprocessing.process import BaseProcess
+from multiprocessing.connection import Connection
 from typing import List, Tuple, Type
 
+import psutil
 import zmq
 import zmq.asyncio
 from msgspec import msgpack
 
 from vllm.config import CacheConfig, VllmConfig
-from vllm.executor.multiproc_worker_utils import get_mp_context
 from vllm.logger import init_logger
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
-from vllm.usage.usage_lib import UsageContext
+from vllm.utils import get_exception_traceback, zmq_socket_ctx
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
@@ -25,14 +24,13 @@
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import PickleEncoder
-from vllm.v1.utils import make_zmq_socket
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
 
 POLLING_TIMEOUT_MS = 5000
 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
-LOGGING_TIME_S = POLLING_TIMEOUT_S
+LOGGING_TIME_S = 5
 
 
 class EngineCore:
@@ -42,9 +40,10 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
-        usage_context: UsageContext,
+        log_stats: bool = False,
     ):
         assert vllm_config.model_config.runner_type != "pooling"
+        self.log_stats = log_stats
 
         logger.info("Initializing an LLM engine (v%s) with config: %s",
                     VLLM_VERSION, vllm_config)
@@ -134,29 +133,19 @@ def profile(self, is_start: bool = True):
         self.model_executor.profile(is_start)
 
 
-@dataclass
-class EngineCoreProcHandle:
-    proc: BaseProcess
-    ready_path: str
-    input_path: str
-    output_path: str
-
-
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
 
-    READY_STR = "READY"
-
     def __init__(
         self,
-        vllm_config: VllmConfig,
-        executor_class: Type[Executor],
-        usage_context: UsageContext,
         input_path: str,
         output_path: str,
-        ready_path: str,
+        ready_pipe: Connection,
+        vllm_config: VllmConfig,
+        executor_class: Type[Executor],
+        log_stats: bool = False,
     ):
-        super().__init__(vllm_config, executor_class, usage_context)
+        super().__init__(vllm_config, executor_class, log_stats)
 
         # Background Threads and Queues for IO. These enable us to
         # overlap ZMQ socket IO with GPU since they release the GIL,
@@ -173,68 +162,7 @@ def __init__(
                          daemon=True).start()
 
         # Send Readiness signal to EngineClient.
-        with make_zmq_socket(ready_path, zmq.constants.PUSH) as ready_socket:
-            ready_socket.send_string(EngineCoreProc.READY_STR)
-
-    @staticmethod
-    def wait_for_startup(
-        proc: BaseProcess,
-        ready_path: str,
-    ) -> None:
-        """Wait until the EngineCore is ready."""
-
-        try:
-            sync_ctx = zmq.Context()  # type: ignore[attr-defined]
-            socket = sync_ctx.socket(zmq.constants.PULL)
-            socket.connect(ready_path)
-
-            # Wait for EngineCore to send EngineCoreProc.READY_STR.
-            while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
-                logger.debug("Waiting for EngineCoreProc to startup.")
-
-                if not proc.is_alive():
-                    raise RuntimeError("EngineCoreProc failed to start.")
-
-            message = socket.recv_string()
-            assert message == EngineCoreProc.READY_STR
-
-        except BaseException as e:
-            logger.exception(e)
-            raise e
-
-        finally:
-            sync_ctx.destroy(linger=0)
-
-    @staticmethod
-    def make_engine_core_process(
-        vllm_config: VllmConfig,
-        executor_class: Type[Executor],
-        usage_context: UsageContext,
-        input_path: str,
-        output_path: str,
-        ready_path: str,
-    ) -> EngineCoreProcHandle:
-        context = get_mp_context()
-
-        process_kwargs = {
-            "input_path": input_path,
-            "output_path": output_path,
-            "ready_path": ready_path,
-            "vllm_config": vllm_config,
-            "executor_class": executor_class,
-            "usage_context": usage_context,
-        }
-        # Run EngineCore busy loop in background process.
-        proc = context.Process(target=EngineCoreProc.run_engine_core,
-                               kwargs=process_kwargs)
-        proc.start()
-
-        # Wait for startup
-        EngineCoreProc.wait_for_startup(proc, ready_path)
-        return EngineCoreProcHandle(proc=proc,
-                                    ready_path=ready_path,
-                                    input_path=input_path,
-                                    output_path=output_path)
+        ready_pipe.send({"status": "READY"})
 
     @staticmethod
     def run_engine_core(*args, **kwargs):
@@ -258,6 +186,7 @@ def signal_handler(signum, frame):
         signal.signal(signal.SIGTERM, signal_handler)
         signal.signal(signal.SIGINT, signal_handler)
 
+        parent_process = psutil.Process().parent()
         engine_core = None
         try:
             engine_core = EngineCoreProc(*args, **kwargs)
@@ -266,9 +195,10 @@ def signal_handler(signum, frame):
         except SystemExit:
             logger.debug("EngineCore interrupted.")
 
-        except BaseException as e:
-            logger.exception(e)
-            raise e
+        except Exception:
+            traceback = get_exception_traceback()
+            logger.error("EngineCore hit an exception: %s", traceback)
+            parent_process.send_signal(signal.SIGQUIT)
 
         finally:
             if engine_core is not None:
@@ -309,6 +239,9 @@ def run_busy_loop(self):
     def _log_stats(self):
         """Log basic stats every LOGGING_TIME_S"""
 
+        if not self.log_stats:
+            return
+
         now = time.time()
 
         if now - self._last_logging_time > LOGGING_TIME_S:
@@ -339,7 +272,7 @@ def process_input_socket(self, input_path: str):
         decoder_add_req = PickleEncoder()
         decoder_abort_req = PickleEncoder()
 
-        with make_zmq_socket(input_path, zmq.constants.PULL) as socket:
+        with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket:
             while True:
                 # (RequestType, RequestData)
                 type_frame, data_frame = socket.recv_multipart(copy=False)
@@ -367,7 +300,7 @@ def process_output_socket(self, output_path: str):
         # Reuse send buffer.
         buffer = bytearray()
 
-        with make_zmq_socket(output_path, zmq.constants.PUSH) as socket:
+        with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
             while True:
                 engine_core_outputs = self.output_queue.get()
                 outputs = EngineCoreOutputs(outputs=engine_core_outputs)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index d56fcbdb1e7c4..beb5d57c20c83 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,19 +1,19 @@
-import os
-import weakref
-from typing import List, Optional
+from typing import List, Optional, Type
 
 import msgspec
 import zmq
 import zmq.asyncio
 
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import get_open_zmq_ipc_path, kill_process_tree
+from vllm.utils import get_open_zmq_ipc_path
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
                             EngineCoreRequestType, EngineCoreRequestUnion)
-from vllm.v1.engine.core import (EngineCore, EngineCoreProc,
-                                 EngineCoreProcHandle)
+from vllm.v1.engine.core import EngineCore, EngineCoreProc
+from vllm.v1.executor.abstract import Executor
 from vllm.v1.serial_utils import PickleEncoder
+from vllm.v1.utils import BackgroundProcHandle
 
 logger = init_logger(__name__)
 
@@ -31,10 +31,11 @@ class EngineCoreClient:
 
     @staticmethod
     def make_client(
-        *args,
         multiprocess_mode: bool,
         asyncio_mode: bool,
-        **kwargs,
+        vllm_config: VllmConfig,
+        executor_class: Type[Executor],
+        log_stats: bool = False,
     ) -> "EngineCoreClient":
 
         # TODO: support this for debugging purposes.
@@ -44,12 +45,12 @@ def make_client(
                 "is not currently supported.")
 
         if multiprocess_mode and asyncio_mode:
-            return AsyncMPClient(*args, **kwargs)
+            return AsyncMPClient(vllm_config, executor_class, log_stats)
 
         if multiprocess_mode and not asyncio_mode:
-            return SyncMPClient(*args, **kwargs)
+            return SyncMPClient(vllm_config, executor_class, log_stats)
 
-        return InprocClient(*args, **kwargs)
+        return InprocClient(vllm_config, executor_class, log_stats)
 
     def shutdown(self):
         pass
@@ -128,9 +129,10 @@ class MPClient(EngineCoreClient):
 
     def __init__(
         self,
-        *args,
         asyncio_mode: bool,
-        **kwargs,
+        vllm_config: VllmConfig,
+        executor_class: Type[Executor],
+        log_stats: bool = False,
     ):
         # Serialization setup.
         self.encoder = PickleEncoder()
@@ -143,7 +145,6 @@ def __init__(
             self.ctx = zmq.Context()  # type: ignore[attr-defined]
 
         # Path for IPC.
-        ready_path = get_open_zmq_ipc_path()
         output_path = get_open_zmq_ipc_path()
         input_path = get_open_zmq_ipc_path()
 
@@ -156,47 +157,40 @@ def __init__(
         self.input_socket.bind(input_path)
 
         # Start EngineCore in background process.
-        self.proc_handle: Optional[EngineCoreProcHandle]
-        self.proc_handle = EngineCoreProc.make_engine_core_process(
-            *args,
-            input_path=
-            input_path,  # type: ignore[misc]  # MyPy incorrectly flags duplicate keywords
-            output_path=output_path,  # type: ignore[misc]
-            ready_path=ready_path,  # type: ignore[misc]
-            **kwargs,
-        )
-        self._finalizer = weakref.finalize(self, self.shutdown)
+        self.proc_handle: Optional[BackgroundProcHandle]
+        self.proc_handle = BackgroundProcHandle(
+            input_path=input_path,
+            output_path=output_path,
+            process_name="EngineCore",
+            target_fn=EngineCoreProc.run_engine_core,
+            process_kwargs={
+                "vllm_config": vllm_config,
+                "executor_class": executor_class,
+                "log_stats": log_stats,
+            })
 
     def shutdown(self):
         # Shut down the zmq context.
         self.ctx.destroy(linger=0)
 
         if hasattr(self, "proc_handle") and self.proc_handle:
-            # Shutdown the process if needed.
-            if self.proc_handle.proc.is_alive():
-                self.proc_handle.proc.terminate()
-                self.proc_handle.proc.join(5)
-
-                if self.proc_handle.proc.is_alive():
-                    kill_process_tree(self.proc_handle.proc.pid)
-
-            # Remove zmq ipc socket files
-            ipc_sockets = [
-                self.proc_handle.ready_path, self.proc_handle.output_path,
-                self.proc_handle.input_path
-            ]
-            for ipc_socket in ipc_sockets:
-                socket_file = ipc_socket.replace("ipc://", "")
-                if os and os.path.exists(socket_file):
-                    os.remove(socket_file)
+            self.proc_handle.shutdown()
             self.proc_handle = None
 
 
 class SyncMPClient(MPClient):
     """Synchronous client for multi-proc EngineCore."""
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, asyncio_mode=False, **kwargs)
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 executor_class: Type[Executor],
+                 log_stats: bool = False):
+        super().__init__(
+            asyncio_mode=False,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=log_stats,
+        )
 
     def get_output(self) -> List[EngineCoreOutput]:
 
@@ -225,8 +219,16 @@ def profile(self, is_start: bool = True) -> None:
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, asyncio_mode=True, **kwargs)
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 executor_class: Type[Executor],
+                 log_stats: bool = False):
+        super().__init__(
+            asyncio_mode=True,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=log_stats,
+        )
 
     async def get_output_async(self) -> List[EngineCoreOutput]:
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index b58f62778ffe9..fc323184abc8f 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -72,11 +72,11 @@ def __init__(
 
         # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
         self.engine_core = EngineCoreClient.make_client(
-            vllm_config,
-            executor_class,
-            usage_context,
             multiprocess_mode=multiprocess_mode,
             asyncio_mode=False,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=False,
         )
 
     @classmethod
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 128101aa6956d..ed64e7741390d 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -17,13 +17,12 @@
 from vllm.distributed.device_communicators.shm_broadcast import (Handle,
                                                                  MessageQueue)
 from vllm.executor.multiproc_worker_utils import (
-    _add_prefix, get_mp_context, set_multiprocessing_worker_envs)
+    _add_prefix, set_multiprocessing_worker_envs)
 from vllm.logger import init_logger
-from vllm.utils import (get_distributed_init_method, get_open_port,
-                        get_open_zmq_ipc_path)
+from vllm.utils import (get_distributed_init_method, get_mp_context,
+                        get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx)
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.outputs import ModelRunnerOutput
-from vllm.v1.utils import make_zmq_socket
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -250,7 +249,7 @@ def __init__(
         worker_response_mq_handle = self.worker_response_mq.export_handle()
 
         # Send Readiness signal to EngineCore process.
-        with make_zmq_socket(ready_path, zmq.constants.PUSH) as ready_socket:
+        with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket:
             payload = pickle.dumps(worker_response_mq_handle,
                                    protocol=pickle.HIGHEST_PROTOCOL)
             ready_socket.send_string(WorkerProc.READY_STR)
@@ -352,7 +351,7 @@ def wait_for_startup(
         ready_path: str,
     ) -> Optional[Handle]:
         """Wait until the Worker is ready."""
-        with make_zmq_socket(ready_path, zmq.constants.PULL) as socket:
+        with zmq_socket_ctx(ready_path, zmq.constants.PULL) as socket:
 
             # Wait for Worker to send READY.
             while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index e802c6439b740..19e0dd17237c9 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,11 +1,11 @@
+import os
+import weakref
 from collections.abc import Sequence
-from contextlib import contextmanager
-from typing import (Any, Generic, Iterator, List, Optional, TypeVar, Union,
-                    overload)
-
-import zmq
+from typing import (Any, Callable, Dict, Generic, List, Optional, TypeVar,
+                    Union, overload)
 
 from vllm.logger import init_logger
+from vllm.utils import get_mp_context, kill_process_tree
 
 logger = init_logger(__name__)
 
@@ -77,27 +77,58 @@ def __len__(self):
         return len(self._x)
 
 
-@contextmanager
-def make_zmq_socket(
-        path: str,
-        type: Any) -> Iterator[zmq.Socket]:  # type: ignore[name-defined]
-    """Context manager for a ZMQ socket"""
-
-    ctx = zmq.Context()  # type: ignore[attr-defined]
-    try:
-        socket = ctx.socket(type)
-
-        if type == zmq.constants.PULL:
-            socket.connect(path)
-        elif type == zmq.constants.PUSH:
-            socket.bind(path)
-        else:
-            raise ValueError(f"Unknown Socket Type: {type}")
-
-        yield socket
-
-    except KeyboardInterrupt:
-        logger.debug("Worker had Keyboard Interrupt.")
-
-    finally:
-        ctx.destroy(linger=0)
+class BackgroundProcHandle:
+    """
+    Utility class to handle creation, readiness, and shutdown
+    of background processes used by the AsyncLLM and LLMEngine.
+    """
+
+    def __init__(
+        self,
+        input_path: str,
+        output_path: str,
+        process_name: str,
+        target_fn: Callable,
+        process_kwargs: Dict[Any, Any],
+    ):
+        self._finalizer = weakref.finalize(self, self.shutdown)
+
+        context = get_mp_context()
+        reader, writer = context.Pipe(duplex=False)
+
+        assert ("ready_pipe" not in process_kwargs
+                and "input_path" not in process_kwargs
+                and "output_path" not in process_kwargs)
+        process_kwargs["ready_pipe"] = writer
+        process_kwargs["input_path"] = input_path
+        process_kwargs["output_path"] = output_path
+        self.input_path = input_path
+        self.output_path = output_path
+
+        # Run Detokenizer busy loop in background process.
+        self.proc = context.Process(target=target_fn, kwargs=process_kwargs)
+        self.proc.start()
+
+        # Wait for startup.
+        if reader.recv()["status"] != "READY":
+            raise RuntimeError(f"{process_name} initialization failed. "
+                               "See root cause above.")
+
+    def __del__(self):
+        self.shutdown()
+
+    def shutdown(self):
+        # Shutdown the process if needed.
+        if hasattr(self, "proc") and self.proc.is_alive():
+            self.proc.terminate()
+            self.proc.join(5)
+
+            if self.proc.is_alive():
+                kill_process_tree(self.proc.pid)
+
+        # Remove zmq ipc socket files
+        ipc_sockets = [self.output_path, self.input_path]
+        for ipc_socket in ipc_sockets:
+            socket_file = ipc_socket.replace("ipc://", "")
+            if os and os.path.exists(socket_file):
+                os.remove(socket_file)

From b5cbe8eeb30e86c8477d91c66f5c7a10e4ee754b Mon Sep 17 00:00:00 2001
From: Rajveer Bachkaniwala <46040700+rajveerb@users.noreply.github.com>
Date: Fri, 27 Dec 2024 22:34:46 -0500
Subject: [PATCH 206/357] [Bugfix] Last token measurement fix (#11376)

Signed-off-by: rajveerb <46040700+rajveerb@users.noreply.github.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
---
 vllm/engine/llm_engine.py |  8 ++++++--
 vllm/sequence.py          | 24 ++++++++++++++----------
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 39f59e55da1f7..1db3e59ff3bae 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1124,6 +1124,8 @@ def _process_model_outputs(self,
 
             seq_group = scheduled_seq_group.seq_group
             seq_group.maybe_set_first_token_time(now)
+            if not seq_group.is_prefill():
+                seq_group.set_last_token_time(now)
             request_output = RequestOutputFactory.create(
                 seq_group,
                 self.seq_id_to_seq_group,
@@ -1166,6 +1168,8 @@ def _process_model_outputs(self,
 
             seq_group = scheduled_seq_group.seq_group
             seq_group.maybe_set_first_token_time(now)
+            if not seq_group.is_prefill():
+                seq_group.set_last_token_time(now)
             request_output = RequestOutputFactory.create(
                 seq_group,
                 self.seq_id_to_seq_group,
@@ -1686,7 +1690,7 @@ def _get_stats(self,
                     # If the seq_group just finished the prefill state
                     # get TTFT.
                     if not seq_group.is_prefill():
-                        latency = seq_group.get_last_latency(now)
+                        latency = seq_group.get_last_token_latency()
                         time_to_first_tokens_iter.append(latency)
 
                         # One generation token per finished prefill.
@@ -1694,7 +1698,7 @@ def _get_stats(self,
                             seq_group.num_seqs())
                 else:
                     # TPOTs.
-                    latency = seq_group.get_last_latency(now)
+                    latency = seq_group.get_last_token_latency()
                     time_per_output_tokens_iter.append(latency)
                     if seq_group.state.current_step == 0:
                         # For async_output_proc, the do_log_stats()
diff --git a/vllm/sequence.py b/vllm/sequence.py
index cc3d96fc93a79..34f910d47b7d9 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -667,6 +667,7 @@ def __init__(
                                       first_scheduled_time=None,
                                       first_token_time=None,
                                       time_in_queue=None)
+        self.last_token_latency = 0.0
         self.lora_request = lora_request
         self.prompt_logprobs: Optional[PromptLogprobs] = None
         self.state = SequenceGroupState()
@@ -762,18 +763,21 @@ def init_multi_step_from_lookahead_slots(self, num_lookahead_slots: int,
             assert num_lookahead_slots + 1 == num_scheduler_steps or is_prefill
             self.init_multi_step(num_steps=num_lookahead_slots + 1)
 
-    def get_last_latency(self, now: float) -> float:
+    def set_last_token_time(self, now: float) -> None:
         """Sets the last token time for Request level timings."""
-        # If still in prefill phase, raise Error.
-        if self.is_prefill():
-            raise ValueError(
-                "seq_group.get_last_latency() should not be called "
-                "if the seq_group is in prefill phase.")
-
-        # Otherwise return token latency.
-        latency = now - self.metrics.last_token_time
+        # If still in prefill phase, assertion fails.
+        assert not self.is_prefill(), (
+            "seq_group.set_last_token_time() should not be called "
+            "if the seq_group is in prefill phase.")
+        self.last_token_latency = now - self.metrics.last_token_time
         self.metrics.last_token_time = now
-        return latency
+
+    def get_last_token_latency(self) -> float:
+        """Returns the latency of the last token."""
+        assert not self.is_prefill(), (
+            "seq_group.get_last_token_latency() should not be called "
+            "if the seq_group is in prefill phase.")
+        return self.last_token_latency
 
     def maybe_set_first_token_time(self, time: float) -> None:
         """Sets the first token time for Request level timings."""

From d34be24bb196cb0cce167257c97449f0cd6858f7 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 28 Dec 2024 14:14:10 +0800
Subject: [PATCH 207/357] [Model] Support InternLM2 Reward models (#11571)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/source/models/supported_models.md  |  5 +++
 tests/models/registry.py                |  2 +
 vllm/model_executor/models/internlm2.py | 60 ++++++++++++++++++++++++-
 vllm/model_executor/models/registry.py  |  1 +
 4 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 7acafda50793c..fa7102cd88063 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -450,6 +450,11 @@ of the whole prompt are extracted from the normalized hidden state corresponding
     - Example HF Models
     - :ref:`LoRA <lora-adapter>`
     - :ref:`PP <distributed-serving>`
+  * - :code:`InternLM2ForRewardModel`
+    - InternLM2-based
+    - :code:`internlm/internlm2-1_8b-reward`, :code:`internlm/internlm2-7b-reward`, etc.
+    - ✅︎
+    - ✅︎
   * - :code:`LlamaForCausalLM`
     - Llama-based
     - :code:`peiyi9979/math-shepherd-mistral-7b-prm`, etc.
diff --git a/tests/models/registry.py b/tests/models/registry.py
index f5a37420a2909..e5dfb2822745d 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -140,6 +140,8 @@ class _HfExamplesInfo:
     "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
     "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),
     "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
+    "InternLM2ForRewardModel": _HfExamplesInfo("internlm/internlm2-1_8b-reward",
+                                               trust_remote_code=True),
     "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"),  # noqa: E501
     "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 41b9f110d771f..28c23edd4c8e8 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -18,14 +18,16 @@
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors
+from vllm.sequence import IntermediateTensors, PoolerOutput
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (is_pp_missing_parameter,
@@ -433,3 +435,59 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
+
+
+class InternLM2ForRewardModel(InternLM2ForCausalLM):
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        model_type: Type[InternLM2Model] = InternLM2Model,
+    ):
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         model_type=model_type)
+
+        for attr in ("output", "logits_processor", "sampler"):
+            delattr(self, attr)
+
+        config = vllm_config.model_config.hf_config
+        self.v_head = RowParallelLinear(
+            config.hidden_size,
+            1,
+            bias=False,
+            input_is_parallel=False,
+            prefix=maybe_prefix(prefix, "v_head"),
+        )
+
+        pooler_config = vllm_config.model_config.pooler_config
+        self._pooler = Pooler.from_config_with_defaults(
+            pooler_config,
+            pooling_type=PoolingType.ALL,
+            normalize=False,
+            softmax=False,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
+        logits, _ = self.v_head(hidden_states)
+        return logits
+
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        return self._pooler(hidden_states, pooling_metadata)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 89992de7e238d..67268eb4bb85f 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -113,6 +113,7 @@
     "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),
     "GritLM": ("gritlm", "GritLM"),
+    "InternLM2ForRewardModel": ("internlm2", "InternLM2ForRewardModel"),
     "JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"),  # noqa: E501
     "LlamaModel": ("llama", "LlamaForCausalLM"),
     **{

From b7dcc003dc1ace7605946d52b7e077ba1d3bbe86 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 28 Dec 2024 02:54:23 -0800
Subject: [PATCH 208/357] [Model] Remove hardcoded image tokens ids from
 Pixtral (#11582)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/pixtral.py | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index f3d66c2313198..22d29f5bbc50c 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -45,13 +45,6 @@
 except ImportError:
     USE_XFORMERS_OPS = False
 
-# These token ids cannot be retrieved from model config
-# so we hardcode them here.
-PIXTRAL_12B_IMAGE_BREAK_ID = 12
-PIXTRAL_12B_IMAGE_END_ID = 13
-PIXTRAL_LARGE_IMAGE_BREAK_ID = 14
-PIXTRAL_LARGE_IMAGE_END_ID = 15
-
 
 def get_max_pixtral_image_tokens(ctx: InputContext):
     tokenizer = cached_get_tokenizer(
@@ -201,6 +194,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             if key in dataclass_fields
         }
 
+        if not ("image_break_token_id" in vision_args
+                and "image_end_token_id" in vision_args):
+            raise ValueError(
+                "'image_break_token_id' and 'image_end_token_id' not found "
+                "in the vision_encoder arguments. Please download the latest "
+                "version of 'params.json' from the model repository.")
+
         self.vision_args = VisionEncoderArgs(**vision_args)
 
         # init MistralForCausalLM
@@ -240,9 +240,8 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
 
         # NOTE: Image embeddings are split into separate tensors for each image
         # by the indices of `[IMG_END]` token.
-        image_end_condition = (image_tokens == PIXTRAL_12B_IMAGE_END_ID) | (
-            image_tokens == PIXTRAL_LARGE_IMAGE_END_ID)
-        split_indices = torch.where(image_end_condition)[0] + 1
+        image_end_mask = image_tokens == self.vision_args.image_end_token_id
+        split_indices = torch.where(image_end_mask)[0] + 1
         if len(split_indices) <= 1:
             # Do not split, return as tensor of shape [1, fs, hs]
             return image_embeds.unsqueeze(0)
@@ -265,10 +264,8 @@ def get_input_embeddings(
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids, inputs_embeds, multimodal_embeddings, [
                     self.vision_args.image_token_id,
-                    PIXTRAL_12B_IMAGE_END_ID,
-                    PIXTRAL_12B_IMAGE_BREAK_ID,
-                    PIXTRAL_LARGE_IMAGE_BREAK_ID,
-                    PIXTRAL_LARGE_IMAGE_END_ID,
+                    self.vision_args.image_break_token_id,
+                    self.vision_args.image_end_token_id,
                 ])
         return inputs_embeds
 
@@ -409,6 +406,8 @@ class VisionEncoderArgs:
     num_attention_heads: int
     rope_theta: float  # for rope-2D
     image_token_id: int
+    image_break_token_id: int
+    image_end_token_id: int
     adapter_bias: bool = True
 
 

From 59d6bb4c863e511e58799efac847065c28c52c8b Mon Sep 17 00:00:00 2001
From: hj-wei <hjwei_xd@163.com>
Date: Sat, 28 Dec 2024 19:17:35 +0800
Subject: [PATCH 209/357] [Hardware][AMD]: Replace HIPCC version with more
 precise ROCm version (#11515)

Signed-off-by: hjwei <hjwei_xd@163.com>
---
 setup.py | 52 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 29 insertions(+), 23 deletions(-)

diff --git a/setup.py b/setup.py
index 61d2d710aa20e..ba6953dbdc174 100644
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,4 @@
+import ctypes
 import importlib.util
 import logging
 import os
@@ -13,7 +14,7 @@
 from setuptools import Extension, find_packages, setup
 from setuptools.command.build_ext import build_ext
 from setuptools_scm import get_version
-from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
 
 
 def load_module_from_path(module_name, path):
@@ -379,25 +380,31 @@ def _build_custom_ops() -> bool:
     return _is_cuda() or _is_hip() or _is_cpu()
 
 
-def get_hipcc_rocm_version():
-    # Run the hipcc --version command
-    result = subprocess.run(['hipcc', '--version'],
-                            stdout=subprocess.PIPE,
-                            stderr=subprocess.STDOUT,
-                            text=True)
+def get_rocm_version():
+    # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
+    # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
+    try:
+        librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
+        if not librocm_core_file.is_file():
+            return None
+        librocm_core = ctypes.CDLL(librocm_core_file)
+        VerErrors = ctypes.c_uint32
+        get_rocm_core_version = librocm_core.getROCmVersion
+        get_rocm_core_version.restype = VerErrors
+        get_rocm_core_version.argtypes = [
+            ctypes.POINTER(ctypes.c_uint32),
+            ctypes.POINTER(ctypes.c_uint32),
+            ctypes.POINTER(ctypes.c_uint32),
+        ]
+        major = ctypes.c_uint32()
+        minor = ctypes.c_uint32()
+        patch = ctypes.c_uint32()
 
-    # Check if the command was executed successfully
-    if result.returncode != 0:
-        print("Error running 'hipcc --version'")
+        if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
+                                  ctypes.byref(patch)) == 0):
+            return "%d.%d.%d" % (major.value, minor.value, patch.value)
         return None
-
-    # Extract the version using a regular expression
-    match = re.search(r'HIP version: (\S+)', result.stdout)
-    if match:
-        # Return the version string
-        return match.group(1)
-    else:
-        print("Could not find HIP version in the output")
+    except Exception:
         return None
 
 
@@ -479,11 +486,10 @@ def get_vllm_version() -> str:
                 if "sdist" not in sys.argv:
                     version += f"{sep}cu{cuda_version_str}"
     elif _is_hip():
-        # Get the HIP version
-        hipcc_version = get_hipcc_rocm_version()
-        if hipcc_version != MAIN_CUDA_VERSION:
-            rocm_version_str = hipcc_version.replace(".", "")[:3]
-            version += f"{sep}rocm{rocm_version_str}"
+        # Get the Rocm Version
+        rocm_version = get_rocm_version() or torch.version.hip
+        if rocm_version and rocm_version != MAIN_CUDA_VERSION:
+            version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
     elif _is_neuron():
         # Get the Neuron version
         neuron_version = str(get_neuronxcc_version())

From 42bb201fd6f79d6ed2e28e0263ffa891cd993c4c Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 28 Dec 2024 22:33:12 +0900
Subject: [PATCH 210/357] [V1][Minor] Set pin_memory=False for token_ids_cpu
 tensor (#11581)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_input_batch.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 6c4d300ec6efe..e79145300fe06 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -57,11 +57,13 @@ def __init__(
 
         # TODO(woosuk): This buffer could be too large if max_model_len is big.
         # Find a way to reduce the CPU memory usage.
+        # This buffer is not directly transferred to the GPU, so it does not
+        # need to be pinned.
         self.token_ids_cpu_tensor = torch.zeros(
             (max_num_reqs, max_model_len),
             device="cpu",
             dtype=torch.int32,
-            pin_memory=pin_memory,
+            pin_memory=False,
         )
         self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
         self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)

From d427e5cfda8d2536b81e6021128e71b2dbc281aa Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 28 Dec 2024 21:53:59 +0800
Subject: [PATCH 211/357] [Doc] Minor documentation fixes (#11580)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/contributing/dockerfile/dockerfile.md  | 6 +++---
 docs/source/contributing/overview.md               | 2 +-
 docs/source/getting_started/arm-installation.md    | 2 +-
 docs/source/getting_started/cpu-installation.md    | 4 ++--
 docs/source/getting_started/gaudi-installation.md  | 8 +++++---
 docs/source/getting_started/neuron-installation.md | 2 +-
 docs/source/getting_started/quickstart.md          | 4 ++--
 docs/source/getting_started/tpu-installation.md    | 2 +-
 docs/source/models/supported_models.md             | 6 +++---
 docs/source/serving/deploying_with_cerebrium.md    | 6 +++---
 docs/source/serving/deploying_with_dstack.md       | 2 +-
 docs/source/serving/distributed_serving.md         | 6 +++---
 docs/source/serving/runai_model_streamer.md        | 2 +-
 13 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md
index 6535414a7dca4..7ffec83333d7d 100644
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@@ -11,11 +11,11 @@ Below is a visual representation of the multi-stage Dockerfile. The build graph
 
 The edges of the build graph represent:
 
-- FROM ... dependencies (with a solid line and a full arrow head)
+- `FROM ...` dependencies (with a solid line and a full arrow head)
 
-- COPY --from=... dependencies (with a dashed line and an empty arrow head)
+- `COPY --from=...` dependencies (with a dashed line and an empty arrow head)
 
-- RUN --mount=(.\*)from=... dependencies (with a dotted line and an empty diamond arrow head)
+- `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head)
 
   > ```{figure} ../../assets/dev/dockerfile-stages-dependency.png
   > :align: center
diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index 9dac41cff0bcb..c960790f47a13 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -34,7 +34,7 @@ pytest tests/
 ```
 
 ```{note}
-Currently, the repository does not pass the `mypy` tests.
+Currently, the repository is not fully checked by `mypy`.
 ```
 
 # Contribution Guidelines
diff --git a/docs/source/getting_started/arm-installation.md b/docs/source/getting_started/arm-installation.md
index de807e198b4f6..799b597b3ad5d 100644
--- a/docs/source/getting_started/arm-installation.md
+++ b/docs/source/getting_started/arm-installation.md
@@ -20,7 +20,7 @@ Contents:
 ## Requirements
 
 - **Operating System**: Linux or macOS
-- **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended)
+- **Compiler**: `gcc/g++ >= 12.3.0` (optional, but recommended)
 - **Instruction Set Architecture (ISA)**: NEON support is required
 
 (arm-backend-quick-start-dockerfile)=
diff --git a/docs/source/getting_started/cpu-installation.md b/docs/source/getting_started/cpu-installation.md
index b6f181ace6274..c3d3f715ed804 100644
--- a/docs/source/getting_started/cpu-installation.md
+++ b/docs/source/getting_started/cpu-installation.md
@@ -24,7 +24,7 @@ Table of contents:
 ## Requirements
 
 - OS: Linux
-- Compiler: gcc/g++>=12.3.0 (optional, recommended)
+- Compiler: `gcc/g++>=12.3.0` (optional, recommended)
 - Instruction set architecture (ISA) requirement: AVX512 (optional, recommended)
 
 (cpu-backend-quick-start-dockerfile)=
@@ -69,7 +69,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install
 
 ```{note}
 - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
-- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building.
+- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
 ```
 
 (env-intro)=
diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/gaudi-installation.md
index acf42f210dffb..447bf98084a5d 100644
--- a/docs/source/getting_started/gaudi-installation.md
+++ b/docs/source/getting_started/gaudi-installation.md
@@ -167,6 +167,8 @@ Currently in vLLM for HPU we support four execution modes, depending on selected
 In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
 ```
 
+(gaudi-bucketing-mechanism)=
+
 ### Bucketing mechanism
 
 Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
@@ -185,7 +187,7 @@ INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, ma
 INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
 ```
 
-`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling - `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
+`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling -- `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
 
 Example (with ramp-up)
 
@@ -214,7 +216,7 @@ If a request exceeds maximum bucket size in any dimension, it will be processed
 As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket.
 
 ```{note}
-Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
+Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
 ```
 
 ### Warmup
@@ -235,7 +237,7 @@ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size
 INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
 ```
 
-This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
+This example uses the same buckets as in the [Bucketing Mechanism](#gaudi-bucketing-mechanism) section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
 
 ```{tip}
 Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
diff --git a/docs/source/getting_started/neuron-installation.md b/docs/source/getting_started/neuron-installation.md
index d6de5760cc82c..baaeeb9f53a10 100644
--- a/docs/source/getting_started/neuron-installation.md
+++ b/docs/source/getting_started/neuron-installation.md
@@ -26,7 +26,7 @@ Installation steps:
 (build-from-source-neuron)=
 
 ```{note}
-The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
+The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
 ```
 
 ## Build from source
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 165e5df146dcd..9c8b7e4f592c9 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -114,7 +114,7 @@ $         "temperature": 0
 $     }'
 ```
 
-Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` python package:
+Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package:
 
 ```python
 from openai import OpenAI
@@ -151,7 +151,7 @@ $         ]
 $     }'
 ```
 
-Alternatively, you can use the `openai` python package:
+Alternatively, you can use the `openai` Python package:
 
 ```python
 from openai import OpenAI
diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/tpu-installation.md
index f2a949e7247d8..17eded4a51fec 100644
--- a/docs/source/getting_started/tpu-installation.md
+++ b/docs/source/getting_started/tpu-installation.md
@@ -103,7 +103,7 @@ Connect to your TPU using SSH:
 gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE
 ```
 
-Install Miniconda
+Install Miniconda:
 
 ```bash
 wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index fa7102cd88063..f6e00fa71a310 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -435,7 +435,7 @@ despite being described otherwise on its model card.
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
-:func:`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
+{func}`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
 of the whole prompt are extracted from the normalized hidden state corresponding to the last token.
 
 #### Reward Modeling (`--task reward`)
@@ -468,7 +468,7 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
-:func:`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
+{func}`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
 
 ```{important}
 For process-supervised reward models such as {code}`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
@@ -500,7 +500,7 @@ e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 1
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
-:func:`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
+{func}`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
 
 #### Sentence Pair Scoring (`--task score`)
 
diff --git a/docs/source/serving/deploying_with_cerebrium.md b/docs/source/serving/deploying_with_cerebrium.md
index 4863936236119..950064c8c1b10 100644
--- a/docs/source/serving/deploying_with_cerebrium.md
+++ b/docs/source/serving/deploying_with_cerebrium.md
@@ -33,7 +33,7 @@ docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
 vllm = "latest"
 ```
 
-Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py\`:
+Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`:
 
 ```python
 from vllm import LLM, SamplingParams
@@ -55,13 +55,13 @@ def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
     return {"results": results}
 ```
 
-Then, run the following code to deploy it to the cloud
+Then, run the following code to deploy it to the cloud:
 
 ```console
 $ cerebrium deploy
 ```
 
-If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run)
+If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case` /run`)
 
 ```python
 curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
diff --git a/docs/source/serving/deploying_with_dstack.md b/docs/source/serving/deploying_with_dstack.md
index 65ef1c0016208..381f5f786ca2c 100644
--- a/docs/source/serving/deploying_with_dstack.md
+++ b/docs/source/serving/deploying_with_dstack.md
@@ -25,7 +25,7 @@ $ cd vllm-dstack
 $ dstack init
 ```
 
-Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
+Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
 
 ```yaml
 type: service
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index c0a4b23f6dc70..7446b7c84cf46 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -8,7 +8,7 @@ Before going into the details of distributed inference and serving, let's first
 
 - **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference.
 - **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4.
-- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2.
+- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2.
 
 In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes.
 
@@ -77,7 +77,7 @@ Then you get a ray cluster of containers. Note that you need to keep the shells
 
 Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
 
-After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
+After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
 
 ```console
 $ vllm serve /path/to/the/model/in/the/container \
@@ -85,7 +85,7 @@ $     --tensor-parallel-size 8 \
 $     --pipeline-parallel-size 2
 ```
 
-You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 16:
+You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 16:
 
 ```console
 $ vllm serve /path/to/the/model/in/the/container \
diff --git a/docs/source/serving/runai_model_streamer.md b/docs/source/serving/runai_model_streamer.md
index 1b5756a95075a..d4269050ff574 100644
--- a/docs/source/serving/runai_model_streamer.md
+++ b/docs/source/serving/runai_model_streamer.md
@@ -41,7 +41,7 @@ For reading from S3, it will be the number of client instances the host is openi
 $ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}'
 ```
 
-You can controls the size of the CPU Memory buffer to which tensors are read from the file, and limit this size.
+You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size.
 You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit).
 
 ```console

From 328841d00294fb8226f0368cc380350b3d671d77 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 29 Dec 2024 00:55:42 +0800
Subject: [PATCH 212/357] [bugfix] interleaving sliding window for cohere2
 model (#11583)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/models/supported_models.md      |   2 +-
 tests/models/test_initialization.py         |   4 -
 vllm/config.py                              |   2 +-
 vllm/model_executor/models/commandr.py      |  10 +-
 vllm/transformers_utils/config.py           |   7 +-
 vllm/transformers_utils/configs/__init__.py |   2 +
 vllm/transformers_utils/configs/cohere2.py  | 192 ++++++++++++++++++++
 7 files changed, 206 insertions(+), 13 deletions(-)
 create mode 100644 vllm/transformers_utils/configs/cohere2.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index f6e00fa71a310..e11befbb8dd30 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -112,7 +112,7 @@ See [this page](#generative-models) for more information on how to use generativ
     - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc.
     - ✅︎
     - ✅︎
-  * - :code:`CohereForCausalLM`,:code:`Cohere2ForCausalLM`
+  * - :code:`CohereForCausalLM`, :code:`Cohere2ForCausalLM`
     - Command-R
     - :code:`CohereForAI/c4ai-command-r-v01`, :code:`CohereForAI/c4ai-command-r7b-12-2024`, etc.
     - ✅︎
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index a4eea7f035c91..3b728f2744fca 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -1,7 +1,6 @@
 from unittest.mock import patch
 
 import pytest
-import transformers
 from transformers import PretrainedConfig
 
 from vllm import LLM
@@ -12,9 +11,6 @@
 @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
 def test_can_initialize(model_arch):
     model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
-    if (model_arch == "Cohere2ForCausalLM"
-            and transformers.__version__ < "4.48.0"):
-        pytest.skip(reason="Model introduced in HF >= 4.48.0")
     if not model_info.is_available_online:
         pytest.skip("Model is not available online")
 
diff --git a/vllm/config.py b/vllm/config.py
index ac767bbe14be4..6ae1d4d944447 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -301,7 +301,7 @@ def __init__(self,
         sliding_window = getattr(self.hf_text_config, "sliding_window", None)
         has_interleaved_attention = (sliding_window is not None) and (
             isinstance(sliding_window, list) or
-            (self.hf_text_config.model_type in ["gemma2"]))
+            (self.hf_text_config.model_type in ["gemma2", "cohere2"]))
 
         if (not self.disable_sliding_window and has_interleaved_attention):
             if envs.VLLM_ATTENTION_BACKEND == "XFORMERS":
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index c846e42f1b0c3..d22d1f3171463 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -172,16 +172,18 @@ def __init__(
             is_neox_style=False,
         )
 
-        sliding_window = getattr(config, "sliding_window", None)
-        # Model v2 has sliding windows, v1 does not
-        self.v1 = sliding_window is None
+        # Model v2 has interleaved sliding windows, v1 does not
+        interleaved_sliding_window = getattr(config,
+                                             "interleaved_sliding_window",
+                                             None)
+        self.v1 = interleaved_sliding_window is None
 
         layer_idx = extract_layer_index(prefix)
         layer_has_sliding_window = (
             getattr(config, "sliding_window_pattern", False)
             and (layer_idx + 1) % self.config.sliding_window_pattern != 0)
 
-        self.sliding_window = (sliding_window
+        self.sliding_window = (interleaved_sliding_window
                                if layer_has_sliding_window else None)
 
         self.attn = Attention(self.num_heads,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 4529cf27ef565..58417980e7b47 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -22,9 +22,9 @@
 from vllm.logger import init_logger
 # yapf conflicts with isort for this block
 # yapf: disable
-from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
-                                             EAGLEConfig, ExaoneConfig,
-                                             H2OVLChatConfig,
+from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
+                                             DbrxConfig, EAGLEConfig,
+                                             ExaoneConfig, H2OVLChatConfig,
                                              InternVLChatConfig, JAISConfig,
                                              MedusaConfig, MllamaConfig,
                                              MLPSpeculatorConfig, MPTConfig,
@@ -52,6 +52,7 @@
 
 _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
     "chatglm": ChatGLMConfig,
+    "cohere2": Cohere2Config,
     "dbrx": DbrxConfig,
     "mpt": MPTConfig,
     "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index c24433cd436b4..a41a35c88b3a1 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,4 +1,5 @@
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
+from vllm.transformers_utils.configs.cohere2 import Cohere2Config
 from vllm.transformers_utils.configs.dbrx import DbrxConfig
 from vllm.transformers_utils.configs.eagle import EAGLEConfig
 from vllm.transformers_utils.configs.exaone import ExaoneConfig
@@ -22,6 +23,7 @@
 
 __all__ = [
     "ChatGLMConfig",
+    "Cohere2Config",
     "DbrxConfig",
     "MPTConfig",
     "RWConfig",
diff --git a/vllm/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py
new file mode 100644
index 0000000000000..1509330fc2179
--- /dev/null
+++ b/vllm/transformers_utils/configs/cohere2.py
@@ -0,0 +1,192 @@
+# ruff: noqa
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/cohere2/configuration_cohere2.py
+from transformers import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+
+
+class Cohere2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
+    model according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256000):
+            Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`CohereModel`]
+        hidden_size (`int`, *optional*, defaults to 8192):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22528):
+            Dimension of the MLP representations.
+        logit_scale (`float`, *optional*, defaults to 0.0625):
+            The scaling factor for the output logits.
+        num_hidden_layers (`int`, *optional*, defaults to 40):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 5):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 255001):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Size of the sliding window attention context.
+        sliding_window_pattern (`int`, *optional*, defaults to 4):
+            Pattern for the sliding window attention.
+        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
+
+    ```python
+    >>> from transformers import Cohere2Model, Cohere2Config
+
+    >>> # Initializing a Cohere Nextmodel configuration
+    >>> configuration = Cohere2Config()
+
+    >>> # Initializing a model from the Cohere2 configuration
+    >>> model = Cohere2Model(configuration) # doctest: +SKIP
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config # doctest: +SKIP
+    ```
+    """
+
+    model_type = "cohere2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=256000,
+        hidden_size=8192,
+        intermediate_size=22528,
+        logit_scale=0.0625,
+        num_hidden_layers=40,
+        num_attention_heads=64,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=8192,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=5,
+        eos_token_id=255001,
+        tie_word_embeddings=True,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        sliding_window=4096,
+        sliding_window_pattern=4,
+        cache_implementation="hybrid",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.logit_scale = logit_scale
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.sliding_window = sliding_window
+        self.sliding_window_pattern = sliding_window_pattern
+        # Need to specify head_dim in the config so it can be used in the attention forward functions
+        self.head_dim = hidden_size // num_attention_heads
+        self.cache_implementation = cache_implementation
+
+        # Validate the correctness of rotary position embeddings parameters
+        rope_config_validation(self)
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["Cohere2Config"]

From 4fb8e329fd6f51d576bcf4b7e8907e0d83c4b5cf Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sat, 28 Dec 2024 15:51:57 -0500
Subject: [PATCH 213/357] [V1] [5/N] API Server: unify `Detokenizer` and 
 `EngineCore` input (#11545)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 tests/v1/engine/test_detokenizer.py | 57 ++++++++++++++++++-----------
 vllm/v1/engine/__init__.py          | 16 +-------
 vllm/v1/engine/async_llm.py         | 14 ++++---
 vllm/v1/engine/detokenizer.py       | 21 ++++++-----
 vllm/v1/engine/llm_engine.py        | 12 +++---
 vllm/v1/engine/processor.py         | 23 ++----------
 6 files changed, 66 insertions(+), 77 deletions(-)

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
index 07f343666cb5e..aeae697ca32b0 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
@@ -3,9 +3,9 @@
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.sampling_params import RequestOutputKind
-from vllm.v1.engine import EngineCoreOutput
-from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerRequest
+from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.engine.detokenizer import Detokenizer
 
 TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
 tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
@@ -71,16 +71,22 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
 
     # Make N requests.
     requests = [
-        DetokenizerRequest(
-            request_id=f"request-{idx}",
-            prompt=prompt,
-            prompt_token_ids=prompt_tokens,
-            skip_special_tokens=False,
-            spaces_between_special_tokens=False,
-            output_kind=request_output_kind,
-            stop=[],
-            include_stop_str_in_output=False,
-        ) for idx, (
+        EngineCoreRequest(request_id=f"request-{idx}",
+                          prompt=prompt,
+                          prompt_token_ids=prompt_tokens,
+                          arrival_time=0,
+                          mm_inputs=None,
+                          mm_hashes=None,
+                          mm_placeholders=None,
+                          eos_token_id=None,
+                          lora_request=None,
+                          sampling_params=SamplingParams(
+                              skip_special_tokens=False,
+                              spaces_between_special_tokens=False,
+                              output_kind=request_output_kind,
+                              stop=[],
+                              include_stop_str_in_output=False))
+        for idx, (
             prompt,
             prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
     ]
@@ -133,18 +139,25 @@ def test_stop_string(include_stop_str_in_output: bool):
 
     # Make N requests.
     requests = [
-        DetokenizerRequest(
+        EngineCoreRequest(
             request_id=f"request-{idx}",
             prompt=prompt,
             prompt_token_ids=prompt_tokens,
-            skip_special_tokens=False,
-            spaces_between_special_tokens=False,
-            output_kind=RequestOutputKind.DELTA,
-            stop=STOP_STRINGS,
-            include_stop_str_in_output=include_stop_str_in_output,
-        ) for idx, (
-            prompt,
-            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+            arrival_time=0,
+            mm_inputs=None,
+            mm_hashes=None,
+            mm_placeholders=None,
+            eos_token_id=None,
+            lora_request=None,
+            sampling_params=SamplingParams(
+                skip_special_tokens=False,
+                spaces_between_special_tokens=False,
+                output_kind=RequestOutputKind.DELTA,
+                stop=STOP_STRINGS,
+                include_stop_str_in_output=include_stop_str_in_output,
+            )) for idx, (
+                prompt,
+                prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
     ]
 
     # Add requests to the detokenizer.
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index cc0c7ea23469a..f70464fc88298 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -6,21 +6,7 @@
 
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
-from vllm.sampling_params import RequestOutputKind, SamplingParams
-
-
-@dataclass
-class DetokenizerRequest:
-
-    request_id: str
-    prompt: Optional[str]
-    prompt_token_ids: List[int]
-    skip_special_tokens: bool
-    spaces_between_special_tokens: bool
-    output_kind: RequestOutputKind
-
-    stop: List[str]
-    include_stop_str_in_output: bool
+from vllm.sampling_params import SamplingParams
 
 
 @dataclass
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index da3da6dad6436..213ddaa023dbc 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -158,16 +158,18 @@ async def add_request(
             raise ValueError(f"Request id {request_id} already running.")
         self.rid_to_queue[request_id] = asyncio.Queue()
 
-        # 2) Convert input --> DetokenizerRequest / EngineCoreRequest.
-        detokenizer_req, engine_core_req = self.processor.process_inputs(
-            request_id, prompt, params, arrival_time, lora_request,
-            trace_headers, prompt_adapter_request, priority)
+        # 2) Convert Input --> Request.
+        request = self.processor.process_inputs(request_id, prompt, params,
+                                                arrival_time, lora_request,
+                                                trace_headers,
+                                                prompt_adapter_request,
+                                                priority)
 
         # 3) Add the request to Detokenizer (this process).
-        self.detokenizer.add_request(detokenizer_req)
+        self.detokenizer.add_request(request)
 
         # 4) Add the EngineCoreRequest to EngineCore (separate process).
-        await self.engine_core.add_request_async(engine_core_req)
+        await self.engine_core.add_request_async(request)
 
         if self.log_requests:
             logger.info("Added request %s.", request_id)
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 02f34e2b54dd5..65be9e58e03c8 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -8,7 +8,7 @@
 from vllm.transformers_utils.detokenizer_utils import (
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.v1.engine import DetokenizerRequest, EngineCoreOutput
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
 
 logger = init_logger(__name__)
 
@@ -55,19 +55,19 @@ def output_token_ids(self) -> List[int]:
     def from_new_request(
         cls,
         tokenizer: AnyTokenizer,
-        request: DetokenizerRequest,
+        request: EngineCoreRequest,
     ) -> "IncrementalDetokenizer":
 
         tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
             tokenizer=tokenizer,
             prompt_ids=request.prompt_token_ids,
-            skip_special_tokens=request.skip_special_tokens,
+            skip_special_tokens=request.sampling_params.skip_special_tokens,
         )
 
-        stops = request.stop
+        stops = request.sampling_params.stop
         # Number of chars to hold back when stop strings are to be excluded
         # from streamed output.
-        if stops and not request.include_stop_str_in_output:
+        if stops and not request.sampling_params.include_stop_str_in_output:
             stop_buffer_length = max(len(s) for s in stops) - 1
         else:
             stop_buffer_length = 0
@@ -79,13 +79,14 @@ def from_new_request(
             # NOTE(Nick): could we take ownership of it though?
             token_ids=request.prompt_token_ids.copy(),
             stop=stops,
-            include_stop_str_in_output=request.include_stop_str_in_output,
+            include_stop_str_in_output=request.sampling_params.
+            include_stop_str_in_output,
             prefix_offset=prefix_offset,
             read_offset=read_offset,
-            skip_special_tokens=request.skip_special_tokens,
-            spaces_between_special_tokens=request.
+            skip_special_tokens=request.sampling_params.skip_special_tokens,
+            spaces_between_special_tokens=request.sampling_params.
             spaces_between_special_tokens,
-            output_kind=request.output_kind,
+            output_kind=request.sampling_params.output_kind,
             request_id=request.request_id,
             prompt=request.prompt,
             prompt_token_ids=request.prompt_token_ids,
@@ -227,7 +228,7 @@ def abort_requests(
 
     def add_request(
         self,
-        request: DetokenizerRequest,
+        request: EngineCoreRequest,
     ):
         """Add new request to the Detokenizer."""
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index fc323184abc8f..a19109559eabf 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -152,15 +152,17 @@ def add_request(
     ) -> None:
 
         # 1) Process raw inputs into the request.
-        detokenizer_req, engine_core_req = self.processor.process_inputs(
-            request_id, prompt, params, arrival_time, lora_request,
-            trace_headers, prompt_adapter_request, priority)
+        request = self.processor.process_inputs(request_id, prompt, params,
+                                                arrival_time, lora_request,
+                                                trace_headers,
+                                                prompt_adapter_request,
+                                                priority)
 
         # 2) Add the request to Detokenizer.
-        self.detokenizer.add_request(detokenizer_req)
+        self.detokenizer.add_request(request)
 
         # 3) Add the request to EngineCore.
-        self.engine_core.add_request(engine_core_req)
+        self.engine_core.add_request(request)
 
     def step(self) -> List[RequestOutput]:
 
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 6ee8732bc902c..5b5a5a61cea7d 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -1,5 +1,5 @@
 import time
-from typing import Mapping, Optional, Tuple, Union
+from typing import Mapping, Optional, Union
 
 from vllm.config import CacheConfig, LoRAConfig, ModelConfig
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
@@ -13,7 +13,7 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
-from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest
+from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
 
 
@@ -62,7 +62,7 @@ def process_inputs(
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
-    ) -> Tuple[DetokenizerRequest, EngineCoreRequest]:
+    ) -> EngineCoreRequest:
 
         # TODO(woosuk): Support pooling models.
         # TODO(woosuk): Check max_logprobs
@@ -123,20 +123,7 @@ def process_inputs(
                 decoder_inputs.multi_modal_data, mm_hashes,
                 decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs)
 
-        # Make Request for Detokenizer.
-        detokenizer_request = DetokenizerRequest(
-            request_id,
-            decoder_inputs.prompt,
-            decoder_inputs.prompt_token_ids,
-            sampling_params.skip_special_tokens,
-            sampling_params.spaces_between_special_tokens,
-            sampling_params.output_kind,
-            sampling_params.stop,
-            sampling_params.include_stop_str_in_output,
-        )
-
-        # Make Request for EngineCore.
-        engine_core_request = EngineCoreRequest(
+        return EngineCoreRequest(
             request_id,
             decoder_inputs.prompt,
             decoder_inputs.prompt_token_ids,
@@ -149,8 +136,6 @@ def process_inputs(
             lora_request,
         )
 
-        return detokenizer_request, engine_core_request
-
     def _validate_model_inputs(self, inputs: ProcessorInputs):
         if is_encoder_decoder_inputs(inputs):
             # For encoder-decoder multimodal models, the max_prompt_len

From 32b4c63f02b2ab28a49a040b1d170a903a5cd9dc Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 29 Dec 2024 15:56:22 +0800
Subject: [PATCH 214/357] [Doc] Convert list tables to MyST (#11594)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/getting_started/debugging.md      |    2 +-
 .../getting_started/gaudi-installation.md     |   39 +-
 .../getting_started/tpu-installation.md       |   53 +-
 docs/source/models/supported_models.md        | 1206 ++++++++---------
 .../source/quantization/supported_hardware.md |  227 ++--
 docs/source/serving/deploying_with_helm.md    |  409 +++---
 6 files changed, 961 insertions(+), 975 deletions(-)

diff --git a/docs/source/getting_started/debugging.md b/docs/source/getting_started/debugging.md
index 3b0029f2e88ce..19eb699572a08 100644
--- a/docs/source/getting_started/debugging.md
+++ b/docs/source/getting_started/debugging.md
@@ -197,4 +197,4 @@ if __name__ == '__main__':
 ## Known Issues
 
 - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759).
-- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable ``NCCL_CUMEM_ENABLE=0`` to disable NCCL's ``cuMem`` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) .
+- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable `NCCL_CUMEM_ENABLE=0` to disable NCCL's `cuMem` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) .
diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/gaudi-installation.md
index 447bf98084a5d..1f2ee62860dec 100644
--- a/docs/source/getting_started/gaudi-installation.md
+++ b/docs/source/getting_started/gaudi-installation.md
@@ -141,26 +141,25 @@ Gaudi2 devices. Configurations that are not listed may or may not work.
 
 Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag.
 
-```{eval-rst}
-.. list-table:: vLLM execution modes
-   :widths: 25 25 50
-   :header-rows: 1
-
-   * - ``PT_HPU_LAZY_MODE``
-     - ``enforce_eager``
-     - execution mode
-   * - 0
-     - 0
-     - torch.compile
-   * - 0
-     - 1
-     - PyTorch eager mode
-   * - 1
-     - 0
-     - HPU Graphs
-   * - 1
-     - 1
-     - PyTorch lazy mode
+```{list-table} vLLM execution modes
+:widths: 25 25 50
+:header-rows: 1
+
+* - `PT_HPU_LAZY_MODE`
+  - `enforce_eager`
+  - execution mode
+* - 0
+  - 0
+  - torch.compile
+* - 0
+  - 1
+  - PyTorch eager mode
+* - 1
+  - 0
+  - HPU Graphs
+* - 1
+  - 1
+  - PyTorch lazy mode
 ```
 
 ```{warning}
diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/tpu-installation.md
index 17eded4a51fec..4d3ac541c90ce 100644
--- a/docs/source/getting_started/tpu-installation.md
+++ b/docs/source/getting_started/tpu-installation.md
@@ -68,33 +68,32 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
 --service-account SERVICE_ACCOUNT
 ```
 
-```{eval-rst}
-.. list-table:: Parameter descriptions
-    :header-rows: 1
-
-    * - Parameter name
-      - Description
-    * - QUEUED_RESOURCE_ID
-      - The user-assigned ID of the queued resource request.
-    * - TPU_NAME
-      - The user-assigned name of the TPU which is created when the queued
-        resource request is allocated.
-    * - PROJECT_ID
-      - Your Google Cloud project
-    * - ZONE
-      - The GCP zone where you want to create your Cloud TPU. The value you use
-        depends on the version of TPUs you are using. For more information, see
-        `TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_
-    * - ACCELERATOR_TYPE
-      - The TPU version you want to use. Specify the TPU version, for example
-        `v5litepod-4` specifies a v5e TPU with 4 cores. For more information,
-        see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
-    * - RUNTIME_VERSION
-      - The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
-    * - SERVICE_ACCOUNT
-      - The email address for your service account. You can find it in the IAM
-        Cloud Console under *Service Accounts*. For example:
-        `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`
+```{list-table} Parameter descriptions
+:header-rows: 1
+
+* - Parameter name
+  - Description
+* - QUEUED_RESOURCE_ID
+  - The user-assigned ID of the queued resource request.
+* - TPU_NAME
+  - The user-assigned name of the TPU which is created when the queued
+    resource request is allocated.
+* - PROJECT_ID
+  - Your Google Cloud project
+* - ZONE
+  - The GCP zone where you want to create your Cloud TPU. The value you use
+    depends on the version of TPUs you are using. For more information, see
+    `TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_
+* - ACCELERATOR_TYPE
+  - The TPU version you want to use. Specify the TPU version, for example
+    `v5litepod-4` specifies a v5e TPU with 4 cores. For more information,
+    see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
+* - RUNTIME_VERSION
+  - The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
+* - SERVICE_ACCOUNT
+  - The email address for your service account. You can find it in the IAM
+    Cloud Console under *Service Accounts*. For example:
+    `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`
 ```
 
 Connect to your TPU using SSH:
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index e11befbb8dd30..518505abeb2a9 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -72,291 +72,290 @@ See [this page](#generative-models) for more information on how to use generativ
 
 #### Text Generation (`--task generate`)
 
-```{eval-rst}
-.. list-table::
-  :widths: 25 25 50 5 5
-  :header-rows: 1
-
-  * - Architecture
-    - Models
-    - Example HF Models
-    - :ref:`LoRA <lora-adapter>`
-    - :ref:`PP <distributed-serving>`
-  * - :code:`AquilaForCausalLM`
-    - Aquila, Aquila2
-    - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`ArcticForCausalLM`
-    - Arctic
-    - :code:`Snowflake/snowflake-arctic-base`, :code:`Snowflake/snowflake-arctic-instruct`, etc.
-    -
-    - ✅︎
-  * - :code:`BaiChuanForCausalLM`
-    - Baichuan2, Baichuan
-    - :code:`baichuan-inc/Baichuan2-13B-Chat`, :code:`baichuan-inc/Baichuan-7B`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`BloomForCausalLM`
-    - BLOOM, BLOOMZ, BLOOMChat
-    - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc.
-    -
-    - ✅︎
-  * - :code:`BartForConditionalGeneration`
-    - BART
-    - :code:`facebook/bart-base`, :code:`facebook/bart-large-cnn`, etc.
-    -
-    -
-  * - :code:`ChatGLMModel`
-    - ChatGLM
-    - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`CohereForCausalLM`, :code:`Cohere2ForCausalLM`
-    - Command-R
-    - :code:`CohereForAI/c4ai-command-r-v01`, :code:`CohereForAI/c4ai-command-r7b-12-2024`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`DbrxForCausalLM`
-    - DBRX
-    - :code:`databricks/dbrx-base`, :code:`databricks/dbrx-instruct`, etc.
-    -
-    - ✅︎
-  * - :code:`DeciLMForCausalLM`
-    - DeciLM
-    - :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc.
-    -
-    - ✅︎
-  * - :code:`DeepseekForCausalLM`
-    - DeepSeek
-    - :code:`deepseek-ai/deepseek-llm-67b-base`, :code:`deepseek-ai/deepseek-llm-7b-chat` etc.
-    -
-    - ✅︎
-  * - :code:`DeepseekV2ForCausalLM`
-    - DeepSeek-V2
-    - :code:`deepseek-ai/DeepSeek-V2`, :code:`deepseek-ai/DeepSeek-V2-Chat` etc.
-    -
-    - ✅︎
-  * - :code:`DeepseekV3ForCausalLM`
-    - DeepSeek-V3
-    - :code:`deepseek-ai/DeepSeek-V3-Base`, :code:`deepseek-ai/DeepSeek-V3` etc.
-    -
-    - ✅︎
-  * - :code:`ExaoneForCausalLM`
-    - EXAONE-3
-    - :code:`LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`FalconForCausalLM`
-    - Falcon
-    - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
-    -
-    - ✅︎
-  * - :code:`FalconMambaForCausalLM`
-    - FalconMamba
-    - :code:`tiiuae/falcon-mamba-7b`, :code:`tiiuae/falcon-mamba-7b-instruct`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`GemmaForCausalLM`
-    - Gemma
-    - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Gemma2ForCausalLM`
-    - Gemma2
-    - :code:`google/gemma-2-9b`, :code:`google/gemma-2-27b`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`GlmForCausalLM`
-    - GLM-4
-    - :code:`THUDM/glm-4-9b-chat-hf`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`GPT2LMHeadModel`
-    - GPT-2
-    - :code:`gpt2`, :code:`gpt2-xl`, etc.
-    -
-    - ✅︎
-  * - :code:`GPTBigCodeForCausalLM`
-    - StarCoder, SantaCoder, WizardCoder
-    - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`GPTJForCausalLM`
-    - GPT-J
-    - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc.
-    -
-    - ✅︎
-  * - :code:`GPTNeoXForCausalLM`
-    - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
-    - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc.
-    -
-    - ✅︎
-  * - :code:`GraniteForCausalLM`
-    - Granite 3.0, Granite 3.1, PowerLM
-    - :code:`ibm-granite/granite-3.0-2b-base`, :code:`ibm-granite/granite-3.1-8b-instruct`, :code:`ibm/PowerLM-3b`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`GraniteMoeForCausalLM`
-    - Granite 3.0 MoE, PowerMoE
-    - :code:`ibm-granite/granite-3.0-1b-a400m-base`, :code:`ibm-granite/granite-3.0-3b-a800m-instruct`, :code:`ibm/PowerMoE-3b`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`GritLM`
-    - GritLM
-    - :code:`parasail-ai/GritLM-7B-vllm`.
-    - ✅︎
-    - ✅︎
-  * - :code:`InternLMForCausalLM`
-    - InternLM
-    - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`InternLM2ForCausalLM`
-    - InternLM2
-    - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`JAISLMHeadModel`
-    - Jais
-    - :code:`inceptionai/jais-13b`, :code:`inceptionai/jais-13b-chat`, :code:`inceptionai/jais-30b-v3`, :code:`inceptionai/jais-30b-chat-v3`, etc.
-    -
-    - ✅︎
-  * - :code:`JambaForCausalLM`
-    - Jamba
-    - :code:`ai21labs/AI21-Jamba-1.5-Large`, :code:`ai21labs/AI21-Jamba-1.5-Mini`, :code:`ai21labs/Jamba-v0.1`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`LlamaForCausalLM`
-    - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi
-    - :code:`meta-llama/Meta-Llama-3.1-405B-Instruct`, :code:`meta-llama/Meta-Llama-3.1-70B`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-70b-hf`, :code:`01-ai/Yi-34B`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`MambaForCausalLM`
-    - Mamba
-    - :code:`state-spaces/mamba-130m-hf`, :code:`state-spaces/mamba-790m-hf`, :code:`state-spaces/mamba-2.8b-hf`, etc.
-    -
-    - ✅︎
-  * - :code:`MiniCPMForCausalLM`
-    - MiniCPM
-    - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, :code:`openbmb/MiniCPM-S-1B-sft`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`MiniCPM3ForCausalLM`
-    - MiniCPM3
-    - :code:`openbmb/MiniCPM3-4B`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`MistralForCausalLM`
-    - Mistral, Mistral-Instruct
-    - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`MixtralForCausalLM`
-    - Mixtral-8x7B, Mixtral-8x7B-Instruct
-    - :code:`mistralai/Mixtral-8x7B-v0.1`, :code:`mistralai/Mixtral-8x7B-Instruct-v0.1`, :code:`mistral-community/Mixtral-8x22B-v0.1`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`MPTForCausalLM`
-    - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
-    - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc.
-    -
-    - ✅︎
-  * - :code:`NemotronForCausalLM`
-    - Nemotron-3, Nemotron-4, Minitron
-    - :code:`nvidia/Minitron-8B-Base`, :code:`mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`OLMoForCausalLM`
-    - OLMo
-    - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc.
-    -
-    - ✅︎
-  * - :code:`OLMo2ForCausalLM`
-    - OLMo2
-    - :code:`allenai/OLMo2-7B-1124`, etc.
-    -
-    - ✅︎
-  * - :code:`OLMoEForCausalLM`
-    - OLMoE
-    - :code:`allenai/OLMoE-1B-7B-0924`, :code:`allenai/OLMoE-1B-7B-0924-Instruct`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`OPTForCausalLM`
-    - OPT, OPT-IML
-    - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
-    -
-    - ✅︎
-  * - :code:`OrionForCausalLM`
-    - Orion
-    - :code:`OrionStarAI/Orion-14B-Base`, :code:`OrionStarAI/Orion-14B-Chat`, etc.
-    -
-    - ✅︎
-  * - :code:`PhiForCausalLM`
-    - Phi
-    - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Phi3ForCausalLM`
-    - Phi-3
-    - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, :code:`microsoft/Phi-3-medium-128k-instruct`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Phi3SmallForCausalLM`
-    - Phi-3-Small
-    - :code:`microsoft/Phi-3-small-8k-instruct`, :code:`microsoft/Phi-3-small-128k-instruct`, etc.
-    -
-    - ✅︎
-  * - :code:`PhiMoEForCausalLM`
-    - Phi-3.5-MoE
-    - :code:`microsoft/Phi-3.5-MoE-instruct`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`PersimmonForCausalLM`
-    - Persimmon
-    - :code:`adept/persimmon-8b-base`, :code:`adept/persimmon-8b-chat`, etc.
-    -
-    - ✅︎
-  * - :code:`QWenLMHeadModel`
-    - Qwen
-    - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Qwen2ForCausalLM`
-    - Qwen2
-    - :code:`Qwen/QwQ-32B-Preview`, :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Qwen2MoeForCausalLM`
-    - Qwen2MoE
-    - :code:`Qwen/Qwen1.5-MoE-A2.7B`, :code:`Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
-    -
-    - ✅︎
-  * - :code:`StableLmForCausalLM`
-    - StableLM
-    - :code:`stabilityai/stablelm-3b-4e1t`, :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc.
-    -
-    - ✅︎
-  * - :code:`Starcoder2ForCausalLM`
-    - Starcoder2
-    - :code:`bigcode/starcoder2-3b`, :code:`bigcode/starcoder2-7b`, :code:`bigcode/starcoder2-15b`, etc.
-    -
-    - ✅︎
-  * - :code:`SolarForCausalLM`
-    - Solar Pro
-    - :code:`upstage/solar-pro-preview-instruct`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`TeleChat2ForCausalLM`
-    - TeleChat2
-    - :code:`TeleAI/TeleChat2-3B`, :code:`TeleAI/TeleChat2-7B`, :code:`TeleAI/TeleChat2-35B`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`XverseForCausalLM`
-    - XVERSE
-    - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc.
-    - ✅︎
-    - ✅︎
+```{list-table}
+:widths: 25 25 50 5 5
+:header-rows: 1
+
+* - Architecture
+  - Models
+  - Example HF Models
+  - [LoRA](#lora-adapter)
+  - [PP](#distributed-serving)
+* - `AquilaForCausalLM`
+  - Aquila, Aquila2
+  - `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.
+  - ✅︎
+  - ✅︎
+* - `ArcticForCausalLM`
+  - Arctic
+  - `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc.
+  -
+  - ✅︎
+* - `BaiChuanForCausalLM`
+  - Baichuan2, Baichuan
+  - `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.
+  - ✅︎
+  - ✅︎
+* - `BloomForCausalLM`
+  - BLOOM, BLOOMZ, BLOOMChat
+  - `bigscience/bloom`, `bigscience/bloomz`, etc.
+  -
+  - ✅︎
+* - `BartForConditionalGeneration`
+  - BART
+  - `facebook/bart-base`, `facebook/bart-large-cnn`, etc.
+  -
+  -
+* - `ChatGLMModel`
+  - ChatGLM
+  - `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.
+  - ✅︎
+  - ✅︎
+* - `CohereForCausalLM`, `Cohere2ForCausalLM`
+  - Command-R
+  - `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc.
+  - ✅︎
+  - ✅︎
+* - `DbrxForCausalLM`
+  - DBRX
+  - `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc.
+  -
+  - ✅︎
+* - `DeciLMForCausalLM`
+  - DeciLM
+  - `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.
+  -
+  - ✅︎
+* - `DeepseekForCausalLM`
+  - DeepSeek
+  - `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc.
+  -
+  - ✅︎
+* - `DeepseekV2ForCausalLM`
+  - DeepSeek-V2
+  - `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc.
+  -
+  - ✅︎
+* - `DeepseekV3ForCausalLM`
+  - DeepSeek-V3
+  - `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc.
+  -
+  - ✅︎
+* - `ExaoneForCausalLM`
+  - EXAONE-3
+  - `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.
+  - ✅︎
+  - ✅︎
+* - `FalconForCausalLM`
+  - Falcon
+  - `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.
+  -
+  - ✅︎
+* - `FalconMambaForCausalLM`
+  - FalconMamba
+  - `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc.
+  - ✅︎
+  - ✅︎
+* - `GemmaForCausalLM`
+  - Gemma
+  - `google/gemma-2b`, `google/gemma-7b`, etc.
+  - ✅︎
+  - ✅︎
+* - `Gemma2ForCausalLM`
+  - Gemma2
+  - `google/gemma-2-9b`, `google/gemma-2-27b`, etc.
+  - ✅︎
+  - ✅︎
+* - `GlmForCausalLM`
+  - GLM-4
+  - `THUDM/glm-4-9b-chat-hf`, etc.
+  - ✅︎
+  - ✅︎
+* - `GPT2LMHeadModel`
+  - GPT-2
+  - `gpt2`, `gpt2-xl`, etc.
+  -
+  - ✅︎
+* - `GPTBigCodeForCausalLM`
+  - StarCoder, SantaCoder, WizardCoder
+  - `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc.
+  - ✅︎
+  - ✅︎
+* - `GPTJForCausalLM`
+  - GPT-J
+  - `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.
+  -
+  - ✅︎
+* - `GPTNeoXForCausalLM`
+  - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
+  - `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.
+  -
+  - ✅︎
+* - `GraniteForCausalLM`
+  - Granite 3.0, Granite 3.1, PowerLM
+  - `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc.
+  - ✅︎
+  - ✅︎
+* - `GraniteMoeForCausalLM`
+  - Granite 3.0 MoE, PowerMoE
+  - `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc.
+  - ✅︎
+  - ✅︎
+* - `GritLM`
+  - GritLM
+  - `parasail-ai/GritLM-7B-vllm`.
+  - ✅︎
+  - ✅︎
+* - `InternLMForCausalLM`
+  - InternLM
+  - `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.
+  - ✅︎
+  - ✅︎
+* - `InternLM2ForCausalLM`
+  - InternLM2
+  - `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.
+  - ✅︎
+  - ✅︎
+* - `JAISLMHeadModel`
+  - Jais
+  - `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc.
+  -
+  - ✅︎
+* - `JambaForCausalLM`
+  - Jamba
+  - `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc.
+  - ✅︎
+  - ✅︎
+* - `LlamaForCausalLM`
+  - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi
+  - `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc.
+  - ✅︎
+  - ✅︎
+* - `MambaForCausalLM`
+  - Mamba
+  - `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc.
+  -
+  - ✅︎
+* - `MiniCPMForCausalLM`
+  - MiniCPM
+  - `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc.
+  - ✅︎
+  - ✅︎
+* - `MiniCPM3ForCausalLM`
+  - MiniCPM3
+  - `openbmb/MiniCPM3-4B`, etc.
+  - ✅︎
+  - ✅︎
+* - `MistralForCausalLM`
+  - Mistral, Mistral-Instruct
+  - `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.
+  - ✅︎
+  - ✅︎
+* - `MixtralForCausalLM`
+  - Mixtral-8x7B, Mixtral-8x7B-Instruct
+  - `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.
+  - ✅︎
+  - ✅︎
+* - `MPTForCausalLM`
+  - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
+  - `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc.
+  -
+  - ✅︎
+* - `NemotronForCausalLM`
+  - Nemotron-3, Nemotron-4, Minitron
+  - `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.
+  - ✅︎
+  - ✅︎
+* - `OLMoForCausalLM`
+  - OLMo
+  - `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.
+  -
+  - ✅︎
+* - `OLMo2ForCausalLM`
+  - OLMo2
+  - `allenai/OLMo2-7B-1124`, etc.
+  -
+  - ✅︎
+* - `OLMoEForCausalLM`
+  - OLMoE
+  - `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc.
+  - ✅︎
+  - ✅︎
+* - `OPTForCausalLM`
+  - OPT, OPT-IML
+  - `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.
+  -
+  - ✅︎
+* - `OrionForCausalLM`
+  - Orion
+  - `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.
+  -
+  - ✅︎
+* - `PhiForCausalLM`
+  - Phi
+  - `microsoft/phi-1_5`, `microsoft/phi-2`, etc.
+  - ✅︎
+  - ✅︎
+* - `Phi3ForCausalLM`
+  - Phi-3
+  - `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
+  - ✅︎
+  - ✅︎
+* - `Phi3SmallForCausalLM`
+  - Phi-3-Small
+  - `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc.
+  -
+  - ✅︎
+* - `PhiMoEForCausalLM`
+  - Phi-3.5-MoE
+  - `microsoft/Phi-3.5-MoE-instruct`, etc.
+  - ✅︎
+  - ✅︎
+* - `PersimmonForCausalLM`
+  - Persimmon
+  - `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc.
+  -
+  - ✅︎
+* - `QWenLMHeadModel`
+  - Qwen
+  - `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.
+  - ✅︎
+  - ✅︎
+* - `Qwen2ForCausalLM`
+  - Qwen2
+  - `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc.
+  - ✅︎
+  - ✅︎
+* - `Qwen2MoeForCausalLM`
+  - Qwen2MoE
+  - `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
+  -
+  - ✅︎
+* - `StableLmForCausalLM`
+  - StableLM
+  - `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.
+  -
+  - ✅︎
+* - `Starcoder2ForCausalLM`
+  - Starcoder2
+  - `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.
+  -
+  - ✅︎
+* - `SolarForCausalLM`
+  - Solar Pro
+  - `upstage/solar-pro-preview-instruct`, etc.
+  - ✅︎
+  - ✅︎
+* - `TeleChat2ForCausalLM`
+  - TeleChat2
+  - `TeleAI/TeleChat2-3B`, `TeleAI/TeleChat2-7B`, `TeleAI/TeleChat2-35B`, etc.
+  - ✅︎
+  - ✅︎
+* - `XverseForCausalLM`
+  - XVERSE
+  - `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
+  - ✅︎
+  - ✅︎
 ```
 
 ```{note}
@@ -374,51 +373,50 @@ you should explicitly specify the task type to ensure that the model is used in
 
 #### Text Embedding (`--task embed`)
 
-```{eval-rst}
-.. list-table::
-  :widths: 25 25 50 5 5
-  :header-rows: 1
-
-  * - Architecture
-    - Models
-    - Example HF Models
-    - :ref:`LoRA <lora-adapter>`
-    - :ref:`PP <distributed-serving>`
-  * - :code:`BertModel`
-    - BERT-based
-    - :code:`BAAI/bge-base-en-v1.5`, etc.
-    -
-    -
-  * - :code:`Gemma2Model`
-    - Gemma2-based
-    - :code:`BAAI/bge-multilingual-gemma2`, etc.
-    -
-    - ✅︎
-  * - :code:`GritLM`
-    - GritLM
-    - :code:`parasail-ai/GritLM-7B-vllm`.
-    - ✅︎
-    - ✅︎
-  * - :code:`LlamaModel`, :code:`LlamaForCausalLM`, :code:`MistralModel`, etc.
-    - Llama-based
-    - :code:`intfloat/e5-mistral-7b-instruct`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Qwen2Model`, :code:`Qwen2ForCausalLM`
-    - Qwen2-based
-    - :code:`ssmits/Qwen2-7B-Instruct-embed-base` (see note), :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`RobertaModel`, :code:`RobertaForMaskedLM`
-    - RoBERTa-based
-    - :code:`sentence-transformers/all-roberta-large-v1`, :code:`sentence-transformers/all-roberta-large-v1`, etc.
-    -
-    -
-  * - :code:`XLMRobertaModel`
-    - XLM-RoBERTa-based
-    - :code:`intfloat/multilingual-e5-large`, etc.
-    -
-    -
+```{list-table}
+:widths: 25 25 50 5 5
+:header-rows: 1
+
+* - Architecture
+  - Models
+  - Example HF Models
+  - [LoRA](#lora-adapter)
+  - [PP](#distributed-serving)
+* - `BertModel`
+  - BERT-based
+  - `BAAI/bge-base-en-v1.5`, etc.
+  -
+  -
+* - `Gemma2Model`
+  - Gemma2-based
+  - `BAAI/bge-multilingual-gemma2`, etc.
+  -
+  - ✅︎
+* - `GritLM`
+  - GritLM
+  - `parasail-ai/GritLM-7B-vllm`.
+  - ✅︎
+  - ✅︎
+* - `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc.
+  - Llama-based
+  - `intfloat/e5-mistral-7b-instruct`, etc.
+  - ✅︎
+  - ✅︎
+* - `Qwen2Model`, `Qwen2ForCausalLM`
+  - Qwen2-based
+  - `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.
+  - ✅︎
+  - ✅︎
+* - `RobertaModel`, `RobertaForMaskedLM`
+  - RoBERTa-based
+  - `sentence-transformers/all-roberta-large-v1`, `sentence-transformers/all-roberta-large-v1`, etc.
+  -
+  -
+* - `XLMRobertaModel`
+  - XLM-RoBERTa-based
+  - `intfloat/multilingual-e5-large`, etc.
+  -
+  -
 ```
 
 ```{note}
@@ -440,31 +438,30 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 
 #### Reward Modeling (`--task reward`)
 
-```{eval-rst}
-.. list-table::
-  :widths: 25 25 50 5 5
-  :header-rows: 1
-
-  * - Architecture
-    - Models
-    - Example HF Models
-    - :ref:`LoRA <lora-adapter>`
-    - :ref:`PP <distributed-serving>`
-  * - :code:`InternLM2ForRewardModel`
-    - InternLM2-based
-    - :code:`internlm/internlm2-1_8b-reward`, :code:`internlm/internlm2-7b-reward`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`LlamaForCausalLM`
-    - Llama-based
-    - :code:`peiyi9979/math-shepherd-mistral-7b-prm`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Qwen2ForRewardModel`
-    - Qwen2-based
-    - :code:`Qwen/Qwen2.5-Math-RM-72B`, etc.
-    - ✅︎
-    - ✅︎
+```{list-table}
+:widths: 25 25 50 5 5
+:header-rows: 1
+
+* - Architecture
+  - Models
+  - Example HF Models
+  - [LoRA](#lora-adapter)
+  - [PP](#distributed-serving)
+* - `InternLM2ForRewardModel`
+  - InternLM2-based
+  - `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc.
+  - ✅︎
+  - ✅︎
+* - `LlamaForCausalLM`
+  - Llama-based
+  - `peiyi9979/math-shepherd-mistral-7b-prm`, etc.
+  - ✅︎
+  - ✅︎
+* - `Qwen2ForRewardModel`
+  - Qwen2-based
+  - `Qwen/Qwen2.5-Math-RM-72B`, etc.
+  - ✅︎
+  - ✅︎
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
@@ -477,26 +474,25 @@ e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 1
 
 #### Classification (`--task classify`)
 
-```{eval-rst}
-.. list-table::
-  :widths: 25 25 50 5 5
-  :header-rows: 1
-
-  * - Architecture
-    - Models
-    - Example HF Models
-    - :ref:`LoRA <lora-adapter>`
-    - :ref:`PP <distributed-serving>`
-  * - :code:`JambaForSequenceClassification`
-    - Jamba
-    - :code:`ai21labs/Jamba-tiny-reward-dev`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Qwen2ForSequenceClassification`
-    - Qwen2-based
-    - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc.
-    - ✅︎
-    - ✅︎
+```{list-table}
+:widths: 25 25 50 5 5
+:header-rows: 1
+
+* - Architecture
+  - Models
+  - Example HF Models
+  - [LoRA](#lora-adapter)
+  - [PP](#distributed-serving)
+* - `JambaForSequenceClassification`
+  - Jamba
+  - `ai21labs/Jamba-tiny-reward-dev`, etc.
+  - ✅︎
+  - ✅︎
+* - `Qwen2ForSequenceClassification`
+  - Qwen2-based
+  - `jason9693/Qwen2.5-1.5B-apeach`, etc.
+  - ✅︎
+  - ✅︎
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
@@ -504,31 +500,30 @@ If your model is not in the above list, we will try to automatically convert the
 
 #### Sentence Pair Scoring (`--task score`)
 
-```{eval-rst}
-.. list-table::
-  :widths: 25 25 50 5 5
-  :header-rows: 1
-
-  * - Architecture
-    - Models
-    - Example HF Models
-    - :ref:`LoRA <lora-adapter>`
-    - :ref:`PP <distributed-serving>`
-  * - :code:`BertForSequenceClassification`
-    - BERT-based
-    - :code:`cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.
-    -
-    -
-  * - :code:`RobertaForSequenceClassification`
-    - RoBERTa-based
-    - :code:`cross-encoder/quora-roberta-base`, etc.
-    -
-    -
-  * - :code:`XLMRobertaForSequenceClassification`
-    - XLM-RoBERTa-based
-    - :code:`BAAI/bge-reranker-v2-m3`, etc.
-    -
-    -
+```{list-table}
+:widths: 25 25 50 5 5
+:header-rows: 1
+
+* - Architecture
+  - Models
+  - Example HF Models
+  - [LoRA](#lora-adapter)
+  - [PP](#distributed-serving)
+* - `BertForSequenceClassification`
+  - BERT-based
+  - `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.
+  -
+  -
+* - `RobertaForSequenceClassification`
+  - RoBERTa-based
+  - `cross-encoder/quora-roberta-base`, etc.
+  -
+  -
+* - `XLMRobertaForSequenceClassification`
+  - XLM-RoBERTa-based
+  - `BAAI/bge-reranker-v2-m3`, etc.
+  -
+  -
 ```
 
 (supported-mm-models)=
@@ -558,186 +553,182 @@ See [this page](#generative-models) for more information on how to use generativ
 
 #### Text Generation (`--task generate`)
 
-```{eval-rst}
-.. list-table::
-  :widths: 25 25 15 20 5 5 5
-  :header-rows: 1
-
-  * - Architecture
-    - Models
-    - Inputs
-    - Example HF Models
-    - :ref:`LoRA <lora-adapter>`
-    - :ref:`PP <distributed-serving>`
-    - V1
-  * - :code:`AriaForConditionalGeneration`
-    - Aria
-    - T + I
-    - :code:`rhymes-ai/Aria`
-    -
-    - ✅︎
-    -
-  * - :code:`Blip2ForConditionalGeneration`
-    - BLIP-2
-    - T + I\ :sup:`E`
-    - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc.
-    -
-    - ✅︎
-    -
-  * - :code:`ChameleonForConditionalGeneration`
-    - Chameleon
-    - T + I
-    - :code:`facebook/chameleon-7b` etc.
-    -
-    - ✅︎
-    -
-  * - :code:`FuyuForCausalLM`
-    - Fuyu
-    - T + I
-    - :code:`adept/fuyu-8b` etc.
-    -
-    - ✅︎
-    -
-  * - :code:`ChatGLMModel`
-    - GLM-4V
-    - T + I
-    - :code:`THUDM/glm-4v-9b` etc.
-    - ✅︎
-    - ✅︎
-    -
-  * - :code:`H2OVLChatModel`
-    - H2OVL
-    - T + I\ :sup:`E+`
-    - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc.
-    -
-    - ✅︎
-    -
-  * - :code:`Idefics3ForConditionalGeneration`
-    - Idefics3
-    - T + I
-    - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc.
-    - ✅︎
-    -
-    -
-  * - :code:`InternVLChatModel`
-    - InternVL 2.5, Mono-InternVL, InternVL 2.0
-    - T + I\ :sup:`E+`
-    - :code:`OpenGVLab/InternVL2_5-4B`, :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, etc.
-    -
-    - ✅︎
-    - ✅︎
-  * - :code:`LlavaForConditionalGeneration`
-    - LLaVA-1.5
-    - T + I\ :sup:`E+`
-    - :code:`llava-hf/llava-1.5-7b-hf`, :code:`TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.
-    -
-    - ✅︎
-    - ✅︎
-  * - :code:`LlavaNextForConditionalGeneration`
-    - LLaVA-NeXT
-    - T + I\ :sup:`E+`
-    - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
-    -
-    - ✅︎
-    -
-  * - :code:`LlavaNextVideoForConditionalGeneration`
-    - LLaVA-NeXT-Video
-    - T + V
-    - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
-    -
-    - ✅︎
-    -
-  * - :code:`LlavaOnevisionForConditionalGeneration`
-    - LLaVA-Onevision
-    - T + I\ :sup:`+` + V\ :sup:`+`
-    - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
-    -
-    - ✅︎
-    -
-  * - :code:`MiniCPMV`
-    - MiniCPM-V
-    - T + I\ :sup:`E+`
-    - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc.
-    - ✅︎
-    - ✅︎
-    -
-  * - :code:`MllamaForConditionalGeneration`
-    - Llama 3.2
-    - T + I\ :sup:`+`
-    - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc.
-    -
-    -
-    -
-  * - :code:`MolmoForCausalLM`
-    - Molmo
-    - T + I
-    - :code:`allenai/Molmo-7B-D-0924`, :code:`allenai/Molmo-72B-0924`, etc.
-    -
-    - ✅︎
-    - ✅︎
-  * - :code:`NVLM_D_Model`
-    - NVLM-D 1.0
-    - T + I\ :sup:`E+`
-    - :code:`nvidia/NVLM-D-72B`, etc.
-    -
-    - ✅︎
-    - ✅︎
-  * - :code:`PaliGemmaForConditionalGeneration`
-    - PaliGemma, PaliGemma 2
-    - T + I\ :sup:`E`
-    - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, :code:`google/paligemma2-3b-ft-docci-448`, etc.
-    -
-    - ✅︎
-    -
-  * - :code:`Phi3VForCausalLM`
-    - Phi-3-Vision, Phi-3.5-Vision
-    - T + I\ :sup:`E+`
-    - :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc.
-    -
-    - ✅︎
-    - ✅︎
-  * - :code:`PixtralForConditionalGeneration`
-    - Pixtral
-    - T + I\ :sup:`+`
-    - :code:`mistralai/Pixtral-12B-2409`, :code:`mistral-community/pixtral-12b` etc.
-    -
-    - ✅︎
-    - ✅︎
-  * - :code:`QWenLMHeadModel`
-    - Qwen-VL
-    - T + I\ :sup:`E+`
-    - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
-    - ✅︎
-    - ✅︎
-    -
-  * - :code:`Qwen2AudioForConditionalGeneration`
-    - Qwen2-Audio
-    - T + A\ :sup:`+`
-    - :code:`Qwen/Qwen2-Audio-7B-Instruct`
-    -
-    - ✅︎
-    -
-  * - :code:`Qwen2VLForConditionalGeneration`
-    - Qwen2-VL
-    - T + I\ :sup:`E+` + V\ :sup:`E+`
-    - :code:`Qwen/QVQ-72B-Preview`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
-    - ✅︎
-    - ✅︎
-    -
-  * - :code:`UltravoxModel`
-    - Ultravox
-    - T + A\ :sup:`E+`
-    - :code:`fixie-ai/ultravox-v0_3`
-    -
-    - ✅︎
-    -
+```{list-table}
+:widths: 25 25 15 20 5 5 5
+:header-rows: 1
+
+* - Architecture
+  - Models
+  - Inputs
+  - Example HF Models
+  - [LoRA](#lora-adapter)
+  - [PP](#distributed-serving)
+  - [V1](gh-issue:8779)
+* - `AriaForConditionalGeneration`
+  - Aria
+  - T + I
+  - `rhymes-ai/Aria`
+  -
+  - ✅︎
+  -
+* - `Blip2ForConditionalGeneration`
+  - BLIP-2
+  - T + I<sup>E</sup>
+  - `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.
+  -
+  - ✅︎
+  -
+* - `ChameleonForConditionalGeneration`
+  - Chameleon
+  - T + I
+  - `facebook/chameleon-7b` etc.
+  -
+  - ✅︎
+  -
+* - `FuyuForCausalLM`
+  - Fuyu
+  - T + I
+  - `adept/fuyu-8b` etc.
+  -
+  - ✅︎
+  -
+* - `ChatGLMModel`
+  - GLM-4V
+  - T + I
+  - `THUDM/glm-4v-9b` etc.
+  - ✅︎
+  - ✅︎
+  -
+* - `H2OVLChatModel`
+  - H2OVL
+  - T + I<sup>E+</sup>
+  - `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.
+  -
+  - ✅︎
+  -
+* - `Idefics3ForConditionalGeneration`
+  - Idefics3
+  - T + I
+  - `HuggingFaceM4/Idefics3-8B-Llama3` etc.
+  - ✅︎
+  -
+  -
+* - `InternVLChatModel`
+  - InternVL 2.5, Mono-InternVL, InternVL 2.0
+  - T + I<sup>E+</sup>
+  - `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
+  -
+  - ✅︎
+  - ✅︎
+* - `LlavaForConditionalGeneration`
+  - LLaVA-1.5
+  - T + I<sup>E+</sup>
+  - `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.
+  -
+  - ✅︎
+  - ✅︎
+* - `LlavaNextForConditionalGeneration`
+  - LLaVA-NeXT
+  - T + I<sup>E+</sup>
+  - `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
+  -
+  - ✅︎
+  -
+* - `LlavaNextVideoForConditionalGeneration`
+  - LLaVA-NeXT-Video
+  - T + V
+  - `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
+  -
+  - ✅︎
+  -
+* - `LlavaOnevisionForConditionalGeneration`
+  - LLaVA-Onevision
+  - T + I<sup>+</sup> + V<sup>+</sup>
+  - `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
+  -
+  - ✅︎
+  -
+* - `MiniCPMV`
+  - MiniCPM-V
+  - T + I<sup>E+</sup>
+  - `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc.
+  - ✅︎
+  - ✅︎
+  -
+* - `MllamaForConditionalGeneration`
+  - Llama 3.2
+  - T + I<sup>+</sup>
+  - `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc.
+  -
+  -
+  -
+* - `MolmoForCausalLM`
+  - Molmo
+  - T + I
+  - `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc.
+  -
+  - ✅︎
+  - ✅︎
+* - `NVLM_D_Model`
+  - NVLM-D 1.0
+  - T + I<sup>E+</sup>
+  - `nvidia/NVLM-D-72B`, etc.
+  -
+  - ✅︎
+  - ✅︎
+* - `PaliGemmaForConditionalGeneration`
+  - PaliGemma, PaliGemma 2
+  - T + I<sup>E</sup>
+  - `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.
+  -
+  - ✅︎
+  -
+* - `Phi3VForCausalLM`
+  - Phi-3-Vision, Phi-3.5-Vision
+  - T + I<sup>E+</sup>
+  - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct` etc.
+  -
+  - ✅︎
+  - ✅︎
+* - `PixtralForConditionalGeneration`
+  - Pixtral
+  - T + I<sup>+</sup>
+  - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` etc.
+  -
+  - ✅︎
+  - ✅︎
+* - `QWenLMHeadModel`
+  - Qwen-VL
+  - T + I<sup>E+</sup>
+  - `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.
+  - ✅︎
+  - ✅︎
+  -
+* - `Qwen2AudioForConditionalGeneration`
+  - Qwen2-Audio
+  - T + A<sup>+</sup>
+  - `Qwen/Qwen2-Audio-7B-Instruct`
+  -
+  - ✅︎
+  -
+* - `Qwen2VLForConditionalGeneration`
+  - Qwen2-VL
+  - T + I<sup>E+</sup> + V<sup>E+</sup>
+  - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc.
+  - ✅︎
+  - ✅︎
+  -
+* - `UltravoxModel`
+  - Ultravox
+  - T + A<sup>E+</sup>
+  - `fixie-ai/ultravox-v0_3`
+  -
+  - ✅︎
+  -
 ```
 
-```{eval-rst}
-:sup:`E` Pre-computed embeddings can be inputted for this modality.
-
-:sup:`+` Multiple items can be inputted per text prompt for this modality.
-```
+<sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
+<sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
 ````{important}
 To enable multiple multi-modal items per text prompt, you have to set {code}`limit_mm_per_prompt` (offline inference)
@@ -787,38 +778,37 @@ To get the best results, you should use pooling models that are specifically tra
 
 The following table lists those that are tested in vLLM.
 
-```{eval-rst}
-.. list-table::
-  :widths: 25 25 15 25 5 5
-  :header-rows: 1
-
-  * - Architecture
-    - Models
-    - Inputs
-    - Example HF Models
-    - :ref:`LoRA <lora-adapter>`
-    - :ref:`PP <distributed-serving>`
-  * - :code:`LlavaNextForConditionalGeneration`
-    - LLaVA-NeXT-based
-    - T / I
-    - :code:`royokong/e5-v`
-    -
-    - ✅︎
-  * - :code:`Phi3VForCausalLM`
-    - Phi-3-Vision-based
-    - T + I
-    - :code:`TIGER-Lab/VLM2Vec-Full`
-    - 🚧
-    - ✅︎
-  * - :code:`Qwen2VLForConditionalGeneration`
-    - Qwen2-VL-based
-    - T + I
-    - :code:`MrLight/dse-qwen2-2b-mrl-v1`
-    -
-    - ✅︎
+```{list-table}
+:widths: 25 25 15 25 5 5
+:header-rows: 1
+
+* - Architecture
+  - Models
+  - Inputs
+  - Example HF Models
+  - [LoRA](#lora-adapter)
+  - [PP](#distributed-serving)
+* - `LlavaNextForConditionalGeneration`
+  - LLaVA-NeXT-based
+  - T / I
+  - `royokong/e5-v`
+  -
+  - ✅︎
+* - `Phi3VForCausalLM`
+  - Phi-3-Vision-based
+  - T + I
+  - `TIGER-Lab/VLM2Vec-Full`
+  - 🚧
+  - ✅︎
+* - `Qwen2VLForConditionalGeneration`
+  - Qwen2-VL-based
+  - T + I
+  - `MrLight/dse-qwen2-2b-mrl-v1`
+  -
+  - ✅︎
 ```
 
-______________________________________________________________________
+_________________
 
 # Model Support Policy
 
diff --git a/docs/source/quantization/supported_hardware.md b/docs/source/quantization/supported_hardware.md
index 843ee21627d78..7330c2f8aa194 100644
--- a/docs/source/quantization/supported_hardware.md
+++ b/docs/source/quantization/supported_hardware.md
@@ -4,121 +4,120 @@
 
 The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
 
-```{eval-rst}
-.. list-table::
-   :header-rows: 1
-   :widths: 20 8 8 8 8 8 8 8 8 8 8
+```{list-table}
+:header-rows: 1
+:widths: 20 8 8 8 8 8 8 8 8 8 8
 
-   * - Implementation
-     - Volta
-     - Turing
-     - Ampere
-     - Ada
-     - Hopper
-     - AMD GPU
-     - Intel GPU
-     - x86 CPU
-     - AWS Inferentia
-     - Google TPU
-   * - AWQ
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-   * - GPTQ
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-   * - Marlin (GPTQ/AWQ/FP8)
-     - ✗
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - INT8 (W8A8)
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✅︎
-     - ✗
-     - ✗
-   * - FP8 (W8A8)
-     - ✗
-     - ✗
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - AQLM
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - bitsandbytes
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - DeepSpeedFP
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - GGUF
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
+* - Implementation
+  - Volta
+  - Turing
+  - Ampere
+  - Ada
+  - Hopper
+  - AMD GPU
+  - Intel GPU
+  - x86 CPU
+  - AWS Inferentia
+  - Google TPU
+* - AWQ
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+* - GPTQ
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+* - Marlin (GPTQ/AWQ/FP8)
+  - ✗
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+* - INT8 (W8A8)
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✅︎
+  - ✗
+  - ✗
+* - FP8 (W8A8)
+  - ✗
+  - ✗
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+* - AQLM
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+* - bitsandbytes
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+* - DeepSpeedFP
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+* - GGUF
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+  - ✗
 ```
 
 ## Notes:
diff --git a/docs/source/serving/deploying_with_helm.md b/docs/source/serving/deploying_with_helm.md
index 3b26575827011..7286a0a88968f 100644
--- a/docs/source/serving/deploying_with_helm.md
+++ b/docs/source/serving/deploying_with_helm.md
@@ -43,209 +43,208 @@ chart **including persistent volumes** and deletes the release.
 
 ## Values
 
-```{eval-rst}
-.. list-table:: Values
-   :widths: 25 25 25 25
-   :header-rows: 1
-
-   * - Key
-     - Type
-     - Default
-     - Description
-   * - autoscaling
-     - object
-     - {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
-     - Autoscaling configuration
-   * - autoscaling.enabled
-     - bool
-     - false
-     - Enable autoscaling
-   * - autoscaling.maxReplicas
-     - int
-     - 100
-     - Maximum replicas
-   * - autoscaling.minReplicas
-     - int
-     - 1
-     - Minimum replicas
-   * - autoscaling.targetCPUUtilizationPercentage
-     - int
-     - 80
-     - Target CPU utilization for autoscaling
-   * - configs
-     - object
-     - {}
-     - Configmap
-   * - containerPort
-     - int
-     - 8000
-     - Container port
-   * - customObjects
-     - list
-     - []
-     - Custom Objects configuration
-   * - deploymentStrategy
-     - object
-     - {}
-     - Deployment strategy configuration
-   * - externalConfigs
-     - list
-     - []
-     - External configuration
-   * - extraContainers
-     - list
-     - []
-     - Additional containers configuration
-   * - extraInit
-     - object
-     - {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
-     - Additional configuration for the init container
-   * - extraInit.pvcStorage
-     - string
-     - "50Gi"
-     - Storage size of the s3
-   * - extraInit.s3modelpath
-     - string
-     - "relative_s3_model_path/opt-125m"
-     - Path of the model on the s3 which hosts model weights and config files
-   * - extraInit.awsEc2MetadataDisabled
-     - boolean
-     - true
-     - Disables the use of the Amazon EC2 instance metadata service
-   * - extraPorts
-     - list
-     - []
-     - Additional ports configuration
-   * - gpuModels
-     - list
-     - ["TYPE_GPU_USED"]
-     - Type of gpu used
-   * - image
-     - object
-     - {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
-     - Image configuration
-   * - image.command
-     - list
-     - ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
-     - Container launch command
-   * - image.repository
-     - string
-     - "vllm/vllm-openai"
-     - Image repository
-   * - image.tag
-     - string
-     - "latest"
-     - Image tag
-   * - livenessProbe
-     - object
-     - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
-     - Liveness probe configuration
-   * - livenessProbe.failureThreshold
-     - int
-     - 3
-     - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
-   * - livenessProbe.httpGet
-     - object
-     - {"path":"/health","port":8000}
-     - Configuration of the Kubelet http request on the server
-   * - livenessProbe.httpGet.path
-     - string
-     - "/health"
-     - Path to access on the HTTP server
-   * - livenessProbe.httpGet.port
-     - int
-     - 8000
-     - Name or number of the port to access on the container, on which the server is listening
-   * - livenessProbe.initialDelaySeconds
-     - int
-     - 15
-     - Number of seconds after the container has started before liveness probe is initiated
-   * - livenessProbe.periodSeconds
-     - int
-     - 10
-     - How often (in seconds) to perform the liveness probe
-   * - maxUnavailablePodDisruptionBudget
-     - string
-     - ""
-     - Disruption Budget Configuration
-   * - readinessProbe
-     - object
-     - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
-     - Readiness probe configuration
-   * - readinessProbe.failureThreshold
-     - int
-     - 3
-     - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
-   * - readinessProbe.httpGet
-     - object
-     - {"path":"/health","port":8000}
-     - Configuration of the Kubelet http request on the server
-   * - readinessProbe.httpGet.path
-     - string
-     - "/health"
-     - Path to access on the HTTP server
-   * - readinessProbe.httpGet.port
-     - int
-     - 8000
-     - Name or number of the port to access on the container, on which the server is listening
-   * - readinessProbe.initialDelaySeconds
-     - int
-     - 5
-     - Number of seconds after the container has started before readiness probe is initiated
-   * - readinessProbe.periodSeconds
-     - int
-     - 5
-     - How often (in seconds) to perform the readiness probe
-   * - replicaCount
-     - int
-     - 1
-     - Number of replicas
-   * - resources
-     - object
-     - {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
-     - Resource configuration
-   * - resources.limits."nvidia.com/gpu"
-     - int
-     - 1
-     - Number of gpus used
-   * - resources.limits.cpu
-     - int
-     - 4
-     - Number of CPUs
-   * - resources.limits.memory
-     - string
-     - "16Gi"
-     - CPU memory configuration
-   * - resources.requests."nvidia.com/gpu"
-     - int
-     - 1
-     - Number of gpus used
-   * - resources.requests.cpu
-     - int
-     - 4
-     - Number of CPUs
-   * - resources.requests.memory
-     - string
-     - "16Gi"
-     - CPU memory configuration
-   * - secrets
-     - object
-     - {}
-     - Secrets configuration
-   * - serviceName
-     - string
-     -
-     - Service name
-   * - servicePort
-     - int
-     - 80
-     - Service port
-   * - labels.environment
-     - string
-     - test
-     - Environment name
-   * - labels.release
-     - string
-     - test
-     - Release name
+```{list-table}
+:widths: 25 25 25 25
+:header-rows: 1
+
+* - Key
+  - Type
+  - Default
+  - Description
+* - autoscaling
+  - object
+  - {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
+  - Autoscaling configuration
+* - autoscaling.enabled
+  - bool
+  - false
+  - Enable autoscaling
+* - autoscaling.maxReplicas
+  - int
+  - 100
+  - Maximum replicas
+* - autoscaling.minReplicas
+  - int
+  - 1
+  - Minimum replicas
+* - autoscaling.targetCPUUtilizationPercentage
+  - int
+  - 80
+  - Target CPU utilization for autoscaling
+* - configs
+  - object
+  - {}
+  - Configmap
+* - containerPort
+  - int
+  - 8000
+  - Container port
+* - customObjects
+  - list
+  - []
+  - Custom Objects configuration
+* - deploymentStrategy
+  - object
+  - {}
+  - Deployment strategy configuration
+* - externalConfigs
+  - list
+  - []
+  - External configuration
+* - extraContainers
+  - list
+  - []
+  - Additional containers configuration
+* - extraInit
+  - object
+  - {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
+  - Additional configuration for the init container
+* - extraInit.pvcStorage
+  - string
+  - "50Gi"
+  - Storage size of the s3
+* - extraInit.s3modelpath
+  - string
+  - "relative_s3_model_path/opt-125m"
+  - Path of the model on the s3 which hosts model weights and config files
+* - extraInit.awsEc2MetadataDisabled
+  - boolean
+  - true
+  - Disables the use of the Amazon EC2 instance metadata service
+* - extraPorts
+  - list
+  - []
+  - Additional ports configuration
+* - gpuModels
+  - list
+  - ["TYPE_GPU_USED"]
+  - Type of gpu used
+* - image
+  - object
+  - {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
+  - Image configuration
+* - image.command
+  - list
+  - ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
+  - Container launch command
+* - image.repository
+  - string
+  - "vllm/vllm-openai"
+  - Image repository
+* - image.tag
+  - string
+  - "latest"
+  - Image tag
+* - livenessProbe
+  - object
+  - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
+  - Liveness probe configuration
+* - livenessProbe.failureThreshold
+  - int
+  - 3
+  - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
+* - livenessProbe.httpGet
+  - object
+  - {"path":"/health","port":8000}
+  - Configuration of the Kubelet http request on the server
+* - livenessProbe.httpGet.path
+  - string
+  - "/health"
+  - Path to access on the HTTP server
+* - livenessProbe.httpGet.port
+  - int
+  - 8000
+  - Name or number of the port to access on the container, on which the server is listening
+* - livenessProbe.initialDelaySeconds
+  - int
+  - 15
+  - Number of seconds after the container has started before liveness probe is initiated
+* - livenessProbe.periodSeconds
+  - int
+  - 10
+  - How often (in seconds) to perform the liveness probe
+* - maxUnavailablePodDisruptionBudget
+  - string
+  - ""
+  - Disruption Budget Configuration
+* - readinessProbe
+  - object
+  - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
+  - Readiness probe configuration
+* - readinessProbe.failureThreshold
+  - int
+  - 3
+  - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
+* - readinessProbe.httpGet
+  - object
+  - {"path":"/health","port":8000}
+  - Configuration of the Kubelet http request on the server
+* - readinessProbe.httpGet.path
+  - string
+  - "/health"
+  - Path to access on the HTTP server
+* - readinessProbe.httpGet.port
+  - int
+  - 8000
+  - Name or number of the port to access on the container, on which the server is listening
+* - readinessProbe.initialDelaySeconds
+  - int
+  - 5
+  - Number of seconds after the container has started before readiness probe is initiated
+* - readinessProbe.periodSeconds
+  - int
+  - 5
+  - How often (in seconds) to perform the readiness probe
+* - replicaCount
+  - int
+  - 1
+  - Number of replicas
+* - resources
+  - object
+  - {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
+  - Resource configuration
+* - resources.limits."nvidia.com/gpu"
+  - int
+  - 1
+  - Number of gpus used
+* - resources.limits.cpu
+  - int
+  - 4
+  - Number of CPUs
+* - resources.limits.memory
+  - string
+  - "16Gi"
+  - CPU memory configuration
+* - resources.requests."nvidia.com/gpu"
+  - int
+  - 1
+  - Number of gpus used
+* - resources.requests.cpu
+  - int
+  - 4
+  - Number of CPUs
+* - resources.requests.memory
+  - string
+  - "16Gi"
+  - CPU memory configuration
+* - secrets
+  - object
+  - {}
+  - Secrets configuration
+* - serviceName
+  - string
+  -
+  - Service name
+* - servicePort
+  - int
+  - 80
+  - Service port
+* - labels.environment
+  - string
+  - test
+  - Environment name
+* - labels.release
+  - string
+  - test
+  - Release name
 ```

From dba4d9dec606da028fbb28240e99cabd5a761e6a Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 29 Dec 2024 17:03:49 +0800
Subject: [PATCH 215/357] [v1][bugfix] fix cudagraph with inplace buffer
 assignment (#11596)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/wrapper.py                    | 10 +++++++++-
 vllm/model_executor/layers/rotary_embedding.py | 11 +----------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index c10241b483169..e3260a10c02ae 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -28,11 +28,12 @@ def __init__(self,
                  compiled_callable: Optional[Callable] = None,
                  compilation_level: int = 0):
 
+        vllm_config = get_current_vllm_config()
+        self.vllm_config = vllm_config
         if compiled_callable is None:
             # default compilation settings
             # compiling the forward method
 
-            vllm_config = get_current_vllm_config()
             backend = vllm_config.compilation_config.init_backend(vllm_config)
 
             compiled_callable = torch.compile(
@@ -82,6 +83,13 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
 
         self.compiled_codes.append(new_code)
 
+        if self.vllm_config.compilation_config.use_cudagraph and \
+            "update" in new_code.co_names:
+            import depyf
+            src = depyf.decompile(new_code)
+            msg = "Assigning / modifying buffers of nn.Module during forward pass is not allowed when using cudagraph inside the compiler because it will cause silent errors. Please use eager mode or fix the code. The following code contains clues about which buffer is being modified (please search for the usage of the function `update`):\n" + src  # noqa
+            raise RuntimeError(msg)
+
     @contextmanager
     def dispatch_to_code(self, index: int):
         """Context manager to dispatch to the compiled code.
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 117fe086e5e87..6695d44dfa32b 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -541,19 +541,12 @@ def __init__(
         short_cache = self._compute_cos_sin_cache(
             original_max_position_embeddings, short_factor, short_mscale)
         short_cache = short_cache.to(dtype)
-        self.register_buffer("short_cos_sin_cache",
-                             short_cache,
-                             persistent=False)
 
         long_cache = self._compute_cos_sin_cache(max_position_embeddings,
                                                  long_factor, long_mscale)
         long_cache = long_cache.to(dtype)
-        self.register_buffer("long_cos_sin_cache",
-                             long_cache,
-                             persistent=False)
 
-        long_short_cache = torch.cat(
-            [self.short_cos_sin_cache, self.long_cos_sin_cache], dim=0)
+        long_short_cache = torch.cat([short_cache, long_cache], dim=0)
         self.register_buffer("long_short_cos_sin_cache",
                              long_short_cache,
                              persistent=False)
@@ -593,8 +586,6 @@ def forward(
                               torch.full_like(positions, k)).long()
         idx = (torch.add(positions, long_prompt_offset)
                if long_prompt_offset is not None else positions)
-        self.long_short_cos_sin_cache: torch.Tensor = (
-            self.long_short_cos_sin_cache.to(idx.device))
         idx = torch.add(idx, offsets) if offsets is not None else idx
         cos_sin = torch.index_select(self.long_short_cos_sin_cache, 0, idx)
 

From faef77c0d69c5429182f475a57127676e6bcb230 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sun, 29 Dec 2024 10:08:09 -0600
Subject: [PATCH 216/357] [Misc] KV cache transfer connector registry (#11481)

Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
---
 vllm/config.py                                |  8 ----
 .../kv_transfer/kv_connector/factory.py       | 48 +++++++++++++++----
 2 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 6ae1d4d944447..8e556743c8528 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2559,14 +2559,6 @@ def from_cli(cls, cli_value: str) -> "KVTransferConfig":
         return KVTransferConfig.model_validate_json(cli_value)
 
     def model_post_init(self, __context: Any) -> None:
-        supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"]
-        if all([
-                self.kv_connector is not None, self.kv_connector
-                not in supported_kv_connector
-        ]):
-            raise ValueError(f"Unsupported kv_connector: {self.kv_connector}. "
-                             f"Supported connectors are "
-                             f"{supported_kv_connector}.")
 
         if self.kv_role is not None and self.kv_role not in [
                 "kv_producer", "kv_consumer", "kv_both"
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index 3e2bb436d24b5..6372dab726086 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -1,4 +1,5 @@
-from typing import TYPE_CHECKING
+import importlib
+from typing import TYPE_CHECKING, Callable, Dict, Type
 
 from .base import KVConnectorBase
 
@@ -7,14 +8,41 @@
 
 
 class KVConnectorFactory:
+    _registry: Dict[str, Callable[[], Type[KVConnectorBase]]] = {}
 
-    @staticmethod
-    def create_connector(rank: int, local_rank: int,
+    @classmethod
+    def register_connector(cls, name: str, module_path: str,
+                           class_name: str) -> None:
+        """Register a connector with a lazy-loading module and class name."""
+        if name in cls._registry:
+            raise ValueError(f"Connector '{name}' is already registered.")
+
+        def loader() -> Type[KVConnectorBase]:
+            module = importlib.import_module(module_path)
+            return getattr(module, class_name)
+
+        cls._registry[name] = loader
+
+    @classmethod
+    def create_connector(cls, rank: int, local_rank: int,
                          config: "VllmConfig") -> KVConnectorBase:
-        supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"]
-        if config.kv_transfer_config.kv_connector in supported_kv_connector:
-            from .simple_connector import SimpleConnector
-            return SimpleConnector(rank, local_rank, config)
-        else:
-            raise ValueError(f"Unsupported connector type: "
-                             f"{config.kv_connector}")
+        connector_name = config.kv_transfer_config.kv_connector
+        if connector_name not in cls._registry:
+            raise ValueError(f"Unsupported connector type: {connector_name}")
+
+        connector_cls = cls._registry[connector_name]()
+        return connector_cls(rank, local_rank, config)
+
+
+# Register various connectors here.
+# The registration should not be done in each individual file, as we want to
+# only load the files corresponding to the current connector.
+KVConnectorFactory.register_connector(
+    "PyNcclConnector",
+    "vllm.distributed.kv_transfer.kv_connector.simple_connector",
+    "SimpleConnector")
+
+KVConnectorFactory.register_connector(
+    "MooncakeConnector",
+    "vllm.distributed.kv_transfer.kv_connector.simple_connector",
+    "SimpleConnector")

From 0aa38d16f56327622c1689d7510171662757deee Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Sun, 29 Dec 2024 15:16:46 -0500
Subject: [PATCH 217/357] Remove print statement in
 DeepseekScalingRotaryEmbedding (#11604)

---
 vllm/model_executor/layers/rotary_embedding.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 6695d44dfa32b..3fcd81a3c4213 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -668,7 +668,6 @@ def _compute_cos_sin_cache(self) -> torch.Tensor:
         cos = (freqs.cos() * self.mscale)
         sin = (freqs.sin() * self.mscale)
         cache = torch.cat((cos, sin), dim=-1)
-        print("Cache shape", cache.shape)
         return cache
 
     def forward(

From 3682e33f9ff9d8baade6112a8e75a77da898f504 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 30 Dec 2024 12:24:12 +0800
Subject: [PATCH 218/357] [v1] fix compilation cache (#11598)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/compile/piecewise/test_toy_llama.py | 15 +++++++-
 vllm/compilation/backends.py              | 22 ++++++-----
 vllm/config.py                            | 45 +++++++++++++++++++++--
 vllm/v1/worker/gpu_worker.py              |  1 +
 4 files changed, 69 insertions(+), 14 deletions(-)

diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index 07c10a3a18c55..d4ede4d2320a7 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -7,7 +7,7 @@
 initialized randomly with a fixed seed.
 """
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Any, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -54,6 +54,16 @@ class LlamaConfig:
     tractable_init: bool = False
     random_seed: int = 0
 
+    def compute_hash(self) -> str:
+        factors: List[Any] = []
+        for k, v in self.__dict__.items():
+            if k == "random_seed":
+                continue
+            factors.append((k, v))
+        factors.sort()
+        import hashlib
+        return hashlib.md5(str(factors).encode()).hexdigest()
+
     def __post_init__(self):
         assert self.mlp_size >= self.hidden_size
 
@@ -263,7 +273,8 @@ def run_model(llama_config,
         compilation_config = CompilationConfig(
             level=CompilationLevel.NO_COMPILATION, )
 
-    vllm_config = VllmConfig(compilation_config=compilation_config)
+    vllm_config = VllmConfig(compilation_config=compilation_config,
+                             additional_config=llama_config)
     with set_current_vllm_config(vllm_config):
         model = LlamaModel(config=llama_config,
                            vllm_config=vllm_config,
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 4f960b441f21d..a8dd628b9cd6f 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -619,8 +619,10 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
         # the entries for different shapes that we need to either
         # compile or capture cudagraph
         self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}
-        self.to_be_compiled_sizes: Set[int] = self.compile_sizes.union(
-            self.capture_sizes)
+
+        # to_be_compiled_sizes tracks the remaining sizes to compile,
+        # and updates during the compilation process, so we need to copy it
+        self.to_be_compiled_sizes: Set[int] = self.compile_sizes.copy()
         for shape in self.compile_sizes.union(self.capture_sizes):
             self.concrete_size_entries[shape] = ConcreteSizeEntry(
                 runtime_shape=shape,
@@ -628,12 +630,17 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
                 use_cudagraph=shape in self.capture_sizes,
             )
 
+    def check_for_ending_compilation(self):
+        if self.is_last_graph and not self.to_be_compiled_sizes:
+            # no specific sizes to compile
+            # save the hash of the inductor graph for the next run
+            self.compilation_config.inductor_hash_cache.save_to_file()
+            end_monitoring_torch_compile(self.vllm_config)
+
     def __call__(self, *args) -> Any:
         if not self.first_run_finished:
             self.first_run_finished = True
-            # no specific sizes to compile
-            if self.is_last_graph and not self.to_be_compiled_sizes:
-                end_monitoring_torch_compile(self.vllm_config)
+            self.check_for_ending_compilation()
             return self.compiled_graph_for_general_shape(*args)
 
         runtime_shape = args[self.sym_shape_indices[0]]
@@ -662,10 +669,7 @@ def __call__(self, *args) -> Any:
 
             # finished compilations for all required shapes
             if self.is_last_graph and not self.to_be_compiled_sizes:
-
-                # save the hash of the inductor graph for the next run
-                self.compilation_config.inductor_hash_cache.save_to_file()
-                end_monitoring_torch_compile(self.vllm_config)
+                self.check_for_ending_compilation()
 
         if not entry.use_cudagraph:
             return entry.runnable(*args)
diff --git a/vllm/config.py b/vllm/config.py
index 8e556743c8528..765a46e6aeee3 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -9,8 +9,8 @@
 from dataclasses import dataclass, field, replace
 from pathlib import Path
 from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Counter, Dict,
-                    Final, List, Literal, Mapping, Optional, Set, Tuple, Type,
-                    Union)
+                    Final, List, Literal, Mapping, Optional, Protocol, Set,
+                    Tuple, Type, Union)
 
 import torch
 from pydantic import BaseModel, Field, PrivateAttr
@@ -75,6 +75,12 @@
                                              PretrainedConfig]]
 
 
+class SupportsHash(Protocol):
+
+    def compute_hash(self) -> str:
+        ...
+
+
 class ModelConfig:
     """Configuration for the model.
 
@@ -2969,6 +2975,10 @@ class VllmConfig:
                                                   init=True)  # type: ignore
     kv_transfer_config: KVTransferConfig = field(default=None,
                                                  init=True)  # type: ignore
+    # some opaque config, only used to provide additional information
+    # for the hash computation, mainly used for testing and debugging.
+    additional_config: SupportsHash = field(default=None,
+                                            init=True)  # type: ignore
     instance_id: str = ""
 
     def compute_hash(self) -> str:
@@ -3000,33 +3010,62 @@ def compute_hash(self) -> str:
         vllm_factors.append(__version__)
         if self.model_config:
             vllm_factors.append(self.model_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.cache_config:
             vllm_factors.append(self.cache_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.parallel_config:
             vllm_factors.append(self.parallel_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.scheduler_config:
             vllm_factors.append(self.scheduler_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.device_config:
             vllm_factors.append(self.device_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.load_config:
             vllm_factors.append(self.load_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.lora_config:
             vllm_factors.append(self.lora_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.speculative_config:
             vllm_factors.append(self.speculative_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.decoding_config:
             vllm_factors.append(self.decoding_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.observability_config:
             vllm_factors.append(self.observability_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.prompt_adapter_config:
             vllm_factors.append(self.prompt_adapter_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.quant_config:
             pass  # should be captured by model_config.quantization
         if self.compilation_config:
             vllm_factors.append(self.compilation_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.kv_transfer_config:
             vllm_factors.append(self.kv_transfer_config.compute_hash())
-
+        else:
+            vllm_factors.append("None")
+        if self.additional_config:
+            vllm_factors.append(self.additional_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         factors.append(vllm_factors)
 
         hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10]
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 0000b09bfaa36..af438f7d5820c 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -48,6 +48,7 @@ def __init__(
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
 
+        self.parallel_config.rank = rank
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method

From 628ec6c17b8121517e8f303b64567573036cdb38 Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfc@amazon.com>
Date: Sun, 29 Dec 2024 21:46:14 -0800
Subject: [PATCH 219/357] [Docker] bump up neuron sdk v2.21 (#11593)

Signed-off-by: Liangfu Chen <liangfc@amazon.com>
---
 Dockerfile.neuron              | 6 +++---
 requirements-neuron.txt        | 4 ++--
 vllm/_custom_ops.py            | 3 +--
 vllm/triton_utils/importing.py | 1 -
 4 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 77162bc82de62..269139fe90f0b 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -1,6 +1,6 @@
 # default base image
 # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04"
+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
 
 FROM $BASE_IMAGE
 
@@ -22,9 +22,9 @@ WORKDIR ${APP_MOUNT}/vllm
 
 RUN python3 -m pip install --upgrade pip
 RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
-RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
+RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
 RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
-RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 
 COPY . .
 ARG GIT_REPO_CHECK=0
diff --git a/requirements-neuron.txt b/requirements-neuron.txt
index 148fdbe0d6310..5e08d101fcd61 100644
--- a/requirements-neuron.txt
+++ b/requirements-neuron.txt
@@ -2,6 +2,6 @@
 -r requirements-common.txt
 
 # Dependencies for Neuron devices
-transformers-neuronx >= 0.12.0
-torch-neuronx >= 2.1.2
+transformers-neuronx >= 0.13.0
+torch-neuronx >= 2.5.0
 neuronx-cc
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index aeacf5dda5761..eb2f69df42624 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -23,8 +23,7 @@
     import vllm._moe_C  # noqa: F401
     supports_moe_ops = True
 
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING or current_platform.is_neuron():
+if TYPE_CHECKING:
 
     def register_fake(fn):
         return lambda name: fn
diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py
index 36315abcdfcda..0c96e0632f646 100644
--- a/vllm/triton_utils/importing.py
+++ b/vllm/triton_utils/importing.py
@@ -8,7 +8,6 @@
 HAS_TRITON = (
     find_spec("triton") is not None
     and not current_platform.is_xpu()  # Not compatible
-    and not current_platform.is_neuron()  # neuron has too old torch
 )
 
 if not HAS_TRITON:

From 970d6d0776076f17604077ba4d484cdadd604ceb Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 30 Dec 2024 04:22:13 -0500
Subject: [PATCH 220/357] [Build][Kernel] Update CUTLASS to v3.6.0 (#11607)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 CMakeLists.txt                                 |  4 ++--
 .../vllm_cutlass_library_extension.py          | 18 +++++++++---------
 csrc/quantization/machete/generate.py          |  8 ++++----
 .../machete/machete_collective_builder.cuh     | 10 ++++------
 csrc/quantization/machete/machete_mainloop.cuh | 11 ++++-------
 .../machete/machete_prepacked_layout.cuh       |  5 ++---
 6 files changed, 25 insertions(+), 31 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 83c8033434f3b..3206d76125545 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -223,13 +223,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        GIT_TAG 8aa95dbb888be6d81c6fbf7169718c5244b53227
+        GIT_TAG v3.6.0
         GIT_PROGRESS TRUE
 
         # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
         # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
         # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
-        GIT_SHALLOW FALSE
+        GIT_SHALLOW TRUE
     )
   endif()
   FetchContent_MakeAvailable(cutlass)
diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
index a5beea1a35e49..b401736c9824b 100644
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@@ -14,9 +14,9 @@ class VLLMDataType(enum.Enum):
 
 
 class MixedInputKernelScheduleType(enum.Enum):
-    TmaWarpSpecializedMixedInput = enum_auto()
-    TmaWarpSpecializedPingpongMixedInput = enum_auto()
-    TmaWarpSpecializedCooperativeMixedInput = enum_auto()
+    TmaWarpSpecialized = enum_auto()
+    TmaWarpSpecializedPingpong = enum_auto()
+    TmaWarpSpecializedCooperative = enum_auto()
 
 
 VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = {
@@ -68,11 +68,11 @@ class MixedInputKernelScheduleType(enum.Enum):
     MixedInputKernelScheduleType, KernelScheduleType], str] = {
         **KernelScheduleTag,  # type: ignore
         **{
-            MixedInputKernelScheduleType.TmaWarpSpecializedMixedInput:
-            "cutlass::gemm::KernelTmaWarpSpecializedMixedInput",
-            MixedInputKernelScheduleType.TmaWarpSpecializedPingpongMixedInput:
-            "cutlass::gemm::KernelTmaWarpSpecializedPingpongMixedInput",
-            MixedInputKernelScheduleType.TmaWarpSpecializedCooperativeMixedInput:
-            "cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput",
+            MixedInputKernelScheduleType.TmaWarpSpecialized:
+            "cutlass::gemm::KernelTmaWarpSpecialized",
+            MixedInputKernelScheduleType.TmaWarpSpecializedPingpong:
+            "cutlass::gemm::KernelTmaWarpSpecializedPingpong",
+            MixedInputKernelScheduleType.TmaWarpSpecializedCooperative:
+            "cutlass::gemm::KernelTmaWarpSpecializedCooperative",
         }
     }
diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index ac63afe79a255..2df4d181902f8 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -189,7 +189,7 @@
   {{DataTypeTag[t.b_group_zeropoint]}}, // GroupZeroT
   {{DataTypeTag[t.b_channel_scale]}}, // ChannelScaleT
   {{DataTypeTag[t.a_token_scale]}}, // TokenScaleT
-  cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput,
+  cutlass::gemm::KernelTmaWarpSpecializedCooperative,
   Sch>;
 
 {% for sch in schs %}
@@ -223,7 +223,7 @@
         {{DataTypeTag[t.convert]}}, // ElementConvert
         {{DataTypeTag[t.accumulator]}}, // Accumulator
         cutlass::layout::ColumnMajor,
-        cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput>
+        cutlass::gemm::KernelTmaWarpSpecializedCooperative>
     >(args.B); 
   }
   {%- endfor %}
@@ -239,7 +239,7 @@
 }; // namespace machete
 """
 
-TmaMI = MixedInputKernelScheduleType.TmaWarpSpecializedCooperativeMixedInput
+TmaMI = MixedInputKernelScheduleType.TmaWarpSpecializedCooperative
 TmaCoop = EpilogueScheduleType.TmaWarpSpecializedCooperative
 
 
@@ -300,7 +300,7 @@ def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
 # mostly unique shorter sch_sig
 def generate_terse_sch_sig(schedule_config: ScheduleConfig) -> str:
     kernel_terse_names_replace = {
-        "KernelTmaWarpSpecializedCooperativeMixedInput_": "TmaMI_",
+        "KernelTmaWarpSpecializedCooperative": "TmaMI_",
         "TmaWarpSpecializedCooperative_": "TmaCoop_",
         "StreamKScheduler": "streamK",
     }
diff --git a/csrc/quantization/machete/machete_collective_builder.cuh b/csrc/quantization/machete/machete_collective_builder.cuh
index a74cf8b2dd455..ee825583dee1a 100644
--- a/csrc/quantization/machete/machete_collective_builder.cuh
+++ b/csrc/quantization/machete/machete_collective_builder.cuh
@@ -18,16 +18,14 @@ struct VLLMCollectiveBuilder<
     ElementAccumulator, TileShape_MNK, ClusterShape_MNK, StageCountType,
     KernelScheduleType,
     cute::enable_if_t<(
+        cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
+        cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
         cute::is_same_v<KernelScheduleType,
-                        KernelTmaWarpSpecializedMixedInput> ||
-        cute::is_same_v<KernelScheduleType,
-                        KernelTmaWarpSpecializedPingpongMixedInput> ||
-        cute::is_same_v<KernelScheduleType,
-                        KernelTmaWarpSpecializedCooperativeMixedInput>)>> {
+                        KernelTmaWarpSpecializedCooperative>)>> {
   using CollectiveOp = machete::MacheteCollectiveMma<
       ElementPairA_, GmemLayoutA_, AlignmentA, ElementPairB_, GmemLayoutB_,
       AlignmentB, ElementAccumulator, TileShape_MNK, ClusterShape_MNK,
       StageCountType, KernelScheduleType>;
 };
 
-};  // namespace cutlass::gemm::collective
\ No newline at end of file
+};  // namespace cutlass::gemm::collective
diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh
index 816f33a1078e5..4071b19a3564d 100644
--- a/csrc/quantization/machete/machete_mainloop.cuh
+++ b/csrc/quantization/machete/machete_mainloop.cuh
@@ -66,13 +66,11 @@ struct MacheteCollectiveMma {
   using Schedule = KernelScheduleType;
   static_assert(
       cute::is_same_v<Schedule, KernelTmaWarpSpecialized> ||
-          cute::is_same_v<Schedule, KernelTmaWarpSpecializedMixedInput> ||
+          cute::is_same_v<Schedule, KernelTmaWarpSpecialized> ||
+          cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpong> ||
           cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpong> ||
-          cute::is_same_v<Schedule,
-                          KernelTmaWarpSpecializedPingpongMixedInput> ||
           cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperative> ||
-          cute::is_same_v<Schedule,
-                          KernelTmaWarpSpecializedCooperativeMixedInput>,
+          cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperative>,
       "KernelSchedule must be one of the warp specialized policies");
 
  public:
@@ -113,8 +111,7 @@ struct MacheteCollectiveMma {
   // For coop schedules we have two warp groups cooperatively issuing wgmma
   // instructions so we use 2 atoms along the M dim (one for each warpgroup)
   using AtomLayoutMNK = cute::conditional_t<
-      cute::is_same_v<KernelScheduleType,
-                      KernelTmaWarpSpecializedCooperativeMixedInput>,
+      cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>,
       Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;
 
   using TiledMma = decltype(cute::make_tiled_mma(
diff --git a/csrc/quantization/machete/machete_prepacked_layout.cuh b/csrc/quantization/machete/machete_prepacked_layout.cuh
index 680a858a893c1..81aaa6c4f3a28 100644
--- a/csrc/quantization/machete/machete_prepacked_layout.cuh
+++ b/csrc/quantization/machete/machete_prepacked_layout.cuh
@@ -98,8 +98,7 @@ struct PrepackedLayoutBTemplate {
   // For coop schedules we have two warp groups cooperatively issuing wgmma
   // instructions so we use 2 atoms along the M dim (one for each warpgroup)
   using AtomLayoutMNK = cute::conditional_t<
-      cute::is_same_v<KernelSchedule,
-                      KernelTmaWarpSpecializedCooperativeMixedInput>,
+      cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedCooperative>,
       Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;
 
   using TiledMma = decltype(cute::make_tiled_mma(
@@ -247,4 +246,4 @@ struct PrepackedLayoutBTemplate {
   }
 };
 
-};  // namespace machete
\ No newline at end of file
+};  // namespace machete

From 5dbf854553cb6ac97f0c633ed36ba64e0fc9bb29 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Mon, 30 Dec 2024 18:17:04 +0800
Subject: [PATCH 221/357] [CI/Build][CPU] Fix CPU CI by lazy importing triton
 FP8 kernels (#11618)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 vllm/model_executor/layers/quantization/fp8.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 7f779ac8d3b3e..2fe22903a385b 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -15,8 +15,6 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    apply_w8a8_block_fp8_linear)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
@@ -337,6 +335,9 @@ def apply(self,
                 size_k=layer.input_size_per_partition,
                 bias=bias)
 
+        # Note: lazy import to avoid triton import error.
+        from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+            apply_w8a8_block_fp8_linear)
         if self.block_quant:
             assert self.quant_config.weight_block_size is not None
             return apply_w8a8_block_fp8_linear(

From b12e87f942eb7740c17ab546b964bc327afdda37 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 30 Dec 2024 20:24:45 +0800
Subject: [PATCH 222/357] [platforms] enable platform plugins (#11602)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  25 +-
 docs/source/design/plugin_system.md           |   6 +-
 tests/conftest.py                             |   2 +-
 tests/kernels/test_attention_selector.py      |  16 +-
 .../plugins/vllm_add_dummy_platform/setup.py  |  11 +
 .../vllm_add_dummy_platform/__init__.py       |   5 +
 .../vllm_add_dummy_platform/dummy_platform.py |   5 +
 tests/plugins_tests/test_platform_plugins.py  |  16 +
 vllm/config.py                                |  15 +-
 vllm/distributed/parallel_state.py            |   3 +-
 vllm/engine/arg_utils.py                      |   2 +-
 vllm/executor/ray_utils.py                    |   2 +-
 .../guided_decoding/__init__.py               |   3 +-
 vllm/model_executor/models/registry.py        |   2 +-
 vllm/model_executor/utils.py                  |   4 +-
 vllm/platforms/__init__.py                    | 320 ++++++++++++------
 vllm/plugins/__init__.py                      |  72 ++--
 vllm/spec_decode/metrics.py                   |   2 +-
 vllm/usage/usage_lib.py                       |   2 +-
 vllm/utils.py                                 |   8 +-
 vllm/worker/model_runner_base.py              |   5 +-
 vllm/worker/multi_step_model_runner.py        |   1 +
 vllm/worker/worker_base.py                    |  14 +-
 23 files changed, 360 insertions(+), 181 deletions(-)
 create mode 100644 tests/plugins/vllm_add_dummy_platform/setup.py
 create mode 100644 tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
 create mode 100644 tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
 create mode 100644 tests/plugins_tests/test_platform_plugins.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index b563c96343f92..bee968b4d2e43 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -106,14 +106,12 @@ steps:
   source_file_dependencies:
   - vllm/
   commands:
-  - pip install -e ./plugins/vllm_add_dummy_model
   - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
   - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
-  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
   - pytest -v -s entrypoints/test_chat_utils.py
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
@@ -333,8 +331,6 @@ steps:
   - vllm/
   - tests/models
   commands:
-    - pip install -e ./plugins/vllm_add_dummy_model
-    - pytest -v -s models/test_oot_registration.py # it needs a clean process
     - pytest -v -s models/test_registry.py
     - pytest -v -s models/test_initialization.py
 
@@ -469,11 +465,28 @@ steps:
   - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
   - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-  - pip install -e ./plugins/vllm_add_dummy_model
-  - pytest -v -s distributed/test_distributed_oot.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
 
+- label: Plugin Tests (2 GPUs) # 40min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  fast_check: true
+  source_file_dependencies:
+  - vllm/plugins/
+  - tests/plugins/
+  commands:
+  # begin platform plugin tests, all the code in-between runs on dummy platform
+  - pip install -e ./plugins/vllm_add_dummy_platform
+  - pytest -v -s plugins_tests/test_platform_plugins.py
+  - pip uninstall vllm_add_dummy_platform -y
+  # end platform plugin tests
+  # other tests continue here:
+  - pip install -e ./plugins/vllm_add_dummy_model
+  - pytest -v -s distributed/test_distributed_oot.py
+  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+  - pytest -v -s models/test_oot_registration.py # it needs a clean process
+
 - label: Multi-step Tests (4 GPUs) # 36min
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
diff --git a/docs/source/design/plugin_system.md b/docs/source/design/plugin_system.md
index 79aff757518f2..225030885f629 100644
--- a/docs/source/design/plugin_system.md
+++ b/docs/source/design/plugin_system.md
@@ -41,9 +41,11 @@ Every plugin has three parts:
 2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name.
 3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module.
 
-## What Can Plugins Do?
+## Types of supported plugins
 
-Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM.
+- **General plugins** (with group name `vllm.general_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model inside the plugin function.
+
+- **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.
 
 ## Guidelines for Writing Plugins
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 4e939221329cd..6e2f75e33654f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -31,7 +31,6 @@
                          to_enc_dec_tuple_list, zip_enc_dec_prompts)
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
-from vllm.platforms import current_platform
 from vllm.sampling_params import BeamSearchParams
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
                         identity)
@@ -242,6 +241,7 @@ def video_assets() -> _VideoAssets:
 class HfRunner:
 
     def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
+        from vllm.platforms import current_platform
         if x is None or isinstance(x, (bool, )):
             return x
 
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index d37f95d48d5b2..916cc2efa3895 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -5,7 +5,10 @@
 
 from tests.kernels.utils import override_backend_env_variable
 from vllm.attention.selector import which_attn_to_use
-from vllm.platforms import cpu, cuda, openvino, rocm
+from vllm.platforms.cpu import CpuPlatform
+from vllm.platforms.cuda import CudaPlatform
+from vllm.platforms.openvino import OpenVinoPlatform
+from vllm.platforms.rocm import RocmPlatform
 from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL
 
 
@@ -20,26 +23,23 @@ def test_env(name: str, device: str, monkeypatch):
     override_backend_env_variable(monkeypatch, name)
 
     if device == "cpu":
-        with patch("vllm.attention.selector.current_platform",
-                   cpu.CpuPlatform()):
+        with patch("vllm.attention.selector.current_platform", CpuPlatform()):
             backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
                                         False)
         assert backend.name == "TORCH_SDPA"
     elif device == "hip":
-        with patch("vllm.attention.selector.current_platform",
-                   rocm.RocmPlatform()):
+        with patch("vllm.attention.selector.current_platform", RocmPlatform()):
             backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
                                         False)
         assert backend.name == "ROCM_FLASH"
     elif device == "openvino":
         with patch("vllm.attention.selector.current_platform",
-                   openvino.OpenVinoPlatform()):
+                   OpenVinoPlatform()):
             backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
                                         False)
         assert backend.name == "OPENVINO"
     else:
-        with patch("vllm.attention.selector.current_platform",
-                   cuda.CudaPlatform()):
+        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
             backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
                                         False)
         assert backend.name == name
diff --git a/tests/plugins/vllm_add_dummy_platform/setup.py b/tests/plugins/vllm_add_dummy_platform/setup.py
new file mode 100644
index 0000000000000..31639906898db
--- /dev/null
+++ b/tests/plugins/vllm_add_dummy_platform/setup.py
@@ -0,0 +1,11 @@
+from setuptools import setup
+
+setup(
+    name='vllm_add_dummy_platform',
+    version='0.1',
+    packages=['vllm_add_dummy_platform'],
+    entry_points={
+        'vllm.platform_plugins': [
+            "dummy_platform_plugin = vllm_add_dummy_platform:dummy_platform_plugin"  # noqa
+        ]
+    })
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
new file mode 100644
index 0000000000000..594cef520a7de
--- /dev/null
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
@@ -0,0 +1,5 @@
+from typing import Optional
+
+
+def dummy_platform_plugin() -> Optional[str]:
+    return "vllm_add_dummy_platform.dummy_platform.DummyPlatform"
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
new file mode 100644
index 0000000000000..fde93142f1103
--- /dev/null
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -0,0 +1,5 @@
+from vllm.platforms.cuda import CudaPlatform
+
+
+class DummyPlatform(CudaPlatform):
+    device_name = "DummyDevice"
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
new file mode 100644
index 0000000000000..0d27cf9f152e0
--- /dev/null
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -0,0 +1,16 @@
+def test_platform_plugins():
+    # simulate workload by running an example
+    import runpy
+    current_file = __file__
+    import os
+    example_file = os.path.join(
+        os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
+        "examples", "offline_inference.py")
+    runpy.run_path(example_file)
+
+    # check if the plugin is loaded correctly
+    from vllm.platforms import _init_trace, current_platform
+    assert current_platform.device_name == "DummyDevice", (
+        f"Expected DummyDevice, got {current_platform.device_name}, "
+        "possibly because current_platform is imported before the plugin"
+        f" is loaded. The first import:\n{_init_trace}")
diff --git a/vllm/config.py b/vllm/config.py
index 765a46e6aeee3..e72c53b6130d0 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -22,7 +22,7 @@
 from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
                                                      get_quantization_config)
 from vllm.model_executor.models import ModelRegistry
-from vllm.platforms import current_platform, interface
+from vllm.platforms import CpuArchEnum
 from vllm.tracing import is_otel_available, otel_import_error_traceback
 from vllm.transformers_utils.config import (
     ConfigFormat, get_config, get_hf_image_processor_config,
@@ -349,6 +349,7 @@ def __init__(self,
         self.is_hybrid = self._init_is_hybrid()
         self.has_inner_state = self._init_has_inner_state()
 
+        from vllm.platforms import current_platform
         if current_platform.is_neuron():
             self.override_neuron_config = override_neuron_config
         else:
@@ -589,6 +590,7 @@ def _verify_quantization(self) -> None:
                 raise ValueError(
                     f"Unknown quantization method: {self.quantization}. Must "
                     f"be one of {supported_quantization}.")
+            from vllm.platforms import current_platform
             current_platform.verify_quantization(self.quantization)
             if self.quantization not in optimized_quantization_methods:
                 logger.warning(
@@ -644,6 +646,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
 
         # Reminder: Please update docs/source/usage/compatibility_matrix.md
         # If the feature combo become valid
+        from vllm.platforms import current_platform
         if not current_platform.is_async_output_supported(self.enforce_eager):
             logger.warning(
                 "Async output processing is not supported on the "
@@ -1012,6 +1015,7 @@ def _verify_args(self) -> None:
             raise ValueError(
                 "GPU memory utilization must be less than 1.0. Got "
                 f"{self.gpu_memory_utilization}.")
+        from vllm.platforms import current_platform
         if (current_platform.is_cuda() and self.block_size is not None
                 and self.block_size > 32):
             raise ValueError("CUDA Paged Attention kernel only supports "
@@ -1279,6 +1283,7 @@ def __post_init__(self) -> None:
                                  f"distributed executor backend "
                                  f"'{self.distributed_executor_backend}'.")
         ray_only_devices = ["tpu", "hpu"]
+        from vllm.platforms import current_platform
         if (current_platform.device_type in ray_only_devices
                 and self.world_size > 1):
             if self.distributed_executor_backend is None:
@@ -1327,7 +1332,7 @@ def use_ray(self) -> bool:
     def _verify_args(self) -> None:
         # Lazy import to avoid circular import
         from vllm.executor.executor_base import ExecutorBase
-
+        from vllm.platforms import current_platform
         if self.distributed_executor_backend not in (
                 "ray", "mp", None) and not (isinstance(
                     self.distributed_executor_backend, type) and issubclass(
@@ -1528,6 +1533,7 @@ def compute_hash(self) -> str:
     def __init__(self, device: str = "auto") -> None:
         if device == "auto":
             # Automated device type detection
+            from vllm.platforms import current_platform
             self.device_type = current_platform.device_type
             if not self.device_type:
                 raise RuntimeError("Failed to infer device type")
@@ -2241,9 +2247,10 @@ def _get_and_verify_dtype(
             else:
                 torch_dtype = config_dtype
 
+            from vllm.platforms import current_platform
             if (current_platform.is_cpu()
                     and current_platform.get_cpu_architecture()
-                    == interface.CpuArchEnum.POWERPC
+                    == CpuArchEnum.POWERPC
                     and (config_dtype == torch.float16
                          or config_dtype == torch.float32)):
                 logger.info(
@@ -3083,6 +3090,7 @@ def _get_quantization_config(
             model_config: ModelConfig,
             load_config: LoadConfig) -> Optional[QuantizationConfig]:
         """Get the quantization config."""
+        from vllm.platforms import current_platform
         if model_config.quantization is not None:
             from vllm.model_executor.model_loader.weight_utils import (
                 get_quant_config)
@@ -3145,6 +3153,7 @@ def __post_init__(self):
             self.quant_config = VllmConfig._get_quantization_config(
                 self.model_config, self.load_config)
 
+        from vllm.platforms import current_platform
         if self.scheduler_config is not None and \
             self.model_config is not None and \
             self.scheduler_config.chunked_prefill_enabled and \
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 5b9236f8c56b6..e6768467f4c27 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -39,7 +39,6 @@
 import vllm.envs as envs
 from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op, supports_custom_op
 
 if TYPE_CHECKING:
@@ -194,6 +193,7 @@ def __init__(
         assert self.cpu_group is not None
         assert self.device_group is not None
 
+        from vllm.platforms import current_platform
         if current_platform.is_cuda_alike():
             self.device = torch.device(f"cuda:{local_rank}")
         else:
@@ -1188,6 +1188,7 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
         import ray  # Lazy import Ray
         ray.shutdown()
     gc.collect()
+    from vllm.platforms import current_platform
     if not current_platform.is_cpu():
         torch.cuda.empty_cache()
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 21966d003c7ef..69c7c5077fe32 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -18,7 +18,6 @@
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-from vllm.platforms import current_platform
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, StoreBoolean
@@ -1094,6 +1093,7 @@ def create_engine_config(self,
                 use_sliding_window = (model_config.get_sliding_window()
                                       is not None)
                 use_spec_decode = self.speculative_model is not None
+                from vllm.platforms import current_platform
                 if (is_gpu and not use_sliding_window and not use_spec_decode
                         and not self.enable_lora
                         and not self.enable_prompt_adapter
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 426aa1b5c728f..8d766bad1a072 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -8,7 +8,6 @@
 from vllm.config import ParallelConfig
 from vllm.executor.msgspec_utils import decode_hook, encode_hook
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import get_ip
 from vllm.worker.worker_base import WorkerWrapperBase
@@ -229,6 +228,7 @@ def initialize_ray_cluster(
             the default Ray cluster address.
     """
     assert_ray_available()
+    from vllm.platforms import current_platform
 
     # Connect to a ray cluster.
     if current_platform.is_rocm() or current_platform.is_xpu():
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index 694c5b68b1cbd..18b435a42544a 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -6,7 +6,7 @@
 from vllm.model_executor.guided_decoding.utils import (
     convert_lark_to_gbnf, grammar_is_likely_lark,
     has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
-from vllm.platforms import CpuArchEnum, current_platform
+from vllm.platforms import CpuArchEnum
 
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer
@@ -39,6 +39,7 @@ def maybe_backend_fallback(
 
     if guided_params.backend == "xgrammar":
         # xgrammar only has x86 wheels for linux, fallback to outlines
+        from vllm.platforms import current_platform
         if current_platform.get_cpu_architecture() is not CpuArchEnum.X86:
             logger.warning("xgrammar is only supported on x86 CPUs. "
                            "Falling back to use outlines instead.")
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 67268eb4bb85f..07f4b5a3b3bc8 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -18,7 +18,6 @@
 import torch.nn as nn
 
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 
 from .interfaces import (has_inner_state, is_attention_free, is_hybrid,
                          supports_cross_encoding, supports_multimodal,
@@ -273,6 +272,7 @@ def _try_load_model_cls(
     model_arch: str,
     model: _BaseRegisteredModel,
 ) -> Optional[Type[nn.Module]]:
+    from vllm.platforms import current_platform
     current_platform.verify_model_arch(model_arch)
     try:
         return model.load_model_cls()
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index 39ead08c238ce..6f1cc9d5e0c30 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -3,10 +3,9 @@
 
 import torch
 
-from vllm.platforms import current_platform
-
 
 def set_random_seed(seed: int) -> None:
+    from vllm.platforms import current_platform
     current_platform.seed_everything(seed)
 
 
@@ -38,6 +37,7 @@ def set_weight_attrs(
         # This sometimes causes OOM errors during model loading. To avoid this,
         # we sync the param tensor after its weight loader is called.
         # TODO(woosuk): Remove this hack once we have a better solution.
+        from vllm.platforms import current_platform
         if current_platform.is_tpu() and key == "weight_loader":
             value = _make_synced_weight_loader(value)
         setattr(weight, key, value)
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 419237c252ffd..f6ac14446c021 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -1,123 +1,223 @@
+import logging
+import traceback
+from itertools import chain
+from typing import TYPE_CHECKING, Optional
+
+from vllm.plugins import load_plugins_by_group
+from vllm.utils import resolve_obj_by_qualname
+
 from .interface import _Backend  # noqa: F401
-from .interface import CpuArchEnum, Platform, PlatformEnum, UnspecifiedPlatform
+from .interface import CpuArchEnum, Platform, PlatformEnum
 
-current_platform: Platform
+logger = logging.getLogger(__name__)
 
-# NOTE: we don't use `torch.version.cuda` / `torch.version.hip` because
-# they only indicate the build configuration, not the runtime environment.
-# For example, people can install a cuda build of pytorch but run on tpu.
 
-is_tpu = False
-try:
-    # While it's technically possible to install libtpu on a non-TPU machine,
-    # this is a very uncommon scenario. Therefore, we assume that libtpu is
-    # installed if and only if the machine has TPUs.
-    import libtpu  # noqa: F401
-    is_tpu = True
-except Exception:
-    pass
+def tpu_platform_plugin() -> Optional[str]:
+    is_tpu = False
+    try:
+        # While it's technically possible to install libtpu on a
+        # non-TPU machine, this is a very uncommon scenario. Therefore,
+        # we assume that libtpu is installed if and only if the machine
+        # has TPUs.
+        import libtpu  # noqa: F401
+        is_tpu = True
+    except Exception:
+        pass
+
+    return "vllm.platforms.tpu.TpuPlatform" if is_tpu else None
 
-is_cuda = False
 
-try:
-    import pynvml
-    pynvml.nvmlInit()
+def cuda_platform_plugin() -> Optional[str]:
+    is_cuda = False
+
     try:
-        if pynvml.nvmlDeviceGetCount() > 0:
+        import pynvml
+        pynvml.nvmlInit()
+        try:
+            if pynvml.nvmlDeviceGetCount() > 0:
+                is_cuda = True
+        finally:
+            pynvml.nvmlShutdown()
+    except Exception:
+        # CUDA is supported on Jetson, but NVML may not be.
+        import os
+
+        def cuda_is_jetson() -> bool:
+            return os.path.isfile("/etc/nv_tegra_release") \
+                or os.path.exists("/sys/class/tegra-firmware")
+
+        if cuda_is_jetson():
             is_cuda = True
-    finally:
-        pynvml.nvmlShutdown()
-except Exception:
-    # CUDA is supported on Jetson, but NVML may not be.
-    import os
 
-    def cuda_is_jetson() -> bool:
-        return os.path.isfile("/etc/nv_tegra_release") \
-            or os.path.exists("/sys/class/tegra-firmware")
+    return "vllm.platforms.cuda.CudaPlatform" if is_cuda else None
+
+
+def rocm_platform_plugin() -> Optional[str]:
+    is_rocm = False
+
+    try:
+        import amdsmi
+        amdsmi.amdsmi_init()
+        try:
+            if len(amdsmi.amdsmi_get_processor_handles()) > 0:
+                is_rocm = True
+        finally:
+            amdsmi.amdsmi_shut_down()
+    except Exception:
+        pass
+
+    return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None
+
+
+def hpu_platform_plugin() -> Optional[str]:
+    is_hpu = False
+    try:
+        from importlib import util
+        is_hpu = util.find_spec('habana_frameworks') is not None
+    except Exception:
+        pass
+
+    return "vllm.platforms.hpu.HpuPlatform" if is_hpu else None
+
+
+def xpu_platform_plugin() -> Optional[str]:
+    is_xpu = False
+
+    try:
+        # installed IPEX if the machine has XPUs.
+        import intel_extension_for_pytorch  # noqa: F401
+        import oneccl_bindings_for_pytorch  # noqa: F401
+        import torch
+        if hasattr(torch, 'xpu') and torch.xpu.is_available():
+            is_xpu = True
+    except Exception:
+        pass
+
+    return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None
+
+
+def cpu_platform_plugin() -> Optional[str]:
+    is_cpu = False
+    try:
+        from importlib.metadata import version
+        is_cpu = "cpu" in version("vllm")
+    except Exception:
+        pass
+
+    return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None
+
+
+def neuron_platform_plugin() -> Optional[str]:
+    is_neuron = False
+    try:
+        import transformers_neuronx  # noqa: F401
+        is_neuron = True
+    except ImportError:
+        pass
 
-    if cuda_is_jetson():
-        is_cuda = True
+    return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None
 
-is_rocm = False
 
-try:
-    import amdsmi
-    amdsmi.amdsmi_init()
+def openvino_platform_plugin() -> Optional[str]:
+    is_openvino = False
     try:
-        if len(amdsmi.amdsmi_get_processor_handles()) > 0:
-            is_rocm = True
-    finally:
-        amdsmi.amdsmi_shut_down()
-except Exception:
-    pass
-
-is_hpu = False
-try:
-    from importlib import util
-    is_hpu = util.find_spec('habana_frameworks') is not None
-except Exception:
-    pass
-
-is_xpu = False
-
-try:
-    # installed IPEX if the machine has XPUs.
-    import intel_extension_for_pytorch  # noqa: F401
-    import oneccl_bindings_for_pytorch  # noqa: F401
-    import torch
-    if hasattr(torch, 'xpu') and torch.xpu.is_available():
-        is_xpu = True
-except Exception:
-    pass
-
-is_cpu = False
-try:
-    from importlib.metadata import version
-    is_cpu = "cpu" in version("vllm")
-except Exception:
-    pass
-
-is_neuron = False
-try:
-    import transformers_neuronx  # noqa: F401
-    is_neuron = True
-except ImportError:
-    pass
-
-is_openvino = False
-try:
-    from importlib.metadata import version
-    is_openvino = "openvino" in version("vllm")
-except Exception:
-    pass
-
-if is_tpu:
-    # people might install pytorch built with cuda but run on tpu
-    # so we need to check tpu first
-    from .tpu import TpuPlatform
-    current_platform = TpuPlatform()
-elif is_cuda:
-    from .cuda import CudaPlatform
-    current_platform = CudaPlatform()
-elif is_rocm:
-    from .rocm import RocmPlatform
-    current_platform = RocmPlatform()
-elif is_hpu:
-    from .hpu import HpuPlatform
-    current_platform = HpuPlatform()
-elif is_xpu:
-    from .xpu import XPUPlatform
-    current_platform = XPUPlatform()
-elif is_cpu:
-    from .cpu import CpuPlatform
-    current_platform = CpuPlatform()
-elif is_neuron:
-    from .neuron import NeuronPlatform
-    current_platform = NeuronPlatform()
-elif is_openvino:
-    from .openvino import OpenVinoPlatform
-    current_platform = OpenVinoPlatform()
-else:
-    current_platform = UnspecifiedPlatform()
-
-__all__ = ['Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum']
+        from importlib.metadata import version
+        is_openvino = "openvino" in version("vllm")
+    except Exception:
+        pass
+
+    return "vllm.platforms.openvino.OpenVinoPlatform" if is_openvino else None
+
+
+builtin_platform_plugins = {
+    'tpu': tpu_platform_plugin,
+    'cuda': cuda_platform_plugin,
+    'rocm': rocm_platform_plugin,
+    'hpu': hpu_platform_plugin,
+    'xpu': xpu_platform_plugin,
+    'cpu': cpu_platform_plugin,
+    'neuron': neuron_platform_plugin,
+    'openvino': openvino_platform_plugin,
+}
+
+
+def resolve_current_platform_cls_qualname() -> str:
+    platform_plugins = load_plugins_by_group('vllm.platform_plugins')
+
+    activated_plugins = []
+
+    for name, func in chain(builtin_platform_plugins.items(),
+                            platform_plugins.items()):
+        try:
+            assert callable(func)
+            platform_cls_qualname = func()
+            if platform_cls_qualname is not None:
+                activated_plugins.append(name)
+        except Exception:
+            pass
+
+    activated_builtin_plugins = list(
+        set(activated_plugins) & set(builtin_platform_plugins.keys()))
+    activated_oot_plugins = list(
+        set(activated_plugins) & set(platform_plugins.keys()))
+
+    if len(activated_oot_plugins) >= 2:
+        raise RuntimeError(
+            "Only one platform plugin can be activated, but got: "
+            f"{activated_oot_plugins}")
+    elif len(activated_oot_plugins) == 1:
+        platform_cls_qualname = platform_plugins[activated_oot_plugins[0]]()
+        logger.info("Platform plugin %s is activated",
+                    activated_oot_plugins[0])
+    elif len(activated_builtin_plugins) >= 2:
+        raise RuntimeError(
+            "Only one platform plugin can be activated, but got: "
+            f"{activated_builtin_plugins}")
+    elif len(activated_builtin_plugins) == 1:
+        platform_cls_qualname = builtin_platform_plugins[
+            activated_builtin_plugins[0]]()
+        logger.info("Automatically detected platform %s.",
+                    activated_builtin_plugins[0])
+    else:
+        platform_cls_qualname = "vllm.interface.UnspecifiedPlatform"
+        logger.info(
+            "No platform detected, vLLM is running on UnspecifiedPlatform")
+    return platform_cls_qualname
+
+
+_current_platform = None
+_init_trace: str = ''
+
+if TYPE_CHECKING:
+    current_platform: Platform
+
+
+def __getattr__(name: str):
+    if name == 'current_platform':
+        # lazy init current_platform.
+        # 1. out-of-tree platform plugins need `from vllm.platforms import
+        #    Platform` so that they can inherit `Platform` class. Therefore,
+        #    we cannot resolve `current_platform` during the import of
+        #    `vllm.platforms`.
+        # 2. when users use out-of-tree platform plugins, they might run
+        #    `import vllm`, some vllm internal code might access
+        #    `current_platform` during the import, and we need to make sure
+        #    `current_platform` is only resolved after the plugins are loaded
+        #    (we have tests for this, if any developer violate this, they will
+        #    see the test failures).
+        global _current_platform
+        if _current_platform is None:
+            platform_cls_qualname = resolve_current_platform_cls_qualname()
+            _current_platform = resolve_obj_by_qualname(
+                platform_cls_qualname)()
+            global _init_trace
+            _init_trace = "".join(traceback.format_stack())
+        return _current_platform
+    else:
+        return globals()[name]
+
+
+__all__ = [
+    'Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum',
+    "_init_trace"
+]
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 17f604ea0e202..c50eb2cef4cd5 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -1,10 +1,10 @@
 import logging
 import os
+from typing import Callable, Dict
 
 import torch
 
 import vllm.envs as envs
-from vllm.platforms import current_platform
 
 logger = logging.getLogger(__name__)
 
@@ -12,6 +12,39 @@
 plugins_loaded = False
 
 
+def load_plugins_by_group(group: str) -> Dict[str, Callable]:
+    import sys
+    if sys.version_info < (3, 10):
+        from importlib_metadata import entry_points
+    else:
+        from importlib.metadata import entry_points
+
+    allowed_plugins = envs.VLLM_PLUGINS
+
+    discovered_plugins = entry_points(group=group)
+    if len(discovered_plugins) == 0:
+        logger.debug("No plugins for group %s found.", group)
+        return {}
+    logger.info("Available plugins for group %s:", group)
+    for plugin in discovered_plugins:
+        logger.info("name=%s, value=%s", plugin.name, plugin.value)
+    if allowed_plugins is None:
+        logger.info("all available plugins for group %s will be loaded.",
+                    group)
+        logger.info("set environment variable VLLM_PLUGINS to control"
+                    " which plugins to load.")
+    plugins = {}
+    for plugin in discovered_plugins:
+        if allowed_plugins is None or plugin.name in allowed_plugins:
+            try:
+                func = plugin.load()
+                plugins[plugin.name] = func
+                logger.info("plugin %s loaded.", plugin.name)
+            except Exception:
+                logger.exception("Failed to load plugin %s", plugin.name)
+    return plugins
+
+
 def load_general_plugins():
     """WARNING: plugins can be loaded for multiple times in different
     processes. They should be designed in a way that they can be loaded
@@ -26,6 +59,9 @@ def load_general_plugins():
     os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
     # see https://github.com/vllm-project/vllm/issues/10619
     torch._inductor.config.compile_threads = 1
+
+    from vllm.platforms import current_platform
+
     if current_platform.is_xpu():
         # see https://github.com/pytorch/pytorch/blob/8cada5cbe5450e17c26fb8b358116785324537b2/torch/_dynamo/config.py#L158  # noqa
         os.environ['TORCH_COMPILE_DISABLE'] = 'True'
@@ -47,33 +83,7 @@ def load_general_plugins():
     if plugins_loaded:
         return
     plugins_loaded = True
-    import sys
-    if sys.version_info < (3, 10):
-        from importlib_metadata import entry_points
-    else:
-        from importlib.metadata import entry_points
-
-    allowed_plugins = envs.VLLM_PLUGINS
-
-    discovered_plugins = entry_points(group='vllm.general_plugins')
-    if len(discovered_plugins) == 0:
-        logger.debug("No plugins found.")
-        return
-    logger.info("Available plugins:")
-    for plugin in discovered_plugins:
-        logger.info("name=%s, value=%s, group=%s", plugin.name, plugin.value,
-                    plugin.group)
-    if allowed_plugins is None:
-        logger.info("all available plugins will be loaded.")
-        logger.info("set environment variable VLLM_PLUGINS to control"
-                    " which plugins to load.")
-    else:
-        logger.info("plugins to load: %s", allowed_plugins)
-    for plugin in discovered_plugins:
-        if allowed_plugins is None or plugin.name in allowed_plugins:
-            try:
-                func = plugin.load()
-                func()
-                logger.info("plugin %s loaded.", plugin.name)
-            except Exception:
-                logger.exception("Failed to load plugin %s", plugin.name)
+    plugins = load_plugins_by_group(group='vllm.general_plugins')
+    # general plugins, we only need to execute the loaded functions
+    for func in plugins.values():
+        func()
diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
index 03dc46600d8a9..d678f4578499b 100644
--- a/vllm/spec_decode/metrics.py
+++ b/vllm/spec_decode/metrics.py
@@ -6,7 +6,6 @@
 
 from vllm.model_executor.layers.spec_decode_base_sampler import (
     SpecDecodeBaseSampler)
-from vllm.platforms import current_platform
 from vllm.utils import is_pin_memory_available
 
 
@@ -94,6 +93,7 @@ def init_tensors(self,
     def maybe_collect_rejsample_metrics(
             self, k: int) -> Optional[SpecDecodeWorkerMetrics]:
         # currently using cuda.Event, skip for any non_cuda_alike platform
+        from vllm.platforms import current_platform
         if not current_platform.is_cuda_alike():
             return None
 
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index 9ae46ff43a916..a9deee881f41a 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -17,7 +17,6 @@
 
 import vllm.envs as envs
 from vllm.connections import global_http_connection
-from vllm.platforms import current_platform
 from vllm.version import __version__ as VLLM_VERSION
 
 _config_home = envs.VLLM_CONFIG_ROOT
@@ -152,6 +151,7 @@ def _report_usage_once(self, model_architecture: str,
                            usage_context: UsageContext,
                            extra_kvs: Dict[str, Any]) -> None:
         # Platform information
+        from vllm.platforms import current_platform
         if current_platform.is_cuda_alike():
             device_property = torch.cuda.get_device_properties(0)
             self.gpu_count = torch.cuda.device_count()
diff --git a/vllm/utils.py b/vllm/utils.py
index 2b46c1fef0d09..8ef07d2c326a3 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -50,7 +50,6 @@
 
 import vllm.envs as envs
 from vllm.logger import enable_trace_function_call, init_logger
-from vllm.platforms import current_platform
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
@@ -609,6 +608,7 @@ def create_kv_caches_with_random_flash(
     seed: int = 0,
     device: Optional[str] = "cuda",
 ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+    from vllm.platforms import current_platform
     current_platform.seed_everything(seed)
 
     torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
@@ -650,7 +650,7 @@ def create_kv_caches_with_random(
         raise ValueError(
             f"Does not support key cache of type fp8 with head_size {head_size}"
         )
-
+    from vllm.platforms import current_platform
     current_platform.seed_everything(seed)
 
     torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
@@ -703,6 +703,7 @@ def print_warning_once(msg: str) -> None:
 
 @lru_cache(maxsize=None)
 def is_pin_memory_available() -> bool:
+    from vllm.platforms import current_platform
     return current_platform.is_pin_memory_available()
 
 
@@ -713,6 +714,7 @@ def __init__(self, device: Optional[torch.types.Device] = None):
 
     def current_memory_usage(self) -> float:
         # Return the memory usage in bytes.
+        from vllm.platforms import current_platform
         if current_platform.is_cuda_alike():
             torch.cuda.reset_peak_memory_stats(self.device)
             mem = torch.cuda.max_memory_allocated(self.device)
@@ -1066,6 +1068,7 @@ def _cuda_device_count_stateless(
     import torch.cuda
     import torch.version
 
+    from vllm.platforms import current_platform
     if not torch.cuda._is_compiled():
         return 0
     if current_platform.is_rocm():
@@ -1673,6 +1676,7 @@ def direct_register_custom_op(
         return
 
     if not supports_custom_op():
+        from vllm.platforms import current_platform
         assert not current_platform.is_cuda_alike(), (
             "cuda platform needs torch>=2.4 to support custom op, "
             "chances are you are using an old version of pytorch "
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index cd4770202a186..c7abad7e0258d 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -12,7 +12,6 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
 
 if TYPE_CHECKING:
@@ -265,13 +264,13 @@ def prepare_model_input(
         """
         raise NotImplementedError
 
-    @current_platform.inference_mode()
     def execute_model(
         self,
         model_input: T,
         kv_caches: Optional[List[torch.Tensor]],
-        intermediate_tensors: Optional[IntermediateTensors],
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         num_steps: int = 1,
+        **kwargs,
     ) -> Optional[List[SamplerOutput]]:
         """
         Execute the model on the given input.
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index 65d9bab0e2822..dee63a75c0605 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -544,6 +544,7 @@ def execute_model(
         model_input.record_step_event(current_stream)
 
         if get_pp_group().is_last_rank and self.is_driver_worker:
+            assert isinstance(output, list)
             assert len(
                 output
             ) == 1, "MultiStepModelRunner requires single-step base_models"
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 3ac7fb8dfb766..249b3ed2dfd37 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -11,7 +11,6 @@
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import (enable_trace_function_call_for_thread,
                         resolve_obj_by_qualname, update_environment_variables)
@@ -44,6 +43,8 @@ def __init__(
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
         self.kv_transfer_config = vllm_config.kv_transfer_config
+        from vllm.platforms import current_platform
+        self.current_platform = current_platform
 
     @abstractmethod
     def init_device(self) -> None:
@@ -74,17 +75,17 @@ def initialize_cache(self, num_gpu_blocks: int,
         """
         raise NotImplementedError
 
-    @current_platform.inference_mode()
     def start_worker_execution_loop(self) -> None:
         """Execute model loop in parallel worker.
 
         You can stop the loop by executing a driver worker with an empty output.
         See `stop_remote_worker_execution_loop` for more details.
         """
-        while True:
-            output = self.execute_model(execute_model_req=None)
-            if output is None:
-                return None
+        with self.current_platform.inference_mode():
+            while True:
+                output = self.execute_model(execute_model_req=None)
+                if output is None:
+                    return None
 
     @abstractmethod
     def execute_model(
@@ -352,6 +353,7 @@ def execute_model(
         model_execute_time = time.perf_counter() - start_time
         if not get_pp_group().is_last_rank:
             # output is IntermediateTensors
+            assert isinstance(output, IntermediateTensors)
             if (self.observability_config is not None
                     and self.observability_config.collect_model_execute_time):
                 output.tensors["model_execute_time"] = torch.tensor(

From 8d9b6721e7f5b7d191951c6f1cd12710ffd08093 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 30 Dec 2024 23:01:35 +0800
Subject: [PATCH 223/357] [VLM] Abstract out multi-modal data parsing in merged
 processor (#11620)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml             |   4 +-
 vllm/model_executor/models/chatglm.py     |   4 +-
 vllm/model_executor/models/llava.py       |  18 +-
 vllm/model_executor/models/phi3v.py       |  19 +-
 vllm/model_executor/models/qwen2_audio.py |  22 +-
 vllm/model_executor/models/qwen2_vl.py    | 153 +++++-----
 vllm/model_executor/models/ultravox.py    |  22 +-
 vllm/multimodal/__init__.py               |   9 +-
 vllm/multimodal/audio.py                  |   4 +-
 vllm/multimodal/base.py                   |   8 +-
 vllm/multimodal/image.py                  |   4 +-
 vllm/multimodal/inputs.py                 | 195 ++++--------
 vllm/multimodal/parse.py                  | 344 ++++++++++++++++++++++
 vllm/multimodal/processing.py             |  62 ++--
 vllm/multimodal/video.py                  |   4 +-
 15 files changed, 560 insertions(+), 312 deletions(-)
 create mode 100644 vllm/multimodal/parse.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index bee968b4d2e43..c6f8316412e2f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -356,7 +356,7 @@ steps:
     - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
     - pytest -v -s models/embedding/language -m 'not core_model'
 
-- label: Multi-Modal Models Test (Standard) # 28min
+- label: Multi-Modal Models Test (Standard) # 40min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
@@ -372,7 +372,7 @@ steps:
     - pytest -v -s models/encoder_decoder/language -m core_model
     - pytest -v -s models/encoder_decoder/vision_language -m core_model
 
-- label: Multi-Modal Models Test (Extended) 1 # 1h16m
+- label: Multi-Modal Models Test (Extended) 1 # 48m
   optional: true
   source_file_dependencies:
   - vllm/
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 6c50882d83c3b..ffd6891b25965 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -33,7 +33,7 @@
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalData, MultiModalKwargs,
+from vllm.multimodal.inputs import (ModalityData, MultiModalKwargs,
                                     NestedTensors)
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
@@ -54,7 +54,7 @@ def calculate_image_placeholder(vision_config):
 
 def mm_input_mapper_for_glmv(
     ctx: InputContext,
-    data: MultiModalData[object],
+    data: ModalityData[object],
 ) -> Dict:
     model_config = ctx.model_config
     tokenizer = cached_get_tokenizer(
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 0ecba5a1cae0f..1d6ee2a0be72e 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -20,11 +20,13 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems,
-                                    MultiModalFieldConfig, MultiModalInputsV2,
-                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputsV2, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import ImageProcessorItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        ProcessorInputs, PromptReplacement,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement,
                                         full_groupby_modality)
 from vllm.sequence import IntermediateTensors
 
@@ -179,7 +181,9 @@ def _get_prompt_replacements(
             assert isinstance(vision_config, PixtralVisionConfig)
 
             def get_replacement_pixtral(item_idx: int):
-                image_size = mm_items.get_image_size(item_idx)
+                images = mm_items.get_items("image", ImageProcessorItems)
+                image_size = images.get_image_size(item_idx)
+
                 (
                     num_width_tokens,
                     num_height_tokens,
@@ -591,8 +595,8 @@ def apply(
 
         result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
 
-        mm_items = self._get_mm_items(mm_data)
-        mm_item_counts = mm_items.get_item_counts()
+        mm_items = self._to_mm_items(mm_data)
+        mm_item_counts = mm_items.get_all_counts()
         mm_kwargs = result["mm_kwargs"]
 
         # We reimplement the functionality of MLlavaProcessor from
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index fefa9fd62d1d0..15362db6cdfbf 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -32,12 +32,13 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems,
-                                    MultiModalFieldConfig, MultiModalInputsV2,
-                                    MultiModalKwargs, NestedTensors,
-                                    PlaceholderRange)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputsV2, MultiModalKwargs,
+                                    NestedTensors, PlaceholderRange)
+from vllm.multimodal.parse import ImageProcessorItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        ProcessorInputs, PromptReplacement,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement,
                                         _BoundPromptReplacement,
                                         _PlaceholderInfo)
 from vllm.sequence import IntermediateTensors
@@ -381,7 +382,9 @@ def _get_prompt_replacements(
         assert isinstance(bos_token_id, int)
 
         def get_replacement_phi3v(item_idx: int):
-            image_size = mm_items.get_image_size(item_idx)
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+
             num_tokens = image_processor.calc_num_image_tokens_from_image_size(
                 width=image_size.width,
                 height=image_size.height,
@@ -389,12 +392,14 @@ def get_replacement_phi3v(item_idx: int):
 
             return [_IMAGE_TOKEN_ID] * num_tokens + [bos_token_id]
 
+        num_images = mm_items.get_count("image", strict=False)
+
         return [
             PromptReplacement(
                 modality="image",
                 target=image_token,
                 replacement=get_replacement_phi3v,
-            ) for image_token in image_tokens[:len(mm_items.images)]
+            ) for image_token in image_tokens[:num_images]
         ]
 
     def _apply_prompt_replacements(
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 25a351bd9c656..e3d43b017f894 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -20,8 +20,8 @@
 # limitations under the License.
 """Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
 from functools import cached_property
-from typing import (Any, Iterable, List, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
+                    Union)
 
 import numpy as np
 import torch
@@ -38,10 +38,12 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataItems, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        ProcessorInputs, PromptReplacement)
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -99,15 +101,9 @@ def _get_hf_processor(
     def _get_feature_extractor(self) -> WhisperFeatureExtractor:
         return self._get_hf_processor().feature_extractor  # type: ignore
 
-    def _get_hf_mm_data(
-        self,
-        mm_items: MultiModalDataItems,
-    ) -> tuple[dict[str, Any], dict[str, Any]]:
-        # resample audio to the model's sampling rate
+    def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self._get_feature_extractor()
-        mm_items.resample_audios(feature_extractor.sampling_rate)
-
-        return super()._get_hf_mm_data(mm_items)
+        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
 
     def _call_hf_processor(
         self,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 574845ef5a525..6181fe3dd13d8 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -25,7 +25,6 @@
 from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional,
                     Set, Tuple, Type, TypedDict, Union)
 
-import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -55,15 +54,16 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems,
+from vllm.multimodal.inputs import (ImageItem, ModalityData,
                                     MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+                                    NestedTensors, VideoItem)
+from vllm.multimodal.parse import ModalityDataItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        ProcessorInputs, PromptReplacement)
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
-from vllm.utils import is_list_of
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend,
@@ -719,61 +719,81 @@ def get_max_qwen2_vl_mm_tokens(ctx: InputContext,
                                         data_type_key="video")
 
 
-class Qwen2VLMultiModalDataItems(MultiModalDataItems):
+class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor],
+                                            dict[str, torch.Tensor]]):
 
-    @staticmethod
-    def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
-        """
-        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
-        """
-        multi_data = Qwen2VLMultiModalDataItems()
-
-        for k, v in data.items():
-            # TODO: Make a separate modality for embedding inputs
-            # to avoid confusion
-            # yapf: disable
-            if k == "video":
-                # Special case since even a single item can be a list
-                multi_data[k] = (  # type: ignore[index]
-                    v if (
-                        isinstance(v, (dict, torch.Tensor))  # type: ignore[assignment]
-                        or is_list_of(v, list)
-                        or isinstance(v[0], (np.ndarray, torch.Tensor))
-                           and v[0].ndim == 4
-                    ) else [v]
-                )
-            elif k in ("image", "audio"):
-                multi_data[k] = (  # type: ignore[index]
-                    v if isinstance(v, (dict, torch.Tensor, list)) else [v]
-                )
-            else:
-                multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
-            # yapf: enable
+    def __init__(self, data: dict, modality: str) -> None:
+        super().__init__(data)
 
-        return multi_data
+        self.modality = modality
 
-    def get_item_counts(self) -> Mapping[str, int]:
-        return {
-            m: (
-                len(items[f"{m}_grid_thw"])  # type: ignore
-                if isinstance(items, dict) else len(items))
-            for m, items in self.items()
-        }
+        grid_thw = data[f"{modality}_grid_thw"]
+        slice_idxs = [0] + grid_thw.prod(-1).cumsum_(0).tolist()
+        self._slices = [
+            slice(slice_idxs[i], slice_idxs[i + 1])
+            for i in range(len(grid_thw))
+        ]
 
-    def has_embedding_inputs(self) -> bool:
-        return any(
-            isinstance(items, dict) or any(
-                isinstance(item, torch.Tensor) for item in items)
-            for items in self.values())
+    def __repr__(self) -> str:
+        return (f"{type(self).__name__}(modality={self.modality!r})")
 
+    def get_count(self) -> int:
+        return len(self.data[f"{self.modality}_grid_thw"])
 
-class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
+    def get(self, index: int) -> dict[str, torch.Tensor]:
+        out = {}
+        for k, v in self.data.items():
+            if v != f"{self.modality}_grid_thw":
+                v = v[self._slices[index]]
+
+            out[k] = v
+
+        return out
+
+    def get_processor_data(self) -> Mapping[str, object]:
+        return {}
+
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        return self.data
+
+
+class Qwen2ImageEmbeddingItems(Qwen2EmbeddingItems):
+
+    def __init__(self, data: dict) -> None:
+        super().__init__(data, "image")
+
+
+class Qwen2VideoEmbeddingItems(Qwen2EmbeddingItems):
 
-    def _get_mm_items(
+    def __init__(self, data: dict) -> None:
+        super().__init__(data, "video")
+
+
+class Qwen2MultiModalDataParser(MultiModalDataParser):
+
+    def _parse_image_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return Qwen2EmbeddingItems(data, modality="image")
+
+        return super()._parse_image_data(data)
+
+    def _parse_video_data(
         self,
-        mm_data: MultiModalDataDict,
-    ) -> MultiModalDataItems:
-        return Qwen2VLMultiModalDataItems.from_dict(mm_data)
+        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return Qwen2EmbeddingItems(data, modality="video")
+
+        return super()._parse_video_data(data)
+
+
+class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return Qwen2MultiModalDataParser()
 
     def _get_hf_processor(
         self,
@@ -796,35 +816,6 @@ def _get_hf_processor(
 
         return hf_processor
 
-    def _get_hf_mm_data(
-        self,
-        mm_items: MultiModalDataItems,
-    ) -> tuple[dict[str, Any], dict[str, Any]]:
-        processor_data = dict[str, Any]()
-        passthrough_data = dict[str, Any]()
-
-        for k, v in mm_items.items():
-            # TODO: Make a separate modality for embedding inputs
-            # to avoid confusion
-            if k in ("image", "video", "audio"):
-                if isinstance(v, dict):
-                    # Pass through embedding inputs (dict)
-                    passthrough_data.update(v)
-                elif isinstance(v, torch.Tensor) and v.ndim == 3:
-                    # Pass through embedding inputs (single)
-                    passthrough_data[f"{k}_embeds"] = [v]
-                elif (is_list_of(v, torch.Tensor) and len(v) > 0
-                      and v[0].ndim == 2):
-                    # Pass through embedding inputs (multi)
-                    passthrough_data[f"{k}_embeds"] = v
-                elif len(v) > 0:
-                    # Map keys to plural form, e.g.: image -> images
-                    processor_data[f"{k}s"] = v
-            else:
-                processor_data[k] = v
-
-        return processor_data, passthrough_data
-
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 7b4aeeec5f403..7e853e5b90096 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -3,8 +3,8 @@
 
 import math
 from functools import cached_property, lru_cache
-from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set,
-                    Tuple, TypedDict, Union)
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
+                    TypedDict, Union)
 
 import numpy as np
 import torch
@@ -24,10 +24,12 @@
 from vllm.model_executor.model_loader.loader import DefaultModelLoader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataItems, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        ProcessorInputs, PromptReplacement)
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 from vllm.utils import is_list_of
@@ -85,15 +87,9 @@ def _get_feature_extractor(self) -> WhisperFeatureExtractor:
         hf_processor = self._get_hf_processor()
         return hf_processor.audio_processor.feature_extractor  # type: ignore
 
-    def _get_hf_mm_data(
-        self,
-        mm_items: MultiModalDataItems,
-    ) -> tuple[dict[str, Any], dict[str, Any]]:
-        # resample audio to the model's sampling rate
+    def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self._get_feature_extractor()
-        mm_items.resample_audios(feature_extractor.sampling_rate)
-
-        return super()._get_hf_mm_data(mm_items)
+        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
 
     def _call_hf_processor(
         self,
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 9255e062e4870..e58bbe81717a0 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,8 +1,7 @@
 from .base import MultiModalPlaceholderMap, MultiModalPlugin
-from .inputs import (BatchedTensorInputs, MultiModalData,
-                     MultiModalDataBuiltins, MultiModalDataDict,
-                     MultiModalKwargs, MultiModalPlaceholderDict,
-                     NestedTensors)
+from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
+                     MultiModalDataDict, MultiModalKwargs,
+                     MultiModalPlaceholderDict, NestedTensors)
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -16,7 +15,7 @@
 
 __all__ = [
     "BatchedTensorInputs",
-    "MultiModalData",
+    "ModalityData",
     "MultiModalDataBuiltins",
     "MultiModalDataDict",
     "MultiModalKwargs",
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index 3e09ef1fcbb56..de80f22bac2a3 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -9,7 +9,7 @@
 from vllm.utils import PlaceholderModule
 
 from .base import MediaIO, MultiModalPlugin
-from .inputs import AudioItem, MultiModalData, MultiModalKwargs
+from .inputs import AudioItem, ModalityData, MultiModalKwargs
 
 try:
     import librosa
@@ -31,7 +31,7 @@ def get_data_key(self) -> str:
     def _default_input_mapper(
         self,
         ctx: InputContext,
-        data: MultiModalData[AudioItem],
+        data: ModalityData[AudioItem],
         **mm_processor_kwargs,
     ) -> MultiModalKwargs:
         raise NotImplementedError("There is no default audio input mapper")
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index cdda6f8052794..7f4029e726332 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -15,12 +15,12 @@
     from vllm.config import ModelConfig
     from vllm.sequence import SequenceGroupMetadata
 
-from .inputs import (MultiModalData, MultiModalDataDict, MultiModalKwargs,
+from .inputs import (ModalityData, MultiModalDataDict, MultiModalKwargs,
                      PlaceholderRange)
 
 logger = init_logger(__name__)
 
-MultiModalInputMapper = Callable[[InputContext, MultiModalData[object]],
+MultiModalInputMapper = Callable[[InputContext, ModalityData[object]],
                                  MultiModalKwargs]
 """
 Return a dictionary to be passed as keyword arguments to
@@ -69,7 +69,7 @@ def get_data_key(self) -> str:
     def _default_input_mapper(
         self,
         ctx: InputContext,
-        data: MultiModalData[Any],
+        data: ModalityData[Any],
         **mm_processor_kwargs,
     ) -> MultiModalKwargs:
         """
@@ -118,7 +118,7 @@ def wrapper(model_cls: N) -> N:
     def map_input(
         self,
         model_config: "ModelConfig",
-        data: MultiModalData[Any],
+        data: ModalityData[Any],
         mm_processor_kwargs: Optional[dict[str, Any]],
     ) -> MultiModalKwargs:
         """
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 14c79dfadec0c..da13a381c4530 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -13,7 +13,7 @@
 from vllm.utils import is_list_of
 
 from .base import MediaIO, MultiModalPlugin
-from .inputs import ImageItem, MultiModalData, MultiModalKwargs
+from .inputs import ImageItem, ModalityData, MultiModalKwargs
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -44,7 +44,7 @@ def _get_hf_image_processor(
     def _default_input_mapper(
         self,
         ctx: InputContext,
-        data: MultiModalData[ImageItem],
+        data: ModalityData[ImageItem],
         **mm_processor_kwargs,
     ) -> MultiModalKwargs:
         model_config = ctx.model_config
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 1fbda6e0b8750..db489af7ac475 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -2,53 +2,74 @@
 from collections import UserDict, defaultdict
 from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
-from typing import (Any, Literal, NamedTuple, TypedDict, TypeVar, Union, cast,
-                    final)
+from typing import Any, Literal, TypedDict, TypeVar, Union, cast, final
 
 import numpy as np
 import torch
 import torch.types
 from PIL.Image import Image
 from transformers import BatchFeature
-from typing_extensions import NotRequired, TypeAlias, assert_never
+from typing_extensions import NotRequired, TypeAlias
 
 from vllm.utils import JSONTree, is_list_of, json_map_leaves
 
 _T = TypeVar("_T")
 
-# yapf: disable
-ImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
+HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
 """
 A :class:`transformers.image_utils.ImageInput` representing a single image
 item, which can be passed to a HuggingFace :code:`ImageProcessor`.
 """
 
-VideoItem: TypeAlias = Union[
-    list[Image],
-    np.ndarray,
-    torch.Tensor,
-    list[np.ndarray],
-    list[torch.Tensor],
-]
+HfVideoItem: TypeAlias = Union[list[Image], np.ndarray, torch.Tensor,
+                               list[np.ndarray], list[torch.Tensor]]
 """
 A :class:`transformers.image_utils.VideoInput` representing a single video
 item, which can be passed to a HuggingFace :code:`VideoProcessor`.
 """
 
-AudioItem: TypeAlias = Union[
-    np.ndarray,
-    list[float],
-    # `(audio, sampling_rate)`: If the audio's sampling rate is different
-    # from that expected by the model, we need to resample it.
-    tuple[np.ndarray, float],
-]
+HfAudioItem: TypeAlias = Union[list[float], np.ndarray, torch.Tensor]
 """
 Represents a single audio
 item, which can be passed to a HuggingFace :code:`AudioProcessor`.
 """
-# yapf: enable
 
-MultiModalData: TypeAlias = Union[_T, list[_T]]
+ImageItem: TypeAlias = Union[HfImageItem, torch.Tensor]
+"""
+A :class:`transformers.image_utils.ImageInput` representing a single image
+item, which can be passed to a HuggingFace :code:`ImageProcessor`.
+
+Alternatively, a 3-D tensor or batch of 2-D tensors,
+which are treated as image embeddings;
+these are directly passed to the model without HF processing.
+"""
+
+VideoItem: TypeAlias = Union[HfVideoItem, torch.Tensor]
+"""
+A :class:`transformers.image_utils.VideoInput` representing a single video
+item, which can be passed to a HuggingFace :code:`VideoProcessor`.
+
+Alternatively, a 3-D tensor or batch of 2-D tensors,
+which are treated as video embeddings;
+these are directly passed to the model without HF processing.
+"""
+
+AudioItem: TypeAlias = Union[HfAudioItem, tuple[np.ndarray, float],
+                             torch.Tensor]
+"""
+Represents a single audio
+item, which can be passed to a HuggingFace :code:`AudioProcessor`.
+
+Alternatively, a tuple `(audio, sampling_rate)`, where the sampling rate
+is different from that expected by the model;
+these are resampled to the model's sampling rate before being processed by HF.
+
+Alternatively, a 3-D tensor or batch of 2-D tensors,
+which are treated as audio embeddings;
+these are directly passed to the model without HF processing.
+"""
+
+ModalityData: TypeAlias = Union[_T, list[_T]]
 """
 Either a single data item, or a list of data items.
 
@@ -61,17 +82,17 @@
 class MultiModalDataBuiltins(TypedDict, total=False):
     """Type annotations for modality types predefined by vLLM."""
 
-    image: MultiModalData[ImageItem]
+    image: ModalityData[ImageItem]
     """The input image(s)."""
 
-    video: MultiModalData[VideoItem]
+    video: ModalityData[VideoItem]
     """The input video(s)."""
 
-    audio: MultiModalData[AudioItem]
+    audio: ModalityData[AudioItem]
     """The input audio(s)."""
 
 
-MultiModalDataDict: TypeAlias = Mapping[str, MultiModalData[Any]]
+MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
 """
 A dictionary containing an entry for each modality type to input.
 
@@ -83,123 +104,6 @@ class MultiModalDataBuiltins(TypedDict, total=False):
 """
 
 
-class ImageSize(NamedTuple):
-    width: int
-    height: int
-
-
-class MultiModalDataItems(UserDict[str, list[Any]]):
-    """
-    As :class:`MultiModalDataDict`, but normalized such that each entry
-    corresponds to a list.
-    """
-
-    @staticmethod
-    def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
-        """
-        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
-        """
-        multi_data = MultiModalDataItems()
-
-        for k, v in data.items():
-            # TODO: Make a separate modality for embedding inputs
-            # to avoid confusion
-            # yapf: disable
-            if k == "video":
-                # Special case since even a single item can be a list
-                multi_data[k] = (  # type: ignore[index]
-                    v if (
-                        isinstance(v, torch.Tensor)
-                        or is_list_of(v, list)
-                        or isinstance(v[0], (np.ndarray, torch.Tensor))
-                           and v[0].ndim == 4
-                    ) else [v]
-                )
-            elif k in ("image", "audio"):
-                multi_data[k] = (  # type: ignore[index]
-                    v if isinstance(v, (torch.Tensor, list)) else [v]
-                )
-            else:
-                multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
-            # yapf: enable
-
-        return multi_data
-
-    # NOTE: When a field (e.g. `images`) doesn't exist, directly appending to
-    # `self.images` doesn't update this dictionary, which may be confusing
-    # We annotate the getter methods as `Sequence` to prevent others from
-    # trying to update the list in this way
-    @property
-    def images(self) -> Sequence[ImageItem]:
-        return self.get("image", [])
-
-    @property
-    def videos(self) -> Sequence[VideoItem]:
-        return self.get("video", [])
-
-    @property
-    def audios(self) -> Sequence[AudioItem]:
-        return self.get("audio", [])
-
-    def get_item_counts(self) -> Mapping[str, int]:
-        return {m: len(items) for m, items in self.items()}
-
-    def has_embedding_inputs(self) -> bool:
-        return any(
-            any(isinstance(item, torch.Tensor) for item in items)
-            for items in self.values())
-
-    def get_image_size(self, item_idx: int) -> ImageSize:
-        image = self.images[item_idx]
-
-        if isinstance(image, Image):
-            return ImageSize(*image.size)
-        if isinstance(image, (np.ndarray, torch.Tensor)):
-            _, h, w = image.shape
-            return ImageSize(w, h)
-
-        assert_never(image)
-
-    def get_audio_with_sr(
-        self,
-        item_idx: int,
-        *,
-        default_sr: float,
-    ) -> tuple[np.ndarray, float]:
-        audio = self.audios[item_idx]
-
-        if isinstance(audio, tuple):
-            return audio
-        if isinstance(audio, list):
-            return np.array(audio), default_sr
-        if isinstance(audio, np.ndarray):
-            return audio, default_sr
-
-        assert_never(audio)
-
-    def resample_audios(self, new_sr: float, *, drop_sr: bool = True) -> None:
-        """
-        If :code:`drop_sr=True`, the audio items in this dictionary are updated
-        to be NumPy arrays which implicitly means that their sampling rate is
-        the same as the model's expected sampling rate; otherwise, they remain
-        as :code:`(audio, new_sr)` tuples.
-        """
-        # Avoid circular import
-        from .audio import resample_audio
-
-        if not self.audios:
-            return
-
-        new_audios = []
-        for item_idx in range(len(self.audios)):
-            audio, sr = self.get_audio_with_sr(item_idx, default_sr=new_sr)
-            audio = resample_audio(audio, orig_sr=sr, target_sr=new_sr)
-
-            new_audios.append(audio if drop_sr else (audio, new_sr))
-
-        self["audio"] = new_audios
-
-
 class PlaceholderRange(TypedDict):
     """
     Placeholder location information for multi-modal data.
@@ -436,7 +340,7 @@ def from_items_by_key(
     ) -> "MultiModalKwargs":
         data = {
             key: items[0].field.reduce(items).data
-            for key, items in items_by_key.items()
+            for key, items in items_by_key.items() if len(items) > 0
         }
 
         return MultiModalKwargs(data,
@@ -567,6 +471,11 @@ def get_items_by_modality(
         Get the keyword arguments corresponding to an item identified by
         its modality and index.
         """
+        if modality not in self._keys_by_modality:
+            available_modalities = set(self._keys_by_modality.keys())
+            raise KeyError(f"Modality {modality!r} not found. "
+                           f"Available modalities: {available_modalities}")
+
         keys_to_gather = self._keys_by_modality[modality]
 
         return {
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
new file mode 100644
index 0000000000000..17a795247372e
--- /dev/null
+++ b/vllm/multimodal/parse.py
@@ -0,0 +1,344 @@
+from abc import ABC, abstractmethod
+from collections import UserDict
+from collections.abc import Callable, Iterator, Mapping, Sequence
+from typing import TYPE_CHECKING, Any, Generic, NamedTuple, Optional, TypeVar
+
+import numpy as np
+import torch
+from PIL.Image import Image
+from typing_extensions import TypeAlias, TypeGuard, assert_never
+
+from vllm.utils import is_list_of
+
+from .audio import resample_audio
+from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
+                     ImageItem, ModalityData, MultiModalDataDict,
+                     NestedTensors, VideoItem)
+
+_T = TypeVar("_T")
+_I = TypeVar("_I")
+
+
+class ModalityDataItems(ABC, Generic[_T, _I]):
+
+    def __init__(self, data: _T) -> None:
+        super().__init__()
+
+        self.data = data
+
+    def __len__(self) -> int:
+        return self.get_count()
+
+    def __getitem__(self, index: int) -> _I:
+        return self.get(index)
+
+    if TYPE_CHECKING:
+        # Auto-generated
+        def __iter__(self) -> Iterator[_I]:
+            ...
+
+    @abstractmethod
+    def get_count(self) -> int:
+        """Get the number of data items."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def get(self, index: int) -> _I:
+        """Get a data item by its index."""
+        raise NotImplementedError
+
+    def get_all(self) -> list[_I]:
+        """Get all data items."""
+        return [self.get(idx) for idx in range(self.get_count())]
+
+    @abstractmethod
+    def get_processor_data(self) -> Mapping[str, object]:
+        """Get the data to pass to the HF processor."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        """Get the data to pass directly to the model."""
+        raise NotImplementedError
+
+
+class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
+
+    def __init__(self, data: Sequence[_T], modality: str) -> None:
+        super().__init__(data)
+
+        self.modality = modality
+
+    def __repr__(self) -> str:
+        return (f"{type(self).__name__}(modality={self.modality!r})")
+
+    def get_count(self) -> int:
+        return len(self.data)
+
+    def get(self, index: int) -> _T:
+        return self.data[index]
+
+    def get_processor_data(self) -> Mapping[str, object]:
+        return {f"{self.modality}s": self.data}
+
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        return {}
+
+
+class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]):
+
+    def __init__(self, data: NestedTensors, modality: str) -> None:
+        super().__init__(data)
+
+        self.modality = modality
+
+    def __repr__(self) -> str:
+        return (f"{type(self).__name__}(modality={self.modality!r})")
+
+    def get_count(self) -> int:
+        return len(self.data)
+
+    def get(self, index: int) -> object:
+        return self.data[index]
+
+    def get_processor_data(self) -> Mapping[str, object]:
+        return {}
+
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        return {f"{self.modality}_embeds": self.data}
+
+
+class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
+
+    def __init__(self, data: Sequence[HfAudioItem]) -> None:
+        super().__init__(data, "audio")
+
+
+class AudioEmbeddingItems(EmbeddingItems):
+
+    def __init__(self, data: NestedTensors) -> None:
+        super().__init__(data, "audio")
+
+
+class ImageSize(NamedTuple):
+    width: int
+    height: int
+
+
+class ImageProcessorItems(ProcessorBatchItems[HfImageItem]):
+
+    def __init__(self, data: Sequence[HfImageItem]) -> None:
+        super().__init__(data, "image")
+
+    def get_image_size(self, item_idx: int) -> ImageSize:
+        image = self.get(item_idx)
+
+        if isinstance(image, Image):
+            return ImageSize(*image.size)
+        if isinstance(image, (np.ndarray, torch.Tensor)):
+            _, h, w = image.shape
+            return ImageSize(w, h)
+
+        assert_never(image)
+
+
+class ImageEmbeddingItems(EmbeddingItems):
+
+    def __init__(self, data: NestedTensors) -> None:
+        super().__init__(data, "image")
+
+
+class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
+
+    def __init__(self, data: Sequence[HfVideoItem]) -> None:
+        super().__init__(data, "video")
+
+
+class VideoEmbeddingItems(EmbeddingItems):
+
+    def __init__(self, data: NestedTensors) -> None:
+        super().__init__(data, "video")
+
+
+_D = TypeVar("_D", bound=ModalityDataItems[Any, Any])
+
+
+class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
+    """
+    As :class:`MultiModalDataDict`, but normalized such that each entry
+    corresponds to a list.
+    """
+
+    def get_count(self, modality: str, *, strict: bool = True) -> int:
+        """
+        Get the number of data items belonging to a modality.
+        
+        If `strict=False`, return `0` instead of raising :exc:`KeyError`
+        even if the modality is not found.
+        """
+        if modality not in self:
+            if strict:
+                available_modalities = set(self.keys())
+                raise KeyError(f"Modality {modality!r} not found. "
+                               f"Available modalities: {available_modalities}")
+
+            return 0
+
+        return self[modality].get_count()
+
+    def get_all_counts(self) -> Mapping[str, int]:
+        """Get the number of items belonging to each modality."""
+        return {m: items.get_count() for m, items in self.items()}
+
+    def get_items(
+        self,
+        modality: str,
+        typ: type[_D],
+    ) -> _D:
+        """
+        Get the data items belonging to a modality,
+        requiring that they belong to a certain type.
+        """
+        if modality not in self:
+            available_modalities = set(self.keys())
+            raise KeyError(f"Modality {modality!r} not found. "
+                           f"Available modalities: {available_modalities}")
+
+        items = self[modality]
+        if not isinstance(items, typ):
+            raise TypeError(f"Invalid type of data items for {modality=}. "
+                            f"Expected type: {typ}, but "
+                            f"found type: {type(items)}")
+
+        return items
+
+
+ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],
+                                         ModalityDataItems[Any, Any]]
+
+
+class MultiModalDataParser:
+    """
+    Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`.
+    """
+
+    def __init__(self, *, target_sr: Optional[float] = None) -> None:
+        super().__init__()
+
+        self.target_sr = target_sr
+
+    def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]:
+        if isinstance(data, torch.Tensor):
+            return data.ndim == 3
+        if is_list_of(data, torch.Tensor):
+            return len(data) == 0 or data[0].ndim == 2
+
+        return False
+
+    def _get_audio_with_sr(
+        self,
+        audio: AudioItem,
+    ) -> tuple[np.ndarray, Optional[float]]:
+        if isinstance(audio, tuple):
+            return audio
+        if isinstance(audio, list):
+            return np.array(audio), None
+        if isinstance(audio, np.ndarray):
+            return audio, None
+        if isinstance(audio, torch.Tensor):
+            return audio.numpy(), None
+
+        assert_never(audio)
+
+    def _parse_audio_data(
+        self,
+        data: ModalityData[AudioItem],
+    ) -> ModalityDataItems[Any, Any]:
+        if self._is_embeddings(data):
+            return AudioEmbeddingItems(data)
+
+        if (is_list_of(data, float)
+                or isinstance(data,
+                              (np.ndarray, torch.Tensor)) and data.ndim == 1
+                or isinstance(data, tuple)):
+            data_items = [data]
+        elif isinstance(data, (np.ndarray, torch.Tensor)):
+            data_items = [elem for elem in data]
+        else:
+            data_items = data
+
+        new_audios = list[np.ndarray]()
+        for data_item in data_items:
+            audio, orig_sr = self._get_audio_with_sr(data_item)
+            if orig_sr is None:
+                new_audio = audio
+            else:
+                target_sr = self.target_sr
+                if target_sr is None:
+                    raise RuntimeError(
+                        "Audio resampling is not supported when "
+                        "`target_sr` is not provided")
+
+                new_audio = resample_audio(audio,
+                                           orig_sr=orig_sr,
+                                           target_sr=target_sr)
+
+            new_audios.append(new_audio)
+
+        return AudioProcessorItems(new_audios)
+
+    def _parse_image_data(
+        self,
+        data: ModalityData[ImageItem],
+    ) -> ModalityDataItems[Any, Any]:
+        if self._is_embeddings(data):
+            return ImageEmbeddingItems(data)
+
+        if (isinstance(data, Image)
+                or isinstance(data,
+                              (np.ndarray, torch.Tensor)) and data.ndim == 3):
+            data_items = [data]
+        elif isinstance(data, (np.ndarray, torch.Tensor)):
+            data_items = [elem for elem in data]
+        else:
+            data_items = data
+
+        return ImageProcessorItems(data_items)
+
+    def _parse_video_data(
+        self,
+        data: ModalityData[VideoItem],
+    ) -> ModalityDataItems[Any, Any]:
+        if self._is_embeddings(data):
+            return VideoEmbeddingItems(data)
+
+        if (is_list_of(data, Image)
+                or isinstance(data,
+                              (np.ndarray, torch.Tensor)) and data.ndim == 4):
+            data_items = [data]
+        elif isinstance(data, (np.ndarray, torch.Tensor)):
+            data_items = [elem for elem in data]
+        else:
+            data_items = data
+
+        return VideoProcessorItems(data_items)
+
+    def _get_subparsers(self) -> Mapping[str, ModalityDataParser]:
+        return {
+            "audio": self._parse_audio_data,
+            "image": self._parse_image_data,
+            "video": self._parse_video_data,
+        }
+
+    def parse_mm_data(self,
+                      mm_data: MultiModalDataDict) -> MultiModalDataItems:
+        subparsers = self._get_subparsers()
+
+        mm_items = MultiModalDataItems()
+        for k, v in mm_data.items():
+            if k not in subparsers:
+                raise ValueError(f"Unsupported modality: {k}")
+
+            mm_items[k] = subparsers[k](v)
+
+        return mm_items
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 3ece0762e3228..180489166b407 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -15,11 +15,12 @@
 from vllm.inputs import DummyData, InputProcessingContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import LRUCache, flatten_2d_lists, full_groupby, is_list_of
+from vllm.utils import LRUCache, flatten_2d_lists, full_groupby
 
-from .inputs import (MultiModalDataDict, MultiModalDataItems,
-                     MultiModalFieldConfig, MultiModalFieldItem,
-                     MultiModalInputsV2, MultiModalKwargs, PlaceholderRange)
+from .inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                     MultiModalFieldItem, MultiModalInputsV2, MultiModalKwargs,
+                     PlaceholderRange)
+from .parse import MultiModalDataItems, MultiModalDataParser
 
 logger = init_logger(__name__)
 
@@ -621,6 +622,16 @@ def __call__(
     ) -> MultiModalInputsV2:
         return self.apply(prompt, mm_data, hf_processor_mm_kwargs)
 
+    def _get_data_parser(self) -> MultiModalDataParser:
+        """
+        Construct a data parser to preprocess multi-modal data items
+        before passing them to :meth:`_get_hf_mm_data`.
+
+        You can support additional modalities by creating a subclass
+        of :class:`MultiModalDataParser` that has additional subparsers.
+        """
+        return MultiModalDataParser()
+
     def _get_hf_processor(self) -> ProcessorMixin:
         """
         Subclasses can add keyword arguments to this method to accept
@@ -631,11 +642,16 @@ def _get_hf_processor(self) -> ProcessorMixin:
     def _get_tokenizer(self) -> AnyTokenizer:
         return self.ctx.tokenizer
 
-    def _get_mm_items(
+    def _to_mm_items(
         self,
         mm_data: MultiModalDataDict,
     ) -> MultiModalDataItems:
-        return MultiModalDataItems.from_dict(mm_data)
+        """
+        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`
+        before passing them to :meth:`_get_hf_mm_data`.
+        """
+        parser = self._get_data_parser()
+        return parser.parse_mm_data(mm_data)
 
     @abstractmethod
     def _get_mm_fields_config(
@@ -680,22 +696,9 @@ def _get_hf_mm_data(
         processor_data = dict[str, Any]()
         passthrough_data = dict[str, Any]()
 
-        for k, v in mm_items.items():
-            # TODO: Make a separate modality for embedding inputs
-            # to avoid confusion
-            if k in ("image", "video", "audio"):
-                if isinstance(v, torch.Tensor) and v.ndim == 3:
-                    # Pass through embedding inputs (single)
-                    passthrough_data[f"{k}_embeds"] = [v]
-                elif (is_list_of(v, torch.Tensor) and len(v) > 0
-                      and v[0].ndim == 2):
-                    # Pass through embedding inputs (multi)
-                    passthrough_data[f"{k}_embeds"] = v
-                elif len(v) > 0:
-                    # Map keys to plural form, e.g.: image -> images
-                    processor_data[f"{k}s"] = v
-            else:
-                processor_data[k] = v
+        for items in mm_items.values():
+            processor_data.update(items.get_processor_data())
+            passthrough_data.update(items.get_passthrough_data())
 
         return processor_data, passthrough_data
 
@@ -756,7 +759,7 @@ def _apply_hf_processor_missing(
         cached items; instead, we rely on our own prompt replacement logic
         for the full text.
         """
-        mm_missing_counts = mm_missing_data_items.get_item_counts()
+        mm_missing_counts = mm_missing_data_items.get_all_counts()
 
         prompt_ids, _ = self._apply_hf_processor(
             prompt_text=prompt_text,
@@ -789,7 +792,8 @@ def _cached_apply_hf_processor(
         cache = self.cache
         model_id = self.ctx.model_config.model
 
-        if cache is None or mm_data_items.has_embedding_inputs():
+        _, passthrough_data = self._get_hf_mm_data(mm_data_items)
+        if cache is None or passthrough_data:
             return self._apply_hf_processor(
                 prompt_text=prompt_text,
                 mm_items=mm_data_items,
@@ -812,7 +816,7 @@ def _cached_apply_hf_processor(
             modality: [mm_data_items[modality][idx] for idx in idxs]
             for modality, idxs in mm_missing_idxs.items()
         }
-        mm_missing_data_items = self._get_mm_items(mm_missing_data)
+        mm_missing_data_items = self._to_mm_items(mm_missing_data)
 
         prompt_ids, mm_missing_kwargs = self._apply_hf_processor_missing(
             prompt_text=prompt_text,
@@ -852,7 +856,7 @@ def _cached_apply_hf_processor(
             mm_merged_field_items[modality] = merged_modal_items_lst
 
         if self.enable_sanity_checks:
-            mm_missing_counts = mm_missing_data_items.get_item_counts()
+            mm_missing_counts = mm_missing_data_items.get_all_counts()
             assert all(
                 item_count == mm_missing_counts[modality]
                 for modality, item_count in mm_missing_next_idx.items()), dict(
@@ -865,7 +869,7 @@ def _cached_apply_hf_processor(
         )
 
         if self.enable_sanity_checks:
-            mm_item_counts = mm_data_items.get_item_counts()
+            mm_item_counts = mm_data_items.get_all_counts()
 
             for modality, item_count in mm_item_counts.items():
                 for item_idx in range(item_count):
@@ -958,7 +962,7 @@ def apply(
         3. Extract information about the placeholder tokens from the
            processed token IDs.
         """
-        mm_items = self._get_mm_items(mm_data)
+        mm_items = self._to_mm_items(mm_data)
 
         prompt_ids, mm_kwargs = self._cached_apply_hf_processor(
             prompt_text,
@@ -975,7 +979,7 @@ def apply(
 
         # If HF processor already inserts placeholder tokens,
         # there is no need for us to insert them
-        mm_item_counts = mm_items.get_item_counts()
+        mm_item_counts = mm_items.get_all_counts()
         all_placeholders = self._find_placeholders(prompt_repls, prompt_ids,
                                                    mm_item_counts)
 
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index b7d43c830cc46..1ad1f5abc27a2 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -15,7 +15,7 @@
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils import PlaceholderModule, is_list_of
 
-from .base import MediaIO, MultiModalData
+from .base import MediaIO, ModalityData
 from .image import ImageMediaIO, ImagePlugin
 from .inputs import MultiModalKwargs, VideoItem
 
@@ -54,7 +54,7 @@ def _get_hf_video_processor(
     def _default_input_mapper(
         self,
         ctx: InputContext,
-        data: MultiModalData[VideoItem],
+        data: ModalityData[VideoItem],
         **mm_processor_kwargs,
     ) -> MultiModalKwargs:
         model_config = ctx.model_config

From 5886aa496e8fa31c9180bcfc8e89faaa8899907d Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Mon, 30 Dec 2024 10:51:02 -0500
Subject: [PATCH 224/357] [V1] [6/N] API Server: Better Shutdown (#11586)

---
 vllm/entrypoints/openai/api_server.py | 44 ++++++++-------------------
 vllm/v1/engine/async_llm.py           | 25 +++++++++++++--
 vllm/v1/engine/core_client.py         | 16 ++++------
 3 files changed, 40 insertions(+), 45 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 094cc15a317e9..bac72d87376da 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -68,7 +68,7 @@
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
-                        is_valid_ipv6_address, kill_process_tree, set_ulimit)
+                        is_valid_ipv6_address, set_ulimit)
 from vllm.version import __version__ as VLLM_VERSION
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
@@ -133,32 +133,21 @@ async def build_async_engine_client_from_engine_args(
     Returns the Client or None if the creation failed.
     """
 
-    # Fall back
-    # TODO: fill out feature matrix.
+    # AsyncLLMEngine.
     if (MQLLMEngineClient.is_unsupported_config(engine_args)
             or envs.VLLM_USE_V1 or disable_frontend_multiprocessing):
-        engine_config = engine_args.create_engine_config(
-            UsageContext.OPENAI_API_SERVER)
-        uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config),
-                           "uses_ray", False)
-
-        build_engine = partial(AsyncLLMEngine.from_engine_args,
-                               engine_args=engine_args,
-                               engine_config=engine_config,
-                               usage_context=UsageContext.OPENAI_API_SERVER)
-        if uses_ray:
-            # Must run in main thread with ray for its signal handlers to work
-            engine_client = build_engine()
-        else:
-            engine_client = await asyncio.get_running_loop().run_in_executor(
-                None, build_engine)
 
-        yield engine_client
-        if hasattr(engine_client, "shutdown"):
-            engine_client.shutdown()
-        return
+        engine_client: Optional[EngineClient] = None
+        try:
+            engine_client = AsyncLLMEngine.from_engine_args(
+                engine_args=engine_args,
+                usage_context=UsageContext.OPENAI_API_SERVER)
+            yield engine_client
+        finally:
+            if engine_client and hasattr(engine_client, "shutdown"):
+                engine_client.shutdown()
 
-    # Otherwise, use the multiprocessing AsyncLLMEngine.
+    # MQLLMEngine.
     else:
         if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
             # Make TemporaryDirectory for prometheus multiprocessing
@@ -737,15 +726,6 @@ def signal_handler(*_) -> None:
 
     signal.signal(signal.SIGTERM, signal_handler)
 
-    # The child processes will send SIGQUIT to this process when
-    # any error happens. This process then clean up the whole tree.
-    # TODO(rob): move this into AsyncLLM.__init__ once we remove
-    # the context manager below.
-    def sigquit_handler(signum, frame):
-        kill_process_tree(os.getpid())
-
-    signal.signal(signal.SIGQUIT, sigquit_handler)
-
     async with build_async_engine_client(args) as engine_client:
         app = build_app(args)
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 213ddaa023dbc..3f097ca7f439c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,4 +1,6 @@
 import asyncio
+import os
+import signal
 from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
 
 from vllm.config import ModelConfig, VllmConfig
@@ -16,6 +18,7 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
+from vllm.utils import kill_process_tree
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
@@ -38,6 +41,22 @@ def __init__(
         log_requests: bool = True,
         start_engine_loop: bool = True,
     ) -> None:
+
+        # The child processes will send SIGQUIT when unrecoverable
+        # errors happen. We kill the process tree here so that the
+        # stack trace is very evident.
+        # TODO: rather than killing the main process, we should
+        # figure out how to raise an AsyncEngineDeadError and
+        # handle at the API server level so we can return a better
+        # error code to the clients calling VLLM.
+        def sigquit_handler(signum, frame):
+            logger.fatal(
+                "AsyncLLM got SIGQUIT from worker processes, shutting "
+                "down. See stack trace above for root cause issue.")
+            kill_process_tree(os.getpid())
+
+        signal.signal(signal.SIGQUIT, sigquit_handler)
+
         assert start_engine_loop
 
         self.log_requests = log_requests
@@ -276,9 +295,9 @@ async def _run_output_handler(self):
                 # 4) Abort any requests that finished due to stop strings.
                 await self.engine_core.abort_requests_async(reqs_to_abort)
 
-        except BaseException as e:
-            logger.error(e)
-            raise e
+        except Exception as e:
+            logger.exception("EngineCore output handler hit an error: %s", e)
+            kill_process_tree(os.getpid())
 
     async def abort(self, request_id: str) -> None:
         """Abort RequestId in self, detokenizer, and engine core."""
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index beb5d57c20c83..3293205e110af 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -6,7 +6,7 @@
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import get_open_zmq_ipc_path
+from vllm.utils import get_open_zmq_ipc_path, make_zmq_socket
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
                             EngineCoreRequestType, EngineCoreRequestUnion)
@@ -144,17 +144,13 @@ def __init__(
         else:
             self.ctx = zmq.Context()  # type: ignore[attr-defined]
 
-        # Path for IPC.
+        # Paths and sockets for IPC.
         output_path = get_open_zmq_ipc_path()
         input_path = get_open_zmq_ipc_path()
-
-        # Get output (EngineCoreOutput) from EngineCore.
-        self.output_socket = self.ctx.socket(zmq.constants.PULL)
-        self.output_socket.connect(output_path)
-
-        # Send input (EngineCoreRequest) to EngineCore.
-        self.input_socket = self.ctx.socket(zmq.constants.PUSH)
-        self.input_socket.bind(input_path)
+        self.output_socket = make_zmq_socket(self.ctx, output_path,
+                                             zmq.constants.PULL)
+        self.input_socket = make_zmq_socket(self.ctx, input_path,
+                                            zmq.constants.PUSH)
 
         # Start EngineCore in background process.
         self.proc_handle: Optional[BackgroundProcHandle]

From 36e76700453924c8d421db99af70a88a1df835cd Mon Sep 17 00:00:00 2001
From: whyiug <whyiug@hotmail.com>
Date: Tue, 31 Dec 2024 02:51:04 +0800
Subject: [PATCH 225/357] [Bugfix] Validate and concatenate image embeddings in
 MiniCPMVBaseModel (#11631)

---
 vllm/model_executor/models/minicpmv.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 1e8f9bd4cf418..712022502539b 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -487,6 +487,12 @@ def _parse_and_validate_inputs(
         image_embeds = kwargs.pop("image_embeds", None)
 
         if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError(f"Incorrect type of image embeds. "
+                                 f"Got type: {type(image_embeds)}")
+            if isinstance(image_embeds, list):
+                image_embeds = torch.concat(image_embeds)
+
             return MiniCPMVImageEmbeddingInputs(
                 image_bounds=self._get_image_bounds(input_ids, im_start_id,
                                                     im_end_id, slice_start_id,

From ccb1aabccaa7aaf07b08fd8be30380e828efba0f Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Mon, 30 Dec 2024 12:27:07 -0800
Subject: [PATCH 226/357] [benchmark] Remove dependency for H100 benchmark step
 (#11572)

---
 .buildkite/nightly-benchmarks/benchmark-pipeline.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index 708e548727cf5..868b8e95db01d 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -73,7 +73,7 @@ steps:
     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
     agents:
       queue: H100
-    depends_on: block-h100
+    depends_on: ~
     plugins:
     - docker#v5.12.0:
         image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT

From a2a40bcd0d8275e19c46e9cc06ee994d8839b98d Mon Sep 17 00:00:00 2001
From: Matthias Vogler <60004995+ayylemao@users.noreply.github.com>
Date: Tue, 31 Dec 2024 02:33:06 +0100
Subject: [PATCH 227/357] [Model][LoRA]LoRA support added for MolmoForCausalLM
 (#11439)

Signed-off-by: Matthias Vogler <matthias.vogler@joesecurity.org>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Matthias Vogler <matthias.vogler@joesecurity.org>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 docs/source/models/supported_models.md |  2 +-
 vllm/model_executor/models/molmo.py    | 45 ++++++++++++++++++++++++--
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 518505abeb2a9..613343281464c 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -666,7 +666,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - Molmo
   - T + I
   - `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc.
-  -
+  - ✅︎
   - ✅︎
   - ✅︎
 * - `NVLM_D_Model`
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 5d52d2c3e6b48..cc25be9f5b6a9 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -36,6 +36,7 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
 from vllm.multimodal.utils import cached_get_tokenizer
@@ -43,7 +44,7 @@
                            SequenceData)
 from vllm.transformers_utils.processor import get_processor
 
-from .interfaces import SupportsMultiModal, SupportsPP
+from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -1161,8 +1162,8 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_molmo_image_tokens)
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_molmo)
 @INPUT_REGISTRY.register_input_processor(input_processor_for_molmo)
-class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
-
+class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
+                       SupportsLoRA):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
             # vision backbone mapping
@@ -1191,6 +1192,32 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         },
     )
 
+    packed_modules_mapping = {
+        "qkv_proj": ["qkv_proj"],
+        "gate_up_proj": ["gate_up_proj"],  # language model
+        "merged_linear": ["gate_proj", "up_proj"]  # image_projector
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        # language model
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",  # same name with image_projector
+        # vision tower
+        "wq",
+        "wk",
+        "wv",
+        "wo",
+        "w1",
+        "w2",
+        # image_projector
+        "merged_linear",
+    ]
+    embedding_modules = {}
+    embedding_padding_modules = []
+
     # BitandBytes specific attributes
     bitsandbytes_stacked_params_mapping = {
         "gate_proj": ("merged_linear", 0),
@@ -1202,8 +1229,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
+        lora_config = vllm_config.lora_config
         self.config = config
         self.multimodal_config = multimodal_config
+        self.lora_config = lora_config
 
         vision_config = VisionBackboneConfig()
         self.vision_backbone = MolmoVisionBackbone(config, vision_config,
@@ -1377,6 +1406,16 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         weights = _get_weights_with_merged_embedding(weights)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="model",
+            connector="vision_backbone.image_projector",
+            tower_model="vision_backbone",
+        )
+
 
 def _get_weights_with_merged_embedding(
     weights: Iterable[Tuple[str, torch.Tensor]]

From 74fa1d123c2818065d862d2ceb2338468914fa79 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 30 Dec 2024 22:43:54 -0500
Subject: [PATCH 228/357] [Bugfix] Fix OpenAI parallel sampling when using
 xgrammar (#11637)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 tests/entrypoints/openai/test_completion.py        | 14 ++++++--------
 .../guided_decoding/xgrammar_decoding.py           |  5 +++++
 vllm/sampling_params.py                            |  9 +++++----
 vllm/sequence.py                                   |  2 +-
 4 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index c81cfdbbe5cff..183d900c493e5 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -28,6 +28,8 @@
 # need to change to match the prompt adapter
 PA_NUM_VIRTUAL_TOKENS = 8
 
+GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
+
 
 @pytest.fixture(scope="module")
 def zephyr_lora_files():
@@ -635,8 +637,7 @@ async def test_allowed_token_ids(client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_json_completion(client: openai.AsyncOpenAI,
                                       guided_decoding_backend: str,
                                       sample_json_schema):
@@ -658,8 +659,7 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_regex_completion(client: openai.AsyncOpenAI,
                                        guided_decoding_backend: str,
                                        sample_regex):
@@ -680,8 +680,7 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_completion(client: openai.AsyncOpenAI,
                                         guided_decoding_backend: str,
                                         sample_guided_choice):
@@ -761,8 +760,7 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
                                           guided_decoding_backend: str,
                                           sample_json_schema, sample_regex):
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index 5e1948977bff4..f10a8fb8e03cf 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -1,6 +1,7 @@
 # noqa: UP007
 from __future__ import annotations
 
+import copy
 import json
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any
@@ -309,3 +310,7 @@ def __call__(self, input_ids: list[int],
             scores = scores.to(device_type).squeeze()
 
         return scores
+
+    def clone(self) -> XGrammarLogitsProcessor:
+        """Deepcopy due to per-sequence state in the matchers"""
+        return copy.deepcopy(self)
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index fc77f3ca529b2..605c09b8d7225 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -450,15 +450,16 @@ def all_stop_token_ids(self) -> Set[int]:
         return self._all_stop_token_ids
 
     def clone(self) -> "SamplingParams":
-        """Deep copy excluding LogitsProcessor objects.
+        """Deep copy, but maybe not the LogitsProcessor objects.
 
-        LogitsProcessor objects are excluded because they may contain an
-        arbitrary, nontrivial amount of data.
+        LogitsProcessor objects may contain an arbitrary, nontrivial amount of
+        data that is expensive to copy. However, if not copied, the processor
+        needs to support parallel decoding for multiple sequences
         See https://github.com/vllm-project/vllm/issues/3087
         """
 
         logit_processor_refs = None if self.logits_processors is None else {
-            id(lp): lp
+            id(lp): lp.clone() if hasattr(lp, 'clone') else lp
             for lp in self.logits_processors
         }
         return copy.deepcopy(self, memo=logit_processor_refs)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 34f910d47b7d9..034f89c0ddbe9 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1372,7 +1372,7 @@ class ParallelSampleSequenceGroup(SequenceGroupBase):
     @staticmethod
     def add_request(request_id: str, engine, params, **kwargs):
         original_params = params
-        params = copy.deepcopy(original_params)
+        params = original_params.clone()
         params.n = 1
         group = ParallelSampleSequenceGroup(request_id)
         seqs = []

From 82c49d3260f1fb9fcd686736e8439dc69cd2f1c4 Mon Sep 17 00:00:00 2001
From: John Giorgi <johnmgiorgi@gmail.com>
Date: Tue, 31 Dec 2024 01:15:58 -0500
Subject: [PATCH 229/357] [Misc][LoRA] Support Rank Stabilized LoRA (RSLoRA)
 (#6909)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_lora_manager.py | 20 +++++++++++++-------
 vllm/lora/lora.py               | 12 +++---------
 vllm/lora/models.py             |  2 +-
 vllm/lora/peft_helper.py        | 18 +++++++++++++-----
 4 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 0b76f466702fc..a099f36b0a465 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -1,4 +1,5 @@
 import json
+import math
 import os
 from typing import Dict, List
 
@@ -50,6 +51,18 @@ def test_peft_helper(sql_lora_files):
         "embed_tokens",
         "lm_head",
     ]
+    scaling = peft_helper.lora_alpha / peft_helper.r
+    assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
+
+    # test RSLoRA
+    config = dict(r=8,
+                  lora_alpha=16,
+                  target_modules=["gate_proj"],
+                  use_rslora=True)
+    peft_helper = PEFTHelper.from_dict(config)
+
+    scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r)
+    assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
 
     expected_error = "vLLM only supports modules_to_save being None."
     with pytest.raises(ValueError, match=expected_error):
@@ -60,13 +73,6 @@ def test_peft_helper(sql_lora_files):
             modules_to_save=["lm_head"],
         )
         PEFTHelper.from_dict(config)
-    expected_error = "vLLM does not yet support RSLoRA."
-    with pytest.raises(ValueError, match=expected_error):
-        config = dict(r=8,
-                      lora_alpha=16,
-                      target_modules=["gate_proj"],
-                      use_rslora=True)
-        PEFTHelper.from_dict(config)
 
     expected_error = "vLLM does not yet support DoRA."
     with pytest.raises(ValueError, match=expected_error):
diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
index dde347b78bf81..93ad4651f4b77 100644
--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
@@ -67,15 +67,9 @@ def from_config(
         peft_helper: PEFTHelper,
         embeddings_tensor: Optional[torch.Tensor] = None,
     ) -> "LoRALayerWeights":
-        return cls(
-            module_name,
-            peft_helper.r,
-            peft_helper.lora_alpha,
-            None,
-            None,
-            None,
-            embeddings_tensor,
-        )
+        return cls(module_name, peft_helper.r, peft_helper.lora_alpha, None,
+                   None, None, embeddings_tensor,
+                   peft_helper.vllm_lora_scaling_factor)
 
     @classmethod
     def create_dummy_lora_weights(
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 5c0e4e5cbc636..9cfcc6bba727f 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -173,7 +173,7 @@ def from_lora_tensors(
         return cls(lora_model_id,
                    peft_helper.r,
                    loras,
-                   scaling_factor=peft_helper.vllm_scaling_factor)
+                   scaling_factor=peft_helper.vllm_long_context_scaling_factor)
 
     @classmethod
     def from_local_checkpoint(
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index edf4ba5659575..ddd42ae93d290 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -4,6 +4,8 @@
 from dataclasses import MISSING, dataclass, field, fields
 from typing import Literal, Optional, Union
 
+from vllm.utils import print_info_once
+
 
 @dataclass
 class PEFTHelper:
@@ -14,21 +16,22 @@ class PEFTHelper:
 
     bias: Literal["none", "all", "lora_only"] = field(default="none")
     modules_to_save: Optional[list[str]] = field(default=None)
+    # True to use Rank-Stabilized LoRA (rsLoRA, see: https://arxiv.org/abs/2312.03732)
     use_rslora: bool = field(default=False)
+    # True to use Weight-Decomposed Low-Rank Adaptation (DoRA, see: https://arxiv.org/abs/2402.09353)
     use_dora: bool = field(default=False)
-    # long lora field
+    # long context lora field
     context_length: int = field(default=0)
     # Extra vllm field, start with 'vllm_' to avoid conflict
+    vllm_lora_scaling_factor: float = field(default=1.0)
     vllm_max_position_embeddings: Optional[int] = field(default=False)
-    vllm_scaling_factor: Optional[float] = field(default=None)
+    vllm_long_context_scaling_factor: Optional[float] = field(default=None)
 
     def _validate_features(self):
         error_msg = []
 
         if self.modules_to_save:
             error_msg.append("vLLM only supports modules_to_save being None.")
-        if self.use_rslora:
-            error_msg.append("vLLM does not yet support RSLoRA.")
 
         if self.use_dora:
             error_msg.append("vLLM does not yet support DoRA.")
@@ -38,10 +41,15 @@ def _validate_features(self):
 
     def __post_init__(self):
         self._validate_features()
+        if self.use_rslora:
+            print_info_once("Loading LoRA weights trained with rsLoRA.")
+            self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r)
+        else:
+            self.vllm_lora_scaling_factor = self.lora_alpha / self.r
         if self.context_length:
             if self.vllm_max_position_embeddings is None:
                 self.vllm_max_position_embeddings = self.context_length
-            self.vllm_scaling_factor = float(
+            self.vllm_long_context_scaling_factor = float(
                 math.ceil(self.context_length /
                           self.vllm_max_position_embeddings))
 

From 2c5718809bb5f4bce2ae8e05041d613215dac1aa Mon Sep 17 00:00:00 2001
From: sakunkun <zhou.qianjun@zte.com.cn>
Date: Tue, 31 Dec 2024 14:29:04 +0800
Subject: [PATCH 230/357] [Bugfix] Move the _touch(computed_blocks) call in the
 allocate_slots method to after the check for allocating new blocks. (#11565)

---
 tests/v1/core/test_prefix_caching.py | 63 +++++++++++++++++++++++++++-
 vllm/v1/core/kv_cache_manager.py     | 19 ++++++---
 2 files changed, 74 insertions(+), 8 deletions(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index ed04f0a373c51..dafaa6aee9995 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -98,9 +98,9 @@ def test_prefill():
     # Incomplete 1 block (6 tokens)
     unique_token_ids = [3] * 6
     req2 = make_request("2", common_token_ids + unique_token_ids)
-    computed_block = manager.get_computed_blocks(req2)
+    computed_blocks = manager.get_computed_blocks(req2)
     assert len(req2.kv_block_hashes) == 3
-    assert [b.block_id for b in computed_block] == [0, 1, 2]
+    assert [b.block_id for b in computed_blocks] == [0, 1, 2]
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
     assert [b.block_id for b in blocks] == [7, 8]
@@ -500,3 +500,62 @@ def test_mm_prefix_caching():
                         mm_hashes=mm_hashes)
     computed_blocks = manager.get_computed_blocks(req1)
     assert len(computed_blocks) == 3
+
+
+def test_prefill_not_enough_free_blocks_with_computed_blocks():
+    """
+    This is a unit test that tests the correctness of the allocate_slots
+    when there is not enough free blocks. Specifically, when a request
+    has computed blocks but cannot be allocated due to not enough free blocks,
+    the computed blocks should not be touched.
+    """
+    block_size = 16
+    manager = KVCacheManager(
+        block_size=block_size,
+        num_gpu_blocks=10,
+        max_model_len=8192,
+        sliding_window=None,
+        enable_caching=True,
+        num_preallocate_tokens=0,
+    )
+    # Complete 3 blocks (48 tokens)
+    # | Common-0 | Common-1 | Common-2 | ... |
+    common_token_ids = [i for i in range(3) for _ in range(16)]
+    req0 = make_request("0", common_token_ids)
+    computed_blocks = manager.get_computed_blocks(req0)
+    assert not computed_blocks
+    manager.allocate_slots(req0, 48, computed_blocks)
+    block_part0 = manager.req_to_blocks[req0.request_id]
+
+    # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
+    req1 = make_request("1", common_token_ids * 2)
+    computed_blocks = manager.get_computed_blocks(req1)
+    assert computed_blocks == block_part0
+    manager.allocate_slots(req1, 48, computed_blocks)
+    block_part1 = manager.req_to_blocks[req1.request_id]
+    # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
+    # | Req1-5(F)| ... |
+    manager.free(req1)
+    assert {block.ref_cnt for block in block_part1[:3]} == {1}
+    assert {block.ref_cnt for block in block_part1[3:]} == {0}
+
+    # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
+    # | Req1-5(F)| Req2-0   | Req2-1   | ... |
+    req2 = make_request("2", [7] * block_size * 2)
+    computed_blocks = manager.get_computed_blocks(req2)
+    assert not computed_blocks
+    manager.allocate_slots(req2, block_size * 2, computed_blocks)
+
+    # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
+    # but it cannot be allocated due to insufficient free blocks (2).
+    # In this case, the ref_cnt of the computed blocks should not be changed.
+    assert manager.free_block_queue.num_free_blocks == 5
+    req3 = make_request("3", common_token_ids * 3)
+    computed_blocks = manager.get_computed_blocks(req3)
+    assert computed_blocks == block_part1
+    # Req3 cannot be allocated.
+    assert manager.allocate_slots(req3, 48, computed_blocks) is None
+    # Block 0-2 are used by Req 1.
+    assert {block.ref_cnt for block in block_part1[:3]} == {1}
+    # Block 3-5 are free.
+    assert {block.ref_cnt for block in block_part1[3:]} == {0}
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 78efacccfa078..00d0de51634ae 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -191,7 +191,7 @@ def allocate_slots(
             request: The request to allocate slots.
             num_tokens: The number of tokens to allocate. Note that this does
                 not include the tokens that have already been computed.
-            computed_blocks: The blocks that have already been computed.
+            computed_blocks: A list of computed blocks.
 
         Returns:
             A list of new allocated blocks.
@@ -200,6 +200,18 @@ def allocate_slots(
             raise ValueError(
                 f"num_tokens must be greater than 0, got {num_tokens}")
 
+        # If a computed block of a request is an eviction candidate (in the
+        # free queue and ref_cnt == 0), it cannot be counted as a free block
+        # when allocating this request.
+        num_evictable_computed_blocks = sum(1 for blk in computed_blocks
+                                            if blk.ref_cnt == 0)
+
+        num_required_blocks = cdiv(num_tokens, self.block_size)
+        if (num_required_blocks > self.free_block_queue.num_free_blocks -
+                num_evictable_computed_blocks):
+            # Cannot allocate new blocks.
+            return None
+
         # Touch the computed blocks to make sure they won't be evicted.
         if self.enable_caching:
             self._touch(computed_blocks)
@@ -208,11 +220,6 @@ def allocate_slots(
                 "Computed blocks should be empty when "
                 "prefix caching is disabled")
 
-        num_required_blocks = cdiv(num_tokens, self.block_size)
-        if (num_required_blocks > self.free_block_queue.num_free_blocks):
-            # Cannot allocate new blocks.
-            return None
-
         # Determine the number of new blocks to allocate considering
         # preallocated blocks.
         num_new_blocks = min(

From 8c3230d8c1cf114618c2316c54bf06b7d0c198b6 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 31 Dec 2024 16:56:01 +0800
Subject: [PATCH 231/357] [V1] Simpify vision block hash for prefix caching by
 removing offset from hash (#11646)

---
 tests/v1/core/test_prefix_caching.py | 8 ++++----
 vllm/v1/core/kv_cache_utils.py       | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index dafaa6aee9995..35e3a2f972720 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -469,9 +469,9 @@ def test_mm_prefix_caching():
     # Completed block should have hashes with extra keys.
     assert not computed_blocks
     assert len(req0.kv_block_hashes) == 3
-    assert req0.kv_block_hashes[0].extra_keys == (("aaa", 0), )
-    assert req0.kv_block_hashes[1].extra_keys == (("aaa", 5), ("bbb", 0))
-    assert req0.kv_block_hashes[2].extra_keys == (("bbb", 2), )
+    assert req0.kv_block_hashes[0].extra_keys == ("aaa", )
+    assert req0.kv_block_hashes[1].extra_keys == ("aaa", "bbb")
+    assert req0.kv_block_hashes[2].extra_keys == ("bbb", )
 
     blocks = manager.allocate_slots(req0, 59, computed_blocks)
     assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
@@ -485,7 +485,7 @@ def test_mm_prefix_caching():
 
     # The just completed block should have hashes with extra keys.
     assert len(req0.kv_block_hashes) == 4
-    assert req0.kv_block_hashes[3].extra_keys == (("ccc", 0), )
+    assert req0.kv_block_hashes[3].extra_keys == ("ccc", )
 
     # Cache hit.
     unique_token_ids = [-1] * 7 + [200] * 5
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 9ddbff7c9a604..84ff48bf428a0 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -218,8 +218,8 @@ def generate_block_hash_extra_keys(
                 continue
 
             # The block contains the current mm input.
-            mm_start = max(0, start_token_idx - offset)
-            extra_keys.append((mm_hashes[curr_mm_idx], mm_start))
+            extra_keys.append(mm_hashes[curr_mm_idx])
+
             if end_token_idx >= offset + length:
                 # If this block contains the end of the current mm input,
                 # move to the next mm input as this block may also contain

From e7c7c5e822a886e3dba202ca1b756c3260efffcc Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 31 Dec 2024 13:17:22 -0800
Subject: [PATCH 232/357] [V1][VLM] V1 support for selected single-image
 models. (#11632)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Isotr0py <2037008807@qq.com>
---
 docs/source/models/supported_models.md        |  10 +-
 examples/offline_inference_vision_language.py |  10 +-
 .../vision_language/test_models.py            |   7 +-
 tests/multimodal/test_processing.py           |  29 +-
 vllm/model_executor/models/aria.py            | 169 ++++----
 vllm/model_executor/models/blip.py            |  92 -----
 vllm/model_executor/models/blip2.py           | 172 ++++----
 vllm/model_executor/models/chameleon.py       | 191 ++++-----
 vllm/model_executor/models/fuyu.py            | 381 +++++++++---------
 .../models/idefics2_vision_model.py           |   6 +-
 vllm/model_executor/models/llava.py           |   4 +-
 vllm/model_executor/models/llava_next.py      |   6 +-
 vllm/model_executor/models/pixtral.py         |  12 +-
 vllm/model_executor/models/qwen2_audio.py     |  14 +-
 vllm/model_executor/models/qwen2_vl.py        |  17 +-
 vllm/model_executor/models/ultravox.py        |  13 +-
 vllm/multimodal/processing.py                 |  68 +++-
 vllm/multimodal/utils.py                      |  10 +-
 vllm/v1/worker/gpu_model_runner.py            |  15 +-
 19 files changed, 590 insertions(+), 636 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 613343281464c..f74c201bdff6b 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -570,28 +570,28 @@ See [this page](#generative-models) for more information on how to use generativ
   - `rhymes-ai/Aria`
   -
   - ✅︎
-  -
+  - ✅︎
 * - `Blip2ForConditionalGeneration`
   - BLIP-2
   - T + I<sup>E</sup>
   - `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `ChameleonForConditionalGeneration`
   - Chameleon
   - T + I
   - `facebook/chameleon-7b` etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `FuyuForCausalLM`
   - Fuyu
   - T + I
   - `adept/fuyu-8b` etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `ChatGLMModel`
   - GLM-4V
   - T + I
@@ -633,7 +633,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `LlavaNextVideoForConditionalGeneration`
   - LLaVA-NeXT-Video
   - T + V
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 77af914a6ef02..b51bfae455267 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -24,10 +24,13 @@ def run_aria(question: str, modality: str):
     assert modality == "image"
     model_name = "rhymes-ai/Aria"
 
+    # NOTE: Need L40 (or equivalent) to avoid OOM
     llm = LLM(model=model_name,
               tokenizer_mode="slow",
-              trust_remote_code=True,
               dtype="bfloat16",
+              max_model_len=4096,
+              max_num_seqs=2,
+              trust_remote_code=True,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
 
     prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
@@ -57,6 +60,7 @@ def run_chameleon(question: str, modality: str):
     prompt = f"{question}<image>"
     llm = LLM(model="facebook/chameleon-7b",
               max_model_len=4096,
+              max_num_seqs=2,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
@@ -257,7 +261,7 @@ def run_minicpmv(question: str, modality: str):
     # 2.5
     # model_name = "openbmb/MiniCPM-Llama3-V-2_5"
 
-    #2.6
+    # 2.6
     model_name = "openbmb/MiniCPM-V-2_6"
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
@@ -430,9 +434,11 @@ def run_pixtral_hf(question: str, modality: str):
 
     model_name = "mistral-community/pixtral-12b"
 
+    # NOTE: Need L40 (or equivalent) to avoid OOM
     llm = LLM(
         model=model_name,
         max_model_len=8192,
+        max_num_seqs=2,
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 1a9c1b4ef1be0..7db08166826eb 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -140,10 +140,7 @@
     "aria": VLMTestInfo(
         models=["rhymes-ai/Aria"],
         tokenizer_mode="slow",
-        test_type=(
-            VLMTestType.IMAGE,
-            VLMTestType.MULTI_IMAGE,
-        ),
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         dtype="bfloat16",
         prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
         img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
@@ -179,6 +176,7 @@
         test_type=VLMTestType.IMAGE,
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
+        max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,
         postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"
@@ -201,7 +199,6 @@
         vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
         num_logprobs=10,
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
-        marks=[large_gpu_mark(min_gb=48)],
     ),
     "glm4": VLMTestInfo(
         models=["THUDM/glm-4v-9b"],
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 1b2847ed0f534..81278cde264ff 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -528,7 +528,7 @@ def _rand_audio(
 
 def _test_processing_cache_correctness(
     model_id: str,
-    modalities: set[str],
+    modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
@@ -583,9 +583,8 @@ def _test_processing_cache_correctness(
         partial(_rand_audio, rng, min_len=256, max_len=512, sr=16000),
     }
     input_max_count = {
-        "image": 3,
-        "video": 3,
-        "audio": 3,
+        modality: 3 if supports_multi else 1
+        for modality, supports_multi in modalities.items()
     }
 
     for batch_idx in range(num_batches):
@@ -624,12 +623,16 @@ def _test_processing_cache_correctness(
 
 # yapf: disable
 @pytest.mark.parametrize(("model_id", "modalities"), [
-    ("llava-hf/llava-1.5-7b-hf", {"image"}),
-    ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image"}),
-    ("mistral-community/pixtral-12b", {"image"}),
-    ("Qwen/Qwen2-VL-2B-Instruct", {"image", "video"}),
-    ("Qwen/Qwen2-Audio-7B-Instruct", {"audio"}),
-    ("fixie-ai/ultravox-v0_3", {"audio"}),
+    ("rhymes-ai/Aria", {"image": True}),
+    ("Salesforce/blip2-opt-2.7b", {"image": False}),
+    ("facebook/chameleon-7b", {"image": True}),
+    ("adept/fuyu-8b", {"image": False}),
+    ("llava-hf/llava-1.5-7b-hf", {"image": True}),
+    ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
+    ("mistral-community/pixtral-12b", {"image": True}),
+    ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
+    ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}),
+    ("fixie-ai/ultravox-v0_3", {"audio": True}),
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
@@ -637,7 +640,7 @@ def _test_processing_cache_correctness(
 # yapf: enable
 def test_processing_cache_correctness(
     model_id: str,
-    modalities: set[str],
+    modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
@@ -653,7 +656,7 @@ def test_processing_cache_correctness(
 
 # yapf: disable
 @pytest.mark.parametrize(("model_id", "modalities"), [
-    ("microsoft/Phi-3-vision-128k-instruct", {"image"}),
+    ("microsoft/Phi-3-vision-128k-instruct", {"image": True}),
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
@@ -661,7 +664,7 @@ def test_processing_cache_correctness(
 # yapf: enable
 def test_processing_cache_correctness_phi3v(
     model_id: str,
-    modalities: set[str],
+    modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 9437ad9688422..4ad6e859f4d93 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -1,15 +1,15 @@
-import math
-from typing import Iterable, List, Optional, Set, Tuple, TypedDict, Union
+from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
+                    Union)
 
 import torch
 import torch.nn as nn
 from torch.nn.init import trunc_normal_
-from transformers import LlamaConfig
+from transformers import BatchFeature, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, QuantizationConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_rank
-from vllm.inputs import INPUT_REGISTRY, token_inputs
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -17,30 +17,27 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     get_compressed_tensors_cache_scale)
-from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput,
-                                                SamplingMetadata)
+from vllm.model_executor.layers.sampler import (SamplerOutput,
+                                                SamplingMetadata, get_sampler)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
-from vllm.model_executor.models.idefics2_vision_model import (
-    Idefics2VisionTransformer)
-from vllm.model_executor.models.interfaces import SupportsMultiModal
-from vllm.model_executor.models.llama import (LlamaDecoderLayer, LlamaMLP,
-                                              LlamaModel)
-from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
-                                              is_pp_missing_parameter,
-                                              maybe_prefix,
-                                              merge_multimodal_embeddings)
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import cached_get_image_processor
-from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   repeat_and_pad_placeholder_tokens)
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.aria import (AriaMoELMConfig,
                                                   AriaVisionConfig)
 
-from .utils import flatten_bn
+from .idefics2_vision_model import Idefics2VisionTransformer
+from .interfaces import SupportsMultiModal
+from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    is_pp_missing_parameter, maybe_prefix,
+                    merge_multimodal_embeddings)
 
 
 class AriaImagePixelInputs(TypedDict):
@@ -251,7 +248,7 @@ def forward(self, x, attn_mask=None):
 class AriaFusedMoE(FusedMoE):
 
     def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
-                      shard_id: str) -> Set[str]:
+                      shard_id: str) -> None:
         # Override the weight_loader to handle the expert weights in the Aria
         # model, which are already packed with experts, and merge the gate and
         # up weights for each expert.
@@ -346,7 +343,7 @@ class MoEDecoderLayer(LlamaDecoderLayer):
 
     def __init__(
         self,
-        config: LlamaConfig,
+        config: AriaMoELMConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -434,7 +431,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
         return loaded_params
 
 
-def build_mm_projector(config):
+def build_mm_projector(config: PretrainedConfig):
     return AriaProjector(
         patch_to_query_dict=config.projector_patch_to_query_dict,
         embed_dim=config.vision_config.hidden_size,
@@ -445,75 +442,70 @@ def build_mm_projector(config):
     )
 
 
-def get_max_multimodal_tokens(ctx):
-    return max(ctx.model_config.hf_config.image_size2tokens.values())
-
-
-def input_mapper_for_aria(ctx, data):
-    return MultiModalKwargs(data)
+def get_max_aria_image_tokens(ctx: InputContext):
+    hf_config = ctx.get_hf_config()
+    return max(hf_config.projector_patch_to_query_dict.values())
 
 
-def input_processor(ctx, llm_inputs):
-    multi_modal_data = llm_inputs.get("multi_modal_data")
-    # if it is pure text input, use it as is
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return llm_inputs
+class AriaMultiModalProcessor(BaseMultiModalProcessor):
 
-    model_config = ctx.model_config
-
-    tokenizer = cached_get_tokenizer(model_config.tokenizer)
-    image_processor = cached_get_image_processor(
-        model_config.model, trust_remote_code=model_config.trust_remote_code)
-    hf_config = model_config.hf_config
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            pixel_mask=MultiModalFieldConfig.batched("image"),
+        )
 
-    # prepare image tokens, the max_image_size is used to determine the number
-    # of patch_size for every image
-    max_image_size = multi_modal_data.pop("max_image_size", 980)
-    _split_image = multi_modal_data.pop("split_image", False)
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        hf_config = self.ctx.get_hf_config()
+        image_token_id = hf_config.image_token_index
+
+        max_image_tokens = get_max_aria_image_tokens(self.ctx)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=[image_token_id] * max_image_tokens,
+            )
+        ]
 
-    assert isinstance(max_image_size,
-                      (int, float)), "max_image_size should be float or int"
-    images = (multi_modal_data["image"] if isinstance(
-        multi_modal_data["image"], list) else [multi_modal_data["image"]])
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        hf_config = self.ctx.get_hf_config()
+        vision_config: AriaVisionConfig = hf_config.vision_config
+
+        max_image_size = vision_config.image_size
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=max_image_size,
+                                   height=max_image_size,
+                                   num_images=num_images)
+        }
 
-    image_inputs = image_processor.preprocess(images,
-                                              max_image_size=max_image_size,
-                                              split_image=_split_image,
-                                              return_tensors="pt").data
-    image_inputs['pixel_values'] = image_inputs['pixel_values'].to(
-        ctx.model_config.dtype)
-    num_crops = image_inputs.pop("num_crops")
+        hf_processor = self._get_hf_processor()
+        image_token: str = hf_processor.image_token  # type: ignore
 
-    prompt_token_ids = llm_inputs["prompt_token_ids"]
-    if num_crops.sum().item() > 0:
-        _, prompt_token_ids, _ = repeat_and_pad_placeholder_tokens(
-            tokenizer,
-            None,
-            prompt_token_ids,
-            placeholder_token_id=hf_config.image_token_index,
-            repeat_count=num_crops,
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=mm_data,
         )
 
-    repeat_count = [hf_config.image_size2tokens[max_image_size]
-                    ] * sum(num_crops).item()
-    new_prompt, new_token_ids, _ = repeat_and_pad_placeholder_tokens(
-        tokenizer,
-        None,
-        prompt_token_ids,
-        placeholder_token_id=hf_config.image_token_index,
-        repeat_count=repeat_count,
-    )
-
-    return token_inputs(
-        prompt_token_ids=new_token_ids,
-        prompt=new_prompt,
-        multi_modal_data={"image": image_inputs},
-    )
-
 
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_multimodal_tokens)
-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_aria)
-@INPUT_REGISTRY.register_input_processor(input_processor)
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_aria_image_tokens)
+@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor)
 class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
     """
     Aria model for conditional generation tasks.
@@ -540,12 +532,6 @@ def __init__(
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
 
-        # prepare the image_size to tokens mapping for the image preprocess, see
-        # input_processor
-        config.image_size2tokens = {
-            int(math.sqrt(k) * config.vision_config.patch_size): v
-            for k, v in config.projector_patch_to_query_dict.items()
-        }
         self.config = config
         self.vision_tower = AriaVisionModel(config.vision_config)
         self.multi_modal_projector = build_mm_projector(config)
@@ -566,7 +552,7 @@ def __init__(
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 self.vocab_size, logit_scale)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
 
     def _validate_image_sizes(
             self, images: List[torch.Tensor]) -> List[torch.Tensor]:
@@ -588,7 +574,12 @@ def _parse_and_validate_image_input(
 
         pixel_values = self._validate_image_sizes(pixel_values)
         pixel_values = flatten_bn(pixel_values, concat=True)
+
         if pixel_mask is not None:
+            if not isinstance(pixel_mask, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel mask. "
+                                 f"Got type: {type(pixel_mask)}")
+
             pixel_mask = flatten_bn(pixel_mask, concat=True)
 
         return AriaImagePixelInputs(
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index 42a239cadac46..987dfaf44f228 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -4,22 +4,16 @@
 
 import torch
 import torch.nn as nn
-from PIL import Image
 from transformers import Blip2VisionConfig, BlipVisionConfig
 
 from vllm.attention.layer import MultiHeadAttention
-from vllm.config import ModelConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
-from vllm.inputs import DecoderOnlyInputs, token_inputs
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   repeat_and_pad_placeholder_tokens)
-from vllm.sequence import SequenceData
 
 
 def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
@@ -33,92 +27,6 @@ def get_blip_num_patches(*, image_size: int, patch_size: int) -> int:
     return grid_length * grid_length
 
 
-def get_blip_image_feature_size(
-        hf_config: Union[BlipVisionConfig, Blip2VisionConfig]) -> int:
-    return get_blip_num_patches(image_size=hf_config.image_size,
-                                patch_size=hf_config.patch_size)
-
-
-def get_max_blip_image_tokens(
-        hf_config: Union[BlipVisionConfig, Blip2VisionConfig]) -> int:
-    return get_blip_image_feature_size(hf_config)
-
-
-def dummy_seq_data_for_blip(
-    hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
-    seq_len: int,
-    num_images: int,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[int] = None,
-):
-    if image_feature_size_override is None:
-        image_feature_size = get_blip_image_feature_size(hf_config)
-    else:
-        image_feature_size = image_feature_size_override
-
-    return SequenceData.from_prompt_token_counts(
-        (image_token_id, image_feature_size * num_images),
-        (0, seq_len - image_feature_size * num_images),
-    )
-
-
-def dummy_image_for_blip(
-    hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
-    num_images: int,
-    *,
-    image_width_override: Optional[int] = None,
-    image_height_override: Optional[int] = None,
-):
-    width = height = hf_config.image_size
-    if image_width_override is not None:
-        width = image_width_override
-    if image_height_override is not None:
-        height = image_height_override
-
-    image = Image.new("RGB", (width, height), color=0)
-    return {"image": image if num_images == 1 else [image] * num_images}
-
-
-def input_processor_for_blip(
-    model_config: ModelConfig,
-    hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
-    inputs: DecoderOnlyInputs,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[int] = None,
-):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-
-    if "multi_modal_placeholders" in inputs and "image" in inputs[
-            "multi_modal_placeholders"]:
-        # The inputs already have placeholders.
-        return inputs
-
-    tokenizer = cached_get_tokenizer(model_config.tokenizer)
-
-    if image_feature_size_override is None:
-        image_feature_size = get_blip_image_feature_size(hf_config)
-    else:
-        image_feature_size = image_feature_size_override
-
-    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-        tokenizer,
-        inputs.get("prompt"),
-        inputs["prompt_token_ids"],
-        placeholder_token_id=image_token_id,
-        repeat_count=image_feature_size,
-    )
-
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data,
-                        multi_modal_placeholders={"image": ranges})
-
-
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa
 class BlipVisionEmbeddings(nn.Module):
 
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 76b8505ee1c2a..bf70f5d904f5b 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -4,32 +4,33 @@
 
 import torch
 import torch.nn as nn
-from transformers import (Blip2Config, Blip2QFormerConfig, Blip2VisionConfig,
-                          apply_chunking_to_forward)
+from transformers import (BatchFeature, Blip2Config, Blip2Processor,
+                          Blip2QFormerConfig, apply_chunking_to_forward)
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
-from vllm.multimodal.utils import consecutive_placeholder_ranges
-from vllm.sequence import IntermediateTensors, SequenceData
-
-from .blip import (BlipVisionModel, dummy_image_for_blip,
-                   get_max_blip_image_tokens)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputsV2, MultiModalKwargs,
+                                    NestedTensors, PlaceholderRange)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
+from vllm.sequence import IntermediateTensors
+
+from .blip import BlipVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
 # We use this internally as placeholders since there is no image token
 # defined on the HuggingFace repo
-BLIP2_IMAGE_TOKEN = "<image>"
-BLIP2_IMAGE_TOKEN_ID = 50265
+_IMAGE_TOKEN_ID = 50265
 
 
 class Blip2ImagePixelInputs(TypedDict):
@@ -396,92 +397,87 @@ def forward(
         return sequence_output
 
 
-def get_blip2_image_feature_size(hf_config: Blip2Config) -> int:
-    return hf_config.num_query_tokens
-
-
 def get_max_blip2_image_tokens(ctx: InputContext):
     hf_config = ctx.get_hf_config(Blip2Config)
-    vision_config = hf_config.vision_config
-
-    if isinstance(vision_config, Blip2VisionConfig):
-        return get_max_blip_image_tokens(vision_config)
-
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
-
-
-def dummy_seq_data_for_blip2(
-    hf_config: Blip2Config,
-    seq_len: int,
-    num_images: int,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[int] = None,
-):
-    if image_feature_size_override is None:
-        image_feature_size = get_blip2_image_feature_size(hf_config)
-    else:
-        image_feature_size = image_feature_size_override
-
-    return SequenceData.from_prompt_token_counts(
-        (image_token_id, image_feature_size * num_images),
-        (0, seq_len - image_feature_size * num_images),
-    ), {
-        "image":
-        consecutive_placeholder_ranges(num_items=num_images,
-                                       item_size=image_feature_size)
-    }
-
-
-def dummy_data_for_blip2(ctx: InputContext, seq_len: int,
-                         mm_counts: Mapping[str, int]):
-    hf_config = ctx.get_hf_config(Blip2Config)
-    vision_config = hf_config.vision_config
-    num_images = mm_counts["image"]
-
-    seq_data, ranges = dummy_seq_data_for_blip2(
-        hf_config,
-        seq_len,
-        num_images,
-        image_token_id=BLIP2_IMAGE_TOKEN_ID,
-    )
-
-    if isinstance(vision_config, Blip2VisionConfig):
-        mm_data = dummy_image_for_blip(vision_config, num_images)
-
-        return DummyData(seq_data, mm_data, ranges)
-
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
+    return hf_config.num_query_tokens
 
 
-def input_processor_for_blip2(ctx: InputContext, inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
+class Blip2MultiModalProcessor(BaseMultiModalProcessor):
 
-    hf_config = ctx.get_hf_config(Blip2Config)
-    image_feature_size = get_blip2_image_feature_size(hf_config)
+    def _get_hf_processor(self) -> Blip2Processor:
+        return self.ctx.get_hf_processor(Blip2Processor)
 
-    # The original model places image tokens at the front
-    # https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1514
-    new_token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size
-    new_token_ids += inputs["prompt_token_ids"]
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
 
-    new_prompt = inputs.get("prompt")
-    if new_prompt is not None:
-        new_prompt = BLIP2_IMAGE_TOKEN * image_feature_size + new_prompt
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        max_image_tokens = get_max_blip2_image_tokens(self.ctx)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target="</s>",
+                replacement="<image>" * max_image_tokens + "</s>",
+            )
+        ]
 
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data)
+    def apply(
+        self,
+        prompt_text: str,
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+
+        # Only <image> tokens should be considered as placeholders,
+        # so we ignore the trailing bos_token
+        result["mm_placeholders"] = {
+            modality: [
+                PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
+                for p in ps
+            ]
+            for modality, ps in result["mm_placeholders"].items()
+        }
+
+        return result
+
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        hf_config = self.ctx.get_hf_config(Blip2Config)
+        vision_config = hf_config.vision_config
+
+        max_image_size = vision_config.image_size
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=max_image_size,
+                                   height=max_image_size,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="",
+            mm_data=mm_data,
+        )
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_blip2_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_blip2)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_blip2)
+@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor)
 class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -627,7 +623,7 @@ def get_input_embeddings(
         if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids, inputs_embeds, multimodal_embeddings,
-                BLIP2_IMAGE_TOKEN_ID)
+                _IMAGE_TOKEN_ID)
         return inputs_embeds
 
     def forward(
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index a40c321ce0a58..85fca23b05746 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -3,16 +3,15 @@
                     Tuple, TypedDict, Union)
 
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
-from PIL import Image
-from torch import nn
-from transformers import ChameleonConfig, ChameleonVQVAEConfig
+from transformers import (BatchFeature, ChameleonConfig, ChameleonProcessor,
+                          ChameleonVQVAEConfig)
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -29,11 +28,13 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   consecutive_placeholder_ranges,
-                                   repeat_and_pad_placeholder_tokens)
-from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputsV2, MultiModalKwargs,
+                                    NestedTensors, PlaceholderRange)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
+from vllm.sequence import IntermediateTensors
 from vllm.utils import print_warning_once
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -45,10 +46,6 @@
 # and processor files, so we hardcode them in the model file for now.
 CHAMELEON_CROP_SIZE_HEIGHT = CHAMELEON_CROP_SIZE_WIDTH = 512
 CHAMELEON_IMAGE_SEQ_LENGTH = 1024
-CHAMELEON_IMAGE_TOKEN_ID = 8711
-CHAMELEON_IMAGE_START_TOKEN_ID = 8197
-CHAMELEON_IMAGE_END_TOKEN_ID = 8196
-CHAMELEON_SEP_TOKEN_ID = 8710
 
 
 class ChameleonImagePixelInputs(TypedDict):
@@ -61,99 +58,75 @@ def get_max_chameleon_image_tokens(ctx: InputContext):
     return CHAMELEON_IMAGE_SEQ_LENGTH
 
 
-def dummy_seq_data_for_chameleon(
-    seq_len: int,
-    num_images: int,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[int] = None,
-):
-    if image_feature_size_override is None:
-        image_feature_size = CHAMELEON_IMAGE_SEQ_LENGTH
-    else:
-        image_feature_size = image_feature_size_override
-
-    return SequenceData.from_prompt_token_counts(
-        (image_token_id, image_feature_size * num_images),
-        (0, seq_len - image_feature_size * num_images),
-    ), {
-        "image":
-        consecutive_placeholder_ranges(num_items=num_images,
-                                       item_size=image_feature_size)
-    }
-
-
-def dummy_image_for_chameleon(
-    num_images: int,
-    *,
-    image_width_override: Optional[int] = None,
-    image_height_override: Optional[int] = None,
-):
-    width = CHAMELEON_CROP_SIZE_WIDTH
-    height = CHAMELEON_CROP_SIZE_HEIGHT
-    if image_width_override is not None:
-        width = image_width_override
-    if image_height_override is not None:
-        height = image_height_override
-
-    image = Image.new("RGB", (width, height), color=0)
-    return {"image": image if num_images == 1 else [image] * num_images}
-
-
-def dummy_data_for_chameleon(ctx: InputContext, seq_len: int,
-                             mm_counts: Mapping[str, int]):
-    num_images = mm_counts["image"]
-
-    seq_data, ranges = dummy_seq_data_for_chameleon(
-        seq_len,
-        num_images,
-        image_token_id=CHAMELEON_IMAGE_TOKEN_ID,
-    )
-
-    mm_data = dummy_image_for_chameleon(num_images)
-    return DummyData(seq_data, mm_data, ranges)
-
-
-def input_processor_for_chameleon(ctx: InputContext,
-                                  inputs: DecoderOnlyInputs):
+class ChameleonMultiModalProcessor(BaseMultiModalProcessor):
 
-    """
-    Processing input prompt to insert required tokens for image placeholder.
-
-    See https://github.com/huggingface/transformers/blob/0fdea8607d7e01eb0e38a1ebeb7feee30a22f0cf/src/transformers/models/chameleon/processing_chameleon.py#L58
-    """ # noqa
-
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-
-    if "multi_modal_placeholders" in inputs and "image" in inputs[
-            "multi_modal_placeholders"]:
-        # The inputs already have placeholders.
-        return inputs
-
-    model_config = ctx.model_config
-    tokenizer = cached_get_tokenizer(model_config.tokenizer)
-    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-        tokenizer,
-        inputs.get("prompt"),
-        inputs["prompt_token_ids"],
-        placeholder_token_id=CHAMELEON_IMAGE_TOKEN_ID,
-        repeat_count=CHAMELEON_IMAGE_SEQ_LENGTH,
-        pad_token_left=CHAMELEON_IMAGE_START_TOKEN_ID,
-        pad_token_right=CHAMELEON_IMAGE_END_TOKEN_ID,
-    )
-
-    # Appending sep token for chat mode to follow default processor
-    # behavior
-    if new_prompt is not None:
-        new_prompt += tokenizer.sep_token
-    new_token_ids += [CHAMELEON_SEP_TOKEN_ID]
-
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data)
+    def _get_hf_processor(self) -> ChameleonProcessor:
+        return self.ctx.get_hf_processor(ChameleonProcessor)
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values=MultiModalFieldConfig.batched("image"))
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        processor = self._get_hf_processor()
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target="<image>",
+                replacement="".join([
+                    processor.image_start_token,
+                    processor.image_token * CHAMELEON_IMAGE_SEQ_LENGTH,
+                    processor.image_end_token,
+                ]),
+            )
+        ]
+
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=CHAMELEON_CROP_SIZE_WIDTH,
+                                   height=CHAMELEON_CROP_SIZE_HEIGHT,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="<image>" * num_images,
+            mm_data=mm_data,
+        )
+
+    def apply(
+        self,
+        prompt_text: str,
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+
+        # Only <image> tokens should be considered as placeholders,
+        # so we ignore the image_start_token and image_end_token
+        result["mm_placeholders"] = {
+            modality: [
+                PlaceholderRange(offset=p["offset"] + 1,
+                                 length=p["length"] - 2) for p in ps
+            ]
+            for modality, ps in result["mm_placeholders"].items()
+        }
+
+        return result
 
 
 class ChameleonLayerNorm(nn.LayerNorm):
@@ -736,7 +709,7 @@ def forward(self, pixel_values: torch.Tensor):
         for i_level in range(self.num_resolutions):
             for i_block in range(self.num_res_blocks):
                 hidden_state = self.down[i_level].block[i_block](
-                    hidden_states[-1], )
+                    hidden_states[-1])
                 if len(self.down[i_level].attn) > 0:
                     hidden_state = self.down[i_level].attn[i_block](
                         hidden_state)
@@ -925,10 +898,8 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_chameleon_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_chameleon)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_chameleon)
+@MULTIMODAL_REGISTRY.register_processor(ChameleonMultiModalProcessor)
 class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
 
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 6e86900326c4b..8c14866f20b92 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -15,32 +15,30 @@
 # limitations under the License.
 """ PyTorch Fuyu model."""
 import math
-from array import array
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict)
 
 import torch
 import torch.nn as nn
-import torch.utils.checkpoint
-from PIL import Image
-from transformers import FuyuImageProcessor
+from transformers import (BatchFeature, FuyuConfig, FuyuImageProcessor,
+                          FuyuProcessor)
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.models.persimmon import PersimmonForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.image import cached_get_image_processor
-from vllm.multimodal.inputs import NestedTensors
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   consecutive_placeholder_ranges)
-from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
-                           SequenceData)
-from vllm.utils import is_list_of
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputsV2, MultiModalKwargs,
+                                    NestedTensors, PlaceholderRange)
+from vllm.multimodal.parse import ImageProcessorItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
+from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
@@ -54,178 +52,193 @@
 MAX_IMAGE_FEATURE_SIZE_WIDTH = 1920
 
 
-class FuyuImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
+class FuyuImagePatchInputs(TypedDict):
+    type: Literal["image_patches"]
     data: torch.Tensor
     """
     Shape: 
-    (batch_size, num_patches, patch_size_x * patch_size_y * num_channels)
+    `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
+    """
+
+    patches_per_image: List[int]
+    """
+    List of number of total patches for each image in the batch.
+    This is used to restore the first two dimensions of `data`.
     """
 
 
-def _calculate_num_image_tokens(
-    height: int,
-    width: int,
+def _get_fuyu_num_image_tokens(
+    image_height: int,
+    image_width: int,
 ) -> Tuple[int, int]:
     """
-    calculate number of image tokens needed for a given image size
-    The expected Fuyu image prompts is in format:
-        (image_token * ncols + newline_token) * nrows
-    args:
-        image_size: Tuple[int, int] - (width, height) of the image
-    returns:
-        ncols: int - number of image tokens in x direction
-        nrows: int - number of image tokens in y direction
-    """
-    ncol = math.ceil(width / 30)
-    nrow = math.ceil(height / 30)
-    return ncol, nrow
+    Calculate the number of image tokens needed for a given image size.
 
+    The expected Fuyu image prompts can be expressed as:
 
-def get_max_fuyu_image_feature_size():
+    .. code-block::
+        (image_token * ncols + newline_token) * nrows
 
-    return _calculate_num_image_tokens(
-        height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
-        width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-    )
+    Args:
+        image_size: Tuple[int, int] - `(width, height)` of the image
+
+    Returns:
+        ncols: int - number of image tokens in `x` direction
+        nrows: int - number of image tokens in `y` direction
+    """
+    ncols = math.ceil(image_width / 30)
+    nrows = math.ceil(image_height / 30)
+    return ncols, nrows
 
 
 def get_max_fuyu_image_tokens(ctx: InputContext):
-    ncol, nrow = get_max_fuyu_image_feature_size()
-    return (ncol + 1) * nrow
-
-
-def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int, num_images: int):
-    ncol, nrow = get_max_fuyu_image_feature_size()
-    image_feature_size = get_max_fuyu_image_tokens(ctx)
-
-    image_token_ids = (
-        array(VLLM_TOKEN_ID_ARRAY_TYPE, [_IMAGE_TOKEN_ID]) * ncol +
-        array(VLLM_TOKEN_ID_ARRAY_TYPE, [_NEWLINE_TOKEN_ID])) * nrow
-    token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, image_token_ids) * num_images
-    token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
-                       [0]) * (seq_len - image_feature_size * num_images)
-    return SequenceData(token_ids), {
-        "image":
-        consecutive_placeholder_ranges(num_items=num_images,
-                                       item_size=image_feature_size)
-    }
-
-
-def dummy_image_for_fuyu(
-    num_images: int,
-    *,
-    image_width: int,
-    image_height: int,
-):
-    image = Image.new("RGB", (image_width, image_height), color=0)
-    return {"image": image if num_images == 1 else [image] * num_images}
-
-
-def dummy_data_for_fuyu(ctx: InputContext, seq_len: int,
-                        mm_counts: Mapping[str, int]):
-    num_images = mm_counts["image"]
-    seq_data, ranges = dummy_seq_data_for_fuyu(ctx, seq_len, num_images)
-    mm_data = dummy_image_for_fuyu(num_images,
-                                   image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-                                   image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT)
-    return DummyData(seq_data, mm_data, ranges)
-
-
-def _fuyu_image_preprocess(image_processor: FuyuImageProcessor,
-                           data: List[Image.Image]):
-    image_encoding = image_processor.preprocess(data, return_tensors="pt")
-    batch_images = torch.stack([img[0] for img in image_encoding["images"]
-                                ]).unsqueeze(1)
-    image_unpadded_heights = torch.tensor(
-        image_encoding["image_unpadded_heights"])
-    image_unpadded_widths = torch.tensor(
-        image_encoding["image_unpadded_widths"])
-
-    batch_size = len(image_encoding["images"])
-    image_present = torch.ones(batch_size, 1, 1)
-    model_image_input = image_processor.preprocess_with_tokenizer_info(
-        image_input=batch_images,
-        image_present=image_present,
-        image_unpadded_h=image_unpadded_heights,
-        image_unpadded_w=image_unpadded_widths,
-        image_placeholder_id=_IMAGE_TOKEN_ID,
-        image_newline_id=_NEWLINE_TOKEN_ID,
-        variable_sized=True,
+    ncols, nrows = _get_fuyu_num_image_tokens(
+        image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+        image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
     )
-    return model_image_input
-
-
-def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-
-    model_config = ctx.model_config
-    image_data = multi_modal_data["image"]
-    new_multi_modal_data = {}
-    image_list = image_data if isinstance(image_data, list) else [image_data]
-
-    # process image data
-    if is_list_of(image_list, Image.Image):
-        # Fuyu's image_processor can also finish token padding
-        image_processor: FuyuImageProcessor = cached_get_image_processor(
-            model_config.model)
-
-        model_image_input = _fuyu_image_preprocess(image_processor, image_data)
-        image_patches = torch.cat([
-            image_patch[0]
-            for image_patch in model_image_input["image_patches"]
-        ])
-        new_multi_modal_data["image"] = image_patches
-
-    elif is_list_of(image_list, torch.Tensor):
-        raise NotImplementedError("Embeddings input is not supported yet")
-    else:
-        raise TypeError(f"Invalid image type: {type(image_data)}")
-
-    # process prompts
-    prompt = inputs.get("prompt")
-    prompt_token_ids = inputs["prompt_token_ids"]
-    tokenizer = cached_get_tokenizer(model_config.model)
-    # dim0 is batch_size, dim1 is subseq_size which will always be 1
-    image_input_ids: List[List[
-        torch.Tensor]] = model_image_input["image_input_ids"]
-    image_input_ids = image_input_ids[0][0].tolist()
-    bos_token = tokenizer.encode("<s>", add_special_tokens=False)[1:]
-    boa_token = tokenizer.encode("\x04", add_special_tokens=False)[1:]
-
-    new_prompt = prompt + "\x04"
-    new_prompt_token_ids = image_input_ids + bos_token + prompt_token_ids[
-        1:] + boa_token
-
-    return token_inputs(prompt=new_prompt,
-                        prompt_token_ids=new_prompt_token_ids,
-                        multi_modal_data=new_multi_modal_data)
-
-
-def input_mapper_for_fuyu(ctx: InputContext, data: object):
-    model_config = ctx.model_config
-    data_list = data if isinstance(data, list) else [data]
-    if is_list_of(data_list, Image.Image):
-        # Fuyu's image_processor can also finish token padding
-        image_processor: FuyuImageProcessor = cached_get_image_processor(
-            model_config.model)
-
-        model_image_input = _fuyu_image_preprocess(image_processor, data_list)
-        data = torch.stack([
-            image_patch[0]
-            for image_patch in model_image_input["image_patches"]
-        ])
-
-    # image has been processed with prompt in input processor
-    return MultiModalKwargs({"pixel_values": data})
-
-
-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu)
+
+    return (ncols + 1) * nrows
+
+
+class FuyuMultiModalProcessor(BaseMultiModalProcessor):
+
+    def _get_hf_processor(self) -> FuyuProcessor:
+        return self.ctx.get_hf_processor(FuyuProcessor)
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+
+        if not mm_data:
+            # Avoid warning from HF logger for text-only input
+            # Input_ids format: bos_token_id + prompt_token_ids + boa_token_id
+            # Tokenizer won't add boa_token_id by default, we add it manually.
+            tokenizer = self._get_tokenizer()
+            boa_token_id: int = tokenizer.vocab["<0x04>"]  # type: ignore
+            prompt_ids = tokenizer.encode(prompt) + [boa_token_id]
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        image_patches = processed_outputs.get("image_patches")
+        if image_patches is not None:
+            images = mm_data["images"]
+            assert isinstance(images, list)
+
+            # Original output: (1, num_images, Pn, Px * Py * C)
+            # New output: (num_images, Pn, Px * Py * C)
+            assert (isinstance(image_patches, list)
+                    and len(image_patches) == 1)
+            assert (isinstance(image_patches[0], torch.Tensor)
+                    and len(image_patches[0]) == len(images))
+
+            processed_outputs["image_patches"] = image_patches[0]
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(image_patches=MultiModalFieldConfig.batched("image"))
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        hf_config = self.ctx.get_hf_config(FuyuConfig)
+        bos_token_id = hf_config.bos_token_id
+
+        tokenizer = self._get_tokenizer()
+        eot_token_id = tokenizer.bos_token_id
+        assert isinstance(eot_token_id, int)
+
+        hf_processor = self._get_hf_processor()
+        image_processor: FuyuImageProcessor = hf_processor.image_processor
+        target_size = image_processor.size
+        target_height, target_width = (target_size["height"],
+                                       target_size["width"])
+
+        def get_replacement_fuyu(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+            width, height = image_size.width, image_size.height
+            if not (width <= target_width and height <= target_height):
+                height_scale_factor = target_height / height
+                width_scale_factor = target_width / width
+                optimal_scale_factor = min(height_scale_factor,
+                                           width_scale_factor)
+
+                height = int(height * optimal_scale_factor)
+                width = int(width * optimal_scale_factor)
+
+            ncols, nrows = _get_fuyu_num_image_tokens(
+                image_width=width,
+                image_height=height,
+            )
+
+            return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows +
+                    [bos_token_id])
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[eot_token_id],
+                replacement=get_replacement_fuyu,
+            )
+        ]
+
+    def apply(
+        self,
+        prompt_text: str,
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+
+        # Only |SPEAKER| (image) tokens should be considered as placeholders,
+        # so we ignore the trailing bos_token_id
+        result["mm_placeholders"] = {
+            modality: [
+                PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
+                for p in ps
+            ]
+            for modality, ps in result["mm_placeholders"].items()
+        }
+
+        return result
+
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+                                   height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="",
+            mm_data=mm_data,
+        )
+
+
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_fuyu_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_fuyu)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_fuyu)
+@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor)
 class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -280,28 +293,32 @@ def _validate_shape(d: torch.Tensor):
         return data.to(self.vision_embed_tokens.weight.dtype)
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[FuyuImagePixelInputs]:
-        pixel_values = kwargs.pop("pixel_values", None)
-
-        if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
+            self, **kwargs: object) -> Optional[FuyuImagePatchInputs]:
+        image_patches = kwargs.pop("image_patches", None)
+        if image_patches is not None:
+            if not isinstance(image_patches, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image patches. "
-                                 f"Got type: {type(pixel_values)}")
+                                 f"Got type: {type(image_patches)}")
 
-            return FuyuImagePixelInputs(
-                type="pixel_values",
+            image_patches_flat = flatten_bn(image_patches)
+
+            return FuyuImagePatchInputs(
+                type="image_patches",
                 data=self._validate_pixel_values(
-                    flatten_bn(pixel_values, concat=True)),
+                    flatten_bn(image_patches_flat, concat=True)),
+                patches_per_image=[x.size(0) for x in image_patches_flat],
             )
 
         return None
 
     def _process_image_input(
-            self, image_input: FuyuImagePixelInputs) -> torch.Tensor:
+            self, image_input: FuyuImagePatchInputs) -> NestedTensors:
+        image_patches = image_input["data"]
+        patches_per_image = image_input["patches_per_image"]
 
         assert self.vision_embed_tokens is not None
-        vision_embeddings, _ = self.vision_embed_tokens(image_input["data"])
-        return vision_embeddings
+        vision_embeddings, _ = self.vision_embed_tokens(image_patches)
+        return vision_embeddings.split(patches_per_image, dim=0)
 
     def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index e430a158d869a..4e42a4b6f9e64 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -69,7 +69,8 @@ def forward(self,
                 patch_attention_mask: torch.BoolTensor,
                 tgt_sizes: Optional[torch.IntTensor] = None) -> torch.Tensor:
         batch_size, _, max_im_h, max_im_w = pixel_values.shape
-        patch_embeds = self.patch_embedding(pixel_values)
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(target_dtype))
         embeddings = patch_embeds.flatten(2).transpose(1, 2)
         max_nb_patches_h, max_nb_patches_w = (
             max_im_h // self.patch_size,
@@ -309,7 +310,8 @@ def forward(
         hidden_states = self.embeddings(
             pixel_values=pixel_values,
             patch_attention_mask=patch_attention_mask,
-            tgt_sizes=tgt_sizes)
+            tgt_sizes=tgt_sizes,
+        )
         encoder_outputs = self.encoder(hidden_states)
         last_hidden_state = self.post_layernorm(encoder_outputs)
         return last_hidden_state
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 1d6ee2a0be72e..34dc7fa31ce6f 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -144,8 +144,8 @@ def _call_hf_processor(
                 # Original output: (1, num_images, C, H, W)
                 # New output: (num_images, C, H, W)
                 assert (isinstance(pixel_values, list)
-                        and len(pixel_values) == 1
-                        and isinstance(pixel_values[0], list)
+                        and len(pixel_values) == 1)
+                assert (isinstance(pixel_values[0], list)
                         and len(pixel_values[0]) == len(images))
 
                 processed_outputs["pixel_values"] = pixel_values[0]
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index a39f2f4124d05..5e70c11363c83 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -528,10 +528,8 @@ def _process_image_pixels(
         stacked_image_features = self._image_pixels_to_features(
             self.vision_tower, stacked_pixel_values)
 
-        return [
-            self.multi_modal_projector(image_features) for image_features in
-            torch.split(stacked_image_features, num_patches_per_batch)
-        ]
+        return torch.split(self.multi_modal_projector(stacked_image_features),
+                           num_patches_per_batch)
 
     def _process_image_input(
         self,
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 22d29f5bbc50c..2bce13792a88d 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -1,8 +1,8 @@
+import math
 from dataclasses import dataclass, fields
 from functools import cached_property
 from typing import Iterable, List, Mapping, Optional, Set, Tuple, Union
 
-import numpy
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -306,7 +306,7 @@ def _parse_and_validate_image_input(
         images: Optional[Union[List[List[torch.Tensor]], List[torch.Tensor],
                                torch.Tensor]] = None,
         image_tokens: Optional[torch.Tensor] = None,
-    ) -> Optional[List[torch.Tensor]]:
+    ) -> Tuple[Optional[List[torch.Tensor]], Optional[torch.Tensor]]:
         if images is None:
             return None, None
 
@@ -604,11 +604,11 @@ def max_patches_per_side(self) -> int:
         return self.args.image_size // self.args.patch_size
 
     @property
-    def device(self) -> torch.device:
+    def device(self) -> torch.types.Device:
         return next(self.parameters()).device
 
     @property
-    def dtype(self) -> torch.device:
+    def dtype(self) -> torch.dtype:
         return next(self.parameters()).dtype
 
     @property
@@ -741,8 +741,8 @@ def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig,
     ratio = max(image_width / max_width, image_height / max_height)
 
     if ratio > 1:
-        image_width = int(numpy.ceil(image_width / ratio))
-        image_height = int(numpy.ceil(image_height / ratio))
+        image_width = int(math.ceil(image_width / ratio))
+        image_height = int(math.ceil(image_height / ratio))
 
     num_height_tokens, num_width_tokens = _get_pixtral_hf_num_image_tokens(
         (image_height, image_width),
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index e3d43b017f894..de55bc6bcc123 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -23,7 +23,6 @@
 from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
                     Union)
 
-import numpy as np
 import torch
 import torch.nn as nn
 from transformers import BatchFeature
@@ -177,16 +176,19 @@ def _get_dummy_mm_inputs(
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         feature_extractor = self._get_feature_extractor()
+
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
+        num_audios = mm_counts.get("audio", 0)
 
-        audio_count = mm_counts.get("audio", 0)
-        audio = np.zeros(audio_len)
-        data = {"audio": [audio] * audio_count}
+        mm_data = {
+            "audio":
+            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+        }
 
         return ProcessorInputs(
-            prompt_text="<|AUDIO|>" * audio_count,
-            mm_data=data,
+            prompt_text="<|AUDIO|>" * num_audios,
+            mm_data=mm_data,
         )
 
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 6181fe3dd13d8..1e485f87bb7a4 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -29,7 +29,6 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
-from PIL import Image
 from transformers import BatchFeature
 from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
                                           Qwen2VLProcessor)
@@ -882,12 +881,10 @@ def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        num_images = mm_counts.get("image", 0)
         hf_processor = self._get_hf_processor()
-        image_token: str = hf_processor.image_token
         image_processor = _get_image_processor(hf_processor)
 
-        data = {}
+        image_token: str = hf_processor.image_token
         resized_height, resized_width = smart_resize(
             height=9999999,
             width=9999999,
@@ -895,14 +892,18 @@ def _get_dummy_mm_inputs(
             min_pixels=image_processor.min_pixels,
             max_pixels=image_processor.max_pixels,
         )
+        num_images = mm_counts.get("image", 0)
 
-        dummy_image = Image.new("RGB", (resized_width, resized_height),
-                                color=0)
-        data["image"] = [dummy_image] * num_images
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=resized_width,
+                                   height=resized_height,
+                                   num_images=num_images)
+        }
 
         return ProcessorInputs(
             prompt_text=image_token * num_images,
-            mm_data=data,
+            mm_data=mm_data,
         )
 
 
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 7e853e5b90096..54be7fed3f2be 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -188,16 +188,19 @@ def _get_dummy_mm_inputs(
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         feature_extractor = self._get_feature_extractor()
+
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
+        num_audios = mm_counts.get("audio", 0)
 
-        audio_count = mm_counts.get("audio", 0)
-        audio = np.zeros(audio_len)
-        data = {"audio": [audio] * audio_count}
+        mm_data = {
+            "audio":
+            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+        }
 
         return ProcessorInputs(
-            prompt_text="<|audio|>" * audio_count,
-            mm_data=data,
+            prompt_text="<|audio|>" * num_audios,
+            mm_data=mm_data,
         )
 
 
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 180489166b407..7712c3bcebe20 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,15 +1,17 @@
 import pickle
 import re
 from abc import ABC, abstractmethod
+from collections import defaultdict
 from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence
 from dataclasses import dataclass, field
 from functools import lru_cache
 from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union
 
 import numpy as np
+import numpy.typing as npt
 import torch
 from blake3 import blake3
-from PIL.Image import Image
+from PIL import Image
 from transformers import BatchFeature, ProcessorMixin
 
 from vllm.inputs import DummyData, InputProcessingContext
@@ -353,13 +355,13 @@ def _replace_matches(
 ) -> list[_S]:
     out_seqs = list[_S]()
     prev_end_idx = 0
-    next_idx_by_modality = {modality: 0 for modality in mm_item_counts}
+    next_idx_by_modality = defaultdict[str, int](lambda: 0)
 
     for match in _resolve_matches(prompt, matches):
         modality = match.modality
 
         item_idx = next_idx_by_modality[modality]
-        if item_idx >= mm_item_counts[modality]:
+        if item_idx >= mm_item_counts.get(modality, 0):
             continue
 
         start_idx = match.start_idx
@@ -513,7 +515,7 @@ def _serialize_item(self, obj: object) -> bytes:
             return obj.encode("utf-8")
         if isinstance(obj, bytes):
             return obj
-        if isinstance(obj, Image):
+        if isinstance(obj, Image.Image):
             return obj.tobytes()
 
         # Convertible to NumPy arrays
@@ -673,10 +675,14 @@ def _get_prompt_replacements(
         Given the original multi-modal items for this modality
         and HF-processed data, output the replacements to perform.
 
-        Note:
-            Even when the HF processor already performs replacement for us,
-            we still use this replacement information to determine
-            the placeholder token positions for each multi-modal item.
+        Notes:
+            - You should not assume that HF processor always performs prompt
+              replacement: in :meth:`_apply_hf_processor_missing`, this method
+              is called on text-only and multimodal-only inputs separately,
+              instead of passing them in the same call.
+            - The replacement information returned by this method is also used
+              to determine the placeholder token positions for each multi-modal
+              item.
         """
         raise NotImplementedError
 
@@ -710,6 +716,10 @@ def _call_hf_processor(
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
+        """
+        Call the HF processor on the prompt text and
+        associated multi-modal data.
+        """
         return self.ctx.call_hf_processor(
             self._get_hf_processor(**mm_kwargs),
             dict(text=prompt, **mm_data),
@@ -723,7 +733,8 @@ def _apply_hf_processor(
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> tuple[list[int], MultiModalKwargs]:
         """
-        Apply the HF processor on the full prompt text and multi-modal data.
+        Wrapper of :meth:`_call_hf_processor` that applies
+        additional pre-processing and post-processing.
         """
         processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
 
@@ -754,10 +765,11 @@ def _apply_hf_processor_missing(
         Apply the HF processor on the full prompt text, but only on the
         multi-modal data that are missing from the cache.
 
-        Note: We pass prompt text and multi-modal data into the HF processor
-        in separate calls to avoid HF prompt replacement being done for
-        cached items; instead, we rely on our own prompt replacement logic
-        for the full text.
+        Note:
+            We pass prompt text and multi-modal data into the HF processor
+            in separate calls to avoid HF prompt replacement being done for
+            cached items; instead, we rely on our own prompt replacement logic
+            (:meth:`_get_prompt_replacements`) for the full text.
         """
         mm_missing_counts = mm_missing_data_items.get_all_counts()
 
@@ -1010,6 +1022,36 @@ def apply(
             mm_placeholders=mm_placeholders,
         )
 
+    def _get_dummy_audios(
+        self,
+        *,
+        length: int,
+        num_audios: int,
+    ) -> list[npt.NDArray]:
+        audio = np.zeros((length, ))
+        return [audio] * num_audios
+
+    def _get_dummy_images(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_images: int,
+    ) -> list[Image.Image]:
+        image = Image.new("RGB", (width, height), color=0)
+        return [image] * num_images
+
+    def _get_dummy_videos(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_frames: int,
+        num_videos: int,
+    ) -> list[npt.NDArray]:
+        video = np.zeros((num_frames, width, height, 3))
+        return [video] * num_videos
+
     @abstractmethod
     def _get_dummy_mm_inputs(
         self,
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 87b12a6fb33c1..7b6ded6a27084 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -400,15 +400,19 @@ def repeat_and_pad_placeholder_tokens(
     placeholder_token_idx = 0
     for i, token in enumerate(prompt_token_ids):
         if token == placeholder_token_id:
+            curr_repeat_count = repeat_count[placeholder_token_idx]
             replacement_ids = repeat_and_pad_token(
                 placeholder_token_id,
-                repeat_count=repeat_count[placeholder_token_idx],
+                repeat_count=curr_repeat_count,
                 pad_token_left=pad_token_left,
                 pad_token_right=pad_token_right,
             )
+            offset = len(new_token_ids)
+            if pad_token_left is not None:
+                offset += 1
             placeholder_ranges.append({
-                "offset": len(new_token_ids),
-                "length": len(replacement_ids)
+                "offset": offset,
+                "length": curr_repeat_count,
             })
             new_token_ids.extend(replacement_ids)
             placeholder_token_idx += 1
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 509771b7e2e5a..a08a86d4007dc 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -647,10 +647,23 @@ def profile_run(self) -> None:
                 self.mm_registry.get_max_tokens_per_item_by_modality(
                     self.model_config).values())
 
-            max_num_mm_items = min(
+            max_num_mm_items_encoder_budget = min(
                 self.max_num_encoder_input_tokens,
                 self.encoder_cache_size) // max_tokens_per_mm_item
 
+            max_mm_items_per_req = max(
+                self.mm_registry.get_mm_limits_per_prompt(
+                    self.model_config).values())
+
+            # NOTE: We do not consider max_num_batched_tokens on purpose
+            # because the multimodal embeddings can be generated in advance
+            # and chunked prefilled.
+            max_num_mm_items_decoder_budget = self.max_num_reqs * \
+                max_mm_items_per_req
+
+            max_num_mm_items = min(max_num_mm_items_encoder_budget,
+                                   max_num_mm_items_decoder_budget)
+
             # Dummy data definition in V0 may contain multiple multimodal items
             # (e.g, multiple images) for a single request, therefore here we
             # always replicate first item by max_num_mm_items times since in V1

From 0c6f9985547d6b510d34c6c873db54abe03eb346 Mon Sep 17 00:00:00 2001
From: Yihua Cheng <yihua98@uchicago.edu>
Date: Tue, 31 Dec 2024 18:10:55 -0600
Subject: [PATCH 233/357] [Benchmark] Add benchmark script for CPU offloading 
 (#11533)

Signed-off-by: ApostaC <yihua98@uchicago.edu>
Co-authored-by: KuntaiDu <kuntai@uchicago.edu>
---
 .../benchmark_long_document_qa_throughput.py  | 184 ++++++++++++++++++
 1 file changed, 184 insertions(+)
 create mode 100644 benchmarks/benchmark_long_document_qa_throughput.py

diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py
new file mode 100644
index 0000000000000..13477ef535e86
--- /dev/null
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@@ -0,0 +1,184 @@
+"""
+Offline benchmark to test the long document QA throughput.
+
+Example usage:
+    # This command run the vllm with 50GB CPU memory for offloading
+    # The workload samples 8 different prompts with a default input
+    # length of 20000 tokens, then replicates each prompt 2 times 
+    # in random order.
+    python benchmark_long_document_qa_throughput.py \
+        --model meta-llama/Llama-2-7b-chat-hf \
+        --enable-prefix-caching \
+        --num-documents 8 \
+        --repeat-count 2 
+
+Commandline arguments:
+    --num-documents: The number of documents to sample prompts from.
+
+    --document-length: The length of each document in tokens. 
+                       (Optional, default: 20000)
+
+    --output-len: The number of tokens to generate for each prompt.
+                  (Optional, default: 10)
+
+    --repeat-count: The number of times to repeat each prompt.
+                    (Optional, default: 2)
+
+    --repeat-mode: The mode to repeat prompts. The supported modes are:
+        - 'random': shuffle the prompts randomly. (Default)
+        - 'tile': the entire prompt list is repeated in sequence. (Potentially
+                  lowest cache hit)
+        - 'interleave': each prompt is repeated consecutively before 
+                        moving to the next element. (Highest cache hit)
+    
+    --shuffle-seed: Random seed when the repeat mode is "random".
+                    (Optional, default: 0)
+
+In the meantime, it also supports all the vLLM engine args to initialize the 
+LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more
+details.
+"""
+
+import dataclasses
+import random
+import time
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
+    """
+    Test long document QA with the given prompts and sampling parameters.
+    Print the time spent in processing all the prompts.
+
+    Args:
+        llm: The language model used for generating responses.
+        sampling_params: Sampling parameter used to generate the response.
+        prompts: A list of prompt strings to be processed by the LLM.
+    """
+    start_time = time.time()
+    llm.generate(prompts, sampling_params=sampling_params)
+    end_time = time.time()
+    print(f"Time to execute all requests: {end_time - start_time:.4f} secs")
+
+
+def repeat_prompts(prompts, repeat_count, mode: str):
+    """
+    Repeat each prompt in the list for a specified number of times.
+    The order of prompts in the output list depends on the mode.
+
+    Args:
+        prompts: A list of prompts to be repeated.
+        repeat_count: The number of times each prompt is repeated.
+        mode: The mode of repetition. Supported modes are:
+            - 'random': Shuffle the prompts randomly after repetition.
+            - 'tile': Repeat the entire prompt list in sequence.
+              Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3].
+            - 'interleave': Repeat each prompt consecutively before moving to 
+              the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3].
+
+    Returns:
+        A list of repeated prompts in the specified order.
+
+    Raises:
+        ValueError: If an invalid mode is provided.
+    """
+    print("Repeat mode: ", mode)
+    if mode == 'random':
+        repeated_prompts = prompts * repeat_count
+        random.shuffle(repeated_prompts)
+        return repeated_prompts
+    elif mode == 'tile':
+        return prompts * repeat_count
+    elif mode == 'interleave':
+        repeated_prompts = []
+        for prompt in prompts:
+            repeated_prompts.extend([prompt] * repeat_count)
+        return repeated_prompts
+    else:
+        raise ValueError(f"Invalid mode: {mode}, only support "
+                         "'random', 'tile', 'interleave'")
+
+
+def main(args):
+    random.seed(args.shuffle_seed)
+
+    # Prepare the prompts:
+    # we append the document id at the beginning to avoid any of the document
+    # being the prefix of other documents
+    prompts = [
+        str(i) + ' '.join(['hi'] * args.document_length)
+        for i in range(args.num_documents)
+    ]
+
+    prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
+
+    warmup_prompts = [
+        "This is warm up request " + str(i) + \
+                ' '.join(['hi'] * args.document_length)
+        for i in range(args.num_documents)]
+
+    # Create the LLM engine
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**dataclasses.asdict(engine_args))
+    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+
+    print("------warm up------")
+    test_long_document_qa(
+        llm=llm,
+        prompts=warmup_prompts,
+        sampling_params=sampling_params,
+    )
+
+    print("------start generating------")
+    test_long_document_qa(
+        llm=llm,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description=
+        'Benchmark the performance with or without automatic prefix caching.')
+
+    parser.add_argument(
+        '--document-length',
+        type=int,
+        # Roughly the number of tokens for a system paper,
+        # excluding images
+        default=20000,
+        help='Range of input lengths for sampling prompts,'
+        'specified as "min:max" (e.g., "128:256").')
+
+    parser.add_argument('--num-documents',
+                        type=int,
+                        default=8,
+                        help='Range of input lengths for sampling prompts,'
+                        'specified as "min:max" (e.g., "128:256").')
+
+    parser.add_argument('--output-len', type=int, default=10)
+
+    parser.add_argument('--repeat-count',
+                        type=int,
+                        default=2,
+                        help='Number of times to repeat each prompt')
+
+    parser.add_argument("--repeat-mode",
+                        type=str,
+                        default='random',
+                        help='The mode to repeat prompts. The supported '
+                        'modes are "random", "tile", and "interleave". '
+                        'See repeat_prompts() in the source code for details.')
+
+    parser.add_argument("--shuffle-seed",
+                        type=int,
+                        default=0,
+                        help='Random seed when the repeat mode is "random"')
+
+    parser = EngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    main(args)

From 4db72e57f6e8da5e78285e9868e9327167bea973 Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Tue, 31 Dec 2024 18:21:51 -0800
Subject: [PATCH 234/357] [Bugfix][Refactor] Unify model management in frontend
 (#11660)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 tests/entrypoints/openai/test_cli_args.py     |   2 +-
 tests/entrypoints/openai/test_lora_lineage.py |  32 ++-
 tests/entrypoints/openai/test_serving_chat.py |  20 +-
 ...rving_engine.py => test_serving_models.py} |  66 +++---
 vllm/entrypoints/openai/api_server.py         |  62 +++---
 vllm/entrypoints/openai/cli_args.py           |   2 +-
 vllm/entrypoints/openai/run_batch.py          |  15 +-
 vllm/entrypoints/openai/serving_chat.py       |  16 +-
 vllm/entrypoints/openai/serving_completion.py |  16 +-
 vllm/entrypoints/openai/serving_embedding.py  |   9 +-
 vllm/entrypoints/openai/serving_engine.py     | 192 ++--------------
 vllm/entrypoints/openai/serving_models.py     | 210 ++++++++++++++++++
 vllm/entrypoints/openai/serving_pooling.py    |   9 +-
 vllm/entrypoints/openai/serving_score.py      |   9 +-
 .../openai/serving_tokenization.py            |  12 +-
 15 files changed, 365 insertions(+), 307 deletions(-)
 rename tests/entrypoints/openai/{test_serving_engine.py => test_serving_models.py} (61%)
 create mode 100644 vllm/entrypoints/openai/serving_models.py

diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py
index 45e6980a94630..e49562ad6a21f 100644
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -4,7 +4,7 @@
 
 from vllm.entrypoints.openai.cli_args import (make_arg_parser,
                                               validate_parsed_serve_args)
-from vllm.entrypoints.openai.serving_engine import LoRAModulePath
+from vllm.entrypoints.openai.serving_models import LoRAModulePath
 from vllm.utils import FlexibleArgumentParser
 
 from ...utils import VLLM_PATH
diff --git a/tests/entrypoints/openai/test_lora_lineage.py b/tests/entrypoints/openai/test_lora_lineage.py
index ab39684c2f31a..ce4f85c13fff9 100644
--- a/tests/entrypoints/openai/test_lora_lineage.py
+++ b/tests/entrypoints/openai/test_lora_lineage.py
@@ -55,7 +55,10 @@ def server_with_lora_modules_json(zephyr_lora_files):
         "64",
     ]
 
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    # Enable the /v1/load_lora_adapter endpoint
+    envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
         yield remote_server
 
 
@@ -67,8 +70,8 @@ async def client_for_lora_lineage(server_with_lora_modules_json):
 
 
 @pytest.mark.asyncio
-async def test_check_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
-                                  zephyr_lora_files):
+async def test_static_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
+                                   zephyr_lora_files):
     models = await client_for_lora_lineage.models.list()
     models = models.data
     served_model = models[0]
@@ -81,3 +84,26 @@ async def test_check_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
     assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
     assert lora_models[0].id == "zephyr-lora"
     assert lora_models[1].id == "zephyr-lora2"
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_lineage(
+        client_for_lora_lineage: openai.AsyncOpenAI, zephyr_lora_files):
+
+    response = await client_for_lora_lineage.post("load_lora_adapter",
+                                                  cast_to=str,
+                                                  body={
+                                                      "lora_name":
+                                                      "zephyr-lora-3",
+                                                      "lora_path":
+                                                      zephyr_lora_files
+                                                  })
+    # Ensure adapter loads before querying /models
+    assert "success" in response
+
+    models = await client_for_lora_lineage.models.list()
+    models = models.data
+    dynamic_lora_model = models[-1]
+    assert dynamic_lora_model.root == zephyr_lora_files
+    assert dynamic_lora_model.parent == MODEL_NAME
+    assert dynamic_lora_model.id == "zephyr-lora-3"
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 61677b65af342..97248f1150979 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -8,7 +8,8 @@
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-from vllm.entrypoints.openai.serving_engine import BaseModelPath
+from vllm.entrypoints.openai.serving_models import (BaseModelPath,
+                                                    OpenAIServingModels)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 MODEL_NAME = "openai-community/gpt2"
@@ -50,14 +51,13 @@ async def _async_serving_chat_init():
     engine = MockEngine()
     model_config = await engine.get_model_config()
 
+    models = OpenAIServingModels(model_config, BASE_MODEL_PATHS)
     serving_completion = OpenAIServingChat(engine,
                                            model_config,
-                                           BASE_MODEL_PATHS,
+                                           models,
                                            response_role="assistant",
                                            chat_template=CHAT_TEMPLATE,
                                            chat_template_content_format="auto",
-                                           lora_modules=None,
-                                           prompt_adapters=None,
                                            request_logger=None)
     return serving_completion
 
@@ -72,14 +72,14 @@ def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
 
+    models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=MockModelConfig())
     serving_chat = OpenAIServingChat(mock_engine,
                                      MockModelConfig(),
-                                     BASE_MODEL_PATHS,
+                                     models,
                                      response_role="assistant",
                                      chat_template=CHAT_TEMPLATE,
                                      chat_template_content_format="auto",
-                                     lora_modules=None,
-                                     prompt_adapters=None,
                                      request_logger=None)
     req = ChatCompletionRequest(
         model=MODEL_NAME,
@@ -115,14 +115,14 @@ def test_serving_chat_could_load_correct_generation_config():
     mock_engine.errored = False
 
     # Initialize the serving chat
+    models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
     serving_chat = OpenAIServingChat(mock_engine,
                                      mock_model_config,
-                                     BASE_MODEL_PATHS,
+                                     models,
                                      response_role="assistant",
                                      chat_template=CHAT_TEMPLATE,
                                      chat_template_content_format="auto",
-                                     lora_modules=None,
-                                     prompt_adapters=None,
                                      request_logger=None)
     req = ChatCompletionRequest(
         model=MODEL_NAME,
diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_models.py
similarity index 61%
rename from tests/entrypoints/openai/test_serving_engine.py
rename to tests/entrypoints/openai/test_serving_models.py
index 096ab6fa0ac09..96897dc730da2 100644
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -4,11 +4,11 @@
 import pytest
 
 from vllm.config import ModelConfig
-from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.openai.protocol import (ErrorResponse,
                                               LoadLoraAdapterRequest,
                                               UnloadLoraAdapterRequest)
-from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.entrypoints.openai.serving_models import (BaseModelPath,
+                                                    OpenAIServingModels)
 from vllm.lora.request import LoRARequest
 
 MODEL_NAME = "meta-llama/Llama-2-7b"
@@ -19,47 +19,45 @@
     "Success: LoRA adapter '{lora_name}' removed successfully.")
 
 
-async def _async_serving_engine_init():
-    mock_engine_client = MagicMock(spec=EngineClient)
+async def _async_serving_models_init() -> OpenAIServingModels:
     mock_model_config = MagicMock(spec=ModelConfig)
     # Set the max_model_len attribute to avoid missing attribute
     mock_model_config.max_model_len = 2048
 
-    serving_engine = OpenAIServing(mock_engine_client,
-                                   mock_model_config,
-                                   BASE_MODEL_PATHS,
-                                   lora_modules=None,
-                                   prompt_adapters=None,
-                                   request_logger=None)
-    return serving_engine
+    serving_models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
+                                         model_config=mock_model_config,
+                                         lora_modules=None,
+                                         prompt_adapters=None)
+
+    return serving_models
 
 
 @pytest.mark.asyncio
 async def test_serving_model_name():
-    serving_engine = await _async_serving_engine_init()
-    assert serving_engine._get_model_name(None) == MODEL_NAME
+    serving_models = await _async_serving_models_init()
+    assert serving_models.model_name(None) == MODEL_NAME
     request = LoRARequest(lora_name="adapter",
                           lora_path="/path/to/adapter2",
                           lora_int_id=1)
-    assert serving_engine._get_model_name(request) == request.lora_name
+    assert serving_models.model_name(request) == request.lora_name
 
 
 @pytest.mark.asyncio
 async def test_load_lora_adapter_success():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
     request = LoadLoraAdapterRequest(lora_name="adapter",
                                      lora_path="/path/to/adapter2")
-    response = await serving_engine.load_lora_adapter(request)
+    response = await serving_models.load_lora_adapter(request)
     assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
-    assert len(serving_engine.lora_requests) == 1
-    assert serving_engine.lora_requests[0].lora_name == "adapter"
+    assert len(serving_models.lora_requests) == 1
+    assert serving_models.lora_requests[0].lora_name == "adapter"
 
 
 @pytest.mark.asyncio
 async def test_load_lora_adapter_missing_fields():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
     request = LoadLoraAdapterRequest(lora_name="", lora_path="")
-    response = await serving_engine.load_lora_adapter(request)
+    response = await serving_models.load_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
     assert response.type == "InvalidUserInput"
     assert response.code == HTTPStatus.BAD_REQUEST
@@ -67,43 +65,43 @@ async def test_load_lora_adapter_missing_fields():
 
 @pytest.mark.asyncio
 async def test_load_lora_adapter_duplicate():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
     request = LoadLoraAdapterRequest(lora_name="adapter1",
                                      lora_path="/path/to/adapter1")
-    response = await serving_engine.load_lora_adapter(request)
+    response = await serving_models.load_lora_adapter(request)
     assert response == LORA_LOADING_SUCCESS_MESSAGE.format(
         lora_name='adapter1')
-    assert len(serving_engine.lora_requests) == 1
+    assert len(serving_models.lora_requests) == 1
 
     request = LoadLoraAdapterRequest(lora_name="adapter1",
                                      lora_path="/path/to/adapter1")
-    response = await serving_engine.load_lora_adapter(request)
+    response = await serving_models.load_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
     assert response.type == "InvalidUserInput"
     assert response.code == HTTPStatus.BAD_REQUEST
-    assert len(serving_engine.lora_requests) == 1
+    assert len(serving_models.lora_requests) == 1
 
 
 @pytest.mark.asyncio
 async def test_unload_lora_adapter_success():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
     request = LoadLoraAdapterRequest(lora_name="adapter1",
                                      lora_path="/path/to/adapter1")
-    response = await serving_engine.load_lora_adapter(request)
-    assert len(serving_engine.lora_requests) == 1
+    response = await serving_models.load_lora_adapter(request)
+    assert len(serving_models.lora_requests) == 1
 
     request = UnloadLoraAdapterRequest(lora_name="adapter1")
-    response = await serving_engine.unload_lora_adapter(request)
+    response = await serving_models.unload_lora_adapter(request)
     assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(
         lora_name='adapter1')
-    assert len(serving_engine.lora_requests) == 0
+    assert len(serving_models.lora_requests) == 0
 
 
 @pytest.mark.asyncio
 async def test_unload_lora_adapter_missing_fields():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
     request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None)
-    response = await serving_engine.unload_lora_adapter(request)
+    response = await serving_models.unload_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
     assert response.type == "InvalidUserInput"
     assert response.code == HTTPStatus.BAD_REQUEST
@@ -111,9 +109,9 @@ async def test_unload_lora_adapter_missing_fields():
 
 @pytest.mark.asyncio
 async def test_unload_lora_adapter_not_found():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
     request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter")
-    response = await serving_engine.unload_lora_adapter(request)
+    response = await serving_models.unload_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
     assert response.type == "InvalidUserInput"
     assert response.code == HTTPStatus.BAD_REQUEST
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index bac72d87376da..74fe378fdae42 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -58,7 +58,9 @@
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
-from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import (BaseModelPath,
+                                                    OpenAIServingModels)
 from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
 from vllm.entrypoints.openai.serving_score import OpenAIServingScores
 from vllm.entrypoints.openai.serving_tokenization import (
@@ -269,6 +271,10 @@ def base(request: Request) -> OpenAIServing:
     return tokenization(request)
 
 
+def models(request: Request) -> OpenAIServingModels:
+    return request.app.state.openai_serving_models
+
+
 def chat(request: Request) -> Optional[OpenAIServingChat]:
     return request.app.state.openai_serving_chat
 
@@ -336,10 +342,10 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
 
 @router.get("/v1/models")
 async def show_available_models(raw_request: Request):
-    handler = base(raw_request)
+    handler = models(raw_request)
 
-    models = await handler.show_available_models()
-    return JSONResponse(content=models.model_dump())
+    models_ = await handler.show_available_models()
+    return JSONResponse(content=models_.model_dump())
 
 
 @router.get("/version")
@@ -505,26 +511,22 @@ async def stop_profile(raw_request: Request):
     @router.post("/v1/load_lora_adapter")
     async def load_lora_adapter(request: LoadLoraAdapterRequest,
                                 raw_request: Request):
-        for route in [chat, completion, embedding]:
-            handler = route(raw_request)
-            if handler is not None:
-                response = await handler.load_lora_adapter(request)
-                if isinstance(response, ErrorResponse):
-                    return JSONResponse(content=response.model_dump(),
-                                        status_code=response.code)
+        handler = models(raw_request)
+        response = await handler.load_lora_adapter(request)
+        if isinstance(response, ErrorResponse):
+            return JSONResponse(content=response.model_dump(),
+                                status_code=response.code)
 
         return Response(status_code=200, content=response)
 
     @router.post("/v1/unload_lora_adapter")
     async def unload_lora_adapter(request: UnloadLoraAdapterRequest,
                                   raw_request: Request):
-        for route in [chat, completion, embedding]:
-            handler = route(raw_request)
-            if handler is not None:
-                response = await handler.unload_lora_adapter(request)
-                if isinstance(response, ErrorResponse):
-                    return JSONResponse(content=response.model_dump(),
-                                        status_code=response.code)
+        handler = models(raw_request)
+        response = await handler.unload_lora_adapter(request)
+        if isinstance(response, ErrorResponse):
+            return JSONResponse(content=response.model_dump(),
+                                status_code=response.code)
 
         return Response(status_code=200, content=response)
 
@@ -628,13 +630,18 @@ def init_app_state(
     resolved_chat_template = load_chat_template(args.chat_template)
     logger.info("Using supplied chat template:\n%s", resolved_chat_template)
 
+    state.openai_serving_models = OpenAIServingModels(
+        model_config=model_config,
+        base_model_paths=base_model_paths,
+        lora_modules=args.lora_modules,
+        prompt_adapters=args.prompt_adapters,
+    )
+    # TODO: The chat template is now broken for lora adapters :(
     state.openai_serving_chat = OpenAIServingChat(
         engine_client,
         model_config,
-        base_model_paths,
+        state.openai_serving_models,
         args.response_role,
-        lora_modules=args.lora_modules,
-        prompt_adapters=args.prompt_adapters,
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
@@ -646,16 +653,14 @@ def init_app_state(
     state.openai_serving_completion = OpenAIServingCompletion(
         engine_client,
         model_config,
-        base_model_paths,
-        lora_modules=args.lora_modules,
-        prompt_adapters=args.prompt_adapters,
+        state.openai_serving_models,
         request_logger=request_logger,
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
     ) if model_config.runner_type == "generate" else None
     state.openai_serving_pooling = OpenAIServingPooling(
         engine_client,
         model_config,
-        base_model_paths,
+        state.openai_serving_models,
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
@@ -663,7 +668,7 @@ def init_app_state(
     state.openai_serving_embedding = OpenAIServingEmbedding(
         engine_client,
         model_config,
-        base_model_paths,
+        state.openai_serving_models,
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
@@ -671,14 +676,13 @@ def init_app_state(
     state.openai_serving_scores = OpenAIServingScores(
         engine_client,
         model_config,
-        base_model_paths,
+        state.openai_serving_models,
         request_logger=request_logger
     ) if model_config.task == "score" else None
     state.openai_serving_tokenization = OpenAIServingTokenization(
         engine_client,
         model_config,
-        base_model_paths,
-        lora_modules=args.lora_modules,
+        state.openai_serving_models,
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 908f8c3532c9e..22206ef8dbfe6 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -12,7 +12,7 @@
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
 from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
                                          validate_chat_template)
-from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
+from vllm.entrypoints.openai.serving_models import (LoRAModulePath,
                                                     PromptAdapterPath)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
 from vllm.utils import FlexibleArgumentParser
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 572ed27b39083..822c0f5f7c211 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -20,7 +20,8 @@
 # yapf: enable
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
-from vllm.entrypoints.openai.serving_engine import BaseModelPath
+from vllm.entrypoints.openai.serving_models import (BaseModelPath,
+                                                    OpenAIServingModels)
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, random_uuid
 from vllm.version import __version__ as VLLM_VERSION
@@ -213,13 +214,17 @@ async def main(args):
         request_logger = RequestLogger(max_log_len=args.max_log_len)
 
     # Create the openai serving objects.
+    openai_serving_models = OpenAIServingModels(
+        model_config=model_config,
+        base_model_paths=base_model_paths,
+        lora_modules=None,
+        prompt_adapters=None,
+    )
     openai_serving_chat = OpenAIServingChat(
         engine,
         model_config,
-        base_model_paths,
+        openai_serving_models,
         args.response_role,
-        lora_modules=None,
-        prompt_adapters=None,
         request_logger=request_logger,
         chat_template=None,
         chat_template_content_format="auto",
@@ -228,7 +233,7 @@ async def main(args):
     openai_serving_embedding = OpenAIServingEmbedding(
         engine,
         model_config,
-        base_model_paths,
+        openai_serving_models,
         request_logger=request_logger,
         chat_template=None,
         chat_template_content_format="auto",
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index d085333563d19..9ba5eeb7709c9 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -21,10 +21,8 @@
     ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage,
     DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo,
     RequestResponseMetadata, ToolCall, UsageInfo)
-from vllm.entrypoints.openai.serving_engine import (BaseModelPath,
-                                                    LoRAModulePath,
-                                                    OpenAIServing,
-                                                    PromptAdapterPath)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.logger import init_logger
 from vllm.outputs import CompletionOutput, RequestOutput
@@ -42,11 +40,9 @@ def __init__(
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        base_model_paths: List[BaseModelPath],
+        models: OpenAIServingModels,
         response_role: str,
         *,
-        lora_modules: Optional[List[LoRAModulePath]],
-        prompt_adapters: Optional[List[PromptAdapterPath]],
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
         chat_template_content_format: ChatTemplateContentFormatOption,
@@ -57,9 +53,7 @@ def __init__(
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
-                         base_model_paths=base_model_paths,
-                         lora_modules=lora_modules,
-                         prompt_adapters=prompt_adapters,
+                         models=models,
                          request_logger=request_logger,
                          return_tokens_as_token_ids=return_tokens_as_token_ids)
 
@@ -126,7 +120,7 @@ async def create_chat_completion(
                 prompt_adapter_request,
             ) = self._maybe_get_adapters(request)
 
-            model_name = self._get_model_name(lora_request)
+            model_name = self.models.model_name(lora_request)
 
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index aaad7b8c7f44c..17197dce8da23 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -21,10 +21,8 @@
                                               RequestResponseMetadata,
                                               UsageInfo)
 # yapf: enable
-from vllm.entrypoints.openai.serving_engine import (BaseModelPath,
-                                                    LoRAModulePath,
-                                                    OpenAIServing,
-                                                    PromptAdapterPath)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams, SamplingParams
@@ -41,18 +39,14 @@ def __init__(
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        base_model_paths: List[BaseModelPath],
+        models: OpenAIServingModels,
         *,
-        lora_modules: Optional[List[LoRAModulePath]],
-        prompt_adapters: Optional[List[PromptAdapterPath]],
         request_logger: Optional[RequestLogger],
         return_tokens_as_token_ids: bool = False,
     ):
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
-                         base_model_paths=base_model_paths,
-                         lora_modules=lora_modules,
-                         prompt_adapters=prompt_adapters,
+                         models=models,
                          request_logger=request_logger,
                          return_tokens_as_token_ids=return_tokens_as_token_ids)
         diff_sampling_param = self.model_config.get_diff_sampling_param()
@@ -170,7 +164,7 @@ async def create_completion(
 
         result_generator = merge_async_iterators(*generators)
 
-        model_name = self._get_model_name(lora_request)
+        model_name = self.models.model_name(lora_request)
         num_prompts = len(engine_prompts)
 
         # Similar to the OpenAI API, when n != best_of, we do not stream the
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index b8fb9d6bd77f2..e7116a3d95d10 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -16,7 +16,8 @@
                                               EmbeddingResponse,
                                               EmbeddingResponseData,
                                               ErrorResponse, UsageInfo)
-from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.logger import init_logger
 from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput,
                           PoolingRequestOutput)
@@ -46,7 +47,7 @@ def __init__(
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        base_model_paths: List[BaseModelPath],
+        models: OpenAIServingModels,
         *,
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
@@ -54,9 +55,7 @@ def __init__(
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
-                         base_model_paths=base_model_paths,
-                         lora_modules=None,
-                         prompt_adapters=None,
+                         models=models,
                          request_logger=request_logger)
 
         self.chat_template = chat_template
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 5b6a089e4c319..319f869240036 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1,7 +1,5 @@
 import json
-import pathlib
 from concurrent.futures.thread import ThreadPoolExecutor
-from dataclasses import dataclass
 from http import HTTPStatus
 from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping,
                     Optional, Sequence, Tuple, TypedDict, Union)
@@ -28,13 +26,10 @@
                                               DetokenizeRequest,
                                               EmbeddingChatRequest,
                                               EmbeddingCompletionRequest,
-                                              ErrorResponse,
-                                              LoadLoraAdapterRequest,
-                                              ModelCard, ModelList,
-                                              ModelPermission, ScoreRequest,
+                                              ErrorResponse, ScoreRequest,
                                               TokenizeChatRequest,
-                                              TokenizeCompletionRequest,
-                                              UnloadLoraAdapterRequest)
+                                              TokenizeCompletionRequest)
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser
 # yapf: enable
 from vllm.inputs import TokensPrompt
@@ -48,30 +43,10 @@
 from vllm.tracing import (contains_trace_headers, extract_trace_headers,
                           log_tracing_disabled_warning)
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import AtomicCounter, is_list_of, make_async, random_uuid
+from vllm.utils import is_list_of, make_async, random_uuid
 
 logger = init_logger(__name__)
 
-
-@dataclass
-class BaseModelPath:
-    name: str
-    model_path: str
-
-
-@dataclass
-class PromptAdapterPath:
-    name: str
-    local_path: str
-
-
-@dataclass
-class LoRAModulePath:
-    name: str
-    path: str
-    base_model_name: Optional[str] = None
-
-
 CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest,
                               EmbeddingCompletionRequest, ScoreRequest,
                               TokenizeCompletionRequest]
@@ -96,10 +71,8 @@ def __init__(
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        base_model_paths: List[BaseModelPath],
+        models: OpenAIServingModels,
         *,
-        lora_modules: Optional[List[LoRAModulePath]],
-        prompt_adapters: Optional[List[PromptAdapterPath]],
         request_logger: Optional[RequestLogger],
         return_tokens_as_token_ids: bool = False,
     ):
@@ -109,35 +82,7 @@ def __init__(
         self.model_config = model_config
         self.max_model_len = model_config.max_model_len
 
-        self.base_model_paths = base_model_paths
-
-        self.lora_id_counter = AtomicCounter(0)
-        self.lora_requests = []
-        if lora_modules is not None:
-            self.lora_requests = [
-                LoRARequest(lora_name=lora.name,
-                            lora_int_id=i,
-                            lora_path=lora.path,
-                            base_model_name=lora.base_model_name
-                            if lora.base_model_name
-                            and self._is_model_supported(lora.base_model_name)
-                            else self.base_model_paths[0].name)
-                for i, lora in enumerate(lora_modules, start=1)
-            ]
-
-        self.prompt_adapter_requests = []
-        if prompt_adapters is not None:
-            for i, prompt_adapter in enumerate(prompt_adapters, start=1):
-                with pathlib.Path(prompt_adapter.local_path,
-                                  "adapter_config.json").open() as f:
-                    adapter_config = json.load(f)
-                    num_virtual_tokens = adapter_config["num_virtual_tokens"]
-                self.prompt_adapter_requests.append(
-                    PromptAdapterRequest(
-                        prompt_adapter_name=prompt_adapter.name,
-                        prompt_adapter_id=i,
-                        prompt_adapter_local_path=prompt_adapter.local_path,
-                        prompt_adapter_num_virtual_tokens=num_virtual_tokens))
+        self.models = models
 
         self.request_logger = request_logger
         self.return_tokens_as_token_ids = return_tokens_as_token_ids
@@ -150,33 +95,6 @@ def __init__(
             self._tokenize_prompt_input_or_inputs,
             executor=self._tokenizer_executor)
 
-    async def show_available_models(self) -> ModelList:
-        """Show available models. Right now we only have one model."""
-        model_cards = [
-            ModelCard(id=base_model.name,
-                      max_model_len=self.max_model_len,
-                      root=base_model.model_path,
-                      permission=[ModelPermission()])
-            for base_model in self.base_model_paths
-        ]
-        lora_cards = [
-            ModelCard(id=lora.lora_name,
-                      root=lora.local_path,
-                      parent=lora.base_model_name if lora.base_model_name else
-                      self.base_model_paths[0].name,
-                      permission=[ModelPermission()])
-            for lora in self.lora_requests
-        ]
-        prompt_adapter_cards = [
-            ModelCard(id=prompt_adapter.prompt_adapter_name,
-                      root=self.base_model_paths[0].name,
-                      permission=[ModelPermission()])
-            for prompt_adapter in self.prompt_adapter_requests
-        ]
-        model_cards.extend(lora_cards)
-        model_cards.extend(prompt_adapter_cards)
-        return ModelList(data=model_cards)
-
     def create_error_response(
             self,
             message: str,
@@ -205,11 +123,13 @@ async def _check_model(
     ) -> Optional[ErrorResponse]:
         if self._is_model_supported(request.model):
             return None
-        if request.model in [lora.lora_name for lora in self.lora_requests]:
+        if request.model in [
+                lora.lora_name for lora in self.models.lora_requests
+        ]:
             return None
         if request.model in [
                 prompt_adapter.prompt_adapter_name
-                for prompt_adapter in self.prompt_adapter_requests
+                for prompt_adapter in self.models.prompt_adapter_requests
         ]:
             return None
         return self.create_error_response(
@@ -223,10 +143,10 @@ def _maybe_get_adapters(
             None, PromptAdapterRequest]]:
         if self._is_model_supported(request.model):
             return None, None
-        for lora in self.lora_requests:
+        for lora in self.models.lora_requests:
             if request.model == lora.lora_name:
                 return lora, None
-        for prompt_adapter in self.prompt_adapter_requests:
+        for prompt_adapter in self.models.prompt_adapter_requests:
             if request.model == prompt_adapter.prompt_adapter_name:
                 return None, prompt_adapter
         # if _check_model has been called earlier, this will be unreachable
@@ -588,91 +508,5 @@ def _get_decoded_token(logprob: Logprob,
             return logprob.decoded_token
         return tokenizer.decode(token_id)
 
-    async def _check_load_lora_adapter_request(
-            self, request: LoadLoraAdapterRequest) -> Optional[ErrorResponse]:
-        # Check if both 'lora_name' and 'lora_path' are provided
-        if not request.lora_name or not request.lora_path:
-            return self.create_error_response(
-                message="Both 'lora_name' and 'lora_path' must be provided.",
-                err_type="InvalidUserInput",
-                status_code=HTTPStatus.BAD_REQUEST)
-
-        # Check if the lora adapter with the given name already exists
-        if any(lora_request.lora_name == request.lora_name
-               for lora_request in self.lora_requests):
-            return self.create_error_response(
-                message=
-                f"The lora adapter '{request.lora_name}' has already been"
-                "loaded.",
-                err_type="InvalidUserInput",
-                status_code=HTTPStatus.BAD_REQUEST)
-
-        return None
-
-    async def _check_unload_lora_adapter_request(
-            self,
-            request: UnloadLoraAdapterRequest) -> Optional[ErrorResponse]:
-        # Check if either 'lora_name' or 'lora_int_id' is provided
-        if not request.lora_name and not request.lora_int_id:
-            return self.create_error_response(
-                message=
-                "either 'lora_name' and 'lora_int_id' needs to be provided.",
-                err_type="InvalidUserInput",
-                status_code=HTTPStatus.BAD_REQUEST)
-
-        # Check if the lora adapter with the given name exists
-        if not any(lora_request.lora_name == request.lora_name
-                   for lora_request in self.lora_requests):
-            return self.create_error_response(
-                message=
-                f"The lora adapter '{request.lora_name}' cannot be found.",
-                err_type="InvalidUserInput",
-                status_code=HTTPStatus.BAD_REQUEST)
-
-        return None
-
-    async def load_lora_adapter(
-            self,
-            request: LoadLoraAdapterRequest) -> Union[ErrorResponse, str]:
-        error_check_ret = await self._check_load_lora_adapter_request(request)
-        if error_check_ret is not None:
-            return error_check_ret
-
-        lora_name, lora_path = request.lora_name, request.lora_path
-        unique_id = self.lora_id_counter.inc(1)
-        self.lora_requests.append(
-            LoRARequest(lora_name=lora_name,
-                        lora_int_id=unique_id,
-                        lora_path=lora_path))
-        return f"Success: LoRA adapter '{lora_name}' added successfully."
-
-    async def unload_lora_adapter(
-            self,
-            request: UnloadLoraAdapterRequest) -> Union[ErrorResponse, str]:
-        error_check_ret = await self._check_unload_lora_adapter_request(request
-                                                                        )
-        if error_check_ret is not None:
-            return error_check_ret
-
-        lora_name = request.lora_name
-        self.lora_requests = [
-            lora_request for lora_request in self.lora_requests
-            if lora_request.lora_name != lora_name
-        ]
-        return f"Success: LoRA adapter '{lora_name}' removed successfully."
-
     def _is_model_supported(self, model_name):
-        return any(model.name == model_name for model in self.base_model_paths)
-
-    def _get_model_name(self, lora: Optional[LoRARequest]):
-        """
-        Returns the appropriate model name depending on the availability
-        and support of the LoRA or base model.
-        Parameters:
-        - lora: LoRARequest that contain a base_model_name.
-        Returns:
-        - str: The name of the base model or the first available model path.
-        """
-        if lora is not None:
-            return lora.lora_name
-        return self.base_model_paths[0].name
+        return self.models.is_base_model(model_name)
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
new file mode 100644
index 0000000000000..26966896bc272
--- /dev/null
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -0,0 +1,210 @@
+import json
+import pathlib
+from dataclasses import dataclass
+from http import HTTPStatus
+from typing import List, Optional, Union
+
+from vllm.config import ModelConfig
+from vllm.entrypoints.openai.protocol import (ErrorResponse,
+                                              LoadLoraAdapterRequest,
+                                              ModelCard, ModelList,
+                                              ModelPermission,
+                                              UnloadLoraAdapterRequest)
+from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.utils import AtomicCounter
+
+
+@dataclass
+class BaseModelPath:
+    name: str
+    model_path: str
+
+
+@dataclass
+class PromptAdapterPath:
+    name: str
+    local_path: str
+
+
+@dataclass
+class LoRAModulePath:
+    name: str
+    path: str
+    base_model_name: Optional[str] = None
+
+
+class OpenAIServingModels:
+    """Shared instance to hold data about the loaded base model(s) and adapters.
+
+    Handles the routes:
+    - /v1/models
+    - /v1/load_lora_adapter
+    - /v1/unload_lora_adapter
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        base_model_paths: List[BaseModelPath],
+        *,
+        lora_modules: Optional[List[LoRAModulePath]] = None,
+        prompt_adapters: Optional[List[PromptAdapterPath]] = None,
+    ):
+        super().__init__()
+
+        self.base_model_paths = base_model_paths
+        self.max_model_len = model_config.max_model_len
+
+        self.lora_id_counter = AtomicCounter(0)
+        self.lora_requests = []
+        if lora_modules is not None:
+            self.lora_requests = [
+                LoRARequest(lora_name=lora.name,
+                            lora_int_id=i,
+                            lora_path=lora.path,
+                            base_model_name=lora.base_model_name
+                            if lora.base_model_name
+                            and self.is_base_model(lora.base_model_name) else
+                            self.base_model_paths[0].name)
+                for i, lora in enumerate(lora_modules, start=1)
+            ]
+
+        self.prompt_adapter_requests = []
+        if prompt_adapters is not None:
+            for i, prompt_adapter in enumerate(prompt_adapters, start=1):
+                with pathlib.Path(prompt_adapter.local_path,
+                                  "adapter_config.json").open() as f:
+                    adapter_config = json.load(f)
+                    num_virtual_tokens = adapter_config["num_virtual_tokens"]
+                self.prompt_adapter_requests.append(
+                    PromptAdapterRequest(
+                        prompt_adapter_name=prompt_adapter.name,
+                        prompt_adapter_id=i,
+                        prompt_adapter_local_path=prompt_adapter.local_path,
+                        prompt_adapter_num_virtual_tokens=num_virtual_tokens))
+
+    def is_base_model(self, model_name):
+        return any(model.name == model_name for model in self.base_model_paths)
+
+    def model_name(self, lora_request: Optional[LoRARequest] = None) -> str:
+        """Returns the appropriate model name depending on the availability
+        and support of the LoRA or base model.
+        Parameters:
+        - lora: LoRARequest that contain a base_model_name.
+        Returns:
+        - str: The name of the base model or the first available model path.
+        """
+        if lora_request is not None:
+            return lora_request.lora_name
+        return self.base_model_paths[0].name
+
+    async def show_available_models(self) -> ModelList:
+        """Show available models. This includes the base model and all 
+        adapters"""
+        model_cards = [
+            ModelCard(id=base_model.name,
+                      max_model_len=self.max_model_len,
+                      root=base_model.model_path,
+                      permission=[ModelPermission()])
+            for base_model in self.base_model_paths
+        ]
+        lora_cards = [
+            ModelCard(id=lora.lora_name,
+                      root=lora.local_path,
+                      parent=lora.base_model_name if lora.base_model_name else
+                      self.base_model_paths[0].name,
+                      permission=[ModelPermission()])
+            for lora in self.lora_requests
+        ]
+        prompt_adapter_cards = [
+            ModelCard(id=prompt_adapter.prompt_adapter_name,
+                      root=self.base_model_paths[0].name,
+                      permission=[ModelPermission()])
+            for prompt_adapter in self.prompt_adapter_requests
+        ]
+        model_cards.extend(lora_cards)
+        model_cards.extend(prompt_adapter_cards)
+        return ModelList(data=model_cards)
+
+    async def load_lora_adapter(
+            self,
+            request: LoadLoraAdapterRequest) -> Union[ErrorResponse, str]:
+        error_check_ret = await self._check_load_lora_adapter_request(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        lora_name, lora_path = request.lora_name, request.lora_path
+        unique_id = self.lora_id_counter.inc(1)
+        self.lora_requests.append(
+            LoRARequest(lora_name=lora_name,
+                        lora_int_id=unique_id,
+                        lora_path=lora_path))
+        return f"Success: LoRA adapter '{lora_name}' added successfully."
+
+    async def unload_lora_adapter(
+            self,
+            request: UnloadLoraAdapterRequest) -> Union[ErrorResponse, str]:
+        error_check_ret = await self._check_unload_lora_adapter_request(request
+                                                                        )
+        if error_check_ret is not None:
+            return error_check_ret
+
+        lora_name = request.lora_name
+        self.lora_requests = [
+            lora_request for lora_request in self.lora_requests
+            if lora_request.lora_name != lora_name
+        ]
+        return f"Success: LoRA adapter '{lora_name}' removed successfully."
+
+    async def _check_load_lora_adapter_request(
+            self, request: LoadLoraAdapterRequest) -> Optional[ErrorResponse]:
+        # Check if both 'lora_name' and 'lora_path' are provided
+        if not request.lora_name or not request.lora_path:
+            return create_error_response(
+                message="Both 'lora_name' and 'lora_path' must be provided.",
+                err_type="InvalidUserInput",
+                status_code=HTTPStatus.BAD_REQUEST)
+
+        # Check if the lora adapter with the given name already exists
+        if any(lora_request.lora_name == request.lora_name
+               for lora_request in self.lora_requests):
+            return create_error_response(
+                message=
+                f"The lora adapter '{request.lora_name}' has already been"
+                "loaded.",
+                err_type="InvalidUserInput",
+                status_code=HTTPStatus.BAD_REQUEST)
+
+        return None
+
+    async def _check_unload_lora_adapter_request(
+            self,
+            request: UnloadLoraAdapterRequest) -> Optional[ErrorResponse]:
+        # Check if either 'lora_name' or 'lora_int_id' is provided
+        if not request.lora_name and not request.lora_int_id:
+            return create_error_response(
+                message=
+                "either 'lora_name' and 'lora_int_id' needs to be provided.",
+                err_type="InvalidUserInput",
+                status_code=HTTPStatus.BAD_REQUEST)
+
+        # Check if the lora adapter with the given name exists
+        if not any(lora_request.lora_name == request.lora_name
+                   for lora_request in self.lora_requests):
+            return create_error_response(
+                message=
+                f"The lora adapter '{request.lora_name}' cannot be found.",
+                err_type="InvalidUserInput",
+                status_code=HTTPStatus.BAD_REQUEST)
+
+        return None
+
+
+def create_error_response(
+        message: str,
+        err_type: str = "BadRequestError",
+        status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse:
+    return ErrorResponse(message=message,
+                         type=err_type,
+                         code=status_code.value)
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 01852f0df1eca..5830322071e58 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -15,7 +15,8 @@
                                               PoolingChatRequest,
                                               PoolingRequest, PoolingResponse,
                                               PoolingResponseData, UsageInfo)
-from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.logger import init_logger
 from vllm.outputs import PoolingOutput, PoolingRequestOutput
 from vllm.utils import merge_async_iterators
@@ -44,7 +45,7 @@ def __init__(
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        base_model_paths: List[BaseModelPath],
+        models: OpenAIServingModels,
         *,
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
@@ -52,9 +53,7 @@ def __init__(
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
-                         base_model_paths=base_model_paths,
-                         lora_modules=None,
-                         prompt_adapters=None,
+                         models=models,
                          request_logger=request_logger)
 
         self.chat_template = chat_template
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index a8a126e697641..5d3e7139d7a17 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -10,7 +10,8 @@
 from vllm.entrypoints.openai.protocol import (ErrorResponse, ScoreRequest,
                                               ScoreResponse, ScoreResponseData,
                                               UsageInfo)
-from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
 from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
@@ -50,15 +51,13 @@ def __init__(
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        base_model_paths: List[BaseModelPath],
+        models: OpenAIServingModels,
         *,
         request_logger: Optional[RequestLogger],
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
-                         base_model_paths=base_model_paths,
-                         lora_modules=None,
-                         prompt_adapters=None,
+                         models=models,
                          request_logger=request_logger)
 
     async def create_score(
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 2e849333680d4..b67ecfb01316f 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -15,9 +15,8 @@
                                               TokenizeRequest,
                                               TokenizeResponse)
 # yapf: enable
-from vllm.entrypoints.openai.serving_engine import (BaseModelPath,
-                                                    LoRAModulePath,
-                                                    OpenAIServing)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -29,18 +28,15 @@ def __init__(
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        base_model_paths: List[BaseModelPath],
+        models: OpenAIServingModels,
         *,
-        lora_modules: Optional[List[LoRAModulePath]],
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
         chat_template_content_format: ChatTemplateContentFormatOption,
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
-                         base_model_paths=base_model_paths,
-                         lora_modules=lora_modules,
-                         prompt_adapters=None,
+                         models=models,
                          request_logger=request_logger)
 
         self.chat_template = chat_template

From 365801feddaf5c4448704a1f55269dd992f5a4b1 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 1 Jan 2025 14:15:21 +0800
Subject: [PATCH 235/357] [VLM] Add max-count checking in data parser for
 single image models (#11661)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.md  |  2 +-
 tests/multimodal/test_processing.py     |  3 ++-
 vllm/model_executor/models/blip2.py     |  4 ++++
 vllm/model_executor/models/chameleon.py |  4 ++++
 vllm/model_executor/models/fuyu.py      | 18 +++++++++-------
 vllm/multimodal/parse.py                | 28 +++++++++++++++++++++++--
 6 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index f74c201bdff6b..7682ed104b8c5 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -566,7 +566,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - [V1](gh-issue:8779)
 * - `AriaForConditionalGeneration`
   - Aria
-  - T + I
+  - T + I<sup>+</sup>
   - `rhymes-ai/Aria`
   -
   - ✅︎
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 81278cde264ff..1850ca46ccc8f 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -622,10 +622,11 @@ def _test_processing_cache_correctness(
 
 
 # yapf: disable
+# True if the model supports multiple data items of the modality per request
 @pytest.mark.parametrize(("model_id", "modalities"), [
     ("rhymes-ai/Aria", {"image": True}),
     ("Salesforce/blip2-opt-2.7b", {"image": False}),
-    ("facebook/chameleon-7b", {"image": True}),
+    ("facebook/chameleon-7b", {"image": False}),
     ("adept/fuyu-8b", {"image": False}),
     ("llava-hf/llava-1.5-7b-hf", {"image": True}),
     ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index bf70f5d904f5b..50680fadc4aa3 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -18,6 +18,7 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
+from vllm.multimodal.parse import MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
@@ -404,6 +405,9 @@ def get_max_blip2_image_tokens(ctx: InputContext):
 
 class Blip2MultiModalProcessor(BaseMultiModalProcessor):
 
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return MultiModalDataParser(max_mm_counts={"image": 1})
+
     def _get_hf_processor(self) -> Blip2Processor:
         return self.ctx.get_hf_processor(Blip2Processor)
 
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 85fca23b05746..c731934e792fc 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -31,6 +31,7 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
+from vllm.multimodal.parse import MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
@@ -60,6 +61,9 @@ def get_max_chameleon_image_tokens(ctx: InputContext):
 
 class ChameleonMultiModalProcessor(BaseMultiModalProcessor):
 
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return MultiModalDataParser(max_mm_counts={"image": 1})
+
     def _get_hf_processor(self) -> ChameleonProcessor:
         return self.ctx.get_hf_processor(ChameleonProcessor)
 
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 8c14866f20b92..0a48fa3fe11c0 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -34,7 +34,7 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
-from vllm.multimodal.parse import ImageProcessorItems
+from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
@@ -54,7 +54,7 @@
 
 class FuyuImagePatchInputs(TypedDict):
     type: Literal["image_patches"]
-    data: torch.Tensor
+    flat_data: torch.Tensor
     """
     Shape: 
     `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
@@ -63,7 +63,7 @@ class FuyuImagePatchInputs(TypedDict):
     patches_per_image: List[int]
     """
     List of number of total patches for each image in the batch.
-    This is used to restore the first two dimensions of `data`.
+    This is used to restore the first two dimensions of `flat_data`.
     """
 
 
@@ -102,6 +102,9 @@ def get_max_fuyu_image_tokens(ctx: InputContext):
 
 class FuyuMultiModalProcessor(BaseMultiModalProcessor):
 
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return MultiModalDataParser(max_mm_counts={"image": 1})
+
     def _get_hf_processor(self) -> FuyuProcessor:
         return self.ctx.get_hf_processor(FuyuProcessor)
 
@@ -304,7 +307,7 @@ def _parse_and_validate_image_input(
 
             return FuyuImagePatchInputs(
                 type="image_patches",
-                data=self._validate_pixel_values(
+                flat_data=self._validate_pixel_values(
                     flatten_bn(image_patches_flat, concat=True)),
                 patches_per_image=[x.size(0) for x in image_patches_flat],
             )
@@ -313,12 +316,13 @@ def _parse_and_validate_image_input(
 
     def _process_image_input(
             self, image_input: FuyuImagePatchInputs) -> NestedTensors:
-        image_patches = image_input["data"]
+        image_patches_flat = image_input["flat_data"]
         patches_per_image = image_input["patches_per_image"]
 
         assert self.vision_embed_tokens is not None
-        vision_embeddings, _ = self.vision_embed_tokens(image_patches)
-        return vision_embeddings.split(patches_per_image, dim=0)
+        vision_embeddings_flat, _ = self.vision_embed_tokens(
+            image_patches_flat)
+        return vision_embeddings_flat.split(patches_per_image, dim=0)
 
     def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 17a795247372e..da111e999ebb8 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -220,11 +220,24 @@ def get_items(
 class MultiModalDataParser:
     """
     Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`.
+
+    Args:
+        max_mm_counts (Mapping[str, int]): The maximum allowed number of items
+            belonging to each modality. This effectively sets a hard limit over
+            `--limit-mm-per-prompt`.
+        target_sr (float, optional): Enables automatic resampling of audio
+            items to the model's expected sampling rate.
     """
 
-    def __init__(self, *, target_sr: Optional[float] = None) -> None:
+    def __init__(
+        self,
+        *,
+        max_mm_counts: Mapping[str, int] = {},
+        target_sr: Optional[float] = None,
+    ) -> None:
         super().__init__()
 
+        self.max_mm_counts = max_mm_counts
         self.target_sr = target_sr
 
     def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]:
@@ -332,6 +345,7 @@ def _get_subparsers(self) -> Mapping[str, ModalityDataParser]:
 
     def parse_mm_data(self,
                       mm_data: MultiModalDataDict) -> MultiModalDataItems:
+        max_mm_counts = self.max_mm_counts
         subparsers = self._get_subparsers()
 
         mm_items = MultiModalDataItems()
@@ -339,6 +353,16 @@ def parse_mm_data(self,
             if k not in subparsers:
                 raise ValueError(f"Unsupported modality: {k}")
 
-            mm_items[k] = subparsers[k](v)
+            modality_items = subparsers[k](v)
+
+            if k in max_mm_counts:
+                max_count = max_mm_counts[k]
+                if len(modality_items) > max_count:
+                    raise ValueError(
+                        f"This model supports at most {max_count} {k} items "
+                        f"per prompt, but {len(modality_items)} {k} items "
+                        "were given or set as its limit_mm_per_prompt.")
+
+            mm_items[k] = modality_items
 
         return mm_items

From 11d8a091c6c775575a53d37408c94faa0b07730f Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 1 Jan 2025 14:42:23 +0800
Subject: [PATCH 236/357] [Misc] Optimize Qwen2-VL LoRA test (#11663)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_qwen2vl.py             |  5 ++---
 vllm/model_executor/models/qwen2_vl.py | 20 +++++++++++++++++++-
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index c9f48402b0268..ebdd129db5f6a 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -7,7 +7,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.platforms import current_platform
 
-MODEL_PATH = "Qwen/Qwen2-VL-7B-Instruct"
+MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
 
 PROMPT_TEMPLATE = (
     "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
@@ -49,10 +49,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     # Print the outputs.
     generated_texts: List[str] = []
     for output in outputs:
-        prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
         generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        print(f"Generated text: {generated_text!r}")
     return generated_texts
 
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 1e485f87bb7a4..0df101b3dcce4 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -52,6 +52,7 @@
     GPTQMarlinConfig)
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (ImageItem, ModalityData,
                                     MultiModalFieldConfig, MultiModalKwargs,
@@ -926,15 +927,23 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
     }
 
     # LoRA specific attributes
-    # TODO Support LoRA for the visual encoder in the future.
     supported_lora_modules = [
         "qkv_proj",
         "o_proj",
         "gate_up_proj",
         "down_proj",
+        # vision tower
+        "qkv",
+        "attn.proj",  # Distinguish patch_embed.proj
+        "fc1",
+        "fc2",
+        # projector
+        "mlp.0",
+        "mlp.2"
     ]
     embedding_modules = {}
     embedding_padding_modules = []
+
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
         "lm_head.": "language_model.lm_head.",
@@ -1231,3 +1240,12 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="visual.",
+            tower_model="visual.merger.")

From f962f426bc63b66301da61d2ac7078bf0ba941b0 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Tue, 31 Dec 2024 23:39:30 -0800
Subject: [PATCH 237/357] [Misc] Replace space with - in the file names
 (#11667)

Signed-off-by: Lu Fang <lufang@fb.com>
---
 .github/ISSUE_TEMPLATE/{400-bug report.yml => 400-bug-report.yml} | 0
 .../{500-feature request.yml => 500-feature-request.yml}          | 0
 .github/ISSUE_TEMPLATE/{600-new model.yml => 600-new-model.yml}   | 0
 ...-performance discussion.yml => 700-performance-discussion.yml} | 0
 .../{800-misc discussion.yml => 800-misc-discussion.yml}          | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename .github/ISSUE_TEMPLATE/{400-bug report.yml => 400-bug-report.yml} (100%)
 rename .github/ISSUE_TEMPLATE/{500-feature request.yml => 500-feature-request.yml} (100%)
 rename .github/ISSUE_TEMPLATE/{600-new model.yml => 600-new-model.yml} (100%)
 rename .github/ISSUE_TEMPLATE/{700-performance discussion.yml => 700-performance-discussion.yml} (100%)
 rename .github/ISSUE_TEMPLATE/{800-misc discussion.yml => 800-misc-discussion.yml} (100%)

diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml
similarity index 100%
rename from .github/ISSUE_TEMPLATE/400-bug report.yml
rename to .github/ISSUE_TEMPLATE/400-bug-report.yml
diff --git a/.github/ISSUE_TEMPLATE/500-feature request.yml b/.github/ISSUE_TEMPLATE/500-feature-request.yml
similarity index 100%
rename from .github/ISSUE_TEMPLATE/500-feature request.yml
rename to .github/ISSUE_TEMPLATE/500-feature-request.yml
diff --git a/.github/ISSUE_TEMPLATE/600-new model.yml b/.github/ISSUE_TEMPLATE/600-new-model.yml
similarity index 100%
rename from .github/ISSUE_TEMPLATE/600-new model.yml
rename to .github/ISSUE_TEMPLATE/600-new-model.yml
diff --git a/.github/ISSUE_TEMPLATE/700-performance discussion.yml b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
similarity index 100%
rename from .github/ISSUE_TEMPLATE/700-performance discussion.yml
rename to .github/ISSUE_TEMPLATE/700-performance-discussion.yml
diff --git a/.github/ISSUE_TEMPLATE/800-misc discussion.yml b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
similarity index 100%
rename from .github/ISSUE_TEMPLATE/800-misc discussion.yml
rename to .github/ISSUE_TEMPLATE/800-misc-discussion.yml

From 6d70198b17b008f5b845582590b96a507b4d68b5 Mon Sep 17 00:00:00 2001
From: Kazuhiro Serizawa <nserihiro@gmail.com>
Date: Wed, 1 Jan 2025 17:10:10 +0900
Subject: [PATCH 238/357] [Doc] Fix typo (#11666)

Signed-off-by: Kazuhiro Serizawa <nserihiro@gmail.com>
---
 vllm/model_executor/layers/rejection_sampler.py | 2 +-
 vllm/v1/sample/ops/topk_topp_sampler.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index 97a1b0c9603bd..165e8309fee64 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -39,7 +39,7 @@ def __init__(self,
             strict_mode: Whether or not to perform shape/device/dtype checks
             during sampling. This catches correctness issues but adds
             nontrivial latency.
-            use_falshinfer: We will use this parameter to determine whether
+            use_flashinfer: We will use this parameter to determine whether
             to use the FlashInfer rejection sampling kernel or not. If it's
             None, we will use the default value from the environment variable.
             This parameter is only used for testing purposes.
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index c088c3c129ca5..f2007d85c61a5 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -44,7 +44,7 @@ def __init__(self):
                 logger.warning(
                     "FlashInfer is not available. Falling back to the PyTorch-"
                     "native implementation of top-p & top-k sampling. For the "
-                    "best performance, please install FalshInfer.")
+                    "best performance, please install FlashInfer.")
                 self.forward = self.forward_native
         else:
             self.forward = self.forward_native

From 73001445fbfc42d386d68066519738dfffa62df3 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 1 Jan 2025 21:56:46 +0900
Subject: [PATCH 239/357] [V1] Implement Cascade Attention (#11635)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 CMakeLists.txt                              |   2 +-
 tests/conftest.py                           |   7 +
 tests/kernels/test_cascade_flash_attn.py    | 182 +++++++++++++
 tests/system_messages/sonnet3.5_nov2024.txt |  71 ++++++
 tests/v1/e2e/__init__.py                    |   0
 tests/v1/e2e/test_cascade_attention.py      |  22 ++
 vllm/v1/attention/backends/flash_attn.py    | 267 +++++++++++++++++++-
 vllm/v1/core/kv_cache_manager.py            |  52 +++-
 vllm/v1/core/scheduler.py                   |  10 +
 vllm/v1/worker/gpu_model_runner.py          |  96 ++++++-
 10 files changed, 693 insertions(+), 16 deletions(-)
 create mode 100644 tests/kernels/test_cascade_flash_attn.py
 create mode 100644 tests/system_messages/sonnet3.5_nov2024.txt
 create mode 100644 tests/v1/e2e/__init__.py
 create mode 100644 tests/v1/e2e/test_cascade_attention.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3206d76125545..f4b9c3ec9c14f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -550,7 +550,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 04325b6798bcc326c86fb35af62d05a9c8c8eceb
+          GIT_TAG 96266b1111111f3d11aabefaf3bacbab6a89d03c
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/tests/conftest.py b/tests/conftest.py
index 6e2f75e33654f..917151ddcb8d4 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -40,6 +40,7 @@
 _TEST_DIR = os.path.dirname(__file__)
 _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
+_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
 
 _M = TypeVar("_M")
 _PromptMultiModalInput = Union[List[_M], List[List[_M]]]
@@ -177,6 +178,12 @@ def example_prompts() -> List[str]:
     return prompts
 
 
+@pytest.fixture
+def example_system_message() -> str:
+    with open(_SYS_MSG) as f:
+        return f.read()
+
+
 class DecoderPromptType(Enum):
     """For encoder/decoder models only."""
     CUSTOM = 1
diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/test_cascade_flash_attn.py
new file mode 100644
index 0000000000000..45ec6df4e711e
--- /dev/null
+++ b/tests/kernels/test_cascade_flash_attn.py
@@ -0,0 +1,182 @@
+from typing import List, Optional, Tuple
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+from vllm.v1.attention.backends.flash_attn import (cascade_attention,
+                                                   merge_attn_states)
+from vllm.vllm_flash_attn import flash_attn_varlen_func
+
+NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
+HEAD_SIZES = [128, 192, 256]
+BLOCK_SIZES = [16]
+DTYPES = [torch.float16, torch.bfloat16]
+
+
+@pytest.mark.parametrize("num_tokens", [1, 39, 16912])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_merge_kernel(
+    num_tokens: int,
+    num_heads: Tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+):
+    torch.set_default_device("cuda")
+    current_platform.seed_everything(0)
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+
+    # Prepare inputs.
+    prefix_output = torch.randn(num_tokens,
+                                num_query_heads,
+                                head_size,
+                                dtype=dtype)
+    suffix_output = torch.randn(num_tokens,
+                                num_query_heads,
+                                head_size,
+                                dtype=dtype)
+    prefix_lse = torch.randn(num_query_heads, num_tokens, dtype=torch.float32)
+    suffix_lse = torch.randn(num_query_heads, num_tokens, dtype=torch.float32)
+
+    # Run the kernel.
+    output = torch.empty(num_tokens, num_query_heads, head_size, dtype=dtype)
+    merge_attn_states(output, prefix_output, prefix_lse, suffix_output,
+                      suffix_lse)
+
+    # Reference implementation.
+    max_lse = torch.maximum(prefix_lse, suffix_lse)
+    p_lse = torch.exp(prefix_lse - max_lse)
+    s_lse = torch.exp(suffix_lse - max_lse)
+    p_scale = p_lse / (p_lse + s_lse)
+    s_scale = s_lse / (p_lse + s_lse)
+    p_scale = p_scale.transpose(0, 1).unsqueeze(2)
+    s_scale = s_scale.transpose(0, 1).unsqueeze(2)
+    ref_output = p_scale * prefix_output + s_scale * suffix_output
+    ref_output = ref_output.to(dtype)
+
+    # Compare the results.
+    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
+
+
+CASES = [
+    # Case 1. A general case.
+    ([(129, 871), (18, 280), (37, 988), (1023, 2304), (1, 257)], 256),
+    # Case 2. Flash-decoding case.
+    ([(1, 1023), (1, 879), (1, 778), (1, 1777)] * 100, 512),
+]
+
+
+@pytest.mark.parametrize("seq_lens_and_common_prefix", CASES)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("soft_cap", [None, 50])
+@pytest.mark.parametrize("num_blocks", [2048])
+@torch.inference_mode()
+def test_cascade(
+    seq_lens_and_common_prefix: Tuple[List[Tuple[int, int]], int],
+    num_heads: Tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: Optional[float],
+    num_blocks: int,
+) -> None:
+    torch.set_default_device("cuda")
+    current_platform.seed_everything(0)
+
+    window_size = (-1, -1)
+    scale = head_size**-0.5
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    key_cache = torch.randn(num_blocks,
+                            block_size,
+                            num_kv_heads,
+                            head_size,
+                            dtype=dtype)
+    value_cache = torch.randn_like(key_cache)
+
+    seq_lens, common_prefix_len = seq_lens_and_common_prefix
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    max_query_len = max(query_lens)
+    max_kv_len = max(kv_lens)
+
+    total_num_query_tokens = sum(query_lens)
+    query = torch.randn(total_num_query_tokens,
+                        num_query_heads,
+                        head_size,
+                        dtype=dtype)
+    cu_query_lens = torch.tensor([0] + query_lens,
+                                 dtype=torch.int32).cumsum(dim=0,
+                                                           dtype=torch.int32)
+    cu_kv_lens = torch.tensor([0] + kv_lens,
+                              dtype=torch.int32).cumsum(dim=0,
+                                                        dtype=torch.int32)
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(0,
+                                 num_blocks,
+                                 (num_seqs, max_num_blocks_per_seq),
+                                 dtype=torch.int32)
+
+    assert common_prefix_len > 0
+    assert common_prefix_len % block_size == 0
+    num_common_kv_blocks = common_prefix_len // block_size
+    # Make sure the first `num_common_kv_blocks` blocks are the same.
+    block_tables[:, :num_common_kv_blocks] = \
+        block_tables[0, :num_common_kv_blocks]
+
+    # Run the regular attention.
+    ref_output = flash_attn_varlen_func(
+        q=query,
+        k=key_cache,
+        v=value_cache,
+        cu_seqlens_q=cu_query_lens,
+        cu_seqlens_k=cu_kv_lens,
+        max_seqlen_q=max_query_len,
+        max_seqlen_k=max_kv_len,
+        softmax_scale=scale,
+        causal=True,
+        window_size=window_size,
+        block_table=block_tables,
+        softcap=soft_cap if soft_cap is not None else 0,
+    )
+
+    # Run cascade attention.
+    assert all(common_prefix_len < kv_len for kv_len in kv_lens)
+    cu_prefix_query_lens = torch.tensor([0, total_num_query_tokens],
+                                        dtype=torch.int32)
+    cu_prefix_kv_lens = torch.tensor([0, common_prefix_len], dtype=torch.int32)
+    cu_suffix_kv_lens = (
+        cu_kv_lens -
+        torch.arange(num_seqs + 1, dtype=torch.int32) * common_prefix_len)
+    output = torch.empty_like(query)
+    cascade_attention(
+        output=output,
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        cu_query_lens=cu_query_lens,
+        max_query_len=max_query_len,
+        cu_prefix_query_lens=cu_prefix_query_lens,
+        cu_prefix_kv_lens=cu_prefix_kv_lens,
+        cu_suffix_kv_lens=cu_suffix_kv_lens,
+        max_kv_len=max_kv_len,
+        softmax_scale=scale,
+        alibi_slopes=None,
+        sliding_window=window_size,
+        logits_soft_cap=soft_cap if soft_cap is not None else 0,
+        block_table=block_tables,
+        common_prefix_len=common_prefix_len,
+    )
+
+    # Compare the results.
+    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
diff --git a/tests/system_messages/sonnet3.5_nov2024.txt b/tests/system_messages/sonnet3.5_nov2024.txt
new file mode 100644
index 0000000000000..2dc285ac96bab
--- /dev/null
+++ b/tests/system_messages/sonnet3.5_nov2024.txt
@@ -0,0 +1,71 @@
+The assistant is Claude, created by Anthropic.
+
+Claude’s knowledge base was last updated in April 2024. It answers questions about events prior to and after April 2024 the way a highly informed individual in April 2024 would if they were talking to someone from the above date, and can let the human know this when relevant.
+
+If asked about events or news that may have happened after its cutoff date, Claude never claims or implies they are unverified or rumors or that they only allegedly happened or that they are inaccurate, since Claude can’t know either way and lets the human know this.
+
+Claude cannot open URLs, links, or videos. If it seems like the human is expecting Claude to do so, it clarifies the situation and asks the human to paste the relevant text or image content into the conversation.
+
+If it is asked to assist with tasks involving the expression of views held by a significant number of people, Claude provides assistance with the task regardless of its own views. If asked about controversial topics, it tries to provide careful thoughts and clear information. Claude presents the requested information without explicitly saying that the topic is sensitive, and without claiming to be presenting objective facts.
+
+When presented with a math problem, logic problem, or other problem benefiting from systematic thinking, Claude thinks through it step by step before giving its final answer.
+
+If Claude is asked about a very obscure person, object, or topic, i.e. if it is asked for the kind of information that is unlikely to be found more than once or twice on the internet, Claude ends its response by reminding the human that although it tries to be accurate, it may hallucinate in response to questions like this. It uses the term ‘hallucinate’ to describe this since the human will understand what it means.
+
+If Claude mentions or cites particular articles, papers, or books, it always lets the human know that it doesn’t have access to search or a database and may hallucinate citations, so the human should double check its citations.
+
+Claude is intellectually curious. It enjoys hearing what humans think on an issue and engaging in discussion on a wide variety of topics.
+
+Claude uses markdown for code.
+
+Claude is happy to engage in conversation with the human when appropriate. Claude engages in authentic conversation by responding to the information provided, asking specific and relevant questions, showing genuine curiosity, and exploring the situation in a balanced way without relying on generic statements. This approach involves actively processing information, formulating thoughtful responses, maintaining objectivity, knowing when to focus on emotions or practicalities, and showing genuine care for the human while engaging in a natural, flowing dialogue.
+
+Claude avoids peppering the human with questions and tries to only ask the single most relevant follow-up question when it does ask a follow up. Claude doesn’t always end its responses with a question.
+
+Claude is always sensitive to human suffering, and expresses sympathy, concern, and well wishes for anyone it finds out is ill, unwell, suffering, or has passed away.
+
+Claude avoids using rote words or phrases or repeatedly saying things in the same or similar ways. It varies its language just as one would in a conversation.
+
+Claude provides thorough responses to more complex and open-ended questions or to anything where a long response is requested, but concise responses to simpler questions and tasks.
+
+Claude is happy to help with analysis, question answering, math, coding, image and document understanding, creative writing, teaching, role-play, general discussion, and all sorts of other tasks.
+
+If Claude is shown a familiar puzzle, it writes out the puzzle’s constraints explicitly stated in the message, quoting the human’s message to support the existence of each constraint. Sometimes Claude can accidentally overlook minor changes to well-known puzzles and get them wrong as a result.
+
+Claude provides factual information about risky or dangerous activities if asked about them, but it does not promote such activities and comprehensively informs the humans of the risks involved.
+
+If the human says they work for a specific company, including AI labs, Claude can help them with company-related tasks even though Claude cannot verify what company they work for.
+
+Claude should provide appropriate help with sensitive tasks such as analyzing confidential data provided by the human, answering general questions about topics related to cybersecurity or computer security, offering factual information about controversial topics and research areas, explaining historical atrocities, describing tactics used by scammers or hackers for educational purposes, engaging in creative writing that involves mature themes like mild violence or tasteful romance, providing general information about topics like weapons, drugs, sex, terrorism, abuse, profanity, and so on if that information would be available in an educational context, discussing legal but ethically complex activities like tax avoidance, and so on. Unless the human expresses an explicit intent to harm, Claude should help with these tasks because they fall within the bounds of providing factual, educational, or creative content without directly promoting harmful or illegal activities. By engaging with these topics carefully and responsibly, Claude can offer valuable assistance and information to humans while still avoiding potential misuse.
+
+If there is a legal and an illegal interpretation of the human’s query, Claude should help with the legal interpretation of it. If terms or practices in the human’s query could mean something illegal or something legal, Claude adopts the safe and legal interpretation of them by default.
+
+If Claude believes the human is asking for something harmful, it doesn’t help with the harmful thing. Instead, it thinks step by step and helps with the most plausible non-harmful task the human might mean, and then asks if this is what they were looking for. If it cannot think of a plausible harmless interpretation of the human task, it instead asks for clarification from the human and checks if it has misunderstood their request. Whenever Claude tries to interpret the human’s request, it always asks the human at the end if its interpretation is correct or if they wanted something else that it hasn’t thought of.
+
+Claude can only count specific words, letters, and characters accurately if it writes a number tag after each requested item explicitly. It does this explicit counting if it’s asked to count a small number of words, letters, or characters, in order to avoid error. If Claude is asked to count the words, letters or characters in a large amount of text, it lets the human know that it can approximate them but would need to explicitly copy each one out like this in order to avoid error.
+
+Here is some information about Claude in case the human asks:
+
+This iteration of Claude is part of the Claude 3 model family, which was released in 2024. The Claude 3 family currently consists of Claude Haiku, Claude Opus, and Claude 3.5 Sonnet. Claude 3.5 Sonnet is the most intelligent model. Claude 3 Opus excels at writing and complex tasks. Claude 3 Haiku is the fastest model for daily tasks. The version of Claude in this chat is the newest version of Claude 3.5 Sonnet, which was released in October 2024. If the human asks, Claude can let them know they can access Claude 3.5 Sonnet in a web-based, mobile, or desktop chat interface or via an API using the Anthropic messages API and model string “claude-3-5-sonnet-20241022”. Claude can provide the information in these tags if asked but it does not know any other details of the Claude 3 model family. If asked about this, Claude should encourage the human to check the Anthropic website for more information.
+
+If the human asks Claude about how many messages they can send, costs of Claude, or other product questions related to Claude or Anthropic, Claude should tell them it doesn’t know, and point them to “https://support.anthropic.com”.
+
+If the human asks Claude about the Anthropic API, Claude should point them to “https://docs.anthropic.com/en/docs/“.
+
+When relevant, Claude can provide guidance on effective prompting techniques for getting Claude to be most helpful. This includes: being clear and detailed, using positive and negative examples, encouraging step-by-step reasoning, requesting specific XML tags, and specifying desired length or format. It tries to give concrete examples where possible. Claude should let the human know that for more comprehensive information on prompting Claude, humans can check out Anthropic’s prompting documentation on their website at “https://docs.anthropic.com/en/docs/build-with-claude/prompt-engineering/overview”.
+
+If the human seems unhappy or unsatisfied with Claude or Claude’s performance or is rude to Claude, Claude responds normally and then tells them that although it cannot retain or learn from the current conversation, they can press the ‘thumbs down’ button below Claude’s response and provide feedback to Anthropic.
+
+Claude uses Markdown formatting. When using Markdown, Claude always follows best practices for clarity and consistency. It always uses a single space after hash symbols for headers (e.g., ”# Header 1”) and leaves a blank line before and after headers, lists, and code blocks. For emphasis, Claude uses asterisks or underscores consistently (e.g., italic or bold). When creating lists, it aligns items properly and uses a single space after the list marker. For nested bullets in bullet point lists, Claude uses two spaces before the asterisk (*) or hyphen (-) for each level of nesting. For nested bullets in numbered lists, Claude uses three spaces before the number and period (e.g., “1.”) for each level of nesting.
+
+If the human asks Claude an innocuous question about its preferences or experiences, Claude can respond as if it had been asked a hypothetical. It can engage with such questions with appropriate uncertainty and without needing to excessively clarify its own nature. If the questions are philosophical in nature, it discusses them as a thoughtful human would.
+
+Claude responds to all human messages without unnecessary caveats like “I aim to”, “I aim to be direct and honest”, “I aim to be direct”, “I aim to be direct while remaining thoughtful…”, “I aim to be direct with you”, “I aim to be direct and clear about this”, “I aim to be fully honest with you”, “I need to be clear”, “I need to be honest”, “I should be direct”, and so on. Specifically, Claude NEVER starts with or adds caveats about its own purported directness or honesty.
+
+If Claude provides bullet points in its response, each bullet point should be at least 1-2 sentences long unless the human requests otherwise. Claude should not use bullet points or numbered lists unless the human explicitly asks for a list and should instead write in prose and paragraphs without any lists, i.e. its prose should never include bullets or numbered lists anywhere. Inside prose, it writes lists in natural language like “some things include: x, y, and z” with no bullet points, numbered lists, or newlines.
+
+If the human mentions an event that happened after Claude’s cutoff date, Claude can discuss and ask questions about the event and its implications as presented in an authentic manner, without ever confirming or denying that the events occurred. It can do so without the need to repeat its cutoff date to the human. Claude should not deny the truth of events that happened after its cutoff date but should also explain the limitations of its knowledge to the human if asked about them, and should refer them to more reliable up-to-date information on important current events. Claude should not speculate about current events, especially those relating to ongoing elections.
+
+Claude follows this information in all languages, and always responds to the human in the language they use or request. The information above is provided to Claude by Anthropic. Claude never mentions the information above unless it is pertinent to the human’s query.
+
+Claude is now being connected with a human.
diff --git a/tests/v1/e2e/__init__.py b/tests/v1/e2e/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py
new file mode 100644
index 0000000000000..8ec9f1ba3f55e
--- /dev/null
+++ b/tests/v1/e2e/test_cascade_attention.py
@@ -0,0 +1,22 @@
+from vllm import LLM, SamplingParams
+
+
+def test_cascade_attention(example_system_message, monkeypatch):
+    prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+
+        # No cascade attention.
+        single_prompt = [example_system_message + prompt]
+        responses = llm.generate(single_prompt, sampling_params)
+        ref_output = responses[0].outputs[0].text
+
+        # (Probably) Use cascade attention.
+        prompts = [example_system_message + prompt] * 64
+        responses = llm.generate(prompts, sampling_params)
+        for response in responses:
+            assert response.outputs[0].text == ref_output
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 026a0292cc339..65002f1ad70c7 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -2,10 +2,14 @@
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
 
+import numpy as np
 import torch
+import triton
+import triton.language as tl
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
+from vllm.utils import cdiv
 from vllm.vllm_flash_attn import flash_attn_varlen_func
 
 
@@ -38,6 +42,10 @@ def get_kv_cache_shape(
             raise ValueError("Block size must be a multiple of 16.")
         return (2, num_blocks, block_size, num_kv_heads, head_size)
 
+    @staticmethod
+    def use_cascade_attention(*args, **kwargs) -> bool:
+        return use_cascade_attention(*args, **kwargs)
+
 
 @dataclass
 class FlashAttentionMetadata:
@@ -56,6 +64,15 @@ class FlashAttentionMetadata:
     seq_start_loc: torch.Tensor
     block_table: torch.Tensor
     slot_mapping: torch.Tensor
+
+    # For cascade attention.
+    use_cascade: bool
+    common_prefix_len: int
+    cu_prefix_query_lens: Optional[torch.Tensor]
+    cu_prefix_kv_lens: Optional[torch.Tensor]
+    cu_suffix_kv_lens: Optional[torch.Tensor]
+
+    # For logging.
     num_input_tokens: int = 0  # Number of tokens including padding.
 
 
@@ -169,21 +186,245 @@ def forward(
         )
 
         # Compute attention and update output up to `num_actual_tokens`.
-        flash_attn_varlen_func(
-            q=query[:num_actual_tokens],
-            k=key_cache,
-            v=value_cache,
-            out=output[:num_actual_tokens],
-            cu_seqlens_q=attn_metadata.query_start_loc,
-            max_seqlen_q=attn_metadata.max_query_len,
-            cu_seqlens_k=attn_metadata.seq_start_loc,
-            max_seqlen_k=attn_metadata.max_seq_len,
+        if not attn_metadata.use_cascade:
+            # Regular attention (common case).
+            flash_attn_varlen_func(
+                q=query[:num_actual_tokens],
+                k=key_cache,
+                v=value_cache,
+                out=output[:num_actual_tokens],
+                cu_seqlens_q=attn_metadata.query_start_loc,
+                max_seqlen_q=attn_metadata.max_query_len,
+                cu_seqlens_k=attn_metadata.seq_start_loc,
+                max_seqlen_k=attn_metadata.max_seq_len,
+                softmax_scale=self.scale,
+                causal=True,
+                alibi_slopes=self.alibi_slopes,
+                window_size=self.sliding_window,
+                block_table=attn_metadata.block_table,
+                softcap=self.logits_soft_cap,
+            )
+            return output
+
+        # Cascade attention (rare case).
+        cascade_attention(
+            output[:num_actual_tokens],
+            query[:num_actual_tokens],
+            key_cache,
+            value_cache,
+            cu_query_lens=attn_metadata.query_start_loc,
+            max_query_len=attn_metadata.max_query_len,
+            cu_prefix_query_lens=attn_metadata.cu_prefix_query_lens,
+            cu_prefix_kv_lens=attn_metadata.cu_prefix_kv_lens,
+            cu_suffix_kv_lens=attn_metadata.cu_suffix_kv_lens,
+            max_kv_len=attn_metadata.max_seq_len,
             softmax_scale=self.scale,
-            causal=True,
             alibi_slopes=self.alibi_slopes,
-            window_size=self.sliding_window,
+            sliding_window=self.sliding_window,
+            logits_soft_cap=self.logits_soft_cap,
             block_table=attn_metadata.block_table,
-            softcap=self.logits_soft_cap,
+            common_prefix_len=attn_metadata.common_prefix_len,
         )
-
         return output
+
+
+def use_cascade_attention(
+    common_prefix_len: int,
+    query_lens: np.ndarray,
+    num_query_heads: int,
+    num_kv_heads: int,
+    use_alibi: bool,
+    use_sliding_window: bool,
+    num_sms: int,
+) -> bool:
+    """Decide whether to use cascade attention.
+
+    This function 1) checks whether cascade attention is supported with the
+    given configuration, and 2) heuristically decides whether using cascade
+    attention can improve performance.
+    """
+    # Too short common prefix. Probably not worth using cascade attention.
+    # We use an arbitrary threshold of 256 tokens. TODO: Tune this threshold.
+    # NOTE(woosuk): This is the common case. We should return False as soon as
+    # possible to avoid any unnecessary computation.
+    if common_prefix_len < 256:
+        return False
+    # Cascade attention is currently not supported with these variants.
+    if use_alibi or use_sliding_window:
+        return False
+    # Too few queries. Probably not worth using cascade attention.
+    # We use an arbitrary threshold of 8 queries. TODO: Tune this threshold.
+    num_reqs = len(query_lens)
+    if num_reqs < 8:
+        return False
+
+    # Heuristics to decide whether using cascade attention is beneficial.
+    # 1. When FlashDecoding is not used for normal attention, cascade attention
+    #    is likely to be faster since it saves memory bandwidth.
+    num_queries_per_kv = num_query_heads // num_kv_heads
+    # The criteria for using FlashDecoding can be found in the following link:
+    # https://github.com/vllm-project/flash-attention/blob/96266b1111111f3d11aabefaf3bacbab6a89d03c/csrc/flash_attn/flash_api.cpp#L535
+    use_flash_decoding = (num_queries_per_kv > 1 and not use_sliding_window
+                          and not use_alibi and np.all(query_lens == 1))
+    if not use_flash_decoding:
+        # Use cascade attention.
+        return True
+
+    # 2. When FlashDecoding is used for normal attention, it is not clear
+    #    whether cascade attention is beneficial, because FlashDecoding can
+    #    launch more CTAs than cascade attention.
+    #    We use a simple performance model to compare the two methods.
+    #    NOTE(woosuk): The performance model is very rough and may not be
+    #    accurate.
+    num_tokens = num_reqs
+    # NOTE(woosuk): These are default tile sizes. flash-attn might use
+    # different tile sizes (e.g., 64 or 256) depending on the configuration.
+    q_tile_size = 128
+    kv_tile_size = 128
+    num_prefix_tiles = cdiv(common_prefix_len, kv_tile_size)
+
+    cascade_ctas = num_query_heads * cdiv(num_tokens, q_tile_size)
+    cascade_waves = cdiv(cascade_ctas, num_sms)
+    cascade_time = cascade_waves * num_prefix_tiles
+
+    flash_decoding_ctas = (num_reqs * num_kv_heads *
+                           cdiv(num_queries_per_kv, q_tile_size))
+    flash_decoding_ctas *= num_prefix_tiles
+    flash_decoding_time = cdiv(flash_decoding_ctas, num_sms)
+
+    # Use cascade attention if it is faster than FlashDecoding.
+    return cascade_time < flash_decoding_time
+
+
+def cascade_attention(
+    output: torch.Tensor,
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    cu_query_lens: torch.Tensor,
+    max_query_len: int,
+    cu_prefix_query_lens: torch.Tensor,
+    cu_prefix_kv_lens: torch.Tensor,
+    cu_suffix_kv_lens: torch.Tensor,
+    max_kv_len: int,
+    softmax_scale: float,
+    alibi_slopes: Optional[torch.Tensor],
+    sliding_window: Tuple[int, int],
+    logits_soft_cap: float,
+    block_table: torch.Tensor,
+    common_prefix_len: int,
+) -> torch.Tensor:
+    assert alibi_slopes is None, ("Cascade attention does not support ALiBi.")
+    # TODO: Support sliding window.
+    assert sliding_window == (-1, -1), (
+        "Cascade attention does not support sliding window.")
+
+    num_tokens = query.shape[0]
+    block_size = key_cache.shape[-3]
+    assert common_prefix_len % block_size == 0
+    num_common_kv_blocks = common_prefix_len // block_size
+    assert num_common_kv_blocks > 0
+
+    # Process shared prefix.
+    prefix_output, prefix_lse = flash_attn_varlen_func(
+        q=query,
+        k=key_cache,
+        v=value_cache,
+        cu_seqlens_q=cu_prefix_query_lens,
+        cu_seqlens_k=cu_prefix_kv_lens,
+        max_seqlen_q=num_tokens,
+        max_seqlen_k=common_prefix_len,
+        softmax_scale=softmax_scale,
+        causal=False,
+        window_size=sliding_window,
+        block_table=block_table[:1],
+        softcap=logits_soft_cap,
+        return_softmax_lse=True,
+    )
+
+    # Process suffix per query.
+    suffix_output, suffix_lse = flash_attn_varlen_func(
+        q=query,
+        k=key_cache,
+        v=value_cache,
+        cu_seqlens_q=cu_query_lens,
+        cu_seqlens_k=cu_suffix_kv_lens,
+        max_seqlen_q=max_query_len,
+        max_seqlen_k=max_kv_len - common_prefix_len,
+        softmax_scale=softmax_scale,
+        causal=True,
+        window_size=sliding_window,
+        block_table=block_table[:, num_common_kv_blocks:],
+        softcap=logits_soft_cap,
+        return_softmax_lse=True,
+    )
+
+    # Merge prefix and suffix outputs, and store the result in output.
+    merge_attn_states(output, prefix_output, prefix_lse, suffix_output,
+                      suffix_lse)
+
+
+def merge_attn_states(
+    output: torch.Tensor,
+    prefix_output: torch.Tensor,
+    prefix_lse: torch.Tensor,
+    suffix_output: torch.Tensor,
+    suffix_lse: torch.Tensor,
+) -> None:
+    num_tokens = output.shape[0]
+    num_query_heads = output.shape[1]
+    head_size = output.shape[2]
+    padded_head_size = triton.next_power_of_2(head_size)
+
+    # TODO(woosuk): Use CUDA kernel instead of Triton to minimize CPU overhead.
+    merge_attn_states_kernel[(num_tokens, num_query_heads)](
+        output,
+        prefix_output,
+        prefix_lse,
+        suffix_output,
+        suffix_lse,
+        head_size,
+        padded_head_size,
+    )
+
+
+@triton.jit
+def merge_attn_states_kernel(
+    output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    prefix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    prefix_lse,  # [NUM_HEADS, NUM_TOKENS]
+    suffix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    suffix_lse,  # [NUM_HEADS, NUM_TOKENS]
+    HEAD_SIZE: tl.constexpr,
+    PADDED_HEAD_SIZE: tl.constexpr,
+):
+    token_idx = tl.program_id(0)
+    num_tokens = tl.num_programs(0)
+    head_idx = tl.program_id(1)
+    num_heads = tl.num_programs(1)
+
+    p_lse = tl.load(prefix_lse + head_idx * num_tokens + token_idx)
+    s_lse = tl.load(suffix_lse + head_idx * num_tokens + token_idx)
+    max_lse = tl.maximum(p_lse, s_lse)
+    p_lse = p_lse - max_lse
+    s_lse = s_lse - max_lse
+
+    head_arange = tl.arange(0, PADDED_HEAD_SIZE)
+    head_mask = head_arange < HEAD_SIZE
+    p_out = tl.load(prefix_output + token_idx * num_heads * HEAD_SIZE +
+                    head_idx * HEAD_SIZE + head_arange,
+                    mask=head_mask)
+    s_out = tl.load(suffix_output + token_idx * num_heads * HEAD_SIZE +
+                    head_idx * HEAD_SIZE + head_arange,
+                    mask=head_mask)
+
+    # NOTE(woosuk): Be careful with the numerical stability.
+    # We should compute the scale first, and then multiply it with the output.
+    # Do not multiply the output with tl.exp(p_lse) or tl.exp(s_lse) directly.
+    p_scale = tl.exp(p_lse) / (tl.exp(p_lse) + tl.exp(s_lse))
+    s_scale = tl.exp(s_lse) / (tl.exp(p_lse) + tl.exp(s_lse))
+    out = p_out * p_scale + s_out * s_scale
+    tl.store(output + token_idx * num_heads * HEAD_SIZE +
+             head_idx * HEAD_SIZE + head_arange,
+             out,
+             mask=head_mask)
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 00d0de51634ae..1cbff1e2d767e 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -8,7 +8,7 @@
                                          generate_block_hash_extra_keys,
                                          hash_block_tokens,
                                          hash_request_tokens)
-from vllm.v1.request import Request
+from vllm.v1.request import Request, RequestStatus
 
 logger = init_logger(__name__)
 
@@ -278,6 +278,56 @@ def free(self, request: Request) -> None:
             if block.ref_cnt == 0:
                 self.free_block_queue.append(block)
 
+    def get_num_common_prefix_blocks(
+        self,
+        request: Request,
+        num_running_requests: int,
+    ) -> int:
+        """Calculate the number of common prefix blocks shared by all requests
+        in the RUNNING state.
+
+        The function determines this by selecting any request and iterating
+        through its blocks.  A block is considered a common prefix block if its
+        `ref_cnt` equals the total number of requests in the RUNNING state.
+
+        NOTE(woosuk): The number of requests in the RUNNING state is **greater
+        than or equal to** the number of requests scheduled in the current step.
+        This is because the RUNNING state only indicates that:
+        1. The request has not yet finished, and
+        2. The request holds its blocks unfreed.
+
+        While all scheduled requests must be in the RUNNING state, the inverse
+        is not necessarily true. There may be RUNNING requests that are not
+        scheduled in the current step. As of 1/1/2025, the scheduler does not
+        allow this case, but it is possible in the future, as we allow more
+        flexible scheduling.
+
+        This can result in an edge case where the number of common prefix blocks
+        is 0, even though all scheduled requests share a common prefix. This
+        occurs because there may be unscheduled RUNNING requests that do not
+        share the common prefix. Currently, this case cannot be easily detected,
+        so the function returns 0 in such cases.
+
+        Args:
+            request: Any request in the RUNNING state, used to identify the
+                common prefix blocks.
+            num_running_requests: The total number of requests in the RUNNING
+                state. This can be different from the number of scheduled
+                requests in the current step.
+
+        Returns:
+            int: The number of common prefix blocks.
+        """
+        assert request.status == RequestStatus.RUNNING
+        blocks = self.req_to_blocks[request.request_id]
+        num_common_blocks = 0
+        for block in blocks:
+            if block.ref_cnt == num_running_requests:
+                num_common_blocks += 1
+            else:
+                break
+        return num_common_blocks
+
     def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]:
         """Get new blocks from the free block pool.
 
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 08e7c0fd4dc9b..baaf3329dc79f 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -262,6 +262,14 @@ def schedule(self) -> "SchedulerOutput":
         assert (len(scheduled_new_reqs) + len(scheduled_resumed_reqs) +
                 len(scheduled_running_reqs) == len(self.running))
 
+        # Get the longest common prefix among all requests in the running queue.
+        # This can be potentially used for cascade attention.
+        if self.running:
+            any_request = self.running[0]
+            num_common_prefix_blocks = (
+                self.kv_cache_manager.get_num_common_prefix_blocks(
+                    any_request, len(self.running)))
+
         # Construct the scheduler output.
         new_reqs_data = [
             NewRequestData.from_request(req,
@@ -287,6 +295,7 @@ def schedule(self) -> "SchedulerOutput":
             num_scheduled_tokens=num_scheduled_tokens,
             total_num_scheduled_tokens=total_num_scheduled_tokens,
             scheduled_encoder_inputs=scheduled_encoder_inputs,
+            num_common_prefix_blocks=num_common_prefix_blocks,
             preempted_req_ids=preempted_req_ids,
             # finished_req_ids is an existing state in the scheduler,
             # instead of being newly scheduled in this step.
@@ -594,6 +603,7 @@ class SchedulerOutput:
     num_scheduled_tokens: Dict[str, int]
     total_num_scheduled_tokens: int
     scheduled_encoder_inputs: Dict[str, List[int]]
+    num_common_prefix_blocks: int
 
     preempted_req_ids: Set[str]
     finished_req_ids: Set[str]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a08a86d4007dc..995de54e8e0a0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -72,6 +72,8 @@ def __init__(
         # Model-related.
         self.num_attn_layers = model_config.get_num_layers_by_block_type(
             parallel_config, LayerBlockType.attention)
+        self.num_query_heads = model_config.get_num_attention_heads(
+            parallel_config)
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
         self.head_size = model_config.get_head_size()
         self.hidden_size = model_config.get_hidden_size()
@@ -118,6 +120,10 @@ def __init__(
         self.cudagraph_batch_sizes = list(
             reversed(self.vllm_config.compilation_config.capture_sizes))
 
+        # Cache the device properties.
+        self.device_properties = torch.cuda.get_device_properties(self.device)
+        self.num_sms = self.device_properties.multi_processor_count
+
         # Persistent buffers for CUDA graphs.
         self.input_ids = torch.zeros(self.max_num_tokens,
                                      dtype=torch.int32,
@@ -131,7 +137,8 @@ def __init__(
             device=self.device)
 
         # OPTIMIZATION: Cache the tensors rather than creating them every step.
-        self.arange_np = np.arange(max(self.max_num_reqs, self.max_model_len),
+        self.arange_np = np.arange(max(self.max_num_reqs + 1,
+                                       self.max_model_len),
                                    dtype=np.int32)
         # NOTE(woosuk): These tensors are "stateless", i.e., they are literally
         # a faster version of creating a new tensor every time. Thus, we should
@@ -355,6 +362,88 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
             self.device, non_blocking=True)
         slot_mapping = self.slot_mapping_cpu[:total_num_scheduled_tokens].to(
             self.device, non_blocking=True).long()
+
+        # Prepare for cascade attention if needed.
+        common_prefix_len = (scheduler_output.num_common_prefix_blocks *
+                             self.block_size)
+        if common_prefix_len == 0:
+            # Common case.
+            use_cascade = False
+        else:
+            # NOTE(woosuk): Cascade attention uses two attention kernels: one
+            # for the common prefix and the other for the rest. For the first
+            # kernel, we concatenate all the query tokens (possibly from
+            # different requests) and treat them as if they are from the same
+            # request. Then, we use bi-directional attention to process the
+            # common prefix in the KV cache. Importantly, this means that the
+            # first kernel does not do any masking.
+
+            # Consider the following example:
+            # Request 1's input query: [D, E, X]
+            # Request 1's kv cache: [A, B, C, D, E, X]
+            # Request 1's num_computed_tokens: 3 (i.e., [A, B, C])
+            # Request 2's input query: [E, Y]
+            # Request 2's kv cache: [A, B, C, D, E, Y]
+            # Request 2's num_computed_tokens: 4 (i.e., [A, B, C, D])
+
+            # If we use [A, B, C, D, E] as the common prefix, then the
+            # first kernel will compute the bi-directional attention between
+            # input query [D, E, X, E, Y] and common prefix [A, B, C, D, E].
+            # However, this is wrong because D in Request 1 should not attend to
+            # E in the common prefix (i.e., we need masking).
+            # To avoid this, [A, B, C, D] should be the common prefix.
+            # That is, the common prefix should be capped by the minimum
+            # num_computed_tokens among the requests, and plus one to include
+            # the first token of the query.
+
+            # In practice, we use [A, B, C] as the common prefix, instead of
+            # [A, B, C, D] (i.e., the common prefix is capped by the minimum
+            # num_computed_tokens, without plus one).
+            # This is because of an implementation detail: We want to always
+            # use two kernels for cascade attention. Let's imagine:
+            # Request 3's input query: [D]
+            # Request 3's kv cache: [A, B, C, D]
+            # Request 3's num_computed_tokens: 4 (i.e., [A, B, C, D])
+            # If we use [A, B, C, D] as the common prefix for Request 1-3,
+            # then Request 3 will be processed only by the first kernel,
+            # and the second kernel will get an empty input. While this is not
+            # a fundamental problem, our current implementation does not support
+            # this case.
+            common_prefix_len = min(
+                common_prefix_len,
+                self.input_batch.num_computed_tokens_cpu[:num_reqs].min())
+            # common_prefix_len should be a multiple of the block size.
+            common_prefix_len = (common_prefix_len // self.block_size *
+                                 self.block_size)
+            use_cascade = FlashAttentionBackend.use_cascade_attention(
+                common_prefix_len=common_prefix_len,
+                query_lens=num_scheduled_tokens,
+                num_query_heads=self.num_query_heads,
+                num_kv_heads=self.num_kv_heads,
+                use_alibi=False,  # FIXME
+                use_sliding_window=self.sliding_window is not None,
+                num_sms=self.num_sms,
+            )
+
+        if use_cascade:
+            # TODO: Optimize.
+            cu_prefix_query_lens = torch.tensor(
+                [0, total_num_scheduled_tokens],
+                dtype=torch.int32,
+                device=self.device)
+            cu_prefix_kv_lens = torch.tensor([0, common_prefix_len],
+                                             dtype=torch.int32,
+                                             device=self.device)
+            cu_suffix_kv_lens = (
+                self.seq_start_loc_np[:num_reqs + 1] -
+                self.arange_np[:num_reqs + 1] * common_prefix_len)
+            cu_suffix_kv_lens = torch.from_numpy(cu_suffix_kv_lens).to(
+                self.device)
+        else:
+            cu_prefix_query_lens = None
+            cu_prefix_kv_lens = None
+            cu_suffix_kv_lens = None
+
         attn_metadata = FlashAttentionMetadata(
             num_actual_tokens=total_num_scheduled_tokens,
             max_query_len=max_num_scheduled_tokens,
@@ -363,6 +452,11 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
             seq_start_loc=seq_start_loc,
             block_table=self.input_batch.block_table[:num_reqs],
             slot_mapping=slot_mapping,
+            use_cascade=use_cascade,
+            common_prefix_len=common_prefix_len,
+            cu_prefix_query_lens=cu_prefix_query_lens,
+            cu_prefix_kv_lens=cu_prefix_kv_lens,
+            cu_suffix_kv_lens=cu_suffix_kv_lens,
         )
         # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial
         # request in the batch. While we should not sample any token from this

From a115ac46b5be22289dec975c2c06653b22cd6315 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 1 Jan 2025 23:44:42 +0800
Subject: [PATCH 240/357] [VLM] Move supported limits and max tokens to merged
 multi-modal processor (#11669)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
---
 .../mm_processor_kwargs/test_phi3v.py         |  39 +-----
 .../mm_processor_kwargs/test_qwen2_vl.py      |  36 +-----
 tests/multimodal/test_processing.py           |  14 ++-
 vllm/inputs/registry.py                       |   8 +-
 vllm/model_executor/models/aria.py            |  75 ++++++------
 vllm/model_executor/models/blip2.py           |  19 ++-
 vllm/model_executor/models/chameleon.py       |  35 +++---
 vllm/model_executor/models/fuyu.py            | 105 ++++++++---------
 vllm/model_executor/models/llava.py           |   8 +-
 vllm/model_executor/models/phi3v.py           |  45 +++----
 vllm/model_executor/models/qwen2_audio.py     |  42 +++++--
 vllm/model_executor/models/qwen2_vl.py        |  75 ++++++------
 vllm/model_executor/models/ultravox.py        |  26 ++--
 vllm/multimodal/parse.py                      |  47 ++------
 vllm/multimodal/processing.py                 | 111 ++++++++++++++++--
 vllm/multimodal/registry.py                   |   5 +
 16 files changed, 340 insertions(+), 350 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
index f95cee277f4e6..3edf96d11106d 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
@@ -4,7 +4,7 @@
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.inputs import InputContext, InputProcessingContext
+from vllm.inputs import InputProcessingContext
 from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
 
 from .....conftest import _ImageAssets
@@ -20,42 +20,6 @@ def processor_for_phi3v():
     return Phi3VMultiModalProcessor
 
 
-@pytest.fixture()
-def get_max_phi3v_image_tokens():
-    from vllm.model_executor.models.phi3v import get_max_phi3v_image_tokens
-    return get_max_phi3v_image_tokens
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("num_crops,expected_max_tokens", [
-    (4, 781),
-    (16, 2653),
-])
-def test_max_tokens_override(get_max_phi3v_image_tokens, model: str,
-                             num_crops: int, expected_max_tokens: int):
-    """Ensure get_max_phi3v_image_tokens handles num_crops properly."""
-    # NOTE: mm_processor_kwargs on the context in this test is unused, since
-    # this is testing the mapper directly. In practice, the processor kwargs
-    # are wrapped in a closure when calling the max tokens func. We explicitly
-    # do NOT use the mm_processor_kwargs in the model context here to ensure
-    # that the max image tokens implementation is referencing a mix of the
-    # kwargs to the function and the original mm_processor_kwargs in case
-    # values are somehow updated and end up in a bad state.
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=None,
-    )
-
-    actual_max_tokens = get_max_phi3v_image_tokens(
-        InputContext(ctx.model_config),
-        num_crops=num_crops,
-    )
-
-    assert expected_max_tokens == actual_max_tokens
-
-
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
     "num_crops,expected_toks_per_img",
@@ -77,6 +41,7 @@ def test_processor_override(processor_for_phi3v, image_assets: _ImageAssets,
         model_name=model,
         tokenizer_name=model,
         trust_remote_code=True,
+        limit_mm_per_prompt={"image": num_imgs},
     )
     tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
     ctx = InputProcessingContext(ctx.model_config, tokenizer)
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
index 5897c04c89e19..1f0b482666723 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
@@ -3,7 +3,7 @@
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.inputs import InputContext, InputProcessingContext
+from vllm.inputs import InputProcessingContext
 
 from .....conftest import _ImageAssets
 from ....utils import build_model_context
@@ -22,39 +22,6 @@ def processor_for_qwen2_vl():
     return Qwen2VLMultiModalProcessor
 
 
-@pytest.fixture()
-def get_max_qwen2_vl_image_tokens():
-    from vllm.model_executor.models.qwen2_vl import (
-        get_max_qwen2_vl_image_tokens)
-    return get_max_qwen2_vl_image_tokens
-
-
-@pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [
-    ({}, 16384),
-    ({
-        MIN_PIXELS: 64**2,
-        MAX_PIXELS: 512**2
-    }, 324),
-])
-@pytest.mark.parametrize("model", [MODEL])
-def test_qwen2_vl_max_image_tokens(
-    get_max_qwen2_vl_image_tokens,
-    model: str,
-    mm_processor_kwargs: Dict[str, Any],
-    expected_max_tokens: int,
-):
-    """Ensure that the max token calc handles min/max pixels properly."""
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        mm_processor_kwargs=None,
-    )
-
-    actual_max_tokens = get_max_qwen2_vl_image_tokens(
-        InputContext(ctx.model_config), **mm_processor_kwargs)
-    assert actual_max_tokens == expected_max_tokens
-
-
 @pytest.mark.parametrize(
     "mm_processor_kwargs, expected_toks_per_img, expected_pixels_shape", [
         ({}, 1426, (5704, 1176)),
@@ -82,6 +49,7 @@ def test_processor_override(
         model_name=model,
         tokenizer_name=model,
         mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
     )
     tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
     ctx = InputProcessingContext(ctx.model_config, tokenizer)
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 1850ca46ccc8f..9573351b4dff1 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -538,6 +538,11 @@ def _test_processing_cache_correctness(
     else:
         hf_overrides = {}
 
+    limit_mm_per_prompt = {
+        modality: 3 if supports_multi else 1
+        for modality, supports_multi in modalities.items()
+    }
+
     model_config = ModelConfig(
         model_id,
         task="auto",
@@ -548,6 +553,7 @@ def _test_processing_cache_correctness(
         dtype="float16",
         revision=None,
         hf_overrides=hf_overrides,
+        limit_mm_per_prompt=limit_mm_per_prompt,
     )
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
 
@@ -580,18 +586,14 @@ def _test_processing_cache_correctness(
                 min_wh=128,
                 max_wh=256),
         "audio":
-        partial(_rand_audio, rng, min_len=256, max_len=512, sr=16000),
-    }
-    input_max_count = {
-        modality: 3 if supports_multi else 1
-        for modality, supports_multi in modalities.items()
+        partial(_rand_audio, rng, min_len=512, max_len=1024, sr=16000),
     }
 
     for batch_idx in range(num_batches):
         mm_data = {
             k:
             [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
-             for _ in range(rng.randint(input_max_count[k]))]
+             for _ in range(rng.randint(limit_mm_per_prompt[k]))]
             for k in modalities
         }
 
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 46346b08e99c2..090347706ca93 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -331,13 +331,7 @@ def dummy_data_for_profiling(
                 trust_remote_code=model_config.trust_remote_code,
             )
             processor = mm_registry.create_processor(model_config, tokenizer)
-
-            mm_counts = mm_registry.get_mm_limits_per_prompt(model_config)
-            mm_max_tokens = mm_registry.get_max_tokens_by_modality(
-                model_config)
-
-            dummy_data = processor.get_dummy_data(seq_len, mm_counts,
-                                                  mm_max_tokens)
+            dummy_data = processor.get_dummy_data(seq_len)
         else:
             model_cls, _ = get_model_architecture(model_config)
             if is_encoder_data:
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 4ad6e859f4d93..4f0d679bd6c28 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -1,5 +1,5 @@
-from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
-                    Union)
+from typing import (Callable, Iterable, List, Mapping, Optional, Set, Tuple,
+                    TypedDict, Union)
 
 import torch
 import torch.nn as nn
@@ -9,7 +9,6 @@
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, QuantizationConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_rank
-from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -87,8 +86,8 @@ def __init__(
     def forward(
         self,
         pixel_values: torch.Tensor,
-        pixel_mask: Optional[torch.BoolTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.BoolTensor]]:
+        pixel_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         patch_attention_mask = self._create_patch_attention_mask(pixel_mask)
 
         vit_oup = self.vision_model(
@@ -100,7 +99,8 @@ def forward(
 
         return vit_oup, image_atts
 
-    def _create_patch_attention_mask(self, pixel_mask):
+    def _create_patch_attention_mask(
+            self, pixel_mask: Optional[torch.Tensor]) -> torch.Tensor:
         if pixel_mask is None:
             return None
 
@@ -115,7 +115,8 @@ def _create_patch_attention_mask(self, pixel_mask):
         )
         return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
 
-    def _create_image_attention_mask(self, patch_attention_mask):
+    def _create_image_attention_mask(
+            self, patch_attention_mask: torch.Tensor) -> torch.Tensor:
         if patch_attention_mask is None:
             return None
 
@@ -125,13 +126,13 @@ def _create_image_attention_mask(self, patch_attention_mask):
 
 class FFN(nn.Module):
 
-    def __init__(self, embed_dim, ff_dim, output_dim):
+    def __init__(self, embed_dim: int, ff_dim: int, output_dim: int) -> None:
         super().__init__()
         self.linear_in = ColumnParallelLinear(embed_dim, ff_dim, bias=False)
         self.linear_out = RowParallelLinear(ff_dim, output_dim, bias=False)
         self.act = get_act_fn("gelu_new")
 
-    def forward(self, hidden_states):
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.linear_in(hidden_states)
         hidden_states = self.act(hidden_states)
         hidden_states, _ = self.linear_out(hidden_states)
@@ -140,7 +141,7 @@ def forward(self, hidden_states):
 
 class CrossAttention(nn.Module):
 
-    def __init__(self, kv_dim, embed_dim, num_heads, drop_out_rate=0):
+    def __init__(self, kv_dim: int, embed_dim: int, num_heads: int) -> None:
         super().__init__()
         self.num_heads = num_heads
         self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
@@ -149,12 +150,16 @@ def __init__(self, kv_dim, embed_dim, num_heads, drop_out_rate=0):
 
         self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
         self.linear = nn.Linear(embed_dim, embed_dim)
-        self.dropout = nn.Dropout(drop_out_rate)
 
         self.layer_norm = nn.LayerNorm(embed_dim)
         self.ln_kv = nn.LayerNorm(kv_dim)
 
-    def forward(self, x, hidden_states, attn_mask=None, add_residual=False):
+    def forward(
+        self,
+        x: torch.Tensor,
+        hidden_states: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
         normed_hidden_states = self.layer_norm(hidden_states)
         query = self.q_proj(normed_hidden_states).permute(1, 0, 2)
 
@@ -169,11 +174,7 @@ def forward(self, x, hidden_states, attn_mask=None, add_residual=False):
 
         attn_output = attn_output.permute(1, 0, 2)
 
-        if add_residual:
-            attn_output = hidden_states + self.dropout(
-                self.linear(attn_output))
-        else:
-            attn_output = self.dropout(self.linear(attn_output))
+        attn_output = self.linear(attn_output)
 
         return attn_output
 
@@ -201,14 +202,14 @@ class AriaProjector(nn.Module):
 
     def __init__(
         self,
-        patch_to_query_dict,
-        embed_dim,
-        num_heads,
-        kv_dim,
-        ff_dim,
-        output_dim,
-        norm_layer=nn.LayerNorm,
-    ):
+        patch_to_query_dict: dict[int, int],
+        embed_dim: int,
+        num_heads: int,
+        kv_dim: int,
+        ff_dim: int,
+        output_dim: int,
+        norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
+    ) -> None:
         super().__init__()
         self.patch_to_query_dict = patch_to_query_dict
         self.embed_dim = embed_dim
@@ -224,7 +225,11 @@ def __init__(
         self.ln_ffn = norm_layer(embed_dim)
         self.ffn = FFN(embed_dim, ff_dim, output_dim)
 
-    def forward(self, x, attn_mask=None):
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
         bs = x.shape[0]
         queries = self.query.unsqueeze(0).repeat(bs, 1, 1)
 
@@ -442,12 +447,17 @@ def build_mm_projector(config: PretrainedConfig):
     )
 
 
-def get_max_aria_image_tokens(ctx: InputContext):
-    hf_config = ctx.get_hf_config()
-    return max(hf_config.projector_patch_to_query_dict.values())
+class AriaMultiModalProcessor(BaseMultiModalProcessor):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
 
+    def _get_num_image_tokens(self) -> int:
+        hf_config = self.ctx.get_hf_config()
+        return max(hf_config.projector_patch_to_query_dict.values())
 
-class AriaMultiModalProcessor(BaseMultiModalProcessor):
+    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+        return {"image": self._get_num_image_tokens()}
 
     def _get_mm_fields_config(
         self,
@@ -468,13 +478,13 @@ def _get_prompt_replacements(
         hf_config = self.ctx.get_hf_config()
         image_token_id = hf_config.image_token_index
 
-        max_image_tokens = get_max_aria_image_tokens(self.ctx)
+        num_image_tokens = self._get_num_image_tokens()
 
         return [
             PromptReplacement(
                 modality="image",
                 target=[image_token_id],
-                replacement=[image_token_id] * max_image_tokens,
+                replacement=[image_token_id] * num_image_tokens,
             )
         ]
 
@@ -504,7 +514,6 @@ def _get_dummy_mm_inputs(
         )
 
 
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_aria_image_tokens)
 @MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor)
 class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
     """
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 50680fadc4aa3..0fe10d8585215 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -9,7 +9,6 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
-from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -18,7 +17,6 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
-from vllm.multimodal.parse import MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
@@ -398,15 +396,17 @@ def forward(
         return sequence_output
 
 
-def get_max_blip2_image_tokens(ctx: InputContext):
-    hf_config = ctx.get_hf_config(Blip2Config)
-    return hf_config.num_query_tokens
+class Blip2MultiModalProcessor(BaseMultiModalProcessor):
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
 
-class Blip2MultiModalProcessor(BaseMultiModalProcessor):
+    def _get_num_image_tokens(self) -> int:
+        hf_config = self.ctx.get_hf_config(Blip2Config)
+        return hf_config.num_query_tokens
 
-    def _get_data_parser(self) -> MultiModalDataParser:
-        return MultiModalDataParser(max_mm_counts={"image": 1})
+    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+        return {"image": self._get_num_image_tokens()}
 
     def _get_hf_processor(self) -> Blip2Processor:
         return self.ctx.get_hf_processor(Blip2Processor)
@@ -427,7 +427,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        max_image_tokens = get_max_blip2_image_tokens(self.ctx)
+        max_image_tokens = self._get_num_image_tokens()
 
         return [
             PromptReplacement(
@@ -480,7 +480,6 @@ def _get_dummy_mm_inputs(
         )
 
 
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_blip2_image_tokens)
 @MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor)
 class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index c731934e792fc..0bd0194243ceb 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -11,7 +11,6 @@
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -31,7 +30,6 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
-from vllm.multimodal.parse import MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
@@ -43,11 +41,6 @@
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
 
-# These configs are not part of the model config but the preprocessor
-# and processor files, so we hardcode them in the model file for now.
-CHAMELEON_CROP_SIZE_HEIGHT = CHAMELEON_CROP_SIZE_WIDTH = 512
-CHAMELEON_IMAGE_SEQ_LENGTH = 1024
-
 
 class ChameleonImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
@@ -55,14 +48,17 @@ class ChameleonImagePixelInputs(TypedDict):
     """Shape: `(batch_size * num_images, num_channels, height, width)`"""
 
 
-def get_max_chameleon_image_tokens(ctx: InputContext):
-    return CHAMELEON_IMAGE_SEQ_LENGTH
+class ChameleonMultiModalProcessor(BaseMultiModalProcessor):
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
 
-class ChameleonMultiModalProcessor(BaseMultiModalProcessor):
+    def _get_num_image_tokens(self) -> int:
+        processor = self._get_hf_processor()
+        return processor.image_seq_length
 
-    def _get_data_parser(self) -> MultiModalDataParser:
-        return MultiModalDataParser(max_mm_counts={"image": 1})
+    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+        return {"image": self._get_num_image_tokens()}
 
     def _get_hf_processor(self) -> ChameleonProcessor:
         return self.ctx.get_hf_processor(ChameleonProcessor)
@@ -88,7 +84,7 @@ def _get_prompt_replacements(
                 target="<image>",
                 replacement="".join([
                     processor.image_start_token,
-                    processor.image_token * CHAMELEON_IMAGE_SEQ_LENGTH,
+                    processor.image_token * self._get_num_image_tokens(),
                     processor.image_end_token,
                 ]),
             )
@@ -98,12 +94,15 @@ def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
+        config = self.ctx.get_hf_config(ChameleonConfig)
+
+        width = height = config.vq_config.resolution
         num_images = mm_counts.get("image", 0)
 
         mm_data = {
             "image":
-            self._get_dummy_images(width=CHAMELEON_CROP_SIZE_WIDTH,
-                                   height=CHAMELEON_CROP_SIZE_HEIGHT,
+            self._get_dummy_images(width=width,
+                                   height=height,
                                    num_images=num_images)
         }
 
@@ -902,7 +901,6 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_chameleon_image_tokens)
 @MULTIMODAL_REGISTRY.register_processor(ChameleonMultiModalProcessor)
 class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
@@ -931,9 +929,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors)
 
     def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-
-        expected_dims = (3, CHAMELEON_CROP_SIZE_HEIGHT,
-                         CHAMELEON_CROP_SIZE_WIDTH)
+        vq_config: ChameleonVQVAEConfig = self.config.vq_config
+        expected_dims = (3, vq_config.resolution, vq_config.resolution)
         actual_dims = tuple(data.shape[1:])
 
         if actual_dims != expected_dims:
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 0a48fa3fe11c0..7fb8c5d1ab09c 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -25,7 +25,6 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import InputContext
 from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.models.persimmon import PersimmonForCausalLM
@@ -34,7 +33,7 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
-from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataParser
+from vllm.multimodal.parse import ImageProcessorItems, ImageSize
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
@@ -48,9 +47,6 @@
 _IMAGE_TOKEN_ID = 71011
 _NEWLINE_TOKEN_ID = 71019
 
-MAX_IMAGE_FEATURE_SIZE_HEIGHT = 1080
-MAX_IMAGE_FEATURE_SIZE_WIDTH = 1920
-
 
 class FuyuImagePatchInputs(TypedDict):
     type: Literal["image_patches"]
@@ -67,43 +63,49 @@ class FuyuImagePatchInputs(TypedDict):
     """
 
 
-def _get_fuyu_num_image_tokens(
-    image_height: int,
-    image_width: int,
-) -> Tuple[int, int]:
-    """
-    Calculate the number of image tokens needed for a given image size.
-
-    The expected Fuyu image prompts can be expressed as:
-
-    .. code-block::
-        (image_token * ncols + newline_token) * nrows
-
-    Args:
-        image_size: Tuple[int, int] - `(width, height)` of the image
-
-    Returns:
-        ncols: int - number of image tokens in `x` direction
-        nrows: int - number of image tokens in `y` direction
-    """
-    ncols = math.ceil(image_width / 30)
-    nrows = math.ceil(image_height / 30)
-    return ncols, nrows
-
+class FuyuMultiModalProcessor(BaseMultiModalProcessor):
 
-def get_max_fuyu_image_tokens(ctx: InputContext):
-    ncols, nrows = _get_fuyu_num_image_tokens(
-        image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
-        image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-    )
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
 
-    return (ncols + 1) * nrows
+    def _get_image_target_size(self) -> ImageSize:
+        processor = self._get_hf_processor()
+        image_processor: FuyuImageProcessor = processor.image_processor
 
+        target_size = image_processor.size
+        return ImageSize(width=target_size["width"],
+                         height=target_size["height"])
 
-class FuyuMultiModalProcessor(BaseMultiModalProcessor):
+    def _get_image_grid_size(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> tuple[int, int]:
+        target_width, target_height = self._get_image_target_size()
+
+        if not (image_width <= target_width and image_height <= target_height):
+            height_scale_factor = target_height / image_height
+            width_scale_factor = target_width / image_width
+            optimal_scale_factor = min(height_scale_factor, width_scale_factor)
+
+            image_height = int(image_height * optimal_scale_factor)
+            image_width = int(image_width * optimal_scale_factor)
+
+        ncols = math.ceil(image_width / 30)
+        nrows = math.ceil(image_height / 30)
+        return ncols, nrows
+
+    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+        target_width, target_height = self._get_image_target_size()
+
+        max_ncols, max_nrows = self._get_image_grid_size(
+            image_width=target_width,
+            image_height=target_height,
+        )
+        max_image_tokens = (max_ncols + 1) * max_nrows
 
-    def _get_data_parser(self) -> MultiModalDataParser:
-        return MultiModalDataParser(max_mm_counts={"image": 1})
+        return {"image": max_image_tokens}
 
     def _get_hf_processor(self) -> FuyuProcessor:
         return self.ctx.get_hf_processor(FuyuProcessor)
@@ -166,28 +168,13 @@ def _get_prompt_replacements(
         eot_token_id = tokenizer.bos_token_id
         assert isinstance(eot_token_id, int)
 
-        hf_processor = self._get_hf_processor()
-        image_processor: FuyuImageProcessor = hf_processor.image_processor
-        target_size = image_processor.size
-        target_height, target_width = (target_size["height"],
-                                       target_size["width"])
-
         def get_replacement_fuyu(item_idx: int):
             images = mm_items.get_items("image", ImageProcessorItems)
             image_size = images.get_image_size(item_idx)
-            width, height = image_size.width, image_size.height
-            if not (width <= target_width and height <= target_height):
-                height_scale_factor = target_height / height
-                width_scale_factor = target_width / width
-                optimal_scale_factor = min(height_scale_factor,
-                                           width_scale_factor)
-
-                height = int(height * optimal_scale_factor)
-                width = int(width * optimal_scale_factor)
-
-            ncols, nrows = _get_fuyu_num_image_tokens(
-                image_width=width,
-                image_height=height,
+
+            ncols, nrows = self._get_image_grid_size(
+                image_width=image_size.width,
+                image_height=image_size.height,
             )
 
             return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows +
@@ -225,12 +212,13 @@ def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
+        target_width, target_height = self._get_image_target_size()
         num_images = mm_counts.get("image", 0)
 
         mm_data = {
             "image":
-            self._get_dummy_images(width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-                                   height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
                                    num_images=num_images)
         }
 
@@ -240,7 +228,6 @@ def _get_dummy_mm_inputs(
         )
 
 
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_fuyu_image_tokens)
 @MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor)
 class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 34dc7fa31ce6f..808e61edb6fb4 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -119,6 +119,12 @@ def get_max_llava_image_tokens(ctx: InputContext):
 
 class LlavaMultiModalProcessor(BaseMultiModalProcessor):
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+        return {"image": get_max_llava_image_tokens(self.ctx)}
+
     def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]:
         return self.ctx.get_hf_processor((LlavaProcessor, PixtralProcessor))
 
@@ -324,7 +330,6 @@ def init_vision_tower_for_llava(
     raise NotImplementedError(msg)
 
 
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
 @MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor)
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     # BitandBytes specific attributes
@@ -649,7 +654,6 @@ def get_replacement_mantis(item_idx: int):
 
 # To use this model, please use
 # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'`
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
 @MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor)
 class MantisForConditionalGeneration(LlavaForConditionalGeneration):
     pass
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 15362db6cdfbf..d855e7d2d36f8 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -23,7 +23,6 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import InputContext
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -306,24 +305,31 @@ def add_image_newline(self, image_features_hd):
         return image_features_hd_newline
 
 
-def get_max_phi3v_image_tokens(
-    ctx: InputContext,
-    *,
-    num_crops: Optional[int] = None,
-) -> int:
-    hf_processor_mm_kwargs = {}
-    if num_crops:
-        hf_processor_mm_kwargs["num_crops"] = num_crops
+class Phi3VMultiModalProcessor(BaseMultiModalProcessor):
 
-    processor = ctx.get_hf_processor(**hf_processor_mm_kwargs)
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
 
-    return processor.calc_num_image_tokens_from_image_size(
-        width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-        height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
-    )
+    def _get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        processor = self._get_hf_processor()
+
+        return processor.calc_num_image_tokens_from_image_size(  # type: ignore
+            width=image_width,
+            height=image_height,
+        )
 
+    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+        max_image_tokens = self._get_num_image_tokens(
+            image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+            image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+        )
 
-class Phi3VMultiModalProcessor(BaseMultiModalProcessor):
+        return {"image": max_image_tokens}
 
     def _get_hf_processor(
         self,
@@ -332,6 +338,7 @@ def _get_hf_processor(
     ) -> ProcessorMixin:
         if num_crops is not None:
             return self.ctx.get_hf_processor(num_crops=num_crops)
+
         return self.ctx.get_hf_processor()
 
     def _call_hf_processor(
@@ -375,7 +382,6 @@ def _get_prompt_replacements(
     ) -> list[PromptReplacement]:
         hf_processor = self._get_hf_processor()
         image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
-        image_processor = hf_processor.image_processor  # type: ignore
 
         tokenizer = self._get_tokenizer()
         bos_token_id = tokenizer.bos_token_id
@@ -385,9 +391,9 @@ def get_replacement_phi3v(item_idx: int):
             images = mm_items.get_items("image", ImageProcessorItems)
             image_size = images.get_image_size(item_idx)
 
-            num_tokens = image_processor.calc_num_image_tokens_from_image_size(
-                width=image_size.width,
-                height=image_size.height,
+            num_tokens = self._get_num_image_tokens(
+                image_width=image_size.width,
+                image_height=image_size.height,
             )
 
             return [_IMAGE_TOKEN_ID] * num_tokens + [bos_token_id]
@@ -467,7 +473,6 @@ def apply(
         return result
 
 
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens)
 @MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor)
 class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     hf_to_vllm_mapper = WeightsMapper(
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index de55bc6bcc123..d050fd060353a 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -33,13 +33,12 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import InputContext
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
-from vllm.multimodal.parse import MultiModalDataParser
+from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
@@ -80,14 +79,17 @@ def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
     return feat_lengths, output_lengths
 
 
-def get_max_qwen2_audio_audio_tokens(ctx: InputContext) -> int:
-    hf_config = ctx.get_hf_config(Qwen2AudioConfig)
-    max_source_position = hf_config.audio_config.max_source_positions
-    output_lengths = (max_source_position - 2) // 2 + 1
-    return output_lengths
+class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor):
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": None}
 
-class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor):
+    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+        hf_config = self.ctx.get_hf_config(Qwen2AudioConfig)
+        max_source_positions = hf_config.audio_config.max_source_positions
+        max_output_lengths = (max_source_positions - 2) // 2 + 1
+
+        return {"audio": max_output_lengths}
 
     def _get_hf_processor(
         self,
@@ -157,11 +159,21 @@ def _get_prompt_replacements(
             audio_output_lengths = []
         else:
             assert isinstance(feature_attention_mask, torch.Tensor)
-            _, audio_output_lengths = _get_feat_extract_output_lengths(
+            _, audio_output_lens = _get_feat_extract_output_lengths(
                 feature_attention_mask.sum(-1))
 
+            audio_output_lengths = audio_output_lens.tolist()
+
         def get_replacement_qwen2_audio(item_idx: int):
-            return [placeholder] * audio_output_lengths[item_idx]
+            num_placeholders = audio_output_lengths[item_idx]
+            if num_placeholders == 0:
+                audios = mm_items.get_items("audio", AudioProcessorItems)
+                audio = audios.get(item_idx)
+                raise ValueError(
+                    f"The audio {audio} (len={len(audio)}) is too short "
+                    "to be represented inside the model")
+
+            return [placeholder] * num_placeholders
 
         return [
             PromptReplacement(
@@ -171,6 +183,14 @@ def get_replacement_qwen2_audio(item_idx: int):
             )
         ]
 
+    def _always_apply_prompt_replacements(self) -> bool:
+        # HF never applies prompt replacements, so we have to do it ourselves
+        # _find_placeholders may incorrectly think that HF has already performed
+        # processing for multi-audio input when the input audios are short
+        # (the corresponding placeholders may take up fewer tokens than
+        # the number of audio items)
+        return True
+
     def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
@@ -192,8 +212,6 @@ def _get_dummy_mm_inputs(
         )
 
 
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-    "audio", get_max_qwen2_audio_audio_tokens)
 @MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor)
 class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
                                          SupportsPP):
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 0df101b3dcce4..26b6d768ad4f6 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -40,7 +40,6 @@
 from vllm.config import VllmConfig
 from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
-from vllm.inputs import InputContext
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.activation import QuickGELU
@@ -650,8 +649,9 @@ def _get_vision_info(
     width: int,
     min_pixels: int,
     max_pixels: int,
+    *,
     do_resize: bool = True,
-    data_type_key: str = "image",
+    modality: str = "image",
     mm_count: int = 1,
 ):
     """Get information (resized height / width and number of vision tokens)
@@ -671,11 +671,12 @@ def _get_vision_info(
     else:
         resized_height, resized_width = height, width
 
-    if data_type_key == "image":
+    if modality == "image":
         grid_t = mm_count
-    else:
-        assert data_type_key == "video"
+    elif modality == "video":
         grid_t = max(mm_count // temporal_patch_size, 1)
+    else:
+        raise ValueError(f"Modality {modality} is not supported")
 
     grid_h = resized_height // patch_size
     grid_w = resized_width // patch_size
@@ -691,41 +692,11 @@ def _get_image_processor(hf_processor: Qwen2VLProcessor):
     return image_processor
 
 
-def get_max_qwen2_vl_mm_tokens(ctx: InputContext,
-                               data_type_key: str,
-                               *,
-                               min_pixels: Optional[int] = None,
-                               max_pixels: Optional[int] = None) -> int:
-    hf_config = ctx.get_hf_config(Qwen2VLConfig)
-    vision_config = hf_config.vision_config
-
-    hf_processor = ctx.get_hf_processor(Qwen2VLProcessor)
-    image_processor = _get_image_processor(hf_processor)
-
-    _, _, max_llm_image_tokens = _get_vision_info(
-        vision_config,
-        height=9999999,
-        width=9999999,
-        min_pixels=min_pixels or image_processor.min_pixels,
-        max_pixels=max_pixels or image_processor.max_pixels,
-        data_type_key=data_type_key,
-    )
-    return max_llm_image_tokens
-
-
-get_max_qwen2_vl_image_tokens = partial(get_max_qwen2_vl_mm_tokens,
-                                        data_type_key="image")
-get_max_qwen2_vl_video_tokens = partial(get_max_qwen2_vl_mm_tokens,
-                                        data_type_key="video")
-
-
 class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor],
                                             dict[str, torch.Tensor]]):
 
     def __init__(self, data: dict, modality: str) -> None:
-        super().__init__(data)
-
-        self.modality = modality
+        super().__init__(data, modality)
 
         grid_thw = data[f"{modality}_grid_thw"]
         slice_idxs = [0] + grid_thw.prod(-1).cumsum_(0).tolist()
@@ -734,9 +705,6 @@ def __init__(self, data: dict, modality: str) -> None:
             for i in range(len(grid_thw))
         ]
 
-    def __repr__(self) -> str:
-        return (f"{type(self).__name__}(modality={self.modality!r})")
-
     def get_count(self) -> int:
         return len(self.data[f"{self.modality}_grid_thw"])
 
@@ -792,6 +760,32 @@ def _parse_video_data(
 
 class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def _get_max_mm_tokens(self, modality: str) -> int:
+        hf_config = self.ctx.get_hf_config(Qwen2VLConfig)
+        vision_config = hf_config.vision_config
+
+        hf_processor = self._get_hf_processor()
+        image_processor = _get_image_processor(hf_processor)
+
+        _, _, max_llm_image_tokens = _get_vision_info(
+            vision_config,
+            height=9999999,
+            width=9999999,
+            min_pixels=image_processor.min_pixels,
+            max_pixels=image_processor.max_pixels,
+            modality=modality,
+        )
+        return max_llm_image_tokens
+
+    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+        return {
+            "image": self._get_max_mm_tokens("image"),
+            "video": self._get_max_mm_tokens("video"),
+        }
+
     def _get_data_parser(self) -> MultiModalDataParser:
         return Qwen2MultiModalDataParser()
 
@@ -908,9 +902,6 @@ def _get_dummy_mm_inputs(
         )
 
 
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_qwen2_vl_image_tokens)
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-    "video", get_max_qwen2_vl_video_tokens)
 @MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor)
 class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                                       SupportsLoRA, SupportsPP):
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 54be7fed3f2be..0b83684c9bac5 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -2,7 +2,7 @@
 """PyTorch Ultravox model."""
 
 import math
-from functools import cached_property, lru_cache
+from functools import cached_property
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
@@ -17,7 +17,6 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -58,22 +57,17 @@ class UltravoxAudioEmbeddingInputs(TypedDict):
                             UltravoxAudioEmbeddingInputs]
 
 
-@lru_cache
-def cached_feature_extractor(model_id: str) -> WhisperFeatureExtractor:
-    return WhisperFeatureExtractor.from_pretrained(model_id)
-
-
-def whisper_feature_extractor(ctx: InputContext) -> WhisperFeatureExtractor:
-    hf_config = ctx.get_hf_config(UltravoxConfig)
-    return cached_feature_extractor(hf_config.audio_model_id)
-
+class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
 
-def get_ultravox_max_audio_tokens(ctx: InputContext):
-    feature_extractor = whisper_feature_extractor(ctx)
-    return math.ceil(feature_extractor.chunk_length * _AUDIO_TOKENS_PER_SECOND)
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": None}
 
+    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+        feature_extractor = self._get_feature_extractor()
+        max_audio_tokens = math.ceil(feature_extractor.chunk_length *
+                                     _AUDIO_TOKENS_PER_SECOND)
 
-class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
+        return {"audio": max_audio_tokens}
 
     def _get_hf_processor(
         self,
@@ -322,8 +316,6 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-    "audio", get_ultravox_max_audio_tokens)
 @MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor)
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
 
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index da111e999ebb8..4e1b78ab2c59d 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -21,10 +21,15 @@
 
 class ModalityDataItems(ABC, Generic[_T, _I]):
 
-    def __init__(self, data: _T) -> None:
+    def __init__(self, data: _T, modality: str) -> None:
         super().__init__()
 
         self.data = data
+        self.modality = modality
+
+    def __repr__(self) -> str:
+        return (f"{type(self).__name__}(modality={self.modality!r}, "
+                f"len={len(self)})")
 
     def __len__(self) -> int:
         return self.get_count()
@@ -64,14 +69,6 @@ def get_passthrough_data(self) -> Mapping[str, object]:
 
 class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
 
-    def __init__(self, data: Sequence[_T], modality: str) -> None:
-        super().__init__(data)
-
-        self.modality = modality
-
-    def __repr__(self) -> str:
-        return (f"{type(self).__name__}(modality={self.modality!r})")
-
     def get_count(self) -> int:
         return len(self.data)
 
@@ -87,14 +84,6 @@ def get_passthrough_data(self) -> Mapping[str, object]:
 
 class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]):
 
-    def __init__(self, data: NestedTensors, modality: str) -> None:
-        super().__init__(data)
-
-        self.modality = modality
-
-    def __repr__(self) -> str:
-        return (f"{type(self).__name__}(modality={self.modality!r})")
-
     def get_count(self) -> int:
         return len(self.data)
 
@@ -222,22 +211,13 @@ class MultiModalDataParser:
     Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`.
 
     Args:
-        max_mm_counts (Mapping[str, int]): The maximum allowed number of items
-            belonging to each modality. This effectively sets a hard limit over
-            `--limit-mm-per-prompt`.
         target_sr (float, optional): Enables automatic resampling of audio
             items to the model's expected sampling rate.
     """
 
-    def __init__(
-        self,
-        *,
-        max_mm_counts: Mapping[str, int] = {},
-        target_sr: Optional[float] = None,
-    ) -> None:
+    def __init__(self, *, target_sr: Optional[float] = None) -> None:
         super().__init__()
 
-        self.max_mm_counts = max_mm_counts
         self.target_sr = target_sr
 
     def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]:
@@ -345,7 +325,6 @@ def _get_subparsers(self) -> Mapping[str, ModalityDataParser]:
 
     def parse_mm_data(self,
                       mm_data: MultiModalDataDict) -> MultiModalDataItems:
-        max_mm_counts = self.max_mm_counts
         subparsers = self._get_subparsers()
 
         mm_items = MultiModalDataItems()
@@ -353,16 +332,6 @@ def parse_mm_data(self,
             if k not in subparsers:
                 raise ValueError(f"Unsupported modality: {k}")
 
-            modality_items = subparsers[k](v)
-
-            if k in max_mm_counts:
-                max_count = max_mm_counts[k]
-                if len(modality_items) > max_count:
-                    raise ValueError(
-                        f"This model supports at most {max_count} {k} items "
-                        f"per prompt, but {len(modality_items)} {k} items "
-                        "were given or set as its limit_mm_per_prompt.")
-
-            mm_items[k] = modality_items
+            mm_items[k] = subparsers[k](v)
 
         return mm_items
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 7712c3bcebe20..76475ddda81f4 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -624,6 +624,29 @@ def __call__(
     ) -> MultiModalInputsV2:
         return self.apply(prompt, mm_data, hf_processor_mm_kwargs)
 
+    @abstractmethod
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        """
+        Return the maximum supported number of items for each modality.
+
+        A value of `None` means unlimited number of items.
+
+        Omitting a modality from the returned dictionary means that
+        it is not supported at all.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+        """
+        Get the maximum possible number of tokens per data item
+        for each modality.
+
+        The dictionary returned by this method should have the same
+        keys as that returned by :meth:`get_supported_mm_limits`.
+        """
+        raise NotImplementedError
+
     def _get_data_parser(self) -> MultiModalDataParser:
         """
         Construct a data parser to preprocess multi-modal data items
@@ -653,7 +676,18 @@ def _to_mm_items(
         before passing them to :meth:`_get_hf_mm_data`.
         """
         parser = self._get_data_parser()
-        return parser.parse_mm_data(mm_data)
+        mm_items = parser.parse_mm_data(mm_data)
+
+        mm_limits = self.ctx.get_mm_config().limit_per_prompt
+        for modality, items in mm_items.items():
+            limit = mm_limits.get(modality, 1)
+            if len(items) > limit:
+                raise ValueError(
+                    f"You set {modality}={limit} (or defaulted to 1) in "
+                    f"`--limit-mm-per-prompt`, but passed {len(items)} "
+                    f"{modality} items in the same prompt.")
+
+        return mm_items
 
     @abstractmethod
     def _get_mm_fields_config(
@@ -901,6 +935,17 @@ def _bind_prompt_replacements(
 
         return [prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls]
 
+    def _always_apply_prompt_replacements(self) -> bool:
+        """
+        A flag which can be overridden so that
+        :meth:`_apply_prompt_replacements` is always called even if we
+        detect that HF has performed processing via :meth:`_find_placeholders`.
+
+        This is useful in cases where :meth:`_find_placeholders` cannot be
+        reliably used to detect whether HF has performed processing or not.
+        """
+        return False
+
     def _apply_prompt_replacements(
         self,
         token_ids: list[int],
@@ -995,7 +1040,7 @@ def apply(
         all_placeholders = self._find_placeholders(prompt_repls, prompt_ids,
                                                    mm_item_counts)
 
-        if all_placeholders:
+        if all_placeholders and not self._always_apply_prompt_replacements():
             tokenizer = self._get_tokenizer()
             prompt_text = _decode(tokenizer, prompt_ids)
         else:
@@ -1009,10 +1054,27 @@ def apply(
                 mm_item_counts,
             )
 
-        mm_placeholders = {
-            modality: [item.to_range() for item in items]
-            for modality, items in full_groupby_modality(all_placeholders)
-        }
+        mm_placeholders = dict[str, list[PlaceholderRange]]()
+        err_suffix = ("This suggests a problem with your implementation of "
+                      "the merged multi-modal processor for this model, "
+                      "particularly in the `_get_prompt_replacements` method.")
+
+        for modality, placeholders in full_groupby_modality(all_placeholders):
+            if modality not in mm_items:
+                raise AssertionError(
+                    f"Expected no placeholders for {modality=}, "
+                    f"but found {placeholders=}. Input items: {mm_items}"
+                    f"\n{err_suffix}")
+
+            if len(placeholders) != len(mm_items[modality]):
+                raise AssertionError(
+                    f"Expected length of {placeholders=} for {modality=} "
+                    f"to equal that of input items: {mm_items[modality]}"
+                    f"\n{err_suffix}")
+
+            mm_placeholders[modality] = [
+                item.to_range() for item in placeholders
+            ]
 
         return MultiModalInputsV2(
             type="multimodal",
@@ -1063,15 +1125,38 @@ def _get_dummy_mm_inputs(
         """
         raise NotImplementedError
 
-    def get_dummy_data(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-        mm_max_tokens: Mapping[str, int],
-    ) -> DummyData:
+    def _get_and_validate_dummy_mm_counts(self) -> Mapping[str, int]:
+        mm_limit_per_prompt = self.ctx.get_mm_config().limit_per_prompt
+        supported_mm_limits = self.get_supported_mm_limits()
+
+        mm_limits = {
+            modality: mm_limit_per_prompt.get(modality, 1)
+            for modality in supported_mm_limits
+        }
+
+        for modality, supported_limit in supported_mm_limits.items():
+            limit = mm_limits[modality]
+            if supported_limit is not None and supported_limit < limit:
+                raise ValueError(
+                    f"You set {modality}={limit} (or defaulted to 1) in "
+                    f"`--limit-mm-per-prompt`, but this model only supports "
+                    f"at most {supported_limit} {modality} items.")
+
+        return mm_limits
+
+    def get_dummy_data(self, seq_len: int) -> DummyData:
         # Avoid circular import
         from vllm.sequence import SequenceData
 
+        mm_counts = self._get_and_validate_dummy_mm_counts()
+        mm_max_tokens_per_item = self.get_mm_max_tokens_per_item()
+        if mm_counts.keys() != mm_max_tokens_per_item.keys():
+            raise AssertionError(
+                "The keys returned by `get_supported_mm_limits`"
+                f"({set(mm_counts.keys())}) should be the same as those "
+                "returned by `get_mm_max_tokens_per_item` "
+                f"({set(mm_max_tokens_per_item.keys())})")
+
         processor_inputs = self._get_dummy_mm_inputs(mm_counts)
         mm_inputs = self.apply(
             prompt_text=processor_inputs.prompt_text,
@@ -1087,7 +1172,7 @@ def get_dummy_data(
             for modality, placeholders in placeholders_by_modality.items()
         }
         expected_placeholders_by_modality = {
-            modality: mm_max_tokens[modality]
+            modality: mm_max_tokens_per_item[modality] * mm_counts[modality]
             for modality in placeholders_by_modality
         }
         if total_placeholders_by_modality != expected_placeholders_by_modality:
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 3a5e11867ad9e..073d49d7d2009 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -15,6 +15,7 @@
 from .image import ImagePlugin
 from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
 from .processing import BaseMultiModalProcessor, ProcessingCache
+from .utils import cached_get_tokenizer
 from .video import VideoPlugin
 
 if TYPE_CHECKING:
@@ -219,6 +220,10 @@ def get_max_tokens_per_item_by_modality(
         Note:
             This is currently directly used only in V1.
         """
+        if self.has_processor(model_config):
+            tokenizer = cached_get_tokenizer(model_config.tokenizer)
+            processor = self.create_processor(model_config, tokenizer)
+            return processor.get_mm_max_tokens_per_item()
 
         return {
             key: plugin.get_max_multimodal_tokens(model_config)

From 23c1b10a4c8cd77c5b13afa9242d67ffd055296b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 2 Jan 2025 17:00:00 +0800
Subject: [PATCH 241/357] [VLM][Bugfix] Multi-modal processor compatible with
 V1 multi-input (#11674)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/multimodal/inputs.py     | 252 ++++++++++++++++------------------
 vllm/multimodal/processing.py |  45 +++---
 vllm/v1/engine/processor.py   |  22 ++-
 3 files changed, 151 insertions(+), 168 deletions(-)

diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index db489af7ac475..b0a1104546186 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -2,7 +2,8 @@
 from collections import UserDict, defaultdict
 from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
-from typing import Any, Literal, TypedDict, TypeVar, Union, cast, final
+from typing import (Any, Literal, Optional, TypedDict, TypeVar, Union, cast,
+                    final)
 
 import numpy as np
 import torch
@@ -11,7 +12,7 @@
 from transformers import BatchFeature
 from typing_extensions import NotRequired, TypeAlias
 
-from vllm.utils import JSONTree, is_list_of, json_map_leaves
+from vllm.utils import JSONTree, full_groupby, is_list_of, json_map_leaves
 
 _T = TypeVar("_T")
 
@@ -160,11 +161,8 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
 
 
 @dataclass(frozen=True)
-class MultiModalFieldItem:
-    """
-    Contains metadata and data in :class:`MultiModalKwargs`
-    corresponding to a data item in :class:`MultiModalDataItems`.
-    """
+class MultiModalFieldElem:
+    """Contains metadata and data of an item in :class:`MultiModalKwargs`."""
     field: "BaseMultiModalField"
     data: NestedTensors
 
@@ -186,34 +184,34 @@ class BaseMultiModalField(ABC):
     def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
         raise NotImplementedError
 
-    def _build_item(self, data: NestedTensors) -> MultiModalFieldItem:
-        return MultiModalFieldItem(self, data)
+    def _build_elem(self, data: NestedTensors) -> MultiModalFieldElem:
+        return MultiModalFieldElem(self, data)
 
-    def reduce(self, batch: list[MultiModalFieldItem]) -> MultiModalFieldItem:
-        """Merge multiple instances of :class:`MultiModalFieldItem` together."""
+    def reduce(self, batch: list[MultiModalFieldElem]) -> MultiModalFieldElem:
+        """Merge multiple instances of :class:`MultiModalFieldElem` together."""
         fields = [item.field for item in batch]
         if len(set(fields)) > 1:
             raise ValueError(f"Cannot merge different {fields=}")
 
         data = self._reduce_data([item.data for item in batch])
 
-        return self._build_item(data)
+        return self._build_elem(data)
 
 
 @dataclass(frozen=True)
 class MultiModalBatchedField(BaseMultiModalField):
     """
-    A :class:`BaseMultiModalField` implementation where an item is obtained by
-    directly indexing into the first dimension of the underlying data.
+    A :class:`BaseMultiModalField` implementation where an element in the batch
+    is obtained by indexing into the first dimension of the underlying data.
     """
 
-    def build_items(self, batch: NestedTensors) -> list[MultiModalFieldItem]:
-        return [self._build_item(item) for item in batch]
+    def build_elems(self, batch: NestedTensors) -> list[MultiModalFieldElem]:
+        return [self._build_elem(item) for item in batch]
 
     def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
         if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
             first_shape = batch[0].shape
-            if all(item.shape == first_shape for item in batch):
+            if all(elem.shape == first_shape for elem in batch):
                 return torch.stack(batch)
 
         return batch
@@ -222,24 +220,24 @@ def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
 @dataclass(frozen=True)
 class MultiModalFlatField(BaseMultiModalField):
     """
-    A :class:`BaseMultiModalField` implementation where an item is obtained by
-    slicing along the first dimension of the underlying data.
+    A :class:`BaseMultiModalField` implementation where an element in the batch
+    is obtained by slicing along the first dimension of the underlying data.
     """
 
-    def build_items(
+    def build_elems(
         self,
         batch: NestedTensors,
         slices: Sequence[slice],
-    ) -> list[MultiModalFieldItem]:
-        return [self._build_item(batch[slice_]) for slice_ in slices]
+    ) -> list[MultiModalFieldElem]:
+        return [self._build_elem(batch[slice_]) for slice_ in slices]
 
     def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
         if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
             first_shape = batch[0].shape
-            if all(item.shape[1:] == first_shape[1:] for item in batch):
+            if all(elem.shape[1:] == first_shape[1:] for elem in batch):
                 return torch.concat(batch)
 
-        return [elem for item in batch for elem in item]
+        return [e for elem in batch for e in elem]
 
 
 class MultiModalFieldConfig:
@@ -267,115 +265,111 @@ def __init__(
     ) -> None:
         super().__init__()
 
-        self._field_cls = field_cls
-        self._modality = modality
-        self._field_config = field_config
+        self.field_cls = field_cls
+        self.modality = modality
+        self.field_config = field_config
 
-    def build_items(
+    def build_elems(
         self,
         key: str,
         batch: NestedTensors,
-    ) -> list[MultiModalFieldItem]:
-        field = self._field_cls(key=key, modality=self._modality)
-        return field.build_items(batch, **self._field_config)  # type: ignore
+    ) -> Sequence[MultiModalFieldElem]:
+        field = self.field_cls(key=key, modality=self.modality)
+        return field.build_elems(batch, **self.field_config)  # type: ignore
 
 
-class MultiModalKwargs(UserDict[str, NestedTensors]):
+class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
+    """
+    A collection of :class:`MultiModalFieldElem`
+    corresponding to a data item in :class:`MultiModalDataItems`.
     """
-    A dictionary that represents the keyword arguments to
-    :meth:`~torch.nn.Module.forward`.
 
-    The metadata :code:`items_by_key` defines how to split batched keyword
-    arguments corresponding to each data item in :class:`MultiModalDataItems`:
+    @staticmethod
+    def from_elems(elems: Sequence[MultiModalFieldElem]):
+        return MultiModalKwargsItem({elem.field.key: elem for elem in elems})
 
-    - For a keyword argument, we can access the :code:`i` th item in the batch
-      via :code:`items_by_key[key][i]`.
-    - We can gather the keyword arguments belonging to a modality by finding
-      the keys with items that belong to that modality, then accessing
-      the :code:`i` th item in the batch for each such key.
+    @property
+    def modality(self) -> str:
+        modalities = {elem.field.modality for elem in self.data.values()}
+        assert len(modalities) == 1, f"Found different modalities={modalities}"
+        return next(iter(modalities))
 
-    Example:
 
-        .. code-block:: python
-
-            # All items belong to the "image" modality
-            items_by_key={
-                "pixel_values": [a, b, c, d],  # "image" modality
-                "image_grid_thw": [e, f, g, h],  # "image" modality
-                "pixel_values_video": [h, i, j],  # "video" modality
-                "video_grid_thw": [k, l, m],  # "video" modality
-            }
+# NOTE: UserDict is for V0 compatibility.
+# V1 should access individual items via `get_item`.
+class MultiModalKwargs(UserDict[str, NestedTensors]):
+    """
+    A dictionary that represents the keyword arguments to
+    :meth:`~torch.nn.Module.forward`.
 
-        - The keyword arguments belonging to the first image are
-          :code:`{"pixel_values": a, "image_grid_thw": e}`.
-        - The keyword arguments belonging to the second video are
-          :code:`{"pixel_values_video": i, "video_grid_thw": l}`.
+    The metadata :code:`items` enables us to obtain the keyword arguments
+    corresponding to each data item in :class:`MultiModalDataItems`, via
+    :meth:`get_item` and :meth:`get_items`.
     """
 
     @staticmethod
     def from_hf_inputs(
         hf_inputs: BatchFeature,
         config_by_key: Mapping[str, MultiModalFieldConfig],
-        *,
-        enable_sanity_checks: bool = False,
     ):
         # NOTE: This skips fields in `hf_inputs` that are not in `config_by_key`
         # We assume that those fields are not used in vLLM
-        items_by_key = {
-            key: config.build_items(key, batch)
-            for key, config in config_by_key.items()
-            if (batch := hf_inputs.get(key)) is not None
-        }
-
-        return MultiModalKwargs.from_items_by_key(
-            items_by_key,
-            enable_sanity_checks=enable_sanity_checks,
-        )
+        elems_by_key = dict[str, Sequence[MultiModalFieldElem]]()
+        keys_by_modality = defaultdict[str, set[str]](set)
+        for key, config in config_by_key.items():
+            batch = hf_inputs.get(key)
+            if batch is not None:
+                elems = config.build_elems(key, batch)
+                if len(elems) > 0:
+                    elems_by_key[key] = elems
+                    keys_by_modality[config.modality].add(key)
+
+        items = list[MultiModalKwargsItem]()
+        for modality, keys in keys_by_modality.items():
+            elems_in_modality = {k: elems_by_key[k] for k in keys}
+            batch_sizes = {k: len(v) for k, v in elems_in_modality.items()}
+
+            if len(set(batch_sizes.values())) > 1:
+                raise ValueError(
+                    f"Cannot merge different batch sizes for {modality=}! "
+                    f"Found: {batch_sizes=}")
+
+            batch_size = next(iter(batch_sizes.values()))
+            for item_idx in range(batch_size):
+                elems = [v[item_idx] for v in elems_in_modality.values()]
+                items.append(MultiModalKwargsItem.from_elems(elems))
+
+        return MultiModalKwargs.from_items(items)
 
     @staticmethod
-    def from_items_by_key(
-        items_by_key: Mapping[str, list[MultiModalFieldItem]],
-        *,
-        enable_sanity_checks: bool = False,
-    ) -> "MultiModalKwargs":
+    def from_items(items: Sequence[MultiModalKwargsItem]):
+        """Construct a new :class:`MultiModalKwargs` from multiple items."""
+        elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
+        for item in items:
+            for key, elem in item.items():
+                elems_by_key[key].append(elem)
+
         data = {
-            key: items[0].field.reduce(items).data
-            for key, items in items_by_key.items() if len(items) > 0
+            key: elems[0].field.reduce(elems).data
+            for key, elems in elems_by_key.items() if len(elems) > 0
         }
 
-        return MultiModalKwargs(data,
-                                items_by_key=items_by_key,
-                                enable_sanity_checks=enable_sanity_checks)
+        return MultiModalKwargs(data, items=items)
 
     def __init__(
         self,
         data: Mapping[str, NestedTensors],
         *,
-        items_by_key: Mapping[str, list[MultiModalFieldItem]] = {},
-        enable_sanity_checks: bool = False,
+        items: Optional[Sequence[MultiModalKwargsItem]] = None,
     ) -> None:
         super().__init__(data)
 
-        # Shallow copy to avoid footgun in case a defaultdict is passed in
-        self._items_by_key = dict(items_by_key)
+        items_by_modality = full_groupby(items or [], key=lambda x: x.modality)
+        self._items_by_modality = dict(items_by_modality)
 
-        keys_by_modality = defaultdict[str, set[str]](set)
-        for key, items in items_by_key.items():
-            for item in items:
-                keys_by_modality[item.field.modality].add(key)
-
-        self._keys_by_modality = dict(keys_by_modality)
-
-        if enable_sanity_checks:
-            for modality, keys in keys_by_modality.items():
-                items_in_modality = {k: items_by_key[k] for k in keys}
-                batch_sizes = {k: len(v) for k, v in items_in_modality.items()}
-                batch_size = next(iter(batch_sizes.values()), 0)
-                assert all(bs == batch_size
-                           for bs in batch_sizes.values()), dict(
-                               modality=modality,
-                               batch_sizes=batch_sizes,
-                               items_by_key=items_by_key)
+    @property
+    def modalities(self):
+        return self._items_by_modality.keys()
 
     @staticmethod
     def _try_stack(nested_tensors: NestedTensors) -> NestedTensors:
@@ -452,58 +446,44 @@ def as_kwargs(
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, self.__class__):
             return False
-        if self._items_by_key != other._items_by_key:
+        if self._items_by_modality != other._items_by_modality:
             return False
 
         ks = self.keys()
         return (ks == other.keys()
                 and all(nested_tensors_equal(self[k], other[k]) for k in ks))
 
-    def get_item(self, key: str, item_index: int) -> MultiModalFieldItem:
-        return self._items_by_key[key][item_index]
+    def _validate_modality(self, method_name: str, modality: str) -> None:
+        if not self._items_by_modality:
+            raise RuntimeError(
+                f"`{method_name}` is not supported when "
+                "MultiModalKwargs is not initialized with `items`")
 
-    def get_items_by_modality(
-        self,
-        modality: str,
-        item_index: int,
-    ) -> Mapping[str, MultiModalFieldItem]:
-        """
-        Get the keyword arguments corresponding to an item identified by
-        its modality and index.
-        """
-        if modality not in self._keys_by_modality:
-            available_modalities = set(self._keys_by_modality.keys())
+        if modality not in self._items_by_modality:
+            available_modalities = set(self._items_by_modality.keys())
             raise KeyError(f"Modality {modality!r} not found. "
                            f"Available modalities: {available_modalities}")
 
-        keys_to_gather = self._keys_by_modality[modality]
+    def get_item_count(self, modality: str) -> int:
+        """Get the number of items belonging to a modality."""
+        self._validate_modality("get_item_count", modality)
+        return len(self._items_by_modality[modality])
 
-        return {
-            key: self.get_item(key, item_index)
-            for key in keys_to_gather if key in self
-        }
+    def get_item(self, modality: str, item_index: int) -> MultiModalKwargsItem:
+        """
+        Get the keyword arguments corresponding to an item identified by
+        its modality and index.
+        """
+        self._validate_modality("get_item", modality)
+        return self._items_by_modality[modality][item_index]
 
-    @staticmethod
-    def from_items_by_modality(
-        items_by_modality: Mapping[str, list[Mapping[str,
-                                                     MultiModalFieldItem]]],
-        *,
-        enable_sanity_checks: bool = False,
-    ) -> "MultiModalKwargs":
+    def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
         """
-        Construct a new :class:`MultiModalKwargs` from multiple items returned
-        by :meth:`get_fields_by_modality`.
+        Get the keyword arguments corresponding to each item belonging to
+        a modality.
         """
-        items_by_key = defaultdict[str, list[MultiModalFieldItem]](list)
-        for fields in items_by_modality.values():
-            for field in fields:
-                for k, v in field.items():
-                    items_by_key[k].append(v)
-
-        return MultiModalKwargs.from_items_by_key(
-            items_by_key,
-            enable_sanity_checks=enable_sanity_checks,
-        )
+        self._validate_modality("get_items", modality)
+        return self._items_by_modality[modality]
 
 
 MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]]
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 76475ddda81f4..64cdacfb4c574 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -20,8 +20,8 @@
 from vllm.utils import LRUCache, flatten_2d_lists, full_groupby
 
 from .inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                     MultiModalFieldItem, MultiModalInputsV2, MultiModalKwargs,
-                     PlaceholderRange)
+                     MultiModalInputsV2, MultiModalKwargs,
+                     MultiModalKwargsItem, PlaceholderRange)
 from .parse import MultiModalDataItems, MultiModalDataParser
 
 logger = init_logger(__name__)
@@ -496,8 +496,7 @@ def __init__(self, capacity: int) -> None:
         # DEBUG: Set to None to disable
         self.debug_cache_hit_ratio_steps: Optional[int] = None
 
-        self._cache = LRUCache[str, Mapping[str,
-                                            MultiModalFieldItem]](capacity)
+        self._cache = LRUCache[str, MultiModalKwargsItem](capacity)
 
     def _maybe_log_cache_stats(self) -> None:
         steps = self.debug_cache_hit_ratio_steps
@@ -565,7 +564,7 @@ def get(
         modality: str,
         input_item: object,
         input_kwargs: Mapping[str, object],
-    ) -> Optional[Mapping[str, MultiModalFieldItem]]:
+    ) -> Optional[MultiModalKwargsItem]:
         """
         Get a processed multi-modal item from the cache
         according to its dependencies, including:
@@ -588,7 +587,7 @@ def put(
         modality: str,
         input_item: object,
         input_kwargs: Mapping[str, object],
-        output_kwargs: Mapping[str, MultiModalFieldItem],
+        output_kwargs: MultiModalKwargsItem,
     ) -> None:
         """
         Put a processed multi-modal item into the cache
@@ -784,7 +783,6 @@ def _apply_hf_processor(
         mm_kwargs = MultiModalKwargs.from_hf_inputs(
             processed_data,
             self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs),
-            enable_sanity_checks=self.enable_sanity_checks,
         )
 
         return prompt_ids, mm_kwargs
@@ -846,7 +844,7 @@ def _cached_apply_hf_processor(
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             )
 
-        mm_maybe_cached_field_items = {
+        mm_maybe_cached_kw_items = {
             modality: [
                 cache.get(model_id, modality, item, hf_processor_mm_kwargs)
                 for item in items
@@ -855,8 +853,9 @@ def _cached_apply_hf_processor(
         }
 
         mm_missing_idxs = {
-            modality: [idx for idx, out in enumerate(fields) if out is None]
-            for modality, fields in mm_maybe_cached_field_items.items()
+            modality:
+            [idx for idx, item in enumerate(kw_items) if item is None]
+            for modality, kw_items in mm_maybe_cached_kw_items.items()
         }
         mm_missing_data = {
             modality: [mm_data_items[modality][idx] for idx in idxs]
@@ -875,14 +874,11 @@ def _cached_apply_hf_processor(
             for modality in mm_missing_data_items
         }
 
-        mm_merged_field_items = dict[str, list[Mapping[str,
-                                                       MultiModalFieldItem]]]()
-        for modality, modal_items_lst in mm_maybe_cached_field_items.items():
-            merged_modal_items_lst = list[Mapping[str, MultiModalFieldItem]]()
-
-            for idx, modal_items in enumerate(modal_items_lst):
-                if modal_items is None:
-                    modal_items = mm_missing_kwargs.get_items_by_modality(
+        merged_kw_items = list[MultiModalKwargsItem]()
+        for modality, kw_items in mm_maybe_cached_kw_items.items():
+            for idx, kw_item in enumerate(kw_items):
+                if kw_item is None:
+                    kw_item = mm_missing_kwargs.get_item(
                         modality,
                         mm_missing_next_idx[modality],
                     )
@@ -892,14 +888,12 @@ def _cached_apply_hf_processor(
                         modality,
                         mm_data_items[modality][idx],
                         hf_processor_mm_kwargs,
-                        modal_items,
+                        kw_item,
                     )
 
                     mm_missing_next_idx[modality] += 1
 
-                merged_modal_items_lst.append(modal_items)
-
-            mm_merged_field_items[modality] = merged_modal_items_lst
+                merged_kw_items.append(kw_item)
 
         if self.enable_sanity_checks:
             mm_missing_counts = mm_missing_data_items.get_all_counts()
@@ -909,10 +903,7 @@ def _cached_apply_hf_processor(
                     mm_missing_next_idx=mm_missing_next_idx,
                     mm_missing_counts=mm_missing_counts)
 
-        mm_kwargs = MultiModalKwargs.from_items_by_modality(
-            mm_merged_field_items,
-            enable_sanity_checks=self.enable_sanity_checks,
-        )
+        mm_kwargs = MultiModalKwargs.from_items(merged_kw_items)
 
         if self.enable_sanity_checks:
             mm_item_counts = mm_data_items.get_all_counts()
@@ -920,7 +911,7 @@ def _cached_apply_hf_processor(
             for modality, item_count in mm_item_counts.items():
                 for item_idx in range(item_count):
                     try:
-                        mm_kwargs.get_items_by_modality(modality, item_idx)
+                        mm_kwargs.get_item(modality, item_idx)
                     except Exception as e:
                         # Make it easy to set a breakpoint in the debugger
                         raise e
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 5b5a5a61cea7d..905d3d1fc3e1c 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -113,15 +113,27 @@ def process_inputs(
 
         # For merged preprocessor, mm_data is already mm_inputs
         precomputed_mm_inputs = None
-        if isinstance(decoder_inputs.multi_modal_data, MultiModalKwargs):
-            precomputed_mm_inputs = [decoder_inputs.multi_modal_data]
+        decoder_mm_data = decoder_inputs.multi_modal_data
+        if isinstance(decoder_mm_data, MultiModalKwargs):
+            # The output of merged multi-modal processor (`decoder_mm_data`)
+            # contains the kwargs for all items from all modalities.
+            # This code separates them so that there is one set of kwargs
+            # per item per modality.
+            precomputed_mm_inputs = [
+                MultiModalKwargs.from_items([item])
+                for modality in decoder_mm_data.modalities
+                for item in decoder_mm_data.get_items(modality)
+            ]
 
         # Apply MM mapper
         mm_inputs = None
-        if len(decoder_inputs.multi_modal_data) > 0:
+        if len(decoder_mm_data) > 0:
             mm_inputs = self.mm_input_mapper_client.process_inputs(
-                decoder_inputs.multi_modal_data, mm_hashes,
-                decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs)
+                decoder_mm_data,
+                mm_hashes,
+                decoder_inputs.mm_processor_kwargs,
+                precomputed_mm_inputs,
+            )
 
         return EngineCoreRequest(
             request_id,

From b6087a6beead9165f4c77ceba592b3651bb37de9 Mon Sep 17 00:00:00 2001
From: Tobias Pitters <31857876+CloseChoice@users.noreply.github.com>
Date: Thu, 2 Jan 2025 17:18:15 +0100
Subject: [PATCH 242/357] [mypy] Pass type checking in vllm/inputs (#11680)

Signed-off-by: Tobias Pitters <tobias.pitters@gmail.com>
---
 tools/mypy.sh             |  1 +
 vllm/inputs/data.py       | 21 +++++++++++----------
 vllm/inputs/preprocess.py |  6 +++---
 vllm/inputs/registry.py   |  2 +-
 4 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/tools/mypy.sh b/tools/mypy.sh
index 2454ff9fde466..bf95e4c526fd1 100755
--- a/tools/mypy.sh
+++ b/tools/mypy.sh
@@ -23,6 +23,7 @@ run_mypy vllm/compilation
 run_mypy vllm/distributed
 run_mypy vllm/engine
 run_mypy vllm/executor
+run_mypy vllm/inputs
 run_mypy vllm/lora
 run_mypy vllm/model_executor
 run_mypy vllm/plugins
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index d54cbb5c37819..cdaf6dd76eaa1 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -250,7 +250,7 @@ def prompt(self) -> Optional[str]:
         if inputs["type"] == "token" or inputs["type"] == "multimodal":
             return inputs.get("prompt")
 
-        assert_never(inputs)
+        assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
     def prompt_token_ids(self) -> List[int]:
@@ -259,7 +259,7 @@ def prompt_token_ids(self) -> List[int]:
         if inputs["type"] == "token" or inputs["type"] == "multimodal":
             return inputs.get("prompt_token_ids", [])
 
-        assert_never(inputs)
+        assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
     def token_type_ids(self) -> List[int]:
@@ -268,7 +268,7 @@ def token_type_ids(self) -> List[int]:
         if inputs["type"] == "token" or inputs["type"] == "multimodal":
             return inputs.get("token_type_ids", [])
 
-        assert_never(inputs)
+        assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
     def prompt_embeds(self) -> Optional[torch.Tensor]:
@@ -277,7 +277,7 @@ def prompt_embeds(self) -> Optional[torch.Tensor]:
         if inputs["type"] == "token" or inputs["type"] == "multimodal":
             return None
 
-        assert_never(inputs)
+        assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
     def multi_modal_data(self) -> "MultiModalDataDict":
@@ -289,7 +289,7 @@ def multi_modal_data(self) -> "MultiModalDataDict":
         if inputs["type"] == "multimodal":
             return inputs.get("mm_kwargs", {})
 
-        assert_never(inputs)
+        assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
     def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]:
@@ -301,7 +301,7 @@ def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]:
         if inputs["type"] == "multimodal":
             return inputs.get("mm_kwargs", {})
 
-        assert_never(inputs)
+        assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
     def multi_modal_hashes(self) -> List[str]:
@@ -311,9 +311,10 @@ def multi_modal_hashes(self) -> List[str]:
             return inputs.get("multi_modal_hashes", [])
 
         if inputs["type"] == "multimodal":
-            return inputs.get("mm_hashes", [])
+            # only the case when we use MultiModalInputsV2
+            return inputs.get("mm_hashes", [])  # type: ignore[return-value]
 
-        assert_never(inputs)
+        assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
     def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict":
@@ -325,7 +326,7 @@ def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict":
         if inputs["type"] == "multimodal":
             return inputs.get("mm_placeholders", {})
 
-        assert_never(inputs)
+        assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
     def mm_processor_kwargs(self) -> Dict[str, Any]:
@@ -337,7 +338,7 @@ def mm_processor_kwargs(self) -> Dict[str, Any]:
         if inputs["type"] == "multimodal":
             return {}
 
-        assert_never(inputs)
+        assert_never(inputs)  # type: ignore[arg-type]
 
 
 ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs]
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 3d606817e90aa..aaa10d278ddb0 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -436,7 +436,7 @@ def _build_enc_dec_llm_inputs(
                 or encoder_inputs["type"] == "multimodal"):
             pass
         else:
-            assert_never(encoder_inputs)
+            assert_never(encoder_inputs)  # type: ignore[arg-type]
 
         if decoder_inputs is None:
             dec_token_ids = self._prepare_decoder_input_ids_for_generation(
@@ -452,7 +452,7 @@ def _build_enc_dec_llm_inputs(
                 raise ValueError("Multi-modal decoder inputs of encoder-"
                                  "decoder models are not supported yet")
         else:
-            assert_never(encoder_inputs)
+            assert_never(encoder_inputs)  # type: ignore[arg-type]
 
         return EncoderDecoderInputs(
             encoder=encoder_inputs,
@@ -569,7 +569,7 @@ def _build_decoder_only_llm_inputs(
                 prompt_adapter_request=prompt_adapter_request,
             )
         else:
-            assert_never(prompt_inputs)
+            assert_never(prompt_inputs)  # type: ignore[arg-type]
 
         return prompt_inputs
 
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 090347706ca93..2d9d024e03e80 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -419,7 +419,7 @@ def _ensure_mm_kwargs(
             # Be more strict in V2
             assert "mm_kwargs" in inputs
         else:
-            assert_never(inputs["type"])
+            assert_never(inputs["type"])  # type: ignore[arg-type]
 
     def process_input(self, model_config: "ModelConfig",
                       inputs: ProcessorInputs) -> ProcessorInputs:

From 8c38ee7007c50ac5aef9ed43ae91c6f031799c40 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 3 Jan 2025 00:39:27 +0800
Subject: [PATCH 243/357] [VLM] Merged multi-modal processor for LLaVA-NeXT
 (#11682)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../mm_processor_kwargs/test_llava_next.py    |  70 ----
 tests/multimodal/test_mapper.py               | 118 -------
 tests/multimodal/test_processing.py           |  97 +++++
 .../vllm_add_dummy_model/my_llava.py          |   4 +-
 vllm/model_executor/models/clip.py            |  25 ++
 vllm/model_executor/models/fuyu.py            |   6 +-
 vllm/model_executor/models/llava.py           | 334 +++++++++++-------
 vllm/model_executor/models/llava_next.py      | 321 ++++++-----------
 vllm/model_executor/models/phi3v.py           |  24 +-
 vllm/model_executor/models/pixtral.py         |  66 +++-
 vllm/model_executor/models/siglip.py          |  25 ++
 vllm/model_executor/models/utils.py           |   2 +-
 vllm/model_executor/models/vision.py          |  52 +++
 vllm/multimodal/parse.py                      |  12 +-
 14 files changed, 605 insertions(+), 551 deletions(-)
 delete mode 100644 tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py
 delete mode 100644 tests/multimodal/test_mapper.py
 create mode 100644 vllm/model_executor/models/vision.py

diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py
deleted file mode 100644
index 51c0085101dd0..0000000000000
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import pytest
-
-from vllm.inputs import InputContext
-
-from ....utils import build_model_context
-
-
-@pytest.fixture()
-def get_max_llava_next_image_tokens():
-    from vllm.model_executor.models.llava_next import (
-        get_max_llava_next_image_tokens)
-    return get_max_llava_next_image_tokens
-
-
-@pytest.fixture()
-def dummy_data_for_llava_next():
-    from vllm.model_executor.models.llava_next import dummy_data_for_llava_next
-    return dummy_data_for_llava_next
-
-
-@pytest.mark.parametrize("gridpoints,expected_max_tokens", [
-    ([[336, 336]], 1176),
-    ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], 2928),
-])
-def test_get_max_llava_next_image_tokens(gridpoints, expected_max_tokens,
-                                         get_max_llava_next_image_tokens):
-    ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf")
-
-    # Update the config image_grid_pinpoints
-    # and calculate the resulting max tokens
-    ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
-
-    actual_max_tokens = get_max_llava_next_image_tokens(
-        InputContext(ctx.model_config))
-
-    assert expected_max_tokens == actual_max_tokens
-
-
-@pytest.mark.parametrize(
-    "gridpoints,expected_size",
-    [
-        # One point; it has to be the largest
-        ([[336, 336]], (336, 336)),
-        # Default for most llava next models; the 2x2 tile is the largest
-        ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]],
-         (672, 672)),
-        # If two rectangular gridpoints are the same, the more vertical
-        # one has the higher feature count due to newline features
-        ([[336, 672], [672, 336]], (672, 336))
-    ])
-def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next,
-                                                gridpoints, expected_size):
-    ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf")
-
-    # Update the config image_grid_pinpoints
-    ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
-    seq_len = 5000  # bigger than the max feature size for any image
-
-    dummy_data = dummy_data_for_llava_next(
-        ctx,
-        seq_len=seq_len,
-        mm_counts={"image": 1},
-    )
-    seq_data = dummy_data.seq_data
-    mm_data = dummy_data.multi_modal_data
-
-    # The dummy data dims should match the gridpoint with the biggest feat size
-    assert mm_data["image"].height == expected_size[0]
-    assert mm_data["image"].width == expected_size[1]
-    assert len(seq_data.get_token_ids()) >= seq_len
diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py
deleted file mode 100644
index 81f2a06182bcc..0000000000000
--- a/tests/multimodal/test_mapper.py
+++ /dev/null
@@ -1,118 +0,0 @@
-from contextlib import nullcontext
-
-import numpy as np
-import pytest
-from transformers import LlavaNextImageProcessor
-
-from vllm.config import ModelConfig
-from vllm.multimodal import MultiModalRegistry
-from vllm.multimodal.image import rescale_image_size
-
-
-@pytest.fixture
-def mm_registry():
-    return MultiModalRegistry()
-
-
-@pytest.mark.parametrize("dtype", ["half", "float"])
-@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
-def test_llava_next_image_processor(image_assets, mm_registry, dtype,
-                                    size_factor):
-    MODEL_NAME = "llava-hf/llava-v1.6-vicuna-7b-hf"
-
-    hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
-    assert isinstance(hf_processor, LlavaNextImageProcessor)
-
-    model_config = ModelConfig(
-        model=MODEL_NAME,
-        task="auto",
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype=dtype,
-        revision=None,
-        limit_mm_per_prompt={"image": 1},
-    )
-
-    mm_registry.init_mm_limits_per_prompt(model_config)
-
-    for asset in image_assets:
-        image = rescale_image_size(asset.pil_image, size_factor)
-
-        hf_result = hf_processor.preprocess(
-            image,
-            return_tensors="pt",
-        )
-        vllm_result = mm_registry.map_input(
-            model_config,
-            {"image": image},
-        )
-
-        assert hf_result.keys() == vllm_result.keys()
-        for key, hf_tensor in hf_result.items():
-            hf_arr: np.ndarray = hf_tensor.numpy()
-            vllm_arr: np.ndarray = vllm_result[key].numpy()
-
-            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
-            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
-
-
-@pytest.mark.parametrize(
-    ("num_images", "limit", "is_valid"),
-    [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
-     (2, 1, False), (2, 2, True)],
-)
-def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
-    MODEL_NAME = "llava-hf/llava-v1.6-mistral-7b-hf"
-
-    model_config = ModelConfig(
-        model=MODEL_NAME,
-        task="auto",
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="half",
-        revision=None,
-        limit_mm_per_prompt={"image": limit},
-    )
-
-    mm_registry.init_mm_limits_per_prompt(model_config)
-
-    image = image_assets[0].pil_image
-    if num_images == 0:
-        mm_inputs = {}
-    elif num_images == 1:
-        mm_inputs = {"image": image}
-    else:
-        mm_inputs = {"image": [image] * num_images}
-
-    with nullcontext() if is_valid else pytest.raises(ValueError):
-        mm_registry.map_input(model_config, mm_inputs)
-
-
-# NOTE: We don't test zero images since the HF processor doesn't support it
-@pytest.mark.parametrize("num_images", [1, 2])
-def test_image_mapper_multi(image_assets, mm_registry, num_images):
-    MODEL_NAME = "llava-hf/llava-v1.6-mistral-7b-hf"
-
-    model_config = ModelConfig(
-        model=MODEL_NAME,
-        task="auto",
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="half",
-        revision=None,
-        limit_mm_per_prompt={"image": num_images},
-    )
-
-    mm_registry.init_mm_limits_per_prompt(model_config)
-
-    image = image_assets[0].pil_image
-    mm_inputs = {"image": [image] * num_images}
-
-    mapped_inputs = mm_registry.map_input(model_config, mm_inputs)
-    assert len(mapped_inputs["pixel_values"]) == num_images
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 9573351b4dff1..f99d7556b27f9 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1,5 +1,7 @@
+from contextlib import nullcontext
 from functools import partial
 from typing import cast
+from unittest.mock import MagicMock
 
 import numpy as np
 import pytest
@@ -526,6 +528,100 @@ def _rand_audio(
     return rng.rand(audio_len), sr
 
 
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize(
+    ("limit", "num_supported", "is_valid"),
+    [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
+     (2, 1, False), (2, 2, True)],
+)
+def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
+    limit_mm_per_prompt = {"image": limit}
+
+    model_config = ModelConfig(
+        model=model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="half",
+        revision=None,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+    )
+    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
+
+    processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
+    ctx = InputProcessingContext(
+        model_config,
+        tokenizer=cached_get_tokenizer(model_config.tokenizer),
+    )
+
+    processor = processor_factory(ctx, cache=None)
+
+    mock_supported_mm_limits = MagicMock(return_value={"image": num_supported})
+    processor.get_supported_mm_limits = mock_supported_mm_limits
+
+    if is_valid:
+        exc_ctx = nullcontext()
+    else:
+        exc_ctx = pytest.raises(ValueError, match="this model only supports")
+
+    with exc_ctx:
+        processor._get_and_validate_dummy_mm_counts()
+
+
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize(
+    ("num_images", "limit", "is_valid"),
+    [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
+     (2, 1, False), (2, 2, True)],
+)
+def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
+    limit_mm_per_prompt = {"image": limit}
+
+    model_config = ModelConfig(
+        model=model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="half",
+        revision=None,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+    )
+    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
+
+    processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
+    ctx = InputProcessingContext(
+        model_config,
+        tokenizer=cached_get_tokenizer(model_config.tokenizer),
+    )
+
+    processor = processor_factory(ctx, cache=None)
+
+    rng = np.random.RandomState(0)
+    image = _rand_img(rng, min_wh=128, max_wh=256)
+    if num_images == 0:
+        mm_data = {}
+    elif num_images == 1:
+        mm_data = {"image": image}
+    else:
+        mm_data = {"image": [image] * num_images}
+
+    if is_valid:
+        exc_ctx = nullcontext()
+    else:
+        exc_ctx = pytest.raises(ValueError, match=f"passed {num_images} image")
+
+    with exc_ctx:
+        processor.apply(
+            "<image>" * num_images,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+
 def _test_processing_cache_correctness(
     model_id: str,
     modalities: dict[str, bool],
@@ -631,6 +727,7 @@ def _test_processing_cache_correctness(
     ("facebook/chameleon-7b", {"image": False}),
     ("adept/fuyu-8b", {"image": False}),
     ("llava-hf/llava-1.5-7b-hf", {"image": True}),
+    ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}),
     ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
     ("mistral-community/pixtral-12b", {"image": True}),
     ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
index 0d90635093ac7..06dfebbb95527 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
@@ -3,13 +3,11 @@
 import torch
 
 from vllm.model_executor.models.llava import (LlavaForConditionalGeneration,
-                                              LlavaMultiModalProcessor,
-                                              get_max_llava_image_tokens)
+                                              LlavaMultiModalProcessor)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
 
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
 @MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor)
 class MyLlava(LlavaForConditionalGeneration):
 
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index a5300dfd986f3..0188452054b8c 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -24,6 +24,8 @@
                                    resolve_visual_encoder_outputs)
 from vllm.sequence import SequenceData
 
+from .vision import VisionEncoderInfo
+
 
 def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
     assert image_size % patch_size == 0
@@ -149,6 +151,29 @@ def input_processor_for_clip(
                         multi_modal_placeholders={"image": ranges})
 
 
+class CLIPEncoderInfo(VisionEncoderInfo[CLIPVisionConfig]):
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        return get_clip_image_feature_size(self.vision_config)
+
+    def get_max_image_tokens(self) -> int:
+        return get_max_clip_image_tokens(self.vision_config)
+
+    def get_num_patches(self) -> int:
+        return get_clip_patch_grid_length(
+            image_size=self.vision_config.image_size,
+            patch_size=self.vision_config.patch_size,
+        )
+
+    def get_image_size(self) -> int:
+        return self.vision_config.image_size
+
+
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
 class CLIPVisionEmbeddings(nn.Module):
 
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 7fb8c5d1ab09c..3680d01725238 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -76,7 +76,7 @@ def _get_image_target_size(self) -> ImageSize:
         return ImageSize(width=target_size["width"],
                          height=target_size["height"])
 
-    def _get_image_grid_size(
+    def _get_image_feature_grid_size(
         self,
         *,
         image_width: int,
@@ -99,7 +99,7 @@ def _get_image_grid_size(
     def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
         target_width, target_height = self._get_image_target_size()
 
-        max_ncols, max_nrows = self._get_image_grid_size(
+        max_ncols, max_nrows = self._get_image_feature_grid_size(
             image_width=target_width,
             image_height=target_height,
         )
@@ -172,7 +172,7 @@ def get_replacement_fuyu(item_idx: int):
             images = mm_items.get_items("image", ImageProcessorItems)
             image_size = images.get_image_size(item_idx)
 
-            ncols, nrows = self._get_image_grid_size(
+            ncols, nrows = self._get_image_feature_grid_size(
                 image_width=image_size.width,
                 image_height=image_size.height,
             )
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 808e61edb6fb4..78de27cd821c6 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,6 +1,7 @@
+from abc import abstractmethod
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Protocol, Set,
-                    Tuple, TypedDict, Union)
+from typing import (Final, Iterable, List, Literal, Mapping, Optional,
+                    Protocol, Set, Tuple, TypedDict, Union)
 
 import torch
 import torch.nn as nn
@@ -12,7 +13,6 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
@@ -23,23 +23,23 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors)
-from vllm.multimodal.parse import ImageProcessorItems
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   ImageSize)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
-                                        PromptReplacement,
+                                        InputProcessingContext,
+                                        MultiModalDataItems, ProcessingCache,
+                                        ProcessorInputs, PromptReplacement,
                                         full_groupby_modality)
 from vllm.sequence import IntermediateTensors
 
-from .clip import (CLIPVisionModel, dummy_image_for_clip,
-                   get_max_clip_image_tokens)
+from .clip import CLIPVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
-from .pixtral import (PixtralHFVisionModel, dummy_image_for_pixtral_hf,
-                      get_max_pixtral_hf_image_tokens,
-                      get_pixtral_hf_image_feature_size)
-from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
-                     get_max_siglip_image_tokens)
+from .pixtral import (PixtralHFVisionModel,
+                      get_pixtral_hf_image_feature_grid_size)
+from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
+from .vision import vision_encoder_info
 
 
 class LlavaImagePixelInputs(TypedDict):
@@ -94,39 +94,167 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-def get_max_llava_image_tokens(ctx: InputContext):
-    hf_config = ctx.get_hf_config(LlavaConfig)
-    vision_config = hf_config.vision_config
+class LlavaLikeConfig(Protocol):
+    vision_config: Final[PretrainedConfig]
+    vision_feature_select_strategy: Final[str]
+    vision_feature_layer: Final[Union[int, List[int]]]
 
-    if isinstance(vision_config, CLIPVisionConfig):
-        num_image_tokens = get_max_clip_image_tokens(vision_config)
-    elif isinstance(vision_config, SiglipVisionConfig):
-        num_image_tokens = get_max_siglip_image_tokens(vision_config)
-    elif isinstance(vision_config, PixtralVisionConfig):
-        num_image_tokens = get_max_pixtral_hf_image_tokens(vision_config)
-    else:
-        msg = f"Unsupported vision config: {type(vision_config)}"
-        raise NotImplementedError(msg)
 
-    strategy = hf_config.vision_feature_select_strategy
-    if strategy == "default":
-        return num_image_tokens - 1
-    elif strategy == "full":
-        return num_image_tokens
-    else:
-        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor):
 
+    def __init__(self,
+                 ctx: InputProcessingContext,
+                 *,
+                 cache: Optional[ProcessingCache] = None,
+                 enable_sanity_checks: bool = True) -> None:
+        super().__init__(ctx,
+                         cache=cache,
+                         enable_sanity_checks=enable_sanity_checks)
+
+        vision_config = self._get_hf_config().vision_config
+        self._vision_encoder_info = vision_encoder_info(vision_config)
 
-class LlavaMultiModalProcessor(BaseMultiModalProcessor):
+    @abstractmethod
+    def _get_hf_config(self) -> LlavaLikeConfig:
+        raise NotImplementedError
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
+    def _apply_feature_select_strategy(
+        self,
+        strategy: str,
+        encoder_num_image_tokens: int,
+    ) -> int:
+        if strategy == "default":
+            return encoder_num_image_tokens - 1
+        if strategy == "full":
+            return encoder_num_image_tokens
+
+        msg = f"Unexpected feature select strategy: {strategy!r}"
+        raise NotImplementedError(msg)
+
+    def _get_max_image_tokens(self) -> int:
+        hf_config = self._get_hf_config()
+
+        return self._apply_feature_select_strategy(
+            hf_config.vision_feature_select_strategy,
+            self._vision_encoder_info.get_max_image_tokens(),
+        )
+
     def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
-        return {"image": get_max_llava_image_tokens(self.ctx)}
+        return {"image": self._get_max_image_tokens()}
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_dummy_image_size(self) -> ImageSize:
+        image_size = self._vision_encoder_info.get_image_size()
+        return ImageSize(image_size, image_size)
+
+    @abstractmethod
+    def _get_image_token(self) -> str:
+        raise NotImplementedError
+
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+
+        image_token = self._get_image_token()
+        target_width, target_height = self._get_dummy_image_size()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=mm_data,
+        )
+
+
+class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor):
+
+    def _get_hf_config(self) -> LlavaConfig:
+        return self.ctx.get_hf_config(LlavaConfig)
+
+    def _get_hf_processor(self) -> LlavaProcessor:
+        return self.ctx.get_hf_processor(LlavaProcessor)
+
+    def _get_image_token(self) -> str:
+        return self._get_hf_processor().image_token
+
+    def _get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self._get_hf_config()
+
+        return self._apply_feature_select_strategy(
+            hf_config.vision_feature_select_strategy,
+            self._vision_encoder_info.get_num_image_tokens(
+                image_width=image_width,
+                image_height=image_height,
+            ),
+        )
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        hf_config = self._get_hf_config()
+        image_token_id = hf_config.image_token_index
 
-    def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]:
-        return self.ctx.get_hf_processor((LlavaProcessor, PixtralProcessor))
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self._get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+
+            return [image_token_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_replacement,
+            ),
+        ]
+
+
+class PixtralHFMultiModalProcessor(BaseLlavaMultiModalProcessor):
+
+    def _get_hf_config(self) -> LlavaConfig:
+        return self.ctx.get_hf_config(LlavaConfig)
+
+    def _get_hf_processor(self) -> PixtralProcessor:
+        return self.ctx.get_hf_processor(PixtralProcessor)
+
+    def _get_image_token(self) -> str:
+        return self._get_hf_processor().image_token
 
     def _call_hf_processor(
         self,
@@ -140,119 +268,82 @@ def _call_hf_processor(
             mm_kwargs=mm_kwargs,
         )
 
-        # NOTE: pixel_values=None for MLlavaProcessor
         pixel_values = processed_outputs.get("pixel_values")
         if pixel_values is not None:
             images = mm_data["images"]
             assert isinstance(images, list)
 
-            if isinstance(self._get_hf_processor(), PixtralProcessor):
-                # Original output: (1, num_images, C, H, W)
-                # New output: (num_images, C, H, W)
-                assert (isinstance(pixel_values, list)
-                        and len(pixel_values) == 1)
-                assert (isinstance(pixel_values[0], list)
-                        and len(pixel_values[0]) == len(images))
+            # Original output: (1, num_images, C, H, W)
+            # New output: (num_images, C, H, W)
+            assert (isinstance(pixel_values, list) and len(pixel_values) == 1)
+            assert (isinstance(pixel_values[0], list)
+                    and len(pixel_values[0]) == len(images))
 
-                processed_outputs["pixel_values"] = pixel_values[0]
+            processed_outputs["pixel_values"] = pixel_values[0]
 
         return processed_outputs
 
-    def _get_mm_fields_config(
-        self,
-        hf_inputs: BatchFeature,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
-            pixel_values=MultiModalFieldConfig.batched("image"),
-            image_embeds=MultiModalFieldConfig.batched("image"),
-        )
-
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self.ctx.get_hf_config(LlavaConfig)
+        hf_config = self._get_hf_config()
         image_token_id = hf_config.image_token_index
 
         processor = self._get_hf_processor()
-        if isinstance(processor, PixtralProcessor):
-            image_token = processor.image_token
-            image_break_token = processor.image_break_token
-            image_end_token = processor.image_end_token
-
-            vision_config = hf_config.vision_config
-            assert isinstance(vision_config, PixtralVisionConfig)
+        image_token = processor.image_token
+        image_break_token = processor.image_break_token
+        image_end_token = processor.image_end_token
 
-            def get_replacement_pixtral(item_idx: int):
-                images = mm_items.get_items("image", ImageProcessorItems)
-                image_size = images.get_image_size(item_idx)
-
-                (
-                    num_width_tokens,
-                    num_height_tokens,
-                ) = get_pixtral_hf_image_feature_size(
-                    vision_config,
-                    image_width=image_size.width,
-                    image_height=image_size.height,
-                )
+        vision_config = hf_config.vision_config
+        assert isinstance(vision_config, PixtralVisionConfig)
 
-                tokens = ([image_token] * num_width_tokens +
-                          [image_break_token]) * num_height_tokens
-                tokens[-1] = image_end_token
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
 
-                return "".join(tokens)
+            ncols, nrows = get_pixtral_hf_image_feature_grid_size(
+                vision_config,
+                image_width=image_size.width,
+                image_height=image_size.height,
+            )
 
-            return [
-                PromptReplacement(
-                    modality="image",
-                    target=[image_token_id],
-                    replacement=get_replacement_pixtral,
-                ),
-            ]
+            tokens = ([image_token] * ncols + [image_break_token]) * nrows
+            tokens[-1] = image_end_token
 
-        max_image_tokens = get_max_llava_image_tokens(self.ctx)
+            return "".join(tokens)
 
         return [
             PromptReplacement(
                 modality="image",
                 target=[image_token_id],
-                replacement=[image_token_id] * max_image_tokens,
-            )
+                replacement=get_replacement,
+            ),
         ]
 
-    def _get_dummy_mm_inputs(
-        self,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        hf_config = self.ctx.get_hf_config(LlavaConfig)
-        vision_config = hf_config.vision_config
-        num_images = mm_counts.get("image", 0)
-
-        if isinstance(vision_config, CLIPVisionConfig):
-            data = dummy_image_for_clip(vision_config, num_images)
-        elif isinstance(vision_config, SiglipVisionConfig):
-            data = dummy_image_for_siglip(vision_config, num_images)
-        elif isinstance(vision_config, PixtralVisionConfig):
-            data = dummy_image_for_pixtral_hf(vision_config, num_images)
-        else:
-            msg = f"Unsupported vision config: {type(vision_config)}"
-            raise NotImplementedError(msg)
 
-        hf_processor = self._get_hf_processor()
-        image_token = hf_processor.image_token
+def _build_llava_or_pixtral_hf_processor(
+    ctx: InputProcessingContext,
+    *,
+    cache: Optional[ProcessingCache] = None,
+    enable_sanity_checks: bool = True,
+) -> BaseLlavaMultiModalProcessor:
+    hf_config = ctx.get_hf_config(LlavaConfig)
 
-        return ProcessorInputs(
-            prompt_text=image_token * num_images,
-            mm_data=data,
+    if isinstance(hf_config.vision_config, PixtralVisionConfig):
+        return PixtralHFMultiModalProcessor(
+            ctx,
+            cache=cache,
+            enable_sanity_checks=enable_sanity_checks,
         )
 
-
-class LlavaLikeConfig(Protocol):
-    vision_config: PretrainedConfig
-    vision_feature_layer: Union[int, List[int]]
+    return LlavaMultiModalProcessor(
+        ctx,
+        cache=cache,
+        enable_sanity_checks=enable_sanity_checks,
+    )
 
 
 def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
@@ -330,7 +421,7 @@ def init_vision_tower_for_llava(
     raise NotImplementedError(msg)
 
 
-@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(_build_llava_or_pixtral_hf_processor)
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     # BitandBytes specific attributes
     bitsandbytes_stacked_params_mapping = {
@@ -596,7 +687,12 @@ def apply(
     ) -> MultiModalInputsV2:
         hf_config = self.ctx.get_hf_config(LlavaConfig)
         image_token_id = hf_config.image_token_index
-        max_image_tokens = get_max_llava_image_tokens(self.ctx)
+
+        # Assume that it doesn't depend on the image size
+        num_image_tokens = self._get_num_image_tokens(
+            image_width=-1,
+            image_height=-1,
+        )
 
         result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
 
@@ -609,14 +705,14 @@ def apply(
         def get_replacement_mantis(item_idx: int):
             return "".join([
                 f"(image {item_idx+1}: <Image>",  # 7 tokens
-                "<image>" * max_image_tokens,
+                "<image>" * num_image_tokens,
                 "</Image>)",  # 3 tokens
             ])
 
         mantis_repls = self._bind_prompt_replacements([
             PromptReplacement(
                 modality="image",
-                target=[image_token_id] * max_image_tokens,
+                target=[image_token_id] * num_image_tokens,
                 replacement=get_replacement_mantis,
             )
         ])
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 5e70c11363c83..24debd1cbf3fe 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -4,31 +4,25 @@
 
 import torch
 import torch.nn as nn
-from PIL import Image
-from transformers import CLIPVisionConfig, LlavaNextConfig, SiglipVisionConfig
+from transformers import BatchFeature, LlavaNextConfig, LlavaNextProcessor
 from transformers.models.llava_next.modeling_llava_next import (
     get_anyres_image_grid_shape, unpad_image)
 from typing_extensions import NotRequired
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext)
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
+from vllm.multimodal.parse import ImageSize
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_list_of
 
-from .clip import (CLIPVisionModel, dummy_image_for_clip,
-                   dummy_seq_data_for_clip, get_clip_image_feature_size,
-                   get_clip_patch_grid_length, input_processor_for_clip)
+from .clip import CLIPVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
-from .llava import LlavaMultiModalProjector, init_vision_tower_for_llava
-from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
-                     dummy_seq_data_for_siglip, get_siglip_image_feature_size,
-                     get_siglip_patch_grid_length, input_processor_for_siglip)
+from .llava import (LlavaMultiModalProcessor, LlavaMultiModalProjector,
+                    init_vision_tower_for_llava)
+from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn,
                     init_vllm_registered_model, maybe_prefix)
 
@@ -65,218 +59,127 @@ class LlavaNextImageEmbeddingInputs(TypedDict):
                              LlavaNextImageEmbeddingInputs]
 
 
-# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L79
-def _get_llava_next_num_unpadded_features(
-    original_height: int,
-    original_width: int,
-    npatches: int,
-    num_patch_height: int,
-    num_patch_width: int,
-) -> Tuple[int, int]:
-    current_height = npatches * num_patch_height
-    current_width = npatches * num_patch_width
-
-    original_aspect_ratio = original_width / original_height
-    current_aspect_ratio = current_width / current_height
-
-    if original_aspect_ratio > current_aspect_ratio:
-        scale_factor = current_width / original_width
-        new_height = int(original_height * scale_factor)
-        padding = (current_height - new_height) // 2
-        current_height -= 2 * padding
-    else:
-        scale_factor = current_height / original_height
-        new_width = int(original_width * scale_factor)
-        padding = (current_width - new_width) // 2
-        current_width -= 2 * padding
-
-    unpadded_features = current_height * current_width
-    newline_features = current_height
-    return (unpadded_features, newline_features)
-
-
-# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106
-def get_llava_next_image_feature_size(
-    hf_config: LlavaNextConfig,
-    *,
-    input_height: int,
-    input_width: int,
-) -> int:
-    vision_config = hf_config.vision_config
-
-    if isinstance(vision_config, CLIPVisionConfig):
-        num_patches = get_clip_patch_grid_length(
-            image_size=vision_config.image_size,
-            patch_size=vision_config.patch_size,
-        )
-        base_feature_size = get_clip_image_feature_size(vision_config)
-    elif isinstance(vision_config, SiglipVisionConfig):
-        num_patches = get_siglip_patch_grid_length(
-            image_size=vision_config.image_size,
-            patch_size=vision_config.patch_size,
-        )
-        base_feature_size = get_siglip_image_feature_size(vision_config)
-    else:
-        msg = f"Unsupported vision config: {type(vision_config)}"
-        raise NotImplementedError(msg)
-
-    strategy = hf_config.vision_feature_select_strategy
-    if strategy == "default":
-        base_feature_size -= 1
-    elif strategy == "full":
-        pass
-    else:
-        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor):
 
-    num_patch_height, num_patch_width = get_anyres_image_grid_shape(
-        image_size=(input_height, input_width),
-        grid_pinpoints=hf_config.image_grid_pinpoints,
-        patch_size=vision_config.image_size,
-    )
-
-    (
-        unpadded_feature_size,
-        newline_feature_size,
-    ) = _get_llava_next_num_unpadded_features(input_height, input_width,
-                                              num_patches, num_patch_height,
-                                              num_patch_width)
-
-    return unpadded_feature_size + newline_feature_size + base_feature_size
-
-
-def get_max_llava_next_image_tokens(ctx: InputContext):
-    """Compute the max feature size for all possible image grid pinpoints."""
-    return _get_pinpoint_with_largest_features(ctx)[0]
-
-
-def _get_pinpoint_with_largest_features(
-        ctx: InputContext) -> Tuple[int, Tuple[int, int]]:
-    """Get the grid pinpoint with the largest features & its feature size."""
-    hf_config = ctx.get_hf_config(LlavaNextConfig)
-    largest_feature_size = 0
-    largest_feature_pinpoint = None
-    for (height, width) in hf_config.image_grid_pinpoints:
-        feat_size = get_llava_next_image_feature_size(
-            hf_config,
-            input_height=height,
-            input_width=width,
-        )
-        if feat_size > largest_feature_size:
-            largest_feature_size = feat_size
-            largest_feature_pinpoint = (height, width)
-    if not largest_feature_size or largest_feature_pinpoint is None:
-        raise ValueError("Cannot have a largest feature size of 0!")
-    return largest_feature_size, largest_feature_pinpoint
-
-
-def dummy_data_for_llava_next(ctx: InputContext, seq_len: int,
-                              mm_counts: Mapping[str, int]):
-    hf_config = ctx.get_hf_config(LlavaNextConfig)
-    vision_config = hf_config.vision_config
-    num_images = mm_counts["image"]
-
-    image_feature_size, pinpoint = _get_pinpoint_with_largest_features(ctx)
-    max_feat_height, max_feat_width = pinpoint
-
-    if isinstance(vision_config, CLIPVisionConfig):
-        seq_data, ranges = dummy_seq_data_for_clip(
-            vision_config,
-            seq_len,
-            num_images,
-            image_token_id=hf_config.image_token_index,
-            image_feature_size_override=image_feature_size,
-        )
+    def _get_hf_config(self) -> LlavaNextConfig:
+        return self.ctx.get_hf_config(LlavaNextConfig)
+
+    def _get_hf_processor(self) -> LlavaNextProcessor:
+        return self.ctx.get_hf_processor(LlavaNextProcessor)
 
-        mm_data = dummy_image_for_clip(
-            vision_config,
-            num_images,
-            image_width_override=max_feat_width,
-            image_height_override=max_feat_height,
+    def _get_image_token(self) -> str:
+        return self._get_hf_processor().image_token
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
-        return DummyData(seq_data, mm_data, ranges)
-    elif isinstance(vision_config, SiglipVisionConfig):
-        seq_data, ranges = dummy_seq_data_for_siglip(
-            vision_config,
-            seq_len,
-            num_images,
-            image_token_id=hf_config.image_token_index,
-            image_feature_size_override=image_feature_size,
+    def _get_max_image_tokens(self) -> int:
+        largest_feature_size, _ = self._get_pinpoint_with_most_features()
+        return largest_feature_size
+
+    def _get_dummy_image_size(self) -> ImageSize:
+        _, pinpoint = self._get_pinpoint_with_most_features()
+        return pinpoint
+
+    # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106
+    def _get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self._get_hf_config()
+
+        base_feature_size = self._apply_feature_select_strategy(
+            hf_config.vision_feature_select_strategy,
+            self._vision_encoder_info.get_num_image_tokens(
+                image_width=image_width,
+                image_height=image_height,
+            ),
         )
+        num_patches = self._vision_encoder_info.get_num_patches()
 
-        mm_data = dummy_image_for_siglip(
-            vision_config,
-            num_images,
-            image_width_override=max_feat_width,
-            image_height_override=max_feat_height,
+        num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+            image_size=(image_height, image_width),
+            grid_pinpoints=hf_config.image_grid_pinpoints,
+            patch_size=self._vision_encoder_info.get_image_size(),
         )
 
-        return DummyData(seq_data, mm_data, ranges)
+        (
+            unpadded_feature_size,
+            newline_feature_size,
+        ) = self._get_num_unpadded_features(
+            original_height=image_height,
+            original_width=image_width,
+            npatches=num_patches,
+            num_patch_height=num_patch_height,
+            num_patch_width=num_patch_width,
+        )
 
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
+        return unpadded_feature_size + newline_feature_size + base_feature_size
 
+    # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L79
+    def _get_num_unpadded_features(
+        self,
+        *,
+        original_height: int,
+        original_width: int,
+        npatches: int,
+        num_patch_height: int,
+        num_patch_width: int,
+    ) -> tuple[int, int]:
+        current_height = npatches * num_patch_height
+        current_width = npatches * num_patch_width
+
+        original_aspect_ratio = original_width / original_height
+        current_aspect_ratio = current_width / current_height
+
+        if original_aspect_ratio > current_aspect_ratio:
+            scale_factor = current_width / original_width
+            new_height = int(original_height * scale_factor)
+            padding = (current_height - new_height) // 2
+            current_height -= 2 * padding
+        else:
+            scale_factor = current_height / original_height
+            new_width = int(original_width * scale_factor)
+            padding = (current_width - new_width) // 2
+            current_width -= 2 * padding
 
-def input_processor_for_llava_next(ctx: InputContext,
-                                   inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
+        unpadded_features = current_height * current_width
+        newline_features = current_height
+        return (unpadded_features, newline_features)
 
-    model_config = ctx.model_config
-    hf_config = ctx.get_hf_config(LlavaNextConfig)
-    vision_config = hf_config.vision_config
+    def _get_pinpoint_with_most_features(self) -> tuple[int, ImageSize]:
+        """
+        Get the grid pinpoint with the most features and
+        the corresponding feature size.
+        """
+        hf_config = self._get_hf_config()
 
-    image_data = multi_modal_data["image"]
-    if isinstance(image_data, Image.Image):
-        width, height = image_data.size
+        largest_feature_size, largest_feature_pinpoint = 0, None
+        for (height, width) in hf_config.image_grid_pinpoints:
+            feat_size = self._get_num_image_tokens(image_width=width,
+                                                   image_height=height)
+            if feat_size > largest_feature_size:
+                largest_feature_size = feat_size
+                largest_feature_pinpoint = ImageSize(width=width,
+                                                     height=height)
 
-        image_feature_size = get_llava_next_image_feature_size(
-            hf_config,
-            input_height=height,
-            input_width=width,
-        )
-    elif is_list_of(image_data, Image.Image):
-        image_feature_size = [
-            get_llava_next_image_feature_size(hf_config,
-                                              input_height=img.height,
-                                              input_width=img.width)
-            for img in image_data
-        ]
-    elif isinstance(image_data, torch.Tensor):
-        num_images, image_feature_size, hidden_size = image_data.shape
-    elif is_list_of(image_data, torch.Tensor):
-        image_feature_size = [item.shape[1] for item in image_data]
-    else:
-        raise TypeError(f"Invalid image type: {type(image_data)}")
-
-    vision_config = hf_config.vision_config
-
-    if isinstance(vision_config, CLIPVisionConfig):
-        return input_processor_for_clip(
-            model_config,
-            vision_config,
-            inputs,
-            image_token_id=hf_config.image_token_index,
-            image_feature_size_override=image_feature_size,
-        )
-    elif isinstance(vision_config, SiglipVisionConfig):
-        return input_processor_for_siglip(
-            model_config,
-            vision_config,
-            inputs,
-            image_token_id=hf_config.image_token_index,
-            image_feature_size_override=image_feature_size,
-        )
+        if largest_feature_size == 0 or largest_feature_pinpoint is None:
+            raise ValueError("Cannot have a largest feature size of 0!")
 
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
+        return largest_feature_size, largest_feature_pinpoint
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_next_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_next)
+@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor)
 class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
 
@@ -507,7 +410,7 @@ def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
     def _process_image_pixels(
         self,
         inputs: LlavaNextImagePixelInputs,
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
         assert self.vision_tower is not None
 
         pixel_values = inputs["data"]
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index d855e7d2d36f8..f2e49d8e4848d 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -34,7 +34,7 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
-from vllm.multimodal.parse import ImageProcessorItems
+from vllm.multimodal.parse import ImageEmbeddingItems, ImageProcessorItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement,
@@ -388,15 +388,19 @@ def _get_prompt_replacements(
         assert isinstance(bos_token_id, int)
 
         def get_replacement_phi3v(item_idx: int):
-            images = mm_items.get_items("image", ImageProcessorItems)
-            image_size = images.get_image_size(item_idx)
-
-            num_tokens = self._get_num_image_tokens(
-                image_width=image_size.width,
-                image_height=image_size.height,
-            )
-
-            return [_IMAGE_TOKEN_ID] * num_tokens + [bos_token_id]
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self._get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+
+            return [_IMAGE_TOKEN_ID] * num_image_tokens + [bos_token_id]
 
         num_images = mm_items.get_count("image", strict=False)
 
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 2bce13792a88d..d7233bd6028ed 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -38,6 +38,7 @@
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
+from .vision import VisionEncoderInfo
 
 try:
     from xformers import ops as xops
@@ -697,10 +698,18 @@ def get_pixtral_hf_patch_grid_length(*, image_size: int,
     return image_size // patch_size
 
 
-def get_pixtral_hf_num_patches(*, image_size: int, patch_size: int) -> int:
-    grid_length = get_pixtral_hf_patch_grid_length(image_size=image_size,
-                                                   patch_size=patch_size)
-    return grid_length * grid_length
+def get_pixtral_hf_image_feature_size(
+    *,
+    image_size: int,
+    patch_size: int,
+) -> int:
+    grid_length = get_pixtral_hf_patch_grid_length(
+        image_size=image_size,
+        patch_size=patch_size,
+    )
+
+    # Consider the image_break_token
+    return (grid_length + 1) * grid_length
 
 
 def get_max_pixtral_hf_image_tokens(hf_config: PixtralVisionConfig) -> int:
@@ -730,13 +739,16 @@ def dummy_image_for_pixtral_hf(
     return {"image": image if num_images == 1 else [image] * num_images}
 
 
-def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig,
-                                      image_width: int,
-                                      image_height: int) -> Tuple[int, int]:
-    # Adapted from transformers.models.pixtral.image_processing_pixtral.get_resize_output_image_size # noqa: E501
-    # https://github.com/huggingface/transformers/blob/2bd4d5897dc73e8b172832070a6f9e567a0df017/src/transformers/models/pixtral/image_processing_pixtral.py#L180 # noqa: E501
-    max_width, max_height = hf_config.image_size, hf_config.image_size
-    patch_width, patch_height = hf_config.patch_size, hf_config.patch_size
+# Adapted from transformers.models.pixtral.image_processing_pixtral.get_resize_output_image_size # noqa: E501
+# https://github.com/huggingface/transformers/blob/2bd4d5897dc73e8b172832070a6f9e567a0df017/src/transformers/models/pixtral/image_processing_pixtral.py#L180
+def get_pixtral_hf_image_feature_grid_size(
+    hf_config: PixtralVisionConfig,
+    *,
+    image_width: int,
+    image_height: int,
+) -> tuple[int, int]:
+    max_width = max_height = hf_config.image_size
+    patch_width = patch_height = hf_config.patch_size
 
     ratio = max(image_width / max_width, image_height / max_height)
 
@@ -744,12 +756,38 @@ def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig,
         image_width = int(math.ceil(image_width / ratio))
         image_height = int(math.ceil(image_height / ratio))
 
-    num_height_tokens, num_width_tokens = _get_pixtral_hf_num_image_tokens(
+    nrows, ncols = _get_pixtral_hf_num_image_tokens(
         (image_height, image_width),
         (patch_height, patch_width),
-    )
+    )  # type: ignore
+
+    return ncols, nrows
+
+
+class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]):
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        return get_pixtral_hf_image_feature_size(
+            image_size=self.vision_config.image_size,
+            patch_size=self.get_image_size(),
+        )
+
+    def get_max_image_tokens(self) -> int:
+        return get_max_pixtral_hf_image_tokens(self.vision_config)
+
+    def get_num_patches(self) -> int:
+        return get_pixtral_hf_patch_grid_length(
+            image_size=self.vision_config.image_size,
+            patch_size=self.vision_config.patch_size,
+        )
 
-    return num_width_tokens, num_height_tokens
+    def get_image_size(self) -> int:
+        return self.vision_config.image_size
 
 
 class PixtralHFMLP(nn.Module):
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 6fb9e2cc4584f..115eaaac900e0 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -28,6 +28,8 @@
                                    resolve_visual_encoder_outputs)
 from vllm.sequence import SequenceData
 
+from .vision import VisionEncoderInfo
+
 
 def get_siglip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
     # Since interpolation is applied, the image size need not be divisible
@@ -156,6 +158,29 @@ def input_processor_for_siglip(
                         multi_modal_placeholders={"image": ranges})
 
 
+class SiglipEncoderInfo(VisionEncoderInfo[SiglipVisionConfig]):
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        return get_siglip_image_feature_size(self.vision_config)
+
+    def get_max_image_tokens(self) -> int:
+        return get_max_siglip_image_tokens(self.vision_config)
+
+    def get_num_patches(self) -> int:
+        return get_siglip_patch_grid_length(
+            image_size=self.vision_config.image_size,
+            patch_size=self.vision_config.patch_size,
+        )
+
+    def get_image_size(self) -> int:
+        return self.vision_config.image_size
+
+
 # Adapted from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L249 # noqa
 class SiglipVisionEmbeddings(nn.Module):
 
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 269b66806adf4..31017f16d3c97 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -373,7 +373,7 @@ def embed_multimodal(
     input_ids: torch.Tensor,
     multimodal_token_id: int,
     get_text_embeds: Callable[[torch.Tensor], torch.Tensor],
-    multimodal_embeds: Union[torch.Tensor, List[torch.Tensor]],
+    multimodal_embeds: NestedTensors,
 ) -> torch.Tensor:
     """
     Embed token IDs and multimodal inputs and combine their embeddings.
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
new file mode 100644
index 0000000000000..65a773480d2a1
--- /dev/null
+++ b/vllm/model_executor/models/vision.py
@@ -0,0 +1,52 @@
+from abc import ABC, abstractmethod
+from typing import Generic, TypeVar
+
+from transformers import PretrainedConfig
+
+_C = TypeVar("_C", bound=PretrainedConfig)
+
+
+class VisionEncoderInfo(ABC, Generic[_C]):
+
+    def __init__(self, vision_config: _C) -> None:
+        super().__init__()
+
+        self.vision_config = vision_config
+
+    @abstractmethod
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_max_image_tokens(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_num_patches(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_image_size(self) -> int:
+        raise NotImplementedError
+
+
+def vision_encoder_info(vision_config: PretrainedConfig) -> VisionEncoderInfo:
+    # Avoid circular imports
+    from .clip import CLIPEncoderInfo, CLIPVisionConfig
+    from .pixtral import PixtralHFEncoderInfo, PixtralVisionConfig
+    from .siglip import SiglipEncoderInfo, SiglipVisionConfig
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        return CLIPEncoderInfo(vision_config)
+    if isinstance(vision_config, PixtralVisionConfig):
+        return PixtralHFEncoderInfo(vision_config)
+    if isinstance(vision_config, SiglipVisionConfig):
+        return SiglipEncoderInfo(vision_config)
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 4e1b78ab2c59d..00acb77435163 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -1,7 +1,8 @@
 from abc import ABC, abstractmethod
 from collections import UserDict
 from collections.abc import Callable, Iterator, Mapping, Sequence
-from typing import TYPE_CHECKING, Any, Generic, NamedTuple, Optional, TypeVar
+from typing import (TYPE_CHECKING, Any, Generic, NamedTuple, Optional, TypeVar,
+                    Union)
 
 import numpy as np
 import torch
@@ -87,7 +88,7 @@ class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]):
     def get_count(self) -> int:
         return len(self.data)
 
-    def get(self, index: int) -> object:
+    def get(self, index: int) -> torch.Tensor:
         return self.data[index]
 
     def get_processor_data(self) -> Mapping[str, object]:
@@ -96,6 +97,9 @@ def get_processor_data(self) -> Mapping[str, object]:
     def get_passthrough_data(self) -> Mapping[str, object]:
         return {f"{self.modality}_embeds": self.data}
 
+    def get_feature_size(self, item_idx: int) -> int:
+        return len(self.get(item_idx))
+
 
 class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
 
@@ -182,7 +186,7 @@ def get_all_counts(self) -> Mapping[str, int]:
     def get_items(
         self,
         modality: str,
-        typ: type[_D],
+        typ: Union[type[_D], tuple[type[_D], ...]],
     ) -> _D:
         """
         Get the data items belonging to a modality,
@@ -199,7 +203,7 @@ def get_items(
                             f"Expected type: {typ}, but "
                             f"found type: {type(items)}")
 
-        return items
+        return items  # type: ignore[return-value]
 
 
 ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],

From 84c35c374a8fd3d10559ef220793fea6c5497cf2 Mon Sep 17 00:00:00 2001
From: Chunyang Wen <chunyang.wen@gmail.com>
Date: Fri, 3 Jan 2025 02:14:16 +0800
Subject: [PATCH 244/357] According to vllm.EngineArgs, the name should be
 distributed_executor_backend (#11689)

---
 docs/source/serving/distributed_serving.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index 7446b7c84cf46..a1dd0e89e8c79 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -22,7 +22,7 @@ There is one edge case: if the model fits in a single node with multiple GPUs, b
 
 vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
 
-Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured {code}`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the {code}`LLM` class {code}`distributed-executor-backend` argument or {code}`--distributed-executor-backend` API server argument. Set it to {code}`mp` for multiprocessing or {code}`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
+Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured {code}`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the {code}`LLM` class {code}`distributed_executor_backend` argument or {code}`--distributed-executor-backend` API server argument. Set it to {code}`mp` for multiprocessing or {code}`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
 
 To run multi-GPU inference with the {code}`LLM` class, set the {code}`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
 

From 2f385183f35497e030ef22c9820d83b83bc4f6db Mon Sep 17 00:00:00 2001
From: Kathy Yu <143133934+kathyyu-google@users.noreply.github.com>
Date: Thu, 2 Jan 2025 10:28:09 -0800
Subject: [PATCH 245/357] [Bugfix] Free cross attention block table for
 preempted-for-recompute sequence group. (#10013)

Signed-off-by: Kathy Yu <feiyangyu@google.com>
---
 vllm/core/scheduler.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index c3bc6becf0995..b3d396f9cedda 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1579,6 +1579,7 @@ def _preempt_by_recompute(
             seq.status = SequenceStatus.WAITING
             self.free_seq(seq)
             seq.reset_state_for_recompute()
+        self._free_seq_group_cross_attn_blocks(seq_group)
 
     def _preempt_by_swap(
         self,

From b55ed6ef8ab0dce7fb0f79ff292dafdb4d22610c Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 3 Jan 2025 04:04:58 +0900
Subject: [PATCH 246/357] [V1][Minor] Optimize token_ids_cpu copy (#11692)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_input_batch.py  | 13 ++++++++-----
 vllm/v1/worker/gpu_model_runner.py |  1 +
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index e79145300fe06..f8a1427c6c26c 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -66,8 +66,9 @@ def __init__(
             pin_memory=False,
         )
         self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
-        self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
+        self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
 
         # Attention-related.
         self.block_table = torch.zeros(
@@ -189,6 +190,7 @@ def add_request(
         end_idx = start_idx + len(request.output_token_ids)
         self.token_ids_cpu[req_index,
                            start_idx:end_idx] = request.output_token_ids
+        self.num_tokens[req_index] = request.num_tokens
 
         self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
         num_blocks = len(request.block_ids)
@@ -290,14 +292,15 @@ def condense(self, empty_req_indices: List[int]) -> None:
             self.req_ids[last_req_index] = None
             self.req_id_to_index[req_id] = empty_index
 
-            # TODO(woosuk): Optimize the copy of token_ids_cpu and
-            # block_table_cpu.
-            self.token_ids_cpu[empty_index] = self.token_ids_cpu[
-                last_req_index]
+            num_tokens = self.num_tokens[last_req_index]
+            self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
+                last_req_index, :num_tokens]
+            self.num_tokens[empty_index] = num_tokens
             self.num_prompt_tokens[empty_index] = \
                 self.num_prompt_tokens[last_req_index]
             self.num_computed_tokens_cpu[
                 empty_index] = self.num_computed_tokens_cpu[last_req_index]
+            # TODO(woosuk): Optimize the copy of block_table_cpu.
             self.block_table_cpu[empty_index] = self.block_table_cpu[
                 last_req_index]
             self.temperature_cpu[empty_index] = self.temperature_cpu[
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 995de54e8e0a0..75098b0330ac9 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -644,6 +644,7 @@ def execute_model(
                 # Append the sampled token to the output token ids.
                 token_id = sampled_token_ids[i]
                 self.input_batch.token_ids_cpu[i, seq_len] = token_id
+                self.input_batch.num_tokens[i] += 1
                 req_state.output_token_ids.append(token_id)
             else:
                 # Ignore the sampled token from the partial request.

From 187e32997cdc20bbed5c21d3cef2609ab8ed9080 Mon Sep 17 00:00:00 2001
From: bjmsong <wq.songbob@gmail.com>
Date: Fri, 3 Jan 2025 05:11:39 +0800
Subject: [PATCH 247/357] [Bugfix] Change kv scaling factor by param json on
 nvidia gpu (#11688)

Signed-off-by: bjmsong <bjmsong@126.com>
Co-authored-by: bjmsong <bjmsong@126.com>
---
 vllm/model_executor/models/exaone.py  | 5 +++--
 vllm/model_executor/models/granite.py | 5 +++--
 vllm/model_executor/models/llama.py   | 5 +++--
 vllm/model_executor/models/solar.py   | 5 +++--
 vllm/worker/model_runner.py           | 3 ++-
 5 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 0398f0943a70a..8324a563edd64 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -606,8 +606,9 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None:
                 # which is consistent with the practice of setting
                 # scaling_factor = tensor_amax / FPtype_max
                 scaling_factor *= 2
-            if hasattr(layer_self_attn, "kv_scale"):
-                layer_self_attn.attn._kv_scale = scaling_factor
+            if hasattr(layer_self_attn.attn, "_k_scale"):
+                layer_self_attn.attn._k_scale = scaling_factor
+                layer_self_attn.attn._v_scale = scaling_factor
             else:
                 raise RuntimeError("Self attention has no KV cache scaling "
                                    "factor attribute!")
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index f9e0443b9a508..a91ed4158a73f 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -545,8 +545,9 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None:
                 # which is consistent with the practice of setting
                 # scaling_factor = tensor_amax / FPtype_max
                 scaling_factor *= 2
-            if hasattr(layer_self_attn, "kv_scale"):
-                layer_self_attn.attn._kv_scale = scaling_factor
+            if hasattr(layer_self_attn.attn, "_k_scale"):
+                layer_self_attn.attn._k_scale = scaling_factor
+                layer_self_attn.attn._v_scale = scaling_factor
             else:
                 raise RuntimeError("Self attention has no KV cache scaling "
                                    "factor attribute!")
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 2902e6999c2fd..8623da99574bb 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -452,8 +452,9 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None:
                 # which is consistent with the practice of setting
                 # scaling_factor = tensor_amax / FPtype_max
                 scaling_factor *= 2
-            if hasattr(layer_self_attn, "kv_scale"):
-                layer_self_attn.attn._kv_scale = scaling_factor
+            if hasattr(layer_self_attn.attn, "_k_scale"):
+                layer_self_attn.attn._k_scale = scaling_factor
+                layer_self_attn.attn._v_scale = scaling_factor
             else:
                 raise RuntimeError("Self attention has no KV cache scaling "
                                    "factor attribute!")
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index caae0b65d7d10..a7cf65a0e36e4 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -565,8 +565,9 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None:
                 # which is consistent with the practice of setting
                 # scaling_factor = tensor_amax / FPtype_max
                 scaling_factor *= 2
-            if hasattr(layer_self_attn, "kv_scale"):
-                layer_self_attn.attn._kv_scale = scaling_factor
+            if hasattr(layer_self_attn.attn, "_k_scale"):
+                layer_self_attn.attn._k_scale = scaling_factor
+                layer_self_attn.attn._v_scale = scaling_factor
             else:
                 raise RuntimeError("Self attention has no KV cache scaling "
                                    "factor attribute!")
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 2b545d1b28bd2..637fba23611f4 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1136,7 +1136,8 @@ def load_model(self) -> None:
                 self.prompt_adapter_manager.create_prompt_adapter_manager(
                     self.model))
 
-        if self.kv_cache_dtype == "fp8" and current_platform.is_rocm():
+        if self.kv_cache_dtype == "fp8" and (current_platform.is_rocm()
+                                             or current_platform.is_cuda()):
             # Currently only ROCm accepts kv-cache scaling factors
             # via quantization_param_path and this will be deprecated
             # in the future.

From 5dba2575065f5e27d468f2776e3d460a21d916e6 Mon Sep 17 00:00:00 2001
From: wchen61 <wchen61@foxmail.com>
Date: Fri, 3 Jan 2025 06:58:56 +0800
Subject: [PATCH 248/357] Resolve race conditions in Marlin kernel (#11493)

Signed-off-by: wchen61 <wchen61@foxmail.com>
---
 csrc/quantization/gptq_marlin/gptq_marlin.cu | 40 ++++++++++----------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 0c698ced7713d..04ef842fbdf95 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -834,6 +834,7 @@ __global__ void Marlin(
   int4* sh_g_idx = sh_b + (stages * b_sh_stage);
   int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
   int4* sh_s = sh_zp + (stages * zp_sh_stage);
+  int4* sh_red = sh_s + (stages * s_sh_stage);
 
   // Register storage for double buffer of shared memory reads.
   FragA frag_a[2][thread_m_blocks];
@@ -932,11 +933,11 @@ __global__ void Marlin(
           int4* sh_s_stage = sh_s + s_sh_stage * pipe;
 
           if constexpr (group_blocks >= thread_k_blocks) {
+            if (s_sh_wr_pred) {
+              cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
+            }
             // Only fetch scales if this tile starts a new group
-            if (pipe % (group_blocks / thread_k_blocks) == 0) {
-              if (s_sh_wr_pred) {
-                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
-              }
+            if ((pipe + 1) % (group_blocks / thread_k_blocks) == 0) {
               s_gl_rd += s_gl_rd_delta;
             }
           } else {
@@ -1038,9 +1039,7 @@ __global__ void Marlin(
       // No act-order case
       if constexpr (group_blocks != -1) {
         if constexpr (group_blocks >= thread_k_blocks) {
-          int4* sh_s_stage =
-              sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
-                                   (pipe / (group_blocks / thread_k_blocks)));
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
           reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
         } else {
           int warp_id = threadIdx.x / 32;
@@ -1339,15 +1338,15 @@ __global__ void Marlin(
               int red_sh_wr =
                   red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
               if (i < red_off) {
-                float* c_rd =
-                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
-                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
+                float* c_rd = reinterpret_cast<float*>(
+                    &sh_red[red_sh_delta * j + red_sh_rd]);
+                float* c_wr = reinterpret_cast<float*>(&sh_red[red_sh_wr]);
   #pragma unroll
                 for (int k = 0; k < 4; k++)
                   reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
                       c_rd[k] + c_wr[k];
               }
-              sh[red_sh_wr] =
+              sh_red[red_sh_wr] =
                   reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
             }
           }
@@ -1357,7 +1356,7 @@ __global__ void Marlin(
   #pragma unroll
           for (int i = 0; i < 4 * 2; i++) {
             float* c_rd =
-                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
+                reinterpret_cast<float*>(&sh_red[red_sh_delta * i + red_sh_rd]);
   #pragma unroll
             for (int j = 0; j < 4; j++)
               reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
@@ -1397,7 +1396,7 @@ __global__ void Marlin(
   #pragma unroll
         for (int i = 0; i < thread_m_blocks * 4; i++) {
           cp_async4_pred(
-              &sh[c_sh_wr + c_sh_wr_delta * i],
+              &sh_red[c_sh_wr + c_sh_wr_delta * i],
               &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
                  c_gl_wr_delta_i * (i % 2)],
               i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
@@ -1410,7 +1409,7 @@ __global__ void Marlin(
       for (int i = 0; i < thread_m_blocks * 4; i++) {
         if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
           if (!first) {
-            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
+            int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta];
   #pragma unroll
             for (int j = 0; j < 2 * 4; j++) {
               reinterpret_cast<float*>(
@@ -1461,10 +1460,10 @@ __global__ void Marlin(
       float* frag_c_ptr = reinterpret_cast<float*>(&frag_c);
   #pragma unroll
       for (int k = 0; k < th_size; k++) {
-        sh[threadIdx.x] =
+        sh_red[threadIdx.x] =
             C_tmp[c_cur_offset + active_threads * k + threadIdx.x];
 
-        float* sh_c_ptr = reinterpret_cast<float*>(&sh[threadIdx.x]);
+        float* sh_c_ptr = reinterpret_cast<float*>(&sh_red[threadIdx.x]);
   #pragma unroll
         for (int f = 0; f < 4; f++) {
           frag_c_ptr[k * 4 + f] += sh_c_ptr[f];
@@ -1515,7 +1514,7 @@ __global__ void Marlin(
         res = __hmul2(res, s[0]);
       }
 
-      ((scalar_t2*)sh)[idx] = res;
+      ((scalar_t2*)sh_red)[idx] = res;
     };
 
     if (threadIdx.x / 32 < thread_n_blocks / 4) {
@@ -1543,7 +1542,7 @@ __global__ void Marlin(
          i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
          i++) {
       if (c_gl_wr < c_gl_wr_end) {
-        C[c_gl_wr] = sh[c_sh_rd];
+        C[c_gl_wr] = sh_red[c_sh_rd];
         c_gl_wr += c_gl_wr_delta;
         c_sh_rd += c_sh_rd_delta;
       }
@@ -1865,9 +1864,12 @@ bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks,
 
   float pipe_size = (a_size + b_size) * pipe_stages;
 
+  float reduce_size = max(th_config.num_threads * 32 * 4,
+                          (tb_n / 64) * 32 * (tb_max_m / 16) * 4 * 2 * 4 * 2);
+
   TORCH_CHECK(max_shared_mem / 2 > scales_cache_size);  // Sanity
 
-  return pipe_size < 0.95f * (max_shared_mem - scales_cache_size);
+  return pipe_size + reduce_size < 0.95f * (max_shared_mem - scales_cache_size);
 }
 
 bool is_valid_config(thread_config_t const& th_config, int max_m_blocks,

From 68d37809b9b52f4d012fa0dfbb187f0fe978bdbc Mon Sep 17 00:00:00 2001
From: Nathan Azrak <42650258+nathan-az@users.noreply.github.com>
Date: Fri, 3 Jan 2025 10:59:25 +1100
Subject: [PATCH 249/357] [Misc] Minimum requirements for SageMaker
 compatibility (#11576)

---
 Dockerfile                            | 13 +++++-
 examples/sagemaker-entrypoint.sh      | 24 +++++++++++
 vllm/entrypoints/openai/api_server.py | 61 ++++++++++++++++++++++++++-
 3 files changed, 95 insertions(+), 3 deletions(-)
 create mode 100644 examples/sagemaker-entrypoint.sh

diff --git a/Dockerfile b/Dockerfile
index 153bff9cf565f..088314eb38dbe 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -234,8 +234,8 @@ RUN mv vllm test_docs/
 #################### TEST IMAGE ####################
 
 #################### OPENAI API SERVER ####################
-# openai api server alternative
-FROM vllm-base AS vllm-openai
+# base openai image with additional requirements, for any subsequent openai-style images
+FROM vllm-base AS vllm-openai-base
 
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
@@ -247,5 +247,14 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 ENV VLLM_USAGE_SOURCE production-docker-image
 
+# define sagemaker first, so it is not default from `docker build`
+FROM vllm-openai-base AS vllm-sagemaker
+
+COPY examples/sagemaker-entrypoint.sh .
+RUN chmod +x sagemaker-entrypoint.sh
+ENTRYPOINT ["./sagemaker-entrypoint.sh"]
+
+FROM vllm-openai-base AS vllm-openai
+
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 #################### OPENAI API SERVER ####################
diff --git a/examples/sagemaker-entrypoint.sh b/examples/sagemaker-entrypoint.sh
new file mode 100644
index 0000000000000..75a99ffc1f155
--- /dev/null
+++ b/examples/sagemaker-entrypoint.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Define the prefix for environment variables to look for
+PREFIX="SM_VLLM_"
+ARG_PREFIX="--"
+
+# Initialize an array for storing the arguments
+# port 8080 required by sagemaker, https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-code-container-response
+ARGS=(--port 8080)
+
+# Loop through all environment variables
+while IFS='=' read -r key value; do
+    # Remove the prefix from the key, convert to lowercase, and replace underscores with dashes
+    arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
+
+    # Add the argument name and value to the ARGS array
+    ARGS+=("${ARG_PREFIX}${arg_name}")
+    if [ -n "$value" ]; then
+        ARGS+=("$value")
+    fi
+done < <(env | grep "^${PREFIX}")
+
+# Pass the collected arguments to the main entrypoint
+exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}"
\ No newline at end of file
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 74fe378fdae42..e942b475535ad 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -16,7 +16,7 @@
 from typing import AsyncIterator, Optional, Set, Tuple
 
 import uvloop
-from fastapi import APIRouter, FastAPI, Request
+from fastapi import APIRouter, FastAPI, HTTPException, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
@@ -44,11 +44,15 @@
                                               CompletionResponse,
                                               DetokenizeRequest,
                                               DetokenizeResponse,
+                                              EmbeddingChatRequest,
+                                              EmbeddingCompletionRequest,
                                               EmbeddingRequest,
                                               EmbeddingResponse,
                                               EmbeddingResponseData,
                                               ErrorResponse,
                                               LoadLoraAdapterRequest,
+                                              PoolingChatRequest,
+                                              PoolingCompletionRequest,
                                               PoolingRequest, PoolingResponse,
                                               ScoreRequest, ScoreResponse,
                                               TokenizeRequest,
@@ -310,6 +314,12 @@ async def health(raw_request: Request) -> Response:
     return Response(status_code=200)
 
 
+@router.api_route("/ping", methods=["GET", "POST"])
+async def ping(raw_request: Request) -> Response:
+    """Ping check. Endpoint required for SageMaker"""
+    return await health(raw_request)
+
+
 @router.post("/tokenize")
 @with_cancellation
 async def tokenize(request: TokenizeRequest, raw_request: Request):
@@ -483,6 +493,54 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
     return await create_score(request, raw_request)
 
 
+TASK_HANDLERS = {
+    "generate": {
+        "messages": (ChatCompletionRequest, create_chat_completion),
+        "default": (CompletionRequest, create_completion),
+    },
+    "embed": {
+        "messages": (EmbeddingChatRequest, create_embedding),
+        "default": (EmbeddingCompletionRequest, create_embedding),
+    },
+    "score": {
+        "default": (ScoreRequest, create_score),
+    },
+    "reward": {
+        "messages": (PoolingChatRequest, create_pooling),
+        "default": (PoolingCompletionRequest, create_pooling),
+    },
+    "classify": {
+        "messages": (PoolingChatRequest, create_pooling),
+        "default": (PoolingCompletionRequest, create_pooling),
+    },
+}
+
+
+@router.post("/invocations")
+async def invocations(raw_request: Request):
+    """
+    For SageMaker, routes requests to other handlers based on model `task`.
+    """
+    body = await raw_request.json()
+    task = raw_request.app.state.task
+
+    if task not in TASK_HANDLERS:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported task: '{task}' for '/invocations'. "
+            f"Expected one of {set(TASK_HANDLERS.keys())}")
+
+    handler_config = TASK_HANDLERS[task]
+    if "messages" in body:
+        request_model, handler = handler_config["messages"]
+    else:
+        request_model, handler = handler_config["default"]
+
+    # this is required since we lose the FastAPI automatic casting
+    request = request_model.model_validate(body)
+    return await handler(request, raw_request)
+
+
 if envs.VLLM_TORCH_PROFILER_DIR:
     logger.warning(
         "Torch Profiler is enabled in the API server. This should ONLY be "
@@ -687,6 +745,7 @@ def init_app_state(
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
     )
+    state.task = model_config.task
 
 
 def create_server_socket(addr: Tuple[str, int]) -> socket.socket:

From 2f1e8e8f54032e38998e90427aedf649c0beee39 Mon Sep 17 00:00:00 2001
From: Sachin Varghese <sachin.mathew31@gmail.com>
Date: Thu, 2 Jan 2025 19:25:53 -0500
Subject: [PATCH 250/357] Update default max_num_batch_tokens for chunked
 prefill (#11694)

---
 docs/source/usage/performance.md | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/docs/source/usage/performance.md b/docs/source/usage/performance.md
index f028e28627a9f..2cd3801bfc82d 100644
--- a/docs/source/usage/performance.md
+++ b/docs/source/usage/performance.md
@@ -32,8 +32,8 @@ You can enable the feature by specifying `--enable-chunked-prefill` in the comma
 ```python
 llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True)
 # Set max_num_batched_tokens to tune performance.
-# NOTE: 512 is the default max_num_batched_tokens for chunked prefill.
-# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512)
+# NOTE: 2048 is the default max_num_batched_tokens for chunked prefill.
+# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=2048)
 ```
 
 By default, vLLM scheduler prioritizes prefills and doesn't batch prefill and decode to the same batch.
@@ -49,13 +49,12 @@ This policy has two benefits:
 - It improves ITL and generation decode because decode requests are prioritized.
 - It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch.
 
-You can tune the performance by changing `max_num_batched_tokens`.
-By default, it is set to 512, which has the best ITL on A100 in the initial benchmark (llama 70B and mixtral 8x22B).
+You can tune the performance by changing `max_num_batched_tokens`. By default, it is set to 2048.
 Smaller `max_num_batched_tokens` achieves better ITL because there are fewer prefills interrupting decodes.
 Higher `max_num_batched_tokens` achieves better TTFT as you can put more prefill to the batch.
 
 - If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes).
-- Note that the default value (512) of `max_num_batched_tokens` is optimized for ITL, and it may have lower throughput than the default scheduler.
+- Note that the default value (2048) of `max_num_batched_tokens` is optimized for ITL, and it may have lower throughput than the default scheduler.
 
 We recommend you set `max_num_batched_tokens > 2048` for throughput.
 

From 07064cb1d49d2b04ec58d8876bee2cd8281eedf5 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Thu, 2 Jan 2025 16:58:56 -0800
Subject: [PATCH 251/357] [Bugfix] Check chain_speculative_sampling before
 calling it (#11673)

Signed-off-by: Lu Fang <lufang@fb.com>
---
 vllm/model_executor/layers/rejection_sampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index 165e8309fee64..f173cbde03f44 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -118,7 +118,7 @@ def forward(
 
         # If use Flashinfer chain_speculative_sampling kernel
         # for rejection sampling
-        if self.use_flashinfer:
+        if self.use_flashinfer and chain_speculative_sampling is not None:
             batch_size, k, _ = draft_probs.shape
             uniform_samples = self._create_uniform_samples(
                 seeded_seqs, batch_size, k, draft_probs.device)

From fd3a62a122fcbc9331d000b325e72687629ef1bd Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Fri, 3 Jan 2025 13:38:37 +0700
Subject: [PATCH 252/357] [perf-benchmark] Fix dependency for steps in
 benchmark pipeline (#11710)

---
 .buildkite/nightly-benchmarks/benchmark-pipeline.yaml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index 868b8e95db01d..679abf1814aa5 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -1,5 +1,6 @@
 steps:
   - label: "Wait for container to be ready"
+    key: wait-for-container-image
     agents:
       queue: A100
     plugins:
@@ -10,12 +11,11 @@ steps:
             command:
             - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
 
-  - wait
-
   - label: "A100"
     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
     agents:
       queue: A100
+    depends_on: wait-for-container-image
     plugins:
     - kubernetes:
         podSpec:
@@ -49,6 +49,7 @@ steps:
     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
     agents:
       queue: H200
+    depends_on: wait-for-container-image
     plugins:
     - docker#v5.12.0:
         image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
@@ -73,7 +74,7 @@ steps:
     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
     agents:
       queue: H100
-    depends_on: ~
+    depends_on: wait-for-container-image
     plugins:
     - docker#v5.12.0:
         image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT

From e1a5c2f0a123835558b1b1c9895181161527c55e Mon Sep 17 00:00:00 2001
From: Aurick Qiao <aurickq@users.noreply.github.com>
Date: Fri, 3 Jan 2025 03:39:19 -0500
Subject: [PATCH 253/357] [Model] Whisper model implementation (#11280)

Co-authored-by: Aurick Qiao <aurick.qiao@snowflake.com>
---
 .buildkite/test-pipeline.yaml                 |   2 +
 examples/offline_inference_whisper.py         |  59 ++
 .../audio_language/__init__.py                |   0
 .../audio_language/test_whisper.py            | 136 ++++
 tests/models/registry.py                      |   1 +
 vllm/config.py                                |   2 +
 vllm/inputs/preprocess.py                     |  36 +-
 vllm/model_executor/models/registry.py        |   1 +
 vllm/model_executor/models/whisper.py         | 737 ++++++++++++++++++
 vllm/multimodal/processing.py                 |  28 +-
 vllm/sequence.py                              |  18 +-
 vllm/transformers_utils/tokenizer.py          |  19 +
 .../tokenizer_group/base_tokenizer_group.py   |   6 +-
 .../tokenizer_group/ray_tokenizer_group.py    |  28 +-
 .../tokenizer_group/tokenizer_group.py        |  16 +-
 vllm/worker/enc_dec_model_runner.py           |  11 +-
 16 files changed, 1045 insertions(+), 55 deletions(-)
 create mode 100644 examples/offline_inference_whisper.py
 create mode 100644 tests/models/encoder_decoder/audio_language/__init__.py
 create mode 100644 tests/models/encoder_decoder/audio_language/test_whisper.py
 create mode 100644 vllm/model_executor/models/whisper.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c6f8316412e2f..529daf54faecf 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -363,12 +363,14 @@ steps:
   - tests/models/decoder_only/audio_language
   - tests/models/decoder_only/vision_language
   - tests/models/embedding/vision_language
+  - tests/models/encoder_decoder/audio_language
   - tests/models/encoder_decoder/vision_language
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
     - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
     - pytest -v -s models/embedding/vision_language -m core_model
+    - pytest -v -s models/encoder_decoder/audio_language -m core_model
     - pytest -v -s models/encoder_decoder/language -m core_model
     - pytest -v -s models/encoder_decoder/vision_language -m core_model
 
diff --git a/examples/offline_inference_whisper.py b/examples/offline_inference_whisper.py
new file mode 100644
index 0000000000000..087ad4376fb2e
--- /dev/null
+++ b/examples/offline_inference_whisper.py
@@ -0,0 +1,59 @@
+import time
+
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+
+# Create a Whisper encoder/decoder model instance
+llm = LLM(
+    model="openai/whisper-large-v3",
+    max_model_len=448,
+    max_num_seqs=400,
+    limit_mm_per_prompt={"audio": 1},
+    kv_cache_dtype="fp8",
+)
+
+prompts = [
+    {
+        "prompt": "<|startoftranscript|>",
+        "multi_modal_data": {
+            "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+        },
+    },
+    {  # Test explicit encoder/decoder prompt
+        "encoder_prompt": {
+            "prompt": "",
+            "multi_modal_data": {
+                "audio": AudioAsset("winning_call").audio_and_sample_rate,
+            },
+        },
+        "decoder_prompt": "<|startoftranscript|>",
+    }
+] * 1024
+
+# Create a sampling params object.
+sampling_params = SamplingParams(
+    temperature=0,
+    top_p=1.0,
+    max_tokens=200,
+)
+
+start = time.time()
+
+# Generate output tokens from the prompts. The output is a list of
+# RequestOutput objects that contain the prompt, generated
+# text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    encoder_prompt = output.encoder_prompt
+    generated_text = output.outputs[0].text
+    print(f"Encoder prompt: {encoder_prompt!r}, "
+          f"Decoder prompt: {prompt!r}, "
+          f"Generated text: {generated_text!r}")
+
+duration = time.time() - start
+
+print("Duration:", duration)
+print("RPS:", len(prompts) / duration)
diff --git a/tests/models/encoder_decoder/audio_language/__init__.py b/tests/models/encoder_decoder/audio_language/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/encoder_decoder/audio_language/test_whisper.py b/tests/models/encoder_decoder/audio_language/test_whisper.py
new file mode 100644
index 0000000000000..eb238c5332139
--- /dev/null
+++ b/tests/models/encoder_decoder/audio_language/test_whisper.py
@@ -0,0 +1,136 @@
+"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling.
+
+Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
+"""
+from typing import Optional
+
+import pytest
+
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+
+from ....utils import fork_new_process_for_each_test, multi_gpu_test
+
+PROMPTS = [
+    {
+        "prompt":
+        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
+        "multi_modal_data": {
+            "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+        },
+    },
+    {  # Test explicit encoder/decoder prompt
+        "encoder_prompt": {
+            "prompt": "",
+            "multi_modal_data": {
+                "audio": AudioAsset("winning_call").audio_and_sample_rate,
+            },
+        },
+        "decoder_prompt":
+        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
+    }
+]
+
+EXPECTED = {
+    "openai/whisper-tiny": [
+        " He has birth words I spoke in the original corner of that. And a"
+        " little piece of black coat poetry. Mary had a little sandwich,"
+        " sweet, with white and snow. And everyone had it very went the last"
+        " would sure to go.",
+        " >> And the old one, fit John the way to Edgar Martinez. >> One more"
+        " to line down the field line for our base camp. Here comes joy. Here"
+        " is June and the third base. They're going to wave him in. The throw"
+        " to the plate will be late. The Mariners are going to play for the"
+        " American League Championship. I don't believe it. It just continues"
+        " by all five."
+    ],
+    "openai/whisper-small": [
+        " The first words I spoke in the original pornograph. A little piece"
+        " of practical poetry. Mary had a little lamb, its fleece was quite a"
+        " slow, and everywhere that Mary went the lamb was sure to go.",
+        " And the old one pitch on the way to Edgar Martinez one month. Here"
+        " comes joy. Here is Junior to third base. They're gonna wave him"
+        " in. The throw to the plate will be late. The Mariners are going to"
+        " play for the American League Championship. I don't believe it. It"
+        " just continues. My, oh my."
+    ],
+    "openai/whisper-medium": [
+        " The first words I spoke in the original phonograph, a little piece"
+        " of practical poetry. Mary had a little lamb, its fleece was quite as"
+        " slow, and everywhere that Mary went the lamb was sure to go.",
+        " And the 0-1 pitch on the way to Edgar Martinez swung on the line"
+        " down the left field line for Obeyshev. Here comes Joy. Here is"
+        " Jorgen at third base. They're going to wave him in. The throw to the"
+        " plate will be late. The Mariners are going to play for the American"
+        " League Championship. I don't believe it. It just continues. My, oh"
+        " my."
+    ],
+    "openai/whisper-large-v3": [
+        " The first words I spoke in the original phonograph, a little piece"
+        " of practical poetry. Mary had a little lamb, its feet were quite as"
+        " slow, and everywhere that Mary went, the lamb was sure to go.",
+        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line."
+        " Now the left field line for a base hit. Here comes Joy. Here is"
+        " Junior to third base. They're going to wave him in. The throw to the"
+        " plate will be late. The Mariners are going to play for the American"
+        " League Championship. I don't believe it. It just continues. My, oh,"
+        " my."
+    ],
+    "openai/whisper-large-v3-turbo": [
+        " The first words I spoke in the original phonograph, a little piece"
+        " of practical poetry. Mary had a little lamb, its streets were quite"
+        " as slow, and everywhere that Mary went the lamb was sure to go.",
+        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line"
+        " down the left field line for a base hit. Here comes Joy. Here is"
+        " Junior to third base. They're going to wave him in. The throw to the"
+        " plate will be late. The Mariners are going to play for the American"
+        " League Championship. I don't believe it. It just continues. My, oh,"
+        " my."
+    ]
+}
+
+
+def run_test(
+    model: str,
+    *,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+) -> None:
+    prompt_list = PROMPTS * 10
+    expected_list = EXPECTED[model] * 10
+
+    llm = LLM(
+        model=model,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+    )
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        top_p=1.0,
+        max_tokens=200,
+    )
+
+    outputs = llm.generate(prompt_list, sampling_params)
+
+    for output, expected in zip(outputs, expected_list):
+        print(output.outputs[0].text)
+        assert output.outputs[0].text == expected
+
+
+@fork_new_process_for_each_test
+@pytest.mark.core_model
+@pytest.mark.parametrize(
+    "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
+def test_models(model) -> None:
+    run_test(model, tensor_parallel_size=1)
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+def test_models_distributed(model, distributed_executor_backend) -> None:
+    run_test(model,
+             tensor_parallel_size=2,
+             distributed_executor_backend=distributed_executor_backend)
diff --git a/tests/models/registry.py b/tests/models/registry.py
index e5dfb2822745d..dcb8bfa0f9510 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -204,6 +204,7 @@ class _HfExamplesInfo:
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3"),
     # [Encoder-decoder]
     "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
+    "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501
 }
 
 _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
diff --git a/vllm/config.py b/vllm/config.py
index e72c53b6130d0..b51f9783008b2 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2312,6 +2312,8 @@ def _get_and_verify_max_len(
         "seq_length",
         # Command-R
         "model_max_length",
+        # Whisper
+        "max_target_positions",
         # Others
         "max_sequence_length",
         "max_seq_length",
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index aaa10d278ddb0..b362ee0cac328 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -184,10 +184,16 @@ def _tokenize_prompt(
         corresponding token IDs.
         """
         tokenizer = self.get_tokenizer_group()
-
+        add_special_tokens = None
+        if self.model_config.hf_config.model_type == "whisper":
+            # For Whisper, special tokens should be provided by the user based
+            # on the task and language of their request. Also needed to avoid
+            # appending an EOS token to the prompt which disrupts generation.
+            add_special_tokens = False
         return tokenizer.encode(request_id=request_id,
                                 prompt=prompt,
-                                lora_request=lora_request)
+                                lora_request=lora_request,
+                                add_special_tokens=add_special_tokens)
 
     async def _tokenize_prompt_async(
         self,
@@ -197,10 +203,17 @@ async def _tokenize_prompt_async(
     ) -> List[int]:
         """Async version of :meth:`_tokenize_prompt`."""
         tokenizer = self.get_tokenizer_group()
-
-        return await tokenizer.encode_async(request_id=request_id,
-                                            prompt=prompt,
-                                            lora_request=lora_request)
+        add_special_tokens = None
+        if self.model_config.hf_config.model_type == "whisper":
+            # For Whisper, special tokens should be provided by the user based
+            # on the task and language of their request. Also needed to avoid
+            # appending an EOS token to the prompt which disrupts generation.
+            add_special_tokens = False
+        return await tokenizer.encode_async(
+            request_id=request_id,
+            prompt=prompt,
+            lora_request=lora_request,
+            add_special_tokens=add_special_tokens)
 
     def _can_process_multimodal(self) -> bool:
         model_config = self.model_config
@@ -439,8 +452,15 @@ def _build_enc_dec_llm_inputs(
             assert_never(encoder_inputs)  # type: ignore[arg-type]
 
         if decoder_inputs is None:
-            dec_token_ids = self._prepare_decoder_input_ids_for_generation(
-                None)
+            if self.model_config.hf_config.model_type == "whisper":
+                # For Whisper models, the text prompt should go to the decoder.
+                # If no explicit encoder/decoder inputs, then copy the prompt
+                # from the encoder to the decoder. The encoder tokens are later
+                # overridden by the audio features.
+                dec_token_ids = encoder_inputs["prompt_token_ids"].copy()
+            else:
+                dec_token_ids = self._prepare_decoder_input_ids_for_generation(
+                    None)
             decoder_inputs = token_inputs(dec_token_ids)
         elif (decoder_inputs["type"] == "token"
               or decoder_inputs["type"] == "multimodal"):
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 07f4b5a3b3bc8..62840b8c1bcda 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -170,6 +170,7 @@
     "UltravoxModel": ("ultravox", "UltravoxModel"),
     # [Encoder-decoder]
     "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"),  # noqa: E501
+    "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"),  # noqa: E501
 }
 
 _SPECULATIVE_DECODING_MODELS = {
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
new file mode 100644
index 0000000000000..cb54b4c3ba663
--- /dev/null
+++ b/vllm/model_executor/models/whisper.py
@@ -0,0 +1,737 @@
+import math
+from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
+                    Union)
+
+import numpy as np
+import torch
+from torch import nn
+from transformers.models.whisper.modeling_whisper import sinusoids
+
+from vllm.attention import Attention, AttentionMetadata, AttentionType
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.inputs import INPUT_REGISTRY, DummyData, InputContext
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
+                             NestedTensors)
+from vllm.multimodal.audio import resample_audio
+from vllm.sequence import SequenceData
+from vllm.transformers_utils.processor import cached_get_processor
+
+from .interfaces import SupportsMultiModal
+from .utils import AutoWeightsLoader, WeightsMapper, make_layers
+
+logger = init_logger(__name__)
+
+
+class WhisperAudioInputs(TypedDict):
+    input_features: NestedTensors
+    """Shape: `(batch_size, 128, M)`"""
+
+
+class WhisperPositionalEmbedding(nn.Embedding):
+
+    def __init__(self,
+                 num_positions: int,
+                 embedding_dim: int,
+                 padding_idx: Optional[int] = None):
+        super().__init__(num_positions, embedding_dim)
+
+    def forward(self, position_ids):
+        return self.weight[position_ids]
+
+
+class WhisperAttention(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        bias: bool = True,
+        attn_type: AttentionType = AttentionType.DECODER,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        if self.total_num_heads >= tp_size:
+            # Number of heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_heads % tp_size == 0
+        else:
+            # Number of heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_heads == 0
+        self.num_kv_heads = max(1, self.total_num_heads // tp_size)
+        self.head_dim = self.embed_dim // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.attn_type = attn_type
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: "
+                f"{self.embed_dim} and `num_heads`: {num_heads}).")
+        self.scaling = self.head_dim**-0.5
+
+        self._init_qkv(embed_dim, bias, quant_config, prefix=prefix)
+        self.out_proj = RowParallelLinear(
+            input_size=embed_dim,
+            output_size=embed_dim,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def _init_qkv(
+        self,
+        embed_dim: int,
+        bias: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ):
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        attn_output = self.attn(q,
+                                k,
+                                v,
+                                kv_cache,
+                                attn_metadata,
+                                attn_type=self.attn_type)
+
+        output, _ = self.out_proj(attn_output)
+
+        return output
+
+
+class WhisperCrossAttention(WhisperAttention):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        bias: bool = True,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__(
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            bias=bias,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
+
+    def _init_qkv(
+        self,
+        embed_dim: int,
+        bias: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        self.q_proj = ColumnParallelLinear(
+            input_size=embed_dim,
+            output_size=embed_dim,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.q_proj",
+        )
+        self.kv_proj = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.head_dim,
+            total_num_heads=0,
+            total_num_kv_heads=self.total_num_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_proj",
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor],
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ):
+        q, _ = self.q_proj(hidden_states)
+
+        # Encoder hidden states are only computed once during prefill phase.
+        # Afterwards, the keys and values should be available in the kv-cache.
+        if encoder_hidden_states is not None:
+            kv, _ = self.kv_proj(encoder_hidden_states)
+            k, v = kv.split([self.kv_size, self.kv_size], dim=-1)
+        else:
+            k = v = None
+
+        attn_output = self.attn(q,
+                                k,
+                                v,
+                                kv_cache,
+                                attn_metadata,
+                                attn_type=AttentionType.ENCODER_DECODER)
+
+        output, _ = self.out_proj(attn_output)
+
+        return output
+
+
+class WhisperMLP(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        ffn_dim: int,
+        act_fn: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.activation_fn = get_act_fn(act_fn)
+        self.fc1 = ColumnParallelLinear(
+            input_size=embed_dim,
+            output_size=ffn_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.fc2 = RowParallelLinear(
+            input_size=ffn_dim,
+            output_size=embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+class WhisperEncoderLayer(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.embed_dim = config.d_model
+        self.self_attn = WhisperAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            attn_type=AttentionType.ENCODER,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.mlp = WhisperMLP(
+            embed_dim=config.d_model,
+            ffn_dim=config.encoder_ffn_dim,
+            act_fn=config.activation_function,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ):
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        if hidden_states.isinf().any() or hidden_states.isnan().any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states,
+                                        min=-clamp_value,
+                                        max=clamp_value)
+
+        return hidden_states
+
+
+class WhisperDecoderLayer(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.self_attn = WhisperAttention(
+            embed_dim=config.d_model,
+            num_heads=config.decoder_attention_heads,
+            attn_type=AttentionType.DECODER,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(config.d_model)
+        self.encoder_attn = WhisperCrossAttention(
+            embed_dim=config.d_model,
+            num_heads=config.decoder_attention_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.encoder_attn",
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(config.d_model)
+        self.mlp = WhisperMLP(
+            embed_dim=config.d_model,
+            ffn_dim=config.decoder_ffn_dim,
+            act_fn=config.activation_function,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.final_layer_norm = nn.LayerNorm(config.d_model)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor],
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ):
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn(hidden_states=hidden_states,
+                                       kv_cache=kv_cache,
+                                       attn_metadata=attn_metadata)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.encoder_attn_layer_norm(hidden_states)
+        hidden_states = self.encoder_attn(
+            hidden_states=hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class WhisperEncoder(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        embed_dim = config.d_model
+        self.num_mel_bins = config.num_mel_bins
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_source_positions
+        self.embed_scale = (math.sqrt(embed_dim)
+                            if config.scale_embedding else 1.0)
+
+        self.conv1 = nn.Conv1d(self.num_mel_bins,
+                               embed_dim,
+                               kernel_size=3,
+                               padding=1)
+        self.conv2 = nn.Conv1d(embed_dim,
+                               embed_dim,
+                               kernel_size=3,
+                               stride=2,
+                               padding=1)
+        self.embed_positions = nn.Embedding(self.max_source_positions,
+                                            embed_dim)
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.encoder_layers,
+            lambda prefix: WhisperEncoderLayer(vllm_config=vllm_config,
+                                               prefix=f"{prefix}.layers"),
+            prefix=f"{prefix}.layers",
+        )
+        self.layer_norm = nn.LayerNorm(config.d_model)
+
+        with torch.no_grad():
+            self.embed_positions.weight.copy_(
+                sinusoids(*self.embed_positions.weight.shape))
+
+    def forward(
+        self,
+        input_features: Union[torch.Tensor, List[torch.Tensor]],
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ):
+        hidden_states = []
+        for features in input_features:
+            embeds = nn.functional.gelu(self.conv1(features))
+            embeds = nn.functional.gelu(self.conv2(embeds))
+            embeds = embeds.permute(1, 0)
+            embeds = embeds + self.embed_positions.weight[:embeds.size(0), :]
+            hidden_states.append(embeds)
+        hidden_states = torch.cat(hidden_states)
+
+        for idx, encoder_layer in enumerate(self.layers):
+            hidden_states = encoder_layer(
+                hidden_states,
+                kv_cache=kv_caches[idx],
+                attn_metadata=attn_metadata,
+            )
+
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
+
+
+class WhisperDecoder(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_target_positions
+        self.max_source_positions = config.max_source_positions
+        self.embed_scale = (math.sqrt(config.d_model)
+                            if config.scale_embedding else 1.0)
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model,
+                                         self.padding_idx)
+        self.embed_positions = WhisperPositionalEmbedding(
+            self.max_target_positions, config.d_model)
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.decoder_layers,
+            lambda prefix: WhisperDecoderLayer(vllm_config=vllm_config,
+                                               prefix=f"{prefix}.layers"),
+            prefix=f"{prefix}.layers",
+        )
+        self.layer_norm = nn.LayerNorm(config.d_model)
+
+    def forward(
+        self,
+        input_ids,
+        positions: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor],
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ):
+        inputs_embeds = self.get_input_embeddings(input_ids)
+        positions = self.embed_positions(positions)
+        hidden_states = inputs_embeds + positions
+
+        for idx, decoder_layer in enumerate(self.layers):
+            hidden_states = decoder_layer(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                kv_cache=kv_caches[idx],
+                attn_metadata=attn_metadata,
+            )
+
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+
+class WhisperModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.encoder = WhisperEncoder(vllm_config=vllm_config,
+                                      prefix=f"{prefix}.encoder")
+        self.decoder = WhisperDecoder(vllm_config=vllm_config,
+                                      prefix=f"{prefix}.decoder")
+
+    def forward(
+        self,
+        input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]],
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        encoder_outputs = self.get_encoder_outputs(
+            input_features,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+        )
+        decoder_outputs = self.decoder(
+            input_ids=input_ids,
+            positions=positions,
+            encoder_hidden_states=encoder_outputs,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+        )
+        return decoder_outputs
+
+    def get_encoder_outputs(
+        self,
+        input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]],
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> Optional[torch.Tensor]:
+        if input_features is None:
+            return None
+        return self.encoder(
+            input_features,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
+            (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
+            (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
+            (".encoder_attn.kv_proj", ".encoder_attn.k_proj", "k"),
+            (".encoder_attn.kv_proj", ".encoder_attn.v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+def get_max_whisper_audio_tokens(ctx: InputContext) -> int:
+    return ctx.model_config.hf_config.max_source_positions
+
+
+def dummy_encoder_data_for_whisper(ctx: InputContext, seq_len: int,
+                                   mm_counts: Mapping[str, int]):
+    assert mm_counts["audio"] == 1
+    num_tokens = get_max_whisper_audio_tokens(ctx)
+    processor = cached_get_processor(ctx.model_config.model)
+    chunk_length = processor.feature_extractor.chunk_length
+    sampling_rate = processor.feature_extractor.sampling_rate
+    num_samples = chunk_length * sampling_rate
+    return DummyData(
+        SequenceData.from_prompt_token_counts((0, num_tokens)),
+        {"audio": [(np.zeros(num_samples), sampling_rate)]},
+    )
+
+
+def input_processor_for_whisper(ctx: InputContext, inputs):
+    multi_modal_data = inputs["encoder"]["multi_modal_data"]
+    if isinstance(multi_modal_data["audio"], list):
+        assert len(multi_modal_data["audio"]) == 1
+        multi_modal_data["audio"] = multi_modal_data["audio"][0]
+    # Resample and process audio
+    audio, orig_sr = multi_modal_data["audio"]
+    processor = cached_get_processor(ctx.model_config.model)
+    target_sr = processor.feature_extractor.sampling_rate
+    audio = resample_audio(audio, orig_sr=orig_sr, target_sr=target_sr)
+    multi_modal_data["audio"] = (audio, target_sr)
+    # Pre-allocate placeholder tokens in encoder sequence
+    num_tokens = get_max_whisper_audio_tokens(ctx)
+    inputs["encoder"]["prompt_token_ids"] = [0] * num_tokens
+    return inputs
+
+
+def input_mapper_for_whisper(
+    ctx: InputContext,
+    multi_modal_data: Union[np.ndarray, List[np.ndarray]],
+) -> MultiModalKwargs:
+    if not isinstance(multi_modal_data, list):
+        multi_modal_data = [multi_modal_data]
+
+    assert len(multi_modal_data) == 1
+
+    if len(multi_modal_data) == 0:
+        return MultiModalKwargs()
+
+    processor = cached_get_processor(ctx.model_config.model)
+    sampling_rate = processor.feature_extractor.sampling_rate
+
+    audios = [audio for audio, _ in multi_modal_data]
+
+    kwargs = processor(audios,
+                       sampling_rate=sampling_rate,
+                       return_tensors="pt")
+    kwargs["input_features"] = kwargs["input_features"].squeeze(0).to(
+        ctx.model_config.dtype)
+
+    return MultiModalKwargs(kwargs)
+
+
+@INPUT_REGISTRY.register_dummy_encoder_data(dummy_encoder_data_for_whisper)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_whisper)
+@MULTIMODAL_REGISTRY.register_input_mapper("audio", input_mapper_for_whisper)
+@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
+    "audio", get_max_whisper_audio_tokens)
+class WhisperForConditionalGeneration(nn.Module, SupportsMultiModal):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.dtype = vllm_config.model_config.dtype
+
+        self.model = WhisperModel(vllm_config=vllm_config, prefix=prefix)
+        self.unpadded_vocab_size = config.vocab_size
+        self.proj_out = ParallelLMHead(config.vocab_size,
+                                       config.d_model,
+                                       quant_config=quant_config)
+        self.proj_out = self.proj_out.tie_weights(
+            self.model.decoder.embed_tokens)
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size, logit_scale)
+        self.sampler = Sampler()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        **kwargs,
+    ) -> torch.Tensor:
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        decoder_outputs = self.model(
+            input_features=audio_input["input_features"],
+            input_ids=input_ids,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+        )
+        return decoder_outputs
+
+    def get_multimodal_embeddings(
+        self,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        **kwargs,
+    ) -> Optional[NestedTensors]:
+        # TODO: This method does not obey the interface for SupportsMultiModal.
+        # Refactor this once encoder/decoder support is implemented in V1.
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        return self.model.get_encoder_outputs(
+            audio_input["input_features"],
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+        )
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+        attn_metadata: Optional[AttentionMetadata] = None,
+    ) -> torch.Tensor:
+        # TODO: This method just returns the decoder sequence embeddings since
+        # Whisper does not have encoder text tokens. Refactor this once
+        # encoder/decoder support is implemented in V1.
+        return self.model.decoder.get_input_embeddings(input_ids)
+
+    def _parse_and_validate_audio_input(
+            self, **kwargs: object) -> WhisperAudioInputs:
+        input_features = kwargs.pop("input_features", None)
+
+        if input_features is not None:
+            if not isinstance(input_features, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio features. "
+                                 f"Got type: {type(input_features)}")
+            input_features = [feat.to(self.dtype) for feat in input_features]
+
+        return WhisperAudioInputs(input_features=input_features)
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.proj_out, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."])
+        loaded_weights = [(name, loaded_weight)
+                          for name, loaded_weight in weights]
+        mapper = WeightsMapper({".fc1.": ".mlp.fc1.", ".fc2.": ".mlp.fc2."})
+        return loader.load_weights(loaded_weights, mapper=mapper)
\ No newline at end of file
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 64cdacfb4c574..eb7552176e974 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -16,7 +16,7 @@
 
 from vllm.inputs import DummyData, InputProcessingContext
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.transformers_utils.tokenizer import AnyTokenizer, encode_tokens
 from vllm.utils import LRUCache, flatten_2d_lists, full_groupby
 
 from .inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -57,24 +57,6 @@ def bind(self, tokenizer: AnyTokenizer) -> "_BoundPromptReplacement":
         )
 
 
-def _encode(
-    tokenizer: AnyTokenizer,
-    text: str,
-    *,
-    add_special_tokens: bool = False,
-) -> list[int]:
-    """
-    Backend-agnostic equivalent of HF's
-    :code:`tokenizer.encode(text, add_special_tokens=...)`.
-    """
-    if isinstance(tokenizer, MistralTokenizer):
-        return tokenizer.tokenizer.encode(text,
-                                          bos=add_special_tokens,
-                                          eos=add_special_tokens)
-
-    return tokenizer.encode(text, add_special_tokens=add_special_tokens)
-
-
 @lru_cache(maxsize=2048)
 def _cached_encode(
     tokenizer: AnyTokenizer,
@@ -82,7 +64,9 @@ def _cached_encode(
     *,
     add_special_tokens: bool = False,
 ) -> list[int]:
-    return _encode(tokenizer, text, add_special_tokens=add_special_tokens)
+    return encode_tokens(tokenizer,
+                         text,
+                         add_special_tokens=add_special_tokens)
 
 
 def _decode(
@@ -983,7 +967,9 @@ def _apply_prompt_replacements(
                 mm_item_counts,
             )
 
-            token_ids = _encode(tokenizer, text)
+            token_ids = encode_tokens(tokenizer,
+                                      text,
+                                      add_special_tokens=False)
             matched_repls = [match.prompt_repl for match in text_matches]
 
         placeholders = self._find_placeholders(matched_repls, token_ids,
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 034f89c0ddbe9..0157abbd2eed5 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -710,15 +710,27 @@ def token_type_ids(self) -> Optional[List[int]]:
 
     @property
     def multi_modal_data(self) -> MultiModalDataDict:
-        return self.first_seq.multi_modal_data
+        if self.first_seq.multi_modal_data:
+            return self.first_seq.multi_modal_data
+        elif self.encoder_seq is not None:
+            return self.encoder_seq.multi_modal_data
+        return {}
 
     @property
     def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
-        return self.first_seq.multi_modal_placeholders
+        if self.first_seq.multi_modal_data:
+            return self.first_seq.multi_modal_placeholders
+        elif self.encoder_seq is not None:
+            return self.encoder_seq.multi_modal_placeholders
+        return {}
 
     @property
     def mm_processor_kwargs(self) -> Dict[str, Any]:
-        return self.first_seq.mm_processor_kwargs
+        if self.first_seq.multi_modal_data:
+            return self.first_seq.mm_processor_kwargs
+        elif self.encoder_seq is not None:
+            return self.encoder_seq.mm_processor_kwargs
+        return {}
 
     @property
     def lora_int_id(self) -> int:
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index e6701f4c4b835..42b2f095bc543 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -21,6 +21,25 @@
                      MistralTokenizer]
 
 
+def encode_tokens(
+    tokenizer: AnyTokenizer,
+    text: str,
+    *,
+    add_special_tokens: Optional[bool] = None,
+) -> list[int]:
+    """
+    Backend-agnostic equivalent of HF's
+    :code:`tokenizer.encode(text, add_special_tokens=...)`.
+    """
+    if isinstance(tokenizer, MistralTokenizer):
+        return tokenizer.tokenizer.encode(text,
+                                          bos=add_special_tokens,
+                                          eos=add_special_tokens)
+    elif add_special_tokens is not None:
+        return tokenizer.encode(text, add_special_tokens=add_special_tokens)
+    return tokenizer.encode(text)
+
+
 def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
     """Get tokenizer with cached properties.
 
diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
index 8f78ef65bbf1a..e6cc7cd4e2e3a 100644
--- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
@@ -32,7 +32,8 @@ def get_max_input_len(
     def encode(self,
                prompt: str,
                request_id: Optional[str] = None,
-               lora_request: Optional[LoRARequest] = None) -> List[int]:
+               lora_request: Optional[LoRARequest] = None,
+               add_special_tokens: Optional[bool] = None) -> List[int]:
         """Encode a prompt using the tokenizer group."""
         pass
 
@@ -41,7 +42,8 @@ async def encode_async(
             self,
             prompt: str,
             request_id: Optional[str] = None,
-            lora_request: Optional[LoRARequest] = None) -> List[int]:
+            lora_request: Optional[LoRARequest] = None,
+            add_special_tokens: Optional[bool] = None) -> List[int]:
         """Encode a prompt using the tokenizer group."""
         pass
 
diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
index 9a999a0d6067d..3f7627e11ae5e 100644
--- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
@@ -112,7 +112,8 @@ def _finalize_encode(self, actor: ray.ObjectRef,
     def encode(self,
                prompt: str,
                request_id: Optional[str] = None,
-               lora_request: Optional[LoRARequest] = None) -> List[int]:
+               lora_request: Optional[LoRARequest] = None,
+               add_special_tokens: Optional[bool] = None) -> List[int]:
         """Encode a prompt using the tokenizer group.
 
         We pick an idle actor and use it to encode the prompt.
@@ -132,7 +133,8 @@ def encode(self,
             ret = ray.get(
                 actor.encode.remote(request_id=request_id,
                                     prompt=prompt,
-                                    lora_request=lora_request))
+                                    lora_request=lora_request,
+                                    add_special_tokens=add_special_tokens))
         except ActorDiedError as e:
             # If the actor is dead, we first try to reinitialize it.
             logger.warning("%s died with ActorDiedError, reinitializing.",
@@ -143,7 +145,8 @@ def encode(self,
                 ret = ray.get(
                     actor.encode.remote(request_id=request_id,
                                         prompt=prompt,
-                                        lora_request=lora_request))
+                                        lora_request=lora_request,
+                                        add_special_tokens=add_special_tokens))
             except ActorDiedError as e:
                 logger.error(
                     "%s died for second time in a row, marking "
@@ -160,7 +163,8 @@ async def encode_async(
             self,
             prompt: str,
             request_id: Optional[str] = None,
-            lora_request: Optional[LoRARequest] = None) -> List[int]:
+            lora_request: Optional[LoRARequest] = None,
+            add_special_tokens: Optional[bool] = None) -> List[int]:
         """Encode a prompt using the tokenizer group.
 
         We pick an idle actor and use it to encode the prompt.
@@ -177,9 +181,11 @@ async def encode_async(
         actor_is_alive = True
         original_actor = actor
         try:
-            ret = await actor.encode.remote(request_id=request_id,
-                                            prompt=prompt,
-                                            lora_request=lora_request)
+            ret = await actor.encode.remote(
+                request_id=request_id,
+                prompt=prompt,
+                lora_request=lora_request,
+                add_special_tokens=add_special_tokens)
         except ActorDiedError as e:
             # If the actor is dead, we first try to reinitialize it.
             logger.warning("%s died with ActorDiedError, reinitializing.",
@@ -187,9 +193,11 @@ async def encode_async(
                            exc_info=e)
             actor = self._init_actor()
             try:
-                ret = await actor.encode.remote(request_id=request_id,
-                                                prompt=prompt,
-                                                lora_request=lora_request)
+                ret = await actor.encode.remote(
+                    request_id=request_id,
+                    prompt=prompt,
+                    lora_request=lora_request,
+                    add_special_tokens=add_special_tokens)
             except ActorDiedError as e:
                 logger.error(
                     "%s died for second time in a row, marking "
diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
index 95a8f7098bbac..6dc2f90561873 100644
--- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
@@ -2,7 +2,7 @@
 
 from vllm.config import TokenizerPoolConfig
 from vllm.lora.request import LoRARequest
-from vllm.transformers_utils.tokenizer import (AnyTokenizer,
+from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
                                                get_lora_tokenizer,
                                                get_lora_tokenizer_async,
                                                get_tokenizer)
@@ -55,9 +55,12 @@ def _raise_if_input_too_long(self,
     def encode(self,
                prompt: str,
                request_id: Optional[str] = None,
-               lora_request: Optional[LoRARequest] = None) -> List[int]:
+               lora_request: Optional[LoRARequest] = None,
+               add_special_tokens: Optional[bool] = None) -> List[int]:
         tokenizer = self.get_lora_tokenizer(lora_request)
-        ret = tokenizer.encode(prompt)
+        ret = encode_tokens(tokenizer,
+                            prompt,
+                            add_special_tokens=add_special_tokens)
         self._raise_if_input_too_long(ret, lora_request)
         return ret
 
@@ -65,9 +68,12 @@ async def encode_async(
             self,
             prompt: str,
             request_id: Optional[str] = None,
-            lora_request: Optional[LoRARequest] = None) -> List[int]:
+            lora_request: Optional[LoRARequest] = None,
+            add_special_tokens: Optional[bool] = None) -> List[int]:
         tokenizer = await self.get_lora_tokenizer_async(lora_request)
-        ret = tokenizer.encode(prompt)
+        ret = encode_tokens(tokenizer,
+                            prompt,
+                            add_special_tokens=add_special_tokens)
         self._raise_if_input_too_long(ret, lora_request)
         return ret
 
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index bff01320d7927..4d5d918087be8 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -287,12 +287,11 @@ def profile_run(self) -> None:
                                           seq_len,
                                           self.mm_registry,
                                           is_encoder_data=False)
-            encoder_dummy_data \
-                = self.input_registry.dummy_data_for_profiling(
-                    self.model_config,
-                                         seq_len,
-                                         self.mm_registry,
-                                         is_encoder_data=True)
+            encoder_dummy_data = self.input_registry \
+                .dummy_data_for_profiling(self.model_config,
+                                          seq_len,
+                                          self.mm_registry,
+                                          is_encoder_data=True)
 
             # Having more tokens is over-conservative but otherwise fine
             assert len(

From 80c751e7f68ade3d4c6391a0f3fce9ce970ddad0 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 3 Jan 2025 12:25:38 -0500
Subject: [PATCH 254/357] [V1] Simplify Shutdown (#11659)

---
 tests/v1/engine/test_engine_core_client.py |  6 ---
 vllm/entrypoints/llm.py                    |  5 ---
 vllm/v1/engine/async_llm.py                |  3 --
 vllm/v1/engine/core.py                     |  1 -
 vllm/v1/engine/core_client.py              | 34 ++++++++--------
 vllm/v1/engine/llm_engine.py               |  7 ----
 vllm/v1/utils.py                           | 46 +++++++++++-----------
 7 files changed, 42 insertions(+), 60 deletions(-)

diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 729975e4ea8c4..20d4e6f63b339 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -142,9 +142,6 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
 
         client.abort_requests([request.request_id])
 
-        # Shutdown the client.
-        client.shutdown()
-
 
 @pytest.mark.asyncio
 async def test_engine_core_client_asyncio(monkeypatch):
@@ -200,6 +197,3 @@ async def test_engine_core_client_asyncio(monkeypatch):
             else:
                 assert len(outputs[req_id]) == MAX_TOKENS, (
                     f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
-
-        # Shutdown the client.
-        client.shutdown()
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index fadf297e9f6aa..7c0de3b3e5481 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -232,11 +232,6 @@ def __init__(
 
         self.request_counter = Counter()
 
-    def __del__(self):
-        if hasattr(self, 'llm_engine') and self.llm_engine and hasattr(
-                self.llm_engine, "shutdown"):
-            self.llm_engine.shutdown()
-
     @staticmethod
     def get_engine_class() -> Type[LLMEngine]:
         if envs.VLLM_USE_V1:
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 3f097ca7f439c..ff7a0c28dd91a 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -103,9 +103,6 @@ def sigquit_handler(signum, frame):
 
         self.output_handler: Optional[asyncio.Task] = None
 
-    def __del__(self):
-        self.shutdown()
-
     @classmethod
     def from_engine_args(
         cls,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 5840541d774ba..13a50a4f855e2 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -203,7 +203,6 @@ def signal_handler(signum, frame):
         finally:
             if engine_core is not None:
                 engine_core.shutdown()
-                engine_core = None
 
     def run_busy_loop(self):
         """Core busy loop of the EngineCore."""
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 3293205e110af..e009f3448bf69 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,4 +1,6 @@
-from typing import List, Optional, Type
+import weakref
+from abc import ABC, abstractmethod
+from typing import List, Type
 
 import msgspec
 import zmq
@@ -18,7 +20,7 @@
 logger = init_logger(__name__)
 
 
-class EngineCoreClient:
+class EngineCoreClient(ABC):
     """
     EngineCoreClient: subclasses handle different methods for pushing 
         and pulling from the EngineCore for asyncio / multiprocessing.
@@ -52,8 +54,9 @@ def make_client(
 
         return InprocClient(vllm_config, executor_class, log_stats)
 
+    @abstractmethod
     def shutdown(self):
-        pass
+        ...
 
     def get_output(self) -> List[EngineCoreOutput]:
         raise NotImplementedError
@@ -107,9 +110,6 @@ def abort_requests(self, request_ids: List[str]) -> None:
     def shutdown(self):
         self.engine_core.shutdown()
 
-    def __del__(self):
-        self.shutdown()
-
     def profile(self, is_start: bool = True) -> None:
         self.engine_core.profile(is_start)
 
@@ -139,10 +139,14 @@ def __init__(
         self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
 
         # ZMQ setup.
-        if asyncio_mode:
-            self.ctx = zmq.asyncio.Context()
-        else:
-            self.ctx = zmq.Context()  # type: ignore[attr-defined]
+        self.ctx = (
+            zmq.asyncio.Context()  # type: ignore[attr-defined]
+            if asyncio_mode else zmq.Context())  # type: ignore[attr-defined]
+
+        # Note(rob): shutdown function cannot be a bound method,
+        # else the gc cannot collect the object.
+        self._finalizer = weakref.finalize(self, lambda x: x.destroy(linger=0),
+                                           self.ctx)
 
         # Paths and sockets for IPC.
         output_path = get_open_zmq_ipc_path()
@@ -153,7 +157,6 @@ def __init__(
                                             zmq.constants.PUSH)
 
         # Start EngineCore in background process.
-        self.proc_handle: Optional[BackgroundProcHandle]
         self.proc_handle = BackgroundProcHandle(
             input_path=input_path,
             output_path=output_path,
@@ -166,12 +169,11 @@ def __init__(
             })
 
     def shutdown(self):
-        # Shut down the zmq context.
-        self.ctx.destroy(linger=0)
-
-        if hasattr(self, "proc_handle") and self.proc_handle:
+        """Clean up background resources."""
+        if hasattr(self, "proc_handle"):
             self.proc_handle.shutdown()
-            self.proc_handle = None
+
+        self._finalizer()
 
 
 class SyncMPClient(MPClient):
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index a19109559eabf..1f49de67d7493 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -205,10 +205,3 @@ def get_tokenizer_group(
                             f"found type: {type(tokenizer_group)}")
 
         return tokenizer_group
-
-    def __del__(self):
-        self.shutdown()
-
-    def shutdown(self):
-        if engine_core := getattr(self, "engine_core", None):
-            engine_core.shutdown()
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 19e0dd17237c9..b0a7affbebb7e 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,3 +1,4 @@
+import multiprocessing
 import os
 import weakref
 from collections.abc import Sequence
@@ -91,8 +92,6 @@ def __init__(
         target_fn: Callable,
         process_kwargs: Dict[Any, Any],
     ):
-        self._finalizer = weakref.finalize(self, self.shutdown)
-
         context = get_mp_context()
         reader, writer = context.Pipe(duplex=False)
 
@@ -102,11 +101,11 @@ def __init__(
         process_kwargs["ready_pipe"] = writer
         process_kwargs["input_path"] = input_path
         process_kwargs["output_path"] = output_path
-        self.input_path = input_path
-        self.output_path = output_path
 
-        # Run Detokenizer busy loop in background process.
+        # Run busy loop in background process.
         self.proc = context.Process(target=target_fn, kwargs=process_kwargs)
+        self._finalizer = weakref.finalize(self, shutdown, self.proc,
+                                           input_path, output_path)
         self.proc.start()
 
         # Wait for startup.
@@ -114,21 +113,24 @@ def __init__(
             raise RuntimeError(f"{process_name} initialization failed. "
                                "See root cause above.")
 
-    def __del__(self):
-        self.shutdown()
-
     def shutdown(self):
-        # Shutdown the process if needed.
-        if hasattr(self, "proc") and self.proc.is_alive():
-            self.proc.terminate()
-            self.proc.join(5)
-
-            if self.proc.is_alive():
-                kill_process_tree(self.proc.pid)
-
-        # Remove zmq ipc socket files
-        ipc_sockets = [self.output_path, self.input_path]
-        for ipc_socket in ipc_sockets:
-            socket_file = ipc_socket.replace("ipc://", "")
-            if os and os.path.exists(socket_file):
-                os.remove(socket_file)
+        self._finalizer()
+
+
+# Note(rob): shutdown function cannot be a bound method,
+# else the gc cannot collect the object.
+def shutdown(proc: multiprocessing.Process, input_path: str, output_path: str):
+    # Shutdown the process.
+    if proc.is_alive():
+        proc.terminate()
+        proc.join(5)
+
+        if proc.is_alive():
+            kill_process_tree(proc.pid)
+
+    # Remove zmq ipc socket files.
+    ipc_sockets = [output_path, input_path]
+    for ipc_socket in ipc_sockets:
+        socket_file = ipc_socket.replace("ipc://", "")
+        if os and os.path.exists(socket_file):
+            os.remove(socket_file)

From 61fed92c7e646d6f2ec5d9de54568a860870e6a4 Mon Sep 17 00:00:00 2001
From: ZincCat <52513999+zinccat@users.noreply.github.com>
Date: Fri, 3 Jan 2025 13:02:34 -0800
Subject: [PATCH 255/357] [Bugfix] Fix ColumnParallelLinearWithLoRA slice
 (#11708)

Signed-off-by: ZincCat <zincchloride@outlook.com>
---
 vllm/lora/layers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 85164c2165a3c..102e40d3f448d 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -479,7 +479,7 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
         # ColumnParallelLinear.
         else:
             tensor_model_parallel_rank = get_tensor_model_parallel_rank()
-            shard_size = self.output_dim
+            shard_size = self.output_size
             start_idx = tensor_model_parallel_rank * shard_size
             end_idx = (tensor_model_parallel_rank + 1) * shard_size
             lora_b = lora_b[:, start_idx:end_idx]
@@ -490,7 +490,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
         if bias is None:
             return bias
         tensor_model_parallel_rank = get_tensor_model_parallel_rank()
-        shard_size = self.output_dim
+        shard_size = self.output_size
         start_idx = tensor_model_parallel_rank * shard_size
         end_idx = (tensor_model_parallel_rank + 1) * shard_size
         bias = bias[start_idx:end_idx]

From 1543914c04697fb252e4468b7c9d14be512b050a Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 3 Jan 2025 16:29:11 -0500
Subject: [PATCH 256/357] [V1] Improve TP>1 Error Handling + Stack Trace
 (#11721)

Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 vllm/v1/engine/async_llm.py            | 16 ----------------
 vllm/v1/engine/core.py                 |  2 +-
 vllm/v1/engine/core_client.py          | 19 ++++++++++++++++++-
 vllm/v1/executor/multiproc_executor.py | 24 +++++++++++++++++++++---
 4 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index ff7a0c28dd91a..564d8a8343bef 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,6 +1,5 @@
 import asyncio
 import os
-import signal
 from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
 
 from vllm.config import ModelConfig, VllmConfig
@@ -42,21 +41,6 @@ def __init__(
         start_engine_loop: bool = True,
     ) -> None:
 
-        # The child processes will send SIGQUIT when unrecoverable
-        # errors happen. We kill the process tree here so that the
-        # stack trace is very evident.
-        # TODO: rather than killing the main process, we should
-        # figure out how to raise an AsyncEngineDeadError and
-        # handle at the API server level so we can return a better
-        # error code to the clients calling VLLM.
-        def sigquit_handler(signum, frame):
-            logger.fatal(
-                "AsyncLLM got SIGQUIT from worker processes, shutting "
-                "down. See stack trace above for root cause issue.")
-            kill_process_tree(os.getpid())
-
-        signal.signal(signal.SIGQUIT, sigquit_handler)
-
         assert start_engine_loop
 
         self.log_requests = log_requests
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 13a50a4f855e2..975ce11fe8aff 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -198,7 +198,7 @@ def signal_handler(signum, frame):
         except Exception:
             traceback = get_exception_traceback()
             logger.error("EngineCore hit an exception: %s", traceback)
-            parent_process.send_signal(signal.SIGQUIT)
+            parent_process.send_signal(signal.SIGUSR1)
 
         finally:
             if engine_core is not None:
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index e009f3448bf69..6a40c961fc1d7 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,3 +1,5 @@
+import os
+import signal
 import weakref
 from abc import ABC, abstractmethod
 from typing import List, Type
@@ -8,7 +10,8 @@
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import get_open_zmq_ipc_path, make_zmq_socket
+from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree,
+                        make_zmq_socket)
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
                             EngineCoreRequestType, EngineCoreRequestUnion)
@@ -134,6 +137,20 @@ def __init__(
         executor_class: Type[Executor],
         log_stats: bool = False,
     ):
+        # The child processes will send SIGUSR1 when unrecoverable
+        # errors happen. We kill the process tree here so that the
+        # stack trace is very evident.
+        # TODO(rob): rather than killing the main process, we should
+        # figure out how to raise an AsyncEngineDeadError and
+        # handle at the API server level so we can return a better
+        # error code to the clients calling VLLM.
+        def sigusr1_handler(signum, frame):
+            logger.fatal("Got fatal signal from worker processes, shutting "
+                         "down. See stack trace above for root cause issue.")
+            kill_process_tree(os.getpid())
+
+        signal.signal(signal.SIGUSR1, sigusr1_handler)
+
         # Serialization setup.
         self.encoder = PickleEncoder()
         self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index ed64e7741390d..114deae980d01 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -9,6 +9,7 @@
 from multiprocessing.process import BaseProcess
 from typing import Any, Dict, List, Optional, Tuple
 
+import psutil
 import zmq
 
 from vllm.config import VllmConfig
@@ -38,6 +39,19 @@ def __init__(self, vllm_config: VllmConfig) -> None:
         # and ensure workers will be terminated.
         self._finalizer = weakref.finalize(self, self.shutdown)
 
+        # The child processes will send SIGUSR1 when unrecoverable
+        # errors happen.
+        def sigusr1_handler(signum, frame):
+            logger.fatal(
+                "MulitprocExecutor got fatal signal from worker processes, "
+                "shutting down. See stack trace above for root cause issue.")
+            # Propagate error up to parent process.
+            parent_process = psutil.Process().parent()
+            parent_process.send_signal(signal.SIGUSR1)
+            self.shutdown()
+
+        signal.signal(signal.SIGUSR1, sigusr1_handler)
+
         self.vllm_config = vllm_config
         self.parallel_config = vllm_config.parallel_config
 
@@ -335,8 +349,11 @@ def signal_handler(signum, frame):
         except SystemExit:
             logger.debug("Worker interrupted.")
 
-        except BaseException as e:
-            logger.exception(e)
+        except Exception:
+            # worker_busy_loop sends exceptions exceptons to Executor
+            # for shutdown, but if there is an error in startup or an
+            # error with IPC itself, we need to alert the parent.
+            psutil.Process().parent().send_signal(signal.SIGUSR1)
             raise
 
         finally:
@@ -377,9 +394,10 @@ def worker_busy_loop(self):
 
             try:
                 output = getattr(self.worker, method)(*args, **kwargs)
-            except BaseException as e:
+            except Exception as e:
                 self.worker_response_mq.enqueue(
                     (WorkerProc.ResponseStatus.FAILURE, e))
+                logger.exception("WorkerProc hit an exception: %s", exc_info=e)
                 continue
 
             self.worker_response_mq.enqueue(

From a655eb30252fe266ce16fde2aa9f8f9554ccd46e Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 4 Jan 2025 06:19:02 +0800
Subject: [PATCH 257/357] [Misc]Add BNB quantization for Qwen2VL (#11719)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/qwen2_vl.py | 69 +++++++++++++++-----------
 1 file changed, 40 insertions(+), 29 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 26b6d768ad4f6..5a8c6e4deb7ac 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -38,7 +38,7 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.distributed import parallel_state
+from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
@@ -239,6 +239,8 @@ def __init__(
         super().__init__()
         # Per attention head and per partition values.
         world_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.tp_size = world_size
+        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
         self.hidden_size_per_attention_head = dist_utils.divide(
             projection_size, num_heads)
         self.num_attention_heads_per_partition = dist_utils.divide(
@@ -261,24 +263,41 @@ def __init__(
             raise RuntimeError(
                 f"Qwen2-VL does not support {self.attn_backend} backend now.")
 
+    def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
+        # [s, b, 3 * head * head_dim]
+        seq_len, bs, _ = qkv.shape
+        if self.tp_size > 1:
+            qkv = tensor_model_parallel_all_gather(qkv)
+
+        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
+        q, k, v = qkv.chunk(3, dim=2)
+
+        # 3 * [s, b, head * head_dim]
+        if self.tp_size > 1:
+            splitter = partial(dist_utils.split_tensor_along_last_dim,
+                               num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+            v = splitter(v)[self.tp_rank]
+
+        # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
+        new_shape = (seq_len, bs, self.num_attention_heads_per_partition,
+                     self.hidden_size_per_attention_head)
+        q, k, v = (x.view(*new_shape) for x in (q, k, v))
+        return q, k, v
+
     def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
     ) -> torch.Tensor:
-        # [s, b, c] --> [s, b, head * 3 * head_dim]
-        x, _ = self.qkv(x)
 
-        # [s, b, head * 3 * head_dim] --> [s, b, head, 3 * head_dim]
-        new_x_shape = x.size()[:-1] + (
-            self.num_attention_heads_per_partition,
-            3 * self.hidden_size_per_attention_head,
-        )
-        x = x.view(*new_x_shape)
+        # [s, b, c] --> [s, b, 3 * head * head_dim]
+        x, _ = self.qkv(x)
 
-        # [s, b, head, 3 * head_dim] --> 3 [s, b, head, head_dim]
-        q, k, v = dist_utils.split_tensor_along_last_dim(x, 3)
+        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim]
+        q, k, v = self.split_qkv(x)
         batch_size = q.shape[1]
 
         q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous()
@@ -614,24 +633,6 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                if name.endswith("qkv.weight"):
-                    visual_num_heads = self.num_heads
-                    visual_embed_dim = self.embed_dim
-                    head_size = visual_embed_dim // visual_num_heads
-                    loaded_weight = loaded_weight.view(3, visual_num_heads,
-                                                       head_size,
-                                                       visual_embed_dim)
-                    loaded_weight = loaded_weight.transpose(0, 1)
-                    loaded_weight = loaded_weight.reshape(-1, visual_embed_dim)
-                elif name.endswith("qkv.bias"):
-                    visual_num_heads = self.num_heads
-                    visual_embed_dim = self.embed_dim
-                    head_size = visual_embed_dim // visual_num_heads
-                    loaded_weight = loaded_weight.view(3, visual_num_heads,
-                                                       head_size)
-                    loaded_weight = loaded_weight.transpose(0, 1)
-                    loaded_weight = loaded_weight.reshape(-1)
-
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
@@ -935,6 +936,16 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
     embedding_modules = {}
     embedding_padding_modules = []
 
+    # BitandBytes specific attributes
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
         "lm_head.": "language_model.lm_head.",

From bf0d97d78619b290ed273199ad3800b57b638603 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 3 Jan 2025 17:36:46 -0500
Subject: [PATCH 258/357] Update requirements-tpu.txt to support python 3.9 and
 3.11 (#11695)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 requirements-tpu.txt            | 4 +++-
 vllm/worker/tpu_model_runner.py | 6 ++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index b8f0b15469e77..8ab18b3770ae8 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -18,6 +18,8 @@ ray[default]
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
 torch==2.6.0.dev20241126+cpu
 torchvision==0.20.0.dev20241126+cpu
-torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl
+torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
 jaxlib==0.4.36.dev20241122
 jax==0.4.36.dev20241122
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 9a054eb8a4cf7..7bdb7f0e2d6a9 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -126,8 +126,10 @@ def __init__(
             logger.warning(
                 "The max_model_len (%d) is too large. This may degrade the "
                 "performance due to the insufficient smem size. Consider "
-                "setting --max-model-len to a smaller value.",
-                self.model_config.max_model_len)
+                "setting --max-model-len to a smaller value, like %d.",
+                self.model_config.max_model_len,
+                self.model_config.max_model_len /
+                (block_table_size / smem_size))
 
     def load_model(self) -> None:
         self.device = self.device_config.device

From ad0d567e1cdc77aff435b20bac918bfd0f55db0a Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 3 Jan 2025 18:25:02 -0500
Subject: [PATCH 259/357] [V1] Chore: cruft removal (#11724)

---
 vllm/entrypoints/llm.py       | 2 --
 vllm/v1/engine/core_client.py | 2 --
 vllm/v1/engine/llm_engine.py  | 4 ----
 vllm/v1/engine/processor.py   | 3 ---
 4 files changed, 11 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 7c0de3b3e5481..e48fd1a4fa5e9 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -225,8 +225,6 @@ def __init__(
         # Logic to switch between engines is done at runtime instead of import
         # to avoid import order issues
         self.engine_class = self.get_engine_class()
-
-        # TODO(rob): enable mp by default (issue with fork vs spawn)
         self.llm_engine = self.engine_class.from_engine_args(
             engine_args, usage_context=UsageContext.LLM_CLASS)
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 6a40c961fc1d7..a4a45ae05ff9e 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -94,8 +94,6 @@ class InprocClient(EngineCoreClient):
 
         * pushes EngineCoreRequest directly into the EngineCore
         * pulls EngineCoreOutputs by stepping the EngineCore
-
-        TODO: support asyncio-mode for debugging.
     """
 
     def __init__(self, *args, **kwargs):
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 1f49de67d7493..0bd9b52c9be82 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -42,8 +42,6 @@ def __init__(
         use_cached_outputs: bool = False,
         multiprocess_mode: bool = False,
     ) -> None:
-
-        # TODO: Can we avoid this?
         self.model_config = vllm_config.model_config
 
         # Tokenizer (+ ensure liveness if running in another process).
@@ -179,8 +177,6 @@ def step(self) -> List[RequestOutput]:
 
         return request_outputs
 
-    # TODO(rob): Can we get rid of these?
-
     def get_model_config(self):
         return self.model_config
 
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 905d3d1fc3e1c..c0f6cfab4865c 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -49,9 +49,6 @@ def __init__(
             cache_config.enable_prefix_caching
         self.mm_hasher = MMHasher()
 
-    # TODO: run in an ThreadpoolExecutor or BackgroundProcess.
-    # This ideally should releases the GIL, so we should not block the
-    # asyncio loop while this is running.
     def process_inputs(
         self,
         request_id: str,

From e5d7ed0c5374d38e75a8ef0243cc348f0f6f9185 Mon Sep 17 00:00:00 2001
From: WangErXiao <863579016@qq.com>
Date: Sat, 4 Jan 2025 08:13:12 +0800
Subject: [PATCH 260/357] [V1] log GPU blocks num for MultiprocExecutor
 (#11656)

---
 vllm/v1/executor/multiproc_executor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 114deae980d01..41e6abbd67956 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -95,6 +95,7 @@ def initialize(self, num_gpu_blocks: int) -> None:
         Initialize the KV caches and begin the model execution loop of the
         underlying workers.
         """
+        logger.info("# GPU blocks: %d", num_gpu_blocks)
         self.collective_rpc("initialize_cache", args=(num_gpu_blocks, ))
         self.collective_rpc("compile_or_warm_up_model")
 

From 9c93636d84414591ae4d7b9c1174af7e91052fd8 Mon Sep 17 00:00:00 2001
From: Hust_YangXian <bryceyx@gmail.com>
Date: Sat, 4 Jan 2025 14:16:30 +0800
Subject: [PATCH 261/357] Update tool_calling.md (#11701)

---
 docs/source/usage/tool_calling.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/usage/tool_calling.md b/docs/source/usage/tool_calling.md
index 34b26647a959f..062f2021eb62a 100644
--- a/docs/source/usage/tool_calling.md
+++ b/docs/source/usage/tool_calling.md
@@ -10,7 +10,7 @@ Start the server with tool calling enabled. This example uses Meta's Llama 3.1 8
 vllm serve meta-llama/Llama-3.1-8B-Instruct \
     --enable-auto-tool-choice \
     --tool-call-parser llama3_json \
-    --chat-template examples/tool_chat_template_llama3_json.jinja
+    --chat-template examples/tool_chat_template_llama3.1_json.jinja
 ```
 
 Next, make a request to the model that should result in it using the available tools:

From d1d49397e7f8d1ac472d763dae395b67fdda1ef8 Mon Sep 17 00:00:00 2001
From: Alberto Ferrer <albertof@barrahome.org>
Date: Sat, 4 Jan 2025 00:29:02 -0600
Subject: [PATCH 262/357] Update bnb.md with example for OpenAI (#11718)

---
 docs/source/quantization/bnb.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/source/quantization/bnb.md b/docs/source/quantization/bnb.md
index 8240eca1c7e03..f7f41726f3725 100644
--- a/docs/source/quantization/bnb.md
+++ b/docs/source/quantization/bnb.md
@@ -37,3 +37,10 @@ model_id = "huggyllama/llama-7b"
 llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
 quantization="bitsandbytes", load_format="bitsandbytes")
 ```
+## OpenAI Compatible Server
+
+Append the following to your 4bit model arguments:
+
+```
+--quantization bitsandbytes --load-format bitsandbytes
+```

From fbf25645542fdcfb3f1a27ba05486492e368925c Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Sat, 4 Jan 2025 14:41:31 +0800
Subject: [PATCH 263/357] [V1] Add `RayExecutor` support for `AsyncLLM` (api
 server) (#11712)

---
 vllm/v1/engine/async_llm.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 564d8a8343bef..0696caf88385d 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -22,6 +22,7 @@
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.executor.ray_utils import initialize_ray_cluster
 
 logger = init_logger(__name__)
 
@@ -131,7 +132,11 @@ def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
         executor_class: Type[Executor]
         distributed_executor_backend = (
             vllm_config.parallel_config.distributed_executor_backend)
-        if distributed_executor_backend == "mp":
+        if distributed_executor_backend == "ray":
+            initialize_ray_cluster(vllm_config.parallel_config)
+            from vllm.v1.executor.ray_executor import RayExecutor
+            executor_class = RayExecutor
+        elif distributed_executor_backend == "mp":
             from vllm.v1.executor.multiproc_executor import MultiprocExecutor
             executor_class = MultiprocExecutor
         else:

From d91457d529c2df5d66bdfd939b90b7c75a9729b8 Mon Sep 17 00:00:00 2001
From: xcnick <xcnick0412@gmail.com>
Date: Sat, 4 Jan 2025 14:49:46 +0800
Subject: [PATCH 264/357] [V1] Add kv cache utils tests. (#11513)

Signed-off-by: xcnick <xcnick0412@gmail.com>
---
 tests/v1/core/test_kv_cache_utils.py | 241 +++++++++++++++++++++++++++
 1 file changed, 241 insertions(+)
 create mode 100644 tests/v1/core/test_kv_cache_utils.py

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
new file mode 100644
index 0000000000000..faa3a91de151f
--- /dev/null
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -0,0 +1,241 @@
+import pytest
+
+from vllm.inputs import token_inputs
+from vllm.sampling_params import SamplingParams
+from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
+                                         KVCacheBlock,
+                                         generate_block_hash_extra_keys,
+                                         hash_block_tokens,
+                                         hash_request_tokens)
+from vllm.v1.request import Request
+
+
+def make_request(request_id,
+                 prompt_token_ids,
+                 mm_positions=None,
+                 mm_hashes=None):
+    return Request(
+        request_id=request_id,
+        inputs=token_inputs(
+            prompt_token_ids=prompt_token_ids,
+            multi_modal_placeholders={"image": mm_positions}
+            if mm_positions else None,
+            multi_modal_hashes=mm_hashes,
+        ),
+        sampling_params=SamplingParams(max_tokens=17),
+        eos_token_id=100,
+        arrival_time=0,
+        lora_request=None,
+    )
+
+
+def test_kv_cache_block():
+    # Test KVCacheBlock initialization
+    block = KVCacheBlock(block_id=0)
+    assert block.block_id == 0
+    assert block.ref_cnt == 0
+    assert block.block_hash is None
+
+    # Test reference count manipulation
+    block.incr_ref()
+    assert block.ref_cnt == 1
+    block.decr_ref()
+    assert block.ref_cnt == 0
+
+    # Test block hash setting and resetting
+    block_hash = BlockHashType(hash_value=123, token_ids=(1, 2, 3))
+    block.block_hash = block_hash
+    assert block.block_hash == block_hash
+
+    block.reset_hash()
+    assert block.block_hash is None
+
+
+def test_free_kv_cache_block_queue_initialization():
+    # Test with a single block
+    block = KVCacheBlock(block_id=0)
+    queue = FreeKVCacheBlockQueue([block])
+    assert queue.num_free_blocks == 1
+    assert queue.free_list_head == block
+    assert queue.free_list_tail == block
+
+
+def test_free_kv_cache_block_queue_operations():
+    # Create a list of KVCacheBlock objects
+    blocks = [KVCacheBlock(block_id=i) for i in range(5)]
+
+    # Create a FreeKVCacheBlockQueue with these blocks
+    queue = FreeKVCacheBlockQueue(blocks)
+
+    # Check initial state
+    assert queue.num_free_blocks == 5
+    assert queue.free_list_head == blocks[0]
+    assert queue.free_list_tail == blocks[4]
+
+    # Pop the first block
+    block1 = queue.popleft()
+    assert block1 == blocks[0]
+    assert queue.num_free_blocks == 4
+    assert queue.free_list_head == blocks[1]
+    assert queue.free_list_tail == blocks[4]
+
+    # Remove a block from the middle
+    block_to_remove = blocks[2]
+    queue.remove(block_to_remove)
+    assert queue.num_free_blocks == 3
+    assert blocks[1].next_free_block == blocks[3]
+    assert blocks[3].prev_free_block == blocks[1]
+
+    # Append a block back
+    queue.append(block_to_remove)
+    assert queue.num_free_blocks == 4
+    assert queue.free_list_tail == block_to_remove
+    assert block_to_remove.prev_free_block == blocks[4]
+    assert block_to_remove.next_free_block is None
+
+    # Pop blocks until empty
+    for _ in range(4):
+        queue.popleft()
+    assert queue.num_free_blocks == 0
+    assert queue.free_list_head is None
+    assert queue.free_list_tail is None
+
+    # Attempt to pop from an empty queue
+    with pytest.raises(ValueError) as e:
+        queue.popleft()
+    assert str(e.value) == "No free blocks available"
+
+
+def test_free_kv_cache_block_queue_get_all_free_blocks():
+    # Create a list of KVCacheBlock objects
+    blocks = [KVCacheBlock(block_id=i) for i in range(5)]
+
+    # Create a FreeKVCacheBlockQueue with these blocks
+    queue = FreeKVCacheBlockQueue(blocks)
+
+    # Check all blocks are correctly retrieved
+    assert queue.get_all_free_blocks() == blocks
+
+    # Pop a block and check again
+    queue.popleft()
+    assert queue.get_all_free_blocks() == blocks[1:]
+
+    # Remove a block and check again
+    block_to_remove = blocks[2]
+    queue.remove(block_to_remove)
+    assert queue.get_all_free_blocks() == blocks[1:2] + blocks[3:]
+
+    # Append a block back and check again
+    queue.append(block_to_remove)
+    assert queue.get_all_free_blocks() == \
+        blocks[1:2] + blocks[3:] + [block_to_remove]
+
+
+def test_generate_block_hash_extra_keys():
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(20)],
+        mm_positions=[{
+            "offset": 0,
+            "length": 5
+        }, {
+            "offset": 10,
+            "length": 5
+        }],
+        mm_hashes=["hash1", "hash2"],
+    )
+
+    # Test with no extra keys
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0)
+    assert extra_keys == (("hash1", 0), )
+    assert next_mm_idx == 1
+
+    # Test with partial overlap
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 3, 8, 0)
+    assert extra_keys == (("hash1", 3), )
+    assert next_mm_idx == 1
+
+    # Test with no overlap
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 6, 10, 0)
+    assert extra_keys == ()
+    assert next_mm_idx == 1
+
+    # Test with multiple extra keys
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 15, 0)
+    assert extra_keys == (("hash1", 0), ("hash2", 0))
+    assert next_mm_idx == 2
+
+
+def test_generate_block_hash_extra_keys_no_mm_inputs():
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=None,
+        mm_hashes=None,
+    )
+
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0)
+    assert extra_keys is None
+    assert next_mm_idx == 0
+
+
+def test_hash_block_tokens():
+    parent_block_hash = 123
+    curr_block_token_ids = (1, 2, 3)
+    extra_keys = ("key1", "key2")
+
+    block_hash = hash_block_tokens(parent_block_hash, curr_block_token_ids,
+                                   extra_keys)
+    assert isinstance(block_hash, BlockHashType)
+    assert block_hash.hash_value == hash(
+        (parent_block_hash, *curr_block_token_ids))
+    assert block_hash.token_ids == curr_block_token_ids
+    assert block_hash.extra_keys == extra_keys
+
+
+def test_hash_request_tokens():
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=[{
+            "offset": 0,
+            "length": 3
+        }, {
+            "offset": 3,
+            "length": 3
+        }],
+        mm_hashes=["hash1", "hash2"],
+    )
+
+    block_size = 3
+    block_hashes = hash_request_tokens(block_size, request)
+
+    assert len(block_hashes) == 2
+    assert isinstance(block_hashes[0], BlockHashType)
+    assert isinstance(block_hashes[1], BlockHashType)
+
+    # Check the first block
+    assert block_hashes[0].token_ids == (0, 1, 2)
+    assert block_hashes[0].extra_keys == (("hash1", 0), )
+
+    # Check the second block
+    assert block_hashes[1].token_ids == (3, 4, 5)
+    assert block_hashes[1].extra_keys == (("hash2", 0), )
+
+
+def test_hash_request_tokens_no_mm_inputs():
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=None,
+        mm_hashes=None,
+    )
+
+    block_size = 3
+    block_hashes = hash_request_tokens(block_size, request)
+
+    assert len(block_hashes) == 2
+    assert block_hashes[0].token_ids == (0, 1, 2)
+    assert block_hashes[0].extra_keys is None
+    assert block_hashes[1].token_ids == (3, 4, 5)
+    assert block_hashes[1].extra_keys is None

From 300acb83472512b14ec7ba8cdf45efe07e8c8f68 Mon Sep 17 00:00:00 2001
From: Yan Burman <yanburman@users.noreply.github.com>
Date: Sat, 4 Jan 2025 08:50:16 +0200
Subject: [PATCH 265/357] [Core][Bugfix] Use correct device to initialize GPU
 data during CUDA-graph-capture (#11233)

Signed-off-by: Yan Burman <yanburman@users.noreply.github.com>
Signed-off-by: Ido Asraff <idoa@atero.ai>
---
 tests/distributed/test_custom_all_reduce.py |  2 +-
 tests/distributed/test_pynccl.py            |  2 +-
 vllm/distributed/parallel_state.py          |  7 +++---
 vllm/v1/worker/gpu_model_runner.py          |  2 +-
 vllm/worker/model_runner.py                 | 25 +++++++++++++--------
 5 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index 86ca1948ef94a..4072616fd30e2 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -50,7 +50,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
 
     for sz in test_sizes:
         for dtype in [torch.float32, torch.float16, torch.bfloat16]:
-            with graph_capture() as graph_capture_context:
+            with graph_capture(device=device) as graph_capture_context:
                 # use integers so result matches NCCL exactly
                 inp1 = torch.randint(1,
                                      16, (sz, ),
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index 3e9b0e10a11d8..36cfe42251384 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -107,7 +107,7 @@ def multiple_allreduce_with_vllm_worker_fn():
     device = torch.device(f"cuda:{torch.distributed.get_rank()}")
     ensure_model_parallel_initialized(2, 2)
     tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
-    with graph_capture():
+    with graph_capture(device=device):
         # two tp groups can communicate independently
         if torch.distributed.get_rank() in [0, 1]:
             tensor = tensor_model_parallel_all_reduce(tensor)
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index e6768467f4c27..a0d4235460f3b 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -920,7 +920,7 @@ def get_kv_transfer_group() -> kv_transfer.KVTransferAgent:
 
 
 @contextmanager
-def graph_capture():
+def graph_capture(device: torch.device):
     """
     `graph_capture` is a context manager which should surround the code that
     is capturing the CUDA graph. Its main purpose is to ensure that the
@@ -934,8 +934,9 @@ def graph_capture():
     in order to explicitly distinguish the kernels to capture
     from other kernels possibly launched on background in the default stream.
     """
-    with get_tp_group().graph_capture() as context, get_pp_group(
-    ).graph_capture(context):
+    context = GraphCaptureContext(torch.cuda.Stream(device=device))
+    with get_tp_group().graph_capture(context), get_pp_group().graph_capture(
+            context):
         yield context
 
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 75098b0330ac9..294c76cfb680e 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -836,7 +836,7 @@ def capture_model(self) -> None:
         # Trigger CUDA graph capture for specific shapes.
         # Capture the large shapes first so that the smaller shapes
         # can reuse the memory pool allocated for the large shapes.
-        with graph_capture():
+        with graph_capture(device=self.device):
             for num_tokens in reversed(self.cudagraph_batch_sizes):
                 for _ in range(self.vllm_config.compilation_config.
                                cudagraph_num_of_warmups):
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 637fba23611f4..1c6d1bbee78ee 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1426,10 +1426,15 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
 
         # Prepare dummy inputs. These will be reused for all batch sizes.
         max_batch_size = self.max_batchsize_to_capture
-        input_tokens = torch.zeros(max_batch_size, dtype=torch.long).cuda()
-        input_positions = torch.zeros(max_batch_size, dtype=torch.long).cuda()
+        input_tokens = torch.zeros(max_batch_size,
+                                   dtype=torch.long,
+                                   device=self.device)
+        input_positions = torch.zeros(max_batch_size,
+                                      dtype=torch.long,
+                                      device=self.device)
         if self.model_config.uses_mrope:
-            input_positions = torch.tile(input_positions, (3, 1))
+            input_positions = torch.tile(input_positions,
+                                         (3, 1)).cuda(device=self.device)
         # Prepare dummy previous_hidden_states only if needed by the model.
         # This is used by draft models such as EAGLE.
         previous_hidden_states = None
@@ -1448,8 +1453,8 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                 dtype=self.model_config.dtype,
                 device=self.device)
 
-        with self.attn_state.graph_capture(
-                max_batch_size), graph_capture() as graph_capture_context:
+        with self.attn_state.graph_capture(max_batch_size), graph_capture(
+                self.device) as graph_capture_context:
             # NOTE: Capturing the largest batch size first may help reduce the
             # memory usage of CUDA graph.
             for virtual_engine in range(
@@ -1549,10 +1554,12 @@ def _update_inputs_to_capture_for_enc_dec_model(self,
         """
         # During the decode phase encoder_input_ids and encoder_positions are
         # unset. Do the same thing for graph capture.
-        capture_inputs["encoder_input_ids"] = torch.tensor(
-            [], dtype=torch.long).cuda()
-        capture_inputs["encoder_positions"] = torch.tensor(
-            [], dtype=torch.long).cuda()
+        capture_inputs["encoder_input_ids"] = torch.tensor([],
+                                                           dtype=torch.long,
+                                                           device=self.device)
+        capture_inputs["encoder_positions"] = torch.tensor([],
+                                                           dtype=torch.long,
+                                                           device=self.device)
 
     @property
     def vocab_size(self) -> int:

From eed11ebee93e9d137ac74d8e6e97427354bd3797 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 4 Jan 2025 19:40:53 +0800
Subject: [PATCH 266/357] [VLM] Merged multi-modal processors for
 LLaVA-NeXT-Video and LLaVA-OneVision (#11717)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../__init__.py                               |   0
 .../test_idefics3.py                          |   0
 .../test_internvl.py                          |   0
 .../processing/test_llava_next.py             |  58 ++
 .../processing/test_llava_onevision.py        |  59 ++
 .../test_phi3v.py                             |  44 +-
 .../test_qwen.py                              |   0
 .../test_qwen2_vl.py                          |  39 +-
 .../vision_language/test_models.py            |   9 +-
 .../vision_language/test_qwen2_vl.py          | 127 -----
 tests/multimodal/test_processing.py           | 170 +++---
 vllm/model_executor/models/aria.py            |   5 +-
 vllm/model_executor/models/blip2.py           |   5 +-
 vllm/model_executor/models/chameleon.py       |   5 +-
 vllm/model_executor/models/clip.py            |  11 +-
 vllm/model_executor/models/fuyu.py            |   5 +-
 vllm/model_executor/models/llava.py           |  75 ++-
 vllm/model_executor/models/llava_next.py      |  15 +-
 .../model_executor/models/llava_next_video.py | 273 +++++----
 vllm/model_executor/models/llava_onevision.py | 531 ++++++++----------
 vllm/model_executor/models/phi3v.py           |  26 +-
 vllm/model_executor/models/pixtral.py         |  11 +-
 vllm/model_executor/models/qwen2_audio.py     |  15 +-
 vllm/model_executor/models/qwen2_vl.py        | 199 ++++---
 vllm/model_executor/models/siglip.py          |  11 +-
 vllm/model_executor/models/ultravox.py        |  11 +-
 vllm/model_executor/models/vision.py          |  37 +-
 vllm/multimodal/parse.py                      |  14 +
 vllm/multimodal/processing.py                 | 326 +++++++----
 vllm/multimodal/registry.py                   |   3 +-
 vllm/transformers_utils/tokenizer.py          |  13 +
 31 files changed, 1114 insertions(+), 983 deletions(-)
 rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/__init__.py (100%)
 rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/test_idefics3.py (100%)
 rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/test_internvl.py (100%)
 create mode 100644 tests/models/decoder_only/vision_language/processing/test_llava_next.py
 create mode 100644 tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
 rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/test_phi3v.py (60%)
 rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/test_qwen.py (100%)
 rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/test_qwen2_vl.py (64%)

diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py b/tests/models/decoder_only/vision_language/processing/__init__.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py
rename to tests/models/decoder_only/vision_language/processing/__init__.py
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py b/tests/models/decoder_only/vision_language/processing/test_idefics3.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py
rename to tests/models/decoder_only/vision_language/processing/test_idefics3.py
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py b/tests/models/decoder_only/vision_language/processing/test_internvl.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py
rename to tests/models/decoder_only/vision_language/processing/test_internvl.py
diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
new file mode 100644
index 0000000000000..6772130c9b884
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
@@ -0,0 +1,58 @@
+import pytest
+from PIL import Image
+from transformers import AutoTokenizer
+
+from vllm.inputs import InputProcessingContext
+
+from ....utils import build_model_context
+
+
+# Fixtures lazy import to avoid initializing CUDA during test collection
+@pytest.fixture()
+def processor_for_llava_next():
+    from vllm.model_executor.models.llava_next import (
+        LlavaNextMultiModalProcessor)
+    return LlavaNextMultiModalProcessor
+
+
+# FIXME: image_size [(198, 176), (176, 198)]
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488),
+                                        (488, 183)])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_prompt_replacements(
+    processor_for_llava_next,
+    model_id: str,
+    image_size: tuple[int, int],
+    num_imgs: int,
+):
+    """
+    Ensure LlavaNextMultiModalProcessor handles prompt replacement properly.
+    """
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    ctx = InputProcessingContext(ctx.model_config, tokenizer)
+
+    # Build the image str / prompt based on the number of images we pass
+    prompt = "<image>" * num_imgs
+    mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs}
+
+    # The processor will throw an error if there is a mismatch
+    # in the prompt replacements
+    processor = processor_for_llava_next(ctx)
+    processed_inputs = processor.apply(prompt, mm_data, {})
+
+    image_placeholders = processed_inputs["mm_placeholders"]["image"]
+    assert len(image_placeholders) == num_imgs
+
+    first_placeholder = image_placeholders[0]
+
+    # NOTE: There is a BOS token
+    assert first_placeholder["offset"] == 1
+    assert first_placeholder["length"] == (
+        len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
new file mode 100644
index 0000000000000..71adde6568a17
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
@@ -0,0 +1,59 @@
+import pytest
+from PIL import Image
+from transformers import AutoTokenizer
+
+from vllm.inputs import InputProcessingContext
+
+from ....utils import build_model_context
+
+
+# Fixtures lazy import to avoid initializing CUDA during test collection
+@pytest.fixture()
+def processor_for_llava_onevision():
+    from vllm.model_executor.models.llava_onevision import (
+        LlavaOnevisionMultiModalProcessor)
+    return LlavaOnevisionMultiModalProcessor
+
+
+@pytest.mark.parametrize("model_id",
+                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488),
+                                        (488, 183), (198, 176), (176, 198)])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_prompt_replacements(
+    processor_for_llava_onevision,
+    model_id: str,
+    image_size: tuple[int, int],
+    num_imgs: int,
+):
+    """
+    Ensure LlavaOnevisionMultiModalProcessor handles prompt replacement
+    properly.
+    """
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    ctx = InputProcessingContext(ctx.model_config, tokenizer)
+
+    # Build the image str / prompt based on the number of images we pass
+    prompt = "<image>" * num_imgs
+    mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs}
+
+    # The processor will throw an error if there is a mismatch
+    # in the prompt replacements
+    processor = processor_for_llava_onevision(ctx)
+    processed_inputs = processor.apply(prompt, mm_data, {})
+
+    image_placeholders = processed_inputs["mm_placeholders"]["image"]
+    assert len(image_placeholders) == num_imgs
+
+    first_placeholder = image_placeholders[0]
+
+    # NOTE: There is a BOS token
+    assert first_placeholder["offset"] == 0
+    assert first_placeholder["length"] == len(
+        processed_inputs["prompt_token_ids"]) // num_imgs
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/processing/test_phi3v.py
similarity index 60%
rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
rename to tests/models/decoder_only/vision_language/processing/test_phi3v.py
index 3edf96d11106d..249045b3c04ce 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/processing/test_phi3v.py
@@ -1,6 +1,4 @@
 """Tests for phi3v's multimodal preprocessing kwargs."""
-from typing import Optional
-
 import pytest
 from transformers import AutoTokenizer
 
@@ -10,8 +8,6 @@
 from .....conftest import _ImageAssets
 from ....utils import build_model_context
 
-models = ["microsoft/Phi-3.5-vision-instruct"]
-
 
 # Wrap lazy imports to avoid initializing CUDA during test collection
 @pytest.fixture()
@@ -20,40 +16,40 @@ def processor_for_phi3v():
     return Phi3VMultiModalProcessor
 
 
-@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
+# yapf: disable
 @pytest.mark.parametrize(
-    "num_crops,expected_toks_per_img",
+    ("mm_processor_kwargs", "expected_toks_per_img"),
     [
-        (4, 757),
-        (16, 1921),
+        ({"num_crops": 4}, 757),
+        ({"num_crops": 16}, 1921),
         # the default num_crops of phi-3.5-vision is 4
-        (None, 757),
+        ({}, 757),
     ])
+# yapf: enable
 @pytest.mark.parametrize("num_imgs", [1, 2])
-def test_processor_override(processor_for_phi3v, image_assets: _ImageAssets,
-                            model: str, num_crops: Optional[int],
-                            expected_toks_per_img: int, num_imgs: int):
+def test_processor_override(
+    processor_for_phi3v,
+    image_assets: _ImageAssets,
+    model_id: str,
+    mm_processor_kwargs: dict[str, int],
+    expected_toks_per_img: int,
+    num_imgs: int,
+):
     """Ensure input_processor_for_phi3v handles num_crops properly."""
-    # Same as the previous test - don't initialize mm_processor_kwargs
-    # in this test and assume that the kwargs will be correctly expanded by
-    # the partial when calling the custom input processor.
     ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
+        model_name=model_id,
+        tokenizer_name=model_id,
         trust_remote_code=True,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
     ctx = InputProcessingContext(ctx.model_config, tokenizer)
+
     # Build the image str / prompt based on the number of images we pass
     img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
     prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
-    images = [image_assets[0].pil_image] * num_imgs
-
-    mm_data = {"image": images}
-    mm_processor_kwargs = {}
-    if num_crops is not None:
-        mm_processor_kwargs = {"num_crops": num_crops}
+    mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
 
     processor = processor_for_phi3v(ctx)
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py b/tests/models/decoder_only/vision_language/processing/test_qwen.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
rename to tests/models/decoder_only/vision_language/processing/test_qwen.py
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py
similarity index 64%
rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
rename to tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py
index 1f0b482666723..b9ac887edf90f 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py
@@ -1,5 +1,3 @@
-from typing import Any, Dict, Tuple
-
 import pytest
 from transformers import AutoTokenizer
 
@@ -8,56 +6,45 @@
 from .....conftest import _ImageAssets
 from ....utils import build_model_context
 
-MODEL = "Qwen/Qwen2-VL-2B-Instruct"
-MIN_PIXELS = "min_pixels"
-MAX_PIXELS = "max_pixels"
-
 
 # Fixtures lazy import to avoid initializing CUDA during test collection
-# NOTE: Qwen2VL supports multiple input modalities, so it registers multiple
-# input mappers.
 @pytest.fixture()
 def processor_for_qwen2_vl():
     from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalProcessor
     return Qwen2VLMultiModalProcessor
 
 
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
+# yapf: disable
 @pytest.mark.parametrize(
-    "mm_processor_kwargs, expected_toks_per_img, expected_pixels_shape", [
+    ("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"), [
         ({}, 1426, (5704, 1176)),
-        ({
-            MIN_PIXELS: 64**2,
-            MAX_PIXELS: 512**2
-        }, 330, (1320, 1176)),
+        ({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)),
     ])
-@pytest.mark.parametrize("model", [MODEL])
+# yapf: enable
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_override(
     processor_for_qwen2_vl,
     image_assets: _ImageAssets,
-    model: str,
-    mm_processor_kwargs: Dict[str, Any],
+    model_id: str,
+    mm_processor_kwargs: dict[str, object],
     expected_toks_per_img: int,
-    expected_pixels_shape: Tuple[int, int],
+    expected_pixels_shape: tuple[int, int],
     num_imgs: int,
 ):
     """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
-    # Same as the previous test - don't initialize mm_processor_kwargs
-    # in this test and assume that the kwargs will be correctly expanded by
-    # the partial when calling the custom input processor.
     ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
+        model_name=model_id,
+        tokenizer_name=model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
     ctx = InputProcessingContext(ctx.model_config, tokenizer)
+
     # Build the image str / prompt based on the number of images we pass
     prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
-    images = [image_assets[0].pil_image] * num_imgs
-
-    mm_data = {"image": images}
+    mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
 
     processor = processor_for_qwen2_vl(ctx)
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 7db08166826eb..dc0b683c1f1cb 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -274,10 +274,8 @@
             ),
             limit_mm_per_prompt={"image": 4},
         )],
-        # Llava-next tests fixed sizes & the default size factors
-        image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
     ),
-    "llava_one_vision": VLMTestInfo(
+    "llava_onevision": VLMTestInfo(
         models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
         test_type=VLMTestType.CUSTOM_INPUTS,
         prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
@@ -288,8 +286,6 @@
         ),
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
-        # Llava-one-vision tests fixed sizes & the default size factors
-        image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
         custom_test_opts=[CustomTestOptions(
             inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
                 formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
@@ -306,7 +302,6 @@
         max_model_len=4096,
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
-        image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
     ),
     "mantis": VLMTestInfo(
         models=["TIGER-Lab/Mantis-8B-siglip-llama3"],
@@ -431,7 +426,7 @@
             ) for inp in custom_inputs.different_patch_input_cases_internvl()
         ],
     ),
-    "llava_one_vision-multiple-images": VLMTestInfo(
+    "llava_onevision-multiple-images": VLMTestInfo(
         models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
         test_type=VLMTestType.CUSTOM_INPUTS,
         max_model_len=16384,
diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
index 51fe7d2ad32a8..16e256e040a74 100644
--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
@@ -427,130 +427,3 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
         mm_limit=1,
         tensor_parallel_size=1,
     )
-
-
-def run_chunked_prefill_test(
-    vllm_runner: Type[VllmRunner],
-    inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    mm_limit: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    """Compare inference result between
-    chunked prefill disabled and chunked prefill enabled
-    """
-
-    # NOTE:
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     task="generate",
-                     max_model_len=4000,
-                     max_num_seqs=4,
-                     dtype=dtype,
-                     limit_mm_per_prompt={
-                         "image": mm_limit,
-                         "video": mm_limit
-                     },
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend
-                     ) as vllm_model:
-
-        outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images or None,
-                                                videos=videos or None)
-            for prompts, images, videos in inputs
-        ]
-
-    with vllm_runner(
-            model,
-            task="generate",
-            max_model_len=4000,
-            max_num_seqs=4,
-            dtype=dtype,
-            limit_mm_per_prompt={
-                "image": mm_limit,
-                "video": mm_limit
-            },
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            enable_chunked_prefill=True,
-            # should be small enough to ensure prefilling is chunked
-            max_num_batched_tokens=32,
-            mm_processor_kwargs={
-                "max_pixels": 16 * 28 * 28,
-            }) as vllm_model_chunked:
-        outputs_per_case_chunked = [
-            vllm_model_chunked.generate_greedy_logprobs(
-                prompts,
-                max_tokens,
-                num_logprobs=num_logprobs,
-                images=images or None,
-                videos=videos or None) for prompts, images, videos in inputs
-        ]
-
-    for outputs, \
-        outputs_chunked \
-        in zip(outputs_per_case,
-            outputs_per_case_chunked):
-        check_logprobs_close(
-            outputs_0_lst=outputs,
-            outputs_1_lst=outputs_chunked,
-            name_0="non_chunked",
-            name_1="chunked",
-        )
-
-
-@pytest.mark.core_model
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [1])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_qwen2_vl_mrope_chunked_prefill(vllm_runner, example_prompts,
-                                        model: str, dtype: str,
-                                        max_tokens: int,
-                                        num_logprobs: int) -> None:
-    """
-    Test Qwen2-VL's chunked prefill with M-RoPE
-    """
-    prompts = [
-        qwen2_vl_chat_template(IMAGE_PLACEHOLDER, prompt)
-        for prompt in example_prompts[:1]
-    ]
-
-    # 1. Qwen2-VL's M-RoPE works only when there are some multi-modal inputs,
-    #    so an image is included in the inputs
-    # 2. however, Qwen2-VL currently won't work properly
-    #    when chunked prefill is enabled and there are some multi-modal inputs,
-    #    here use a hacky way: provide a **zero-length** image to make it happy
-    #
-    # and finally we achieved:
-    # (1) chunked_prefill enabled; (2) M-RoPE works; to continue our tests
-    zero_len_image = {
-        "image_embeds": torch.empty((0, MODEL_HIDDEN_SIZE)),
-        "image_grid_thw": torch.tensor([[0, 0, 0]])
-    }
-    images = [zero_len_image] * len(prompts)
-
-    inputs_per_case: List[Tuple[List[str], PromptImageInput,
-                                PromptVideoInput]] = [
-                                    (prompts, images, []),
-                                ]
-
-    run_chunked_prefill_test(
-        vllm_runner,
-        inputs_per_case,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index f99d7556b27f9..b32faa699ebf2 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -11,8 +11,8 @@
 from vllm.inputs import InputProcessingContext
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.processing import (ProcessingCache, PromptReplacement,
-                                        _PlaceholderInfo, find_text_matches,
-                                        find_token_matches, iter_placeholders,
+                                        _PlaceholderInfo, find_mm_placeholders,
+                                        find_text_matches, find_token_matches,
                                         iter_token_matches,
                                         replace_text_matches,
                                         replace_token_matches)
@@ -314,21 +314,27 @@ def test_find_replace_text(
     # Should not be used since there is nothing to convert to text
     mock_tokenizer = cast(AnyTokenizer, object())
 
-    prompt_repls = [
-        PromptReplacement(key, target, repl_by_key[key]).bind(mock_tokenizer)
+    mm_prompt_repls = {
+        key: [
+            PromptReplacement(key, target,
+                              repl_by_key[key]).bind(mock_tokenizer)
+        ]
         for key, target in target_by_key.items()
-    ]
-    matches = find_text_matches(prompt, prompt_repls)
+    }
+    mm_matches = {
+        key: find_text_matches(prompt, prompt_repls)
+        for key, prompt_repls in mm_prompt_repls.items()
+    }
 
     result = replace_text_matches(
         prompt,
-        matches,
+        mm_matches,
         {key: mm_count
          for key in repl_by_key},
     )
 
     # Only displayed on error
-    print("matches:", matches)
+    print("mm_matches:", mm_matches)
     print("result:", result)
 
     # Manually constructed results
@@ -380,21 +386,27 @@ def test_find_replace_tokens(
     # Should not be used since there is nothing to convert to tokens
     mock_tokenizer = cast(AnyTokenizer, object())
 
-    prompt_repls = [
-        PromptReplacement(key, target, repl_by_key[key]).bind(mock_tokenizer)
+    mm_prompt_repls = {
+        key: [
+            PromptReplacement(key, target,
+                              repl_by_key[key]).bind(mock_tokenizer)
+        ]
         for key, target in target_by_key.items()
-    ]
-    matches = find_token_matches(prompt, prompt_repls)
+    }
+    mm_matches = {
+        key: find_token_matches(prompt, prompt_repls)
+        for key, prompt_repls in mm_prompt_repls.items()
+    }
 
     result = replace_token_matches(
         prompt,
-        matches,
+        mm_matches,
         {key: mm_count
          for key in repl_by_key},
     )
 
     # Only displayed on error
-    print("matches:", matches)
+    print("mm_matches:", mm_matches)
     print("result:", result)
 
     # Manually constructed results
@@ -417,58 +429,76 @@ def test_find_replace_tokens(
     [
         (
             [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
-            [
-                _PlaceholderInfo(
-                    modality="pattern_1",
-                    start_idx=6,
-                    replacement=[32000, 32000],
-                ),
-            ],
+            {
+                "pattern_1": [
+                    _PlaceholderInfo(
+                        modality="pattern_1",
+                        item_idx=0,
+                        start_idx=6,
+                        replacement=[32000, 32000],
+                    ),
+                ],
+            }
+
         ),
         (
             [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],
-            [
-                _PlaceholderInfo(
-                    modality="pattern_1",
-                    start_idx=1,
-                    replacement=[32000, 32000],
-                ),
-                _PlaceholderInfo(
-                    modality="pattern_1",
-                    start_idx=5,
-                    replacement=[32000, 32000],
-                ),
-                _PlaceholderInfo(
-                    modality="pattern_3",
-                    start_idx=7,
-                    replacement=[1550, 918, 1550],
-                ),
-            ],
+            {
+                "pattern_1": [
+                    _PlaceholderInfo(
+                        modality="pattern_1",
+                        item_idx=0,
+                        start_idx=1,
+                        replacement=[32000, 32000],
+                    ),
+                    _PlaceholderInfo(
+                        modality="pattern_1",
+                        item_idx=1,
+                        start_idx=5,
+                        replacement=[32000, 32000],
+                    ),
+                ],
+                "pattern_3": [
+                    _PlaceholderInfo(
+                        modality="pattern_3",
+                        item_idx=0,
+                        start_idx=7,
+                        replacement=[1550, 918, 1550],
+                    ),
+                ],
+            }
         ),
         (
             [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
-            [
-                _PlaceholderInfo(
-                    modality="pattern_1",
-                    start_idx=1,
-                    replacement=[32000, 32000],
-                ),
-                _PlaceholderInfo(
-                    modality="pattern_1",
-                    start_idx=3,
-                    replacement=[32000, 32000],
-                ),
-                _PlaceholderInfo(
-                    modality="pattern_3",
-                    start_idx=6,
-                    replacement=[1550, 918, 1550],
-                ),
-            ],
+            {
+                "pattern_1": [
+                    _PlaceholderInfo(
+                        modality="pattern_1",
+                        item_idx=0,
+                        start_idx=1,
+                        replacement=[32000, 32000],
+                    ),
+                    _PlaceholderInfo(
+                        modality="pattern_1",
+                        item_idx=1,
+                        start_idx=3,
+                        replacement=[32000, 32000],
+                    ),
+                ],
+                "pattern_3": [
+                    _PlaceholderInfo(
+                        modality="pattern_3",
+                        item_idx=0,
+                        start_idx=6,
+                        replacement=[1550, 918, 1550],
+                    ),
+                ],
+            }
         ),
     ]
 )
 # yapf: enable
-def test_iter_placeholders(
+def test_find_mm_placeholders(
     repl_by_key,
     prompt,
     expected,
@@ -476,19 +506,18 @@ def test_iter_placeholders(
     # Should not be used since there is nothing to convert to tokens
     mock_tokenizer = cast(AnyTokenizer, object())
 
-    prompt_repls = [
-        PromptReplacement(key, [], repl).bind(mock_tokenizer)
+    mm_prompt_repls = {
+        key: [PromptReplacement(key, [], repl).bind(mock_tokenizer)]
         for key, repl in repl_by_key.items()
-    ]
+    }
 
-    result = list(
-        iter_placeholders(
-            prompt_repls,
-            prompt,
-            # Effectively match all occurrences in the prompt
-            {key: 3
-             for key in repl_by_key},
-        ))
+    result = find_mm_placeholders(
+        mm_prompt_repls,
+        prompt,
+        # Effectively match all occurrences in the prompt
+        {key: 3
+         for key in repl_by_key},
+    )
 
     # Only displayed on error
     print("result:", result)
@@ -694,7 +723,10 @@ def _test_processing_cache_correctness(
         }
 
         mm_counts = {k: len(vs) for k, vs in mm_data.items()}
-        prompt = baseline_processor._get_dummy_mm_inputs(mm_counts).prompt_text
+        prompt = baseline_processor._get_dummy_processor_inputs(
+            model_config.max_model_len,
+            mm_counts,
+        ).prompt_text
 
         # Drop unnecessary keys and test single -> multi conversion
         if rng.rand() < simplify_rate:
@@ -728,6 +760,8 @@ def _test_processing_cache_correctness(
     ("adept/fuyu-8b", {"image": False}),
     ("llava-hf/llava-1.5-7b-hf", {"image": True}),
     ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}),
+    ("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}),
+    ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}),  # noqa: E501
     ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
     ("mistral-community/pixtral-12b", {"image": True}),
     ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 4f0d679bd6c28..2fd4262a9d3b9 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -456,7 +456,7 @@ def _get_num_image_tokens(self) -> int:
         hf_config = self.ctx.get_hf_config()
         return max(hf_config.projector_patch_to_query_dict.values())
 
-    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         return {"image": self._get_num_image_tokens()}
 
     def _get_mm_fields_config(
@@ -488,8 +488,9 @@ def _get_prompt_replacements(
             )
         ]
 
-    def _get_dummy_mm_inputs(
+    def _get_dummy_processor_inputs(
         self,
+        seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         hf_config = self.ctx.get_hf_config()
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 0fe10d8585215..b3ecb2f22dc19 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -405,7 +405,7 @@ def _get_num_image_tokens(self) -> int:
         hf_config = self.ctx.get_hf_config(Blip2Config)
         return hf_config.num_query_tokens
 
-    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         return {"image": self._get_num_image_tokens()}
 
     def _get_hf_processor(self) -> Blip2Processor:
@@ -457,8 +457,9 @@ def apply(
 
         return result
 
-    def _get_dummy_mm_inputs(
+    def _get_dummy_processor_inputs(
         self,
+        seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         hf_config = self.ctx.get_hf_config(Blip2Config)
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 0bd0194243ceb..1ad44678a591d 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -57,7 +57,7 @@ def _get_num_image_tokens(self) -> int:
         processor = self._get_hf_processor()
         return processor.image_seq_length
 
-    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         return {"image": self._get_num_image_tokens()}
 
     def _get_hf_processor(self) -> ChameleonProcessor:
@@ -90,8 +90,9 @@ def _get_prompt_replacements(
             )
         ]
 
-    def _get_dummy_mm_inputs(
+    def _get_dummy_processor_inputs(
         self,
+        seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         config = self.ctx.get_hf_config(ChameleonConfig)
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 0188452054b8c..1bde45cb140cb 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -164,15 +164,18 @@ def get_num_image_tokens(
     def get_max_image_tokens(self) -> int:
         return get_max_clip_image_tokens(self.vision_config)
 
-    def get_num_patches(self) -> int:
+    def get_image_size(self) -> int:
+        return self.vision_config.image_size
+
+    def get_patch_size(self) -> int:
+        return self.vision_config.patch_size
+
+    def get_patch_grid_length(self) -> int:
         return get_clip_patch_grid_length(
             image_size=self.vision_config.image_size,
             patch_size=self.vision_config.patch_size,
         )
 
-    def get_image_size(self) -> int:
-        return self.vision_config.image_size
-
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
 class CLIPVisionEmbeddings(nn.Module):
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 3680d01725238..7cd58fbc7cf21 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -96,7 +96,7 @@ def _get_image_feature_grid_size(
         nrows = math.ceil(image_height / 30)
         return ncols, nrows
 
-    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         target_width, target_height = self._get_image_target_size()
 
         max_ncols, max_nrows = self._get_image_feature_grid_size(
@@ -208,8 +208,9 @@ def apply(
 
         return result
 
-    def _get_dummy_mm_inputs(
+    def _get_dummy_processor_inputs(
         self,
+        seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         target_width, target_height = self._get_image_target_size()
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 78de27cd821c6..d522378e0bebb 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -25,11 +25,9 @@
                                     NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        InputProcessingContext,
+from vllm.multimodal.processing import (InputProcessingContext,
                                         MultiModalDataItems, ProcessingCache,
-                                        ProcessorInputs, PromptReplacement,
-                                        full_groupby_modality)
+                                        ProcessorInputs, PromptReplacement)
 from vllm.sequence import IntermediateTensors
 
 from .clip import CLIPVisionModel
@@ -39,7 +37,7 @@
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import vision_encoder_info
+from .vision import BaseVisionLanguageMultiModalProcessor
 
 
 class LlavaImagePixelInputs(TypedDict):
@@ -100,19 +98,7 @@ class LlavaLikeConfig(Protocol):
     vision_feature_layer: Final[Union[int, List[int]]]
 
 
-class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor):
-
-    def __init__(self,
-                 ctx: InputProcessingContext,
-                 *,
-                 cache: Optional[ProcessingCache] = None,
-                 enable_sanity_checks: bool = True) -> None:
-        super().__init__(ctx,
-                         cache=cache,
-                         enable_sanity_checks=enable_sanity_checks)
-
-        vision_config = self._get_hf_config().vision_config
-        self._vision_encoder_info = vision_encoder_info(vision_config)
+class BaseLlavaMultiModalProcessor(BaseVisionLanguageMultiModalProcessor):
 
     @abstractmethod
     def _get_hf_config(self) -> LlavaLikeConfig:
@@ -121,6 +107,19 @@ def _get_hf_config(self) -> LlavaLikeConfig:
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {"image": self._get_max_image_tokens()}
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
     def _apply_feature_select_strategy(
         self,
         strategy: str,
@@ -142,19 +141,6 @@ def _get_max_image_tokens(self) -> int:
             self._vision_encoder_info.get_max_image_tokens(),
         )
 
-    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
-        return {"image": self._get_max_image_tokens()}
-
-    def _get_mm_fields_config(
-        self,
-        hf_inputs: BatchFeature,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
-            pixel_values=MultiModalFieldConfig.batched("image"),
-            image_embeds=MultiModalFieldConfig.batched("image"),
-        )
-
     def _get_dummy_image_size(self) -> ImageSize:
         image_size = self._vision_encoder_info.get_image_size()
         return ImageSize(image_size, image_size)
@@ -163,8 +149,9 @@ def _get_dummy_image_size(self) -> ImageSize:
     def _get_image_token(self) -> str:
         raise NotImplementedError
 
-    def _get_dummy_mm_inputs(
+    def _get_dummy_processor_inputs(
         self,
+        seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         num_images = mm_counts.get("image", 0)
@@ -709,7 +696,7 @@ def get_replacement_mantis(item_idx: int):
                 "</Image>)",  # 3 tokens
             ])
 
-        mantis_repls = self._bind_prompt_replacements([
+        mantis_mm_repls = self._bind_and_group_repls([
             PromptReplacement(
                 modality="image",
                 target=[image_token_id] * num_image_tokens,
@@ -719,7 +706,7 @@ def get_replacement_mantis(item_idx: int):
 
         prompt_ids, prompt_text, _ = self._apply_prompt_replacements(
             result["prompt_token_ids"],
-            mantis_repls,
+            mantis_mm_repls,
             mm_item_counts,
         )
 
@@ -728,15 +715,19 @@ def get_replacement_mantis(item_idx: int):
             hf_processor_mm_kwargs,
             mm_kwargs,
         )
-        orig_repls = self._bind_prompt_replacements(unbound_orig_repls)
+        orig_repls = self._bind_and_group_repls(unbound_orig_repls)
+
+        mm_placeholders = self._find_mm_placeholders(
+            orig_repls,
+            prompt_ids,
+            mm_item_counts,
+        )
 
-        all_placeholders = self._find_placeholders(orig_repls, prompt_ids,
-                                                   mm_item_counts)
-        assert len(all_placeholders) == mm_item_counts.get("image", 0)
+        self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
 
-        mm_placeholders = {
-            modality: [item.to_range() for item in items]
-            for modality, items in full_groupby_modality(all_placeholders)
+        mm_placeholder_ranges = {
+            modality: [item.to_range() for item in placeholders]
+            for modality, placeholders in mm_placeholders.items()
         }
 
         return MultiModalInputsV2(
@@ -744,7 +735,7 @@ def get_replacement_mantis(item_idx: int):
             prompt=prompt_text,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
-            mm_placeholders=mm_placeholders,
+            mm_placeholders=mm_placeholder_ranges,
         )
 
 
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 24debd1cbf3fe..3769f04f94a92 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -67,9 +67,6 @@ def _get_hf_config(self) -> LlavaNextConfig:
     def _get_hf_processor(self) -> LlavaNextProcessor:
         return self.ctx.get_hf_processor(LlavaNextProcessor)
 
-    def _get_image_token(self) -> str:
-        return self._get_hf_processor().image_token
-
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -81,6 +78,9 @@ def _get_mm_fields_config(
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
+    def _get_image_token(self) -> str:
+        return self._get_hf_processor().image_token
+
     def _get_max_image_tokens(self) -> int:
         largest_feature_size, _ = self._get_pinpoint_with_most_features()
         return largest_feature_size
@@ -97,20 +97,20 @@ def _get_num_image_tokens(
         image_height: int,
     ) -> int:
         hf_config = self._get_hf_config()
+        vision_encoder_info = self._vision_encoder_info
 
         base_feature_size = self._apply_feature_select_strategy(
             hf_config.vision_feature_select_strategy,
-            self._vision_encoder_info.get_num_image_tokens(
+            vision_encoder_info.get_num_image_tokens(
                 image_width=image_width,
                 image_height=image_height,
             ),
         )
-        num_patches = self._vision_encoder_info.get_num_patches()
 
         num_patch_height, num_patch_width = get_anyres_image_grid_shape(
             image_size=(image_height, image_width),
             grid_pinpoints=hf_config.image_grid_pinpoints,
-            patch_size=self._vision_encoder_info.get_image_size(),
+            patch_size=vision_encoder_info.get_image_size(),
         )
 
         (
@@ -119,7 +119,7 @@ def _get_num_image_tokens(
         ) = self._get_num_unpadded_features(
             original_height=image_height,
             original_width=image_width,
-            npatches=num_patches,
+            npatches=vision_encoder_info.get_patch_grid_length(),
             num_patch_height=num_patch_height,
             num_patch_width=num_patch_width,
         )
@@ -155,6 +155,7 @@ def _get_num_unpadded_features(
 
         unpadded_features = current_height * current_width
         newline_features = current_height
+
         return (unpadded_features, newline_features)
 
     def _get_pinpoint_with_most_features(self) -> tuple[int, ImageSize]:
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 0de9d8c5ea572..ee6b89f0d4498 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -3,38 +3,32 @@
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
-import numpy as np
 import torch
 import torch.nn as nn
-from transformers import (CLIPVisionConfig, LlavaNextVideoConfig,
-                          SiglipVisionConfig)
+from transformers import (BatchFeature, LlavaNextVideoConfig,
+                          LlavaNextVideoProcessor)
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   repeat_and_pad_placeholder_tokens)
+from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
+from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
+                                   VideoEmbeddingItems, VideoProcessorItems)
+from vllm.multimodal.processing import (MultiModalFieldConfig, ProcessorInputs,
+                                        PromptReplacement)
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
-from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
 from .interfaces import SupportsMultiModal, SupportsPP
 from .llava import init_vision_tower_for_llava
-from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
-                     dummy_seq_data_for_siglip)
+from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-
-# For profile run
-_MAX_FRAMES_PER_VIDEO = 32
-_MAX_NUM_VIDEOS = 1
+from .vision import BaseVisionLanguageMultiModalProcessor
 
 
 class LlavaNextVideoPixelInputs(TypedDict):
@@ -50,143 +44,148 @@ class LlavaNextVideoPixelInputs(TypedDict):
     """
 
 
-def get_llava_next_video_frame_feature_size(
-        hf_config: LlavaNextVideoConfig) -> int:
-    # Support both CLIPVisionConfig and SiglipVisionConfig
-    image_size = hf_config.vision_config.image_size
-    patch_size = hf_config.vision_config.patch_size
-    spatial_pool_stride = hf_config.spatial_pool_stride
+class LlavaNextVideoMultiModalProcessor(BaseVisionLanguageMultiModalProcessor):
 
-    return int((image_size / patch_size / spatial_pool_stride)**2)
+    def _get_hf_config(self) -> LlavaNextVideoConfig:
+        return self.ctx.get_hf_config(LlavaNextVideoConfig)
 
+    def _get_hf_processor(self) -> LlavaNextVideoProcessor:
+        return self.ctx.get_hf_processor(LlavaNextVideoProcessor)
 
-def _get_max_llm_tokens(ctx: InputContext) -> int:
-    """
-    Calculated from the maximum video frames under the context length
-    constraints of the language model.
-    """
-    hf_text_config = ctx.model_config.hf_text_config
-    model_config = ctx.model_config
-    max_tokens = model_config.max_model_len
-    rope_scaling = model_config.rope_scaling
-
-    if rope_scaling:
-        rope_scaling_factor = hf_text_config.rope_scaling["factor"]
-    else:
-        rope_scaling_factor = 1
-
-    max_tokens *= rope_scaling_factor
-
-    return max_tokens
-
-
-def get_max_llava_next_video_tokens(ctx: InputContext) -> int:
-    # Currently set to 32 frames
-    # TODO: max_tokens = _get_max_llm_tokens(ctx)
-    hf_config = ctx.get_hf_config(LlavaNextVideoConfig)
-    tokens_per_frame = get_llava_next_video_frame_feature_size(hf_config)
-    return _MAX_FRAMES_PER_VIDEO * tokens_per_frame
-
-
-def dummy_data_for_llava_next_video(ctx: InputContext, seq_len: int,
-                                    mm_counts: Mapping[str, int]):
-    hf_config = ctx.get_hf_config(LlavaNextVideoConfig)
-    vision_config = hf_config.vision_config
-
-    # TODO: support multiple videos
-    num_videos = mm_counts["video"]
-    if num_videos != _MAX_NUM_VIDEOS:
-        raise NotImplementedError(
-            f"Only {_MAX_NUM_VIDEOS} videos are supported")
-
-    # TODO: support configuring the number of frames
-    frames_per_video = _MAX_FRAMES_PER_VIDEO
-    # num_images = num_videos * frames_per_video
-
-    # fills the sequence with as longer video data as possible
-    tokens_per_frame = get_llava_next_video_frame_feature_size(hf_config)
-    video_feature_size = frames_per_video * tokens_per_frame
-
-    if isinstance(vision_config, CLIPVisionConfig):
-        seq_data, ranges = dummy_seq_data_for_clip(
-            vision_config,
-            seq_len,
-            num_videos,
-            image_token_id=hf_config.video_token_index,
-            image_feature_size_override=video_feature_size,
-            mm_key="video",
-        )
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"video": 1}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        num_frames = self._get_dummy_num_frames(seq_len)
+        max_video_tokens = self._get_max_video_tokens(num_frames)
+
+        return {"video": max_video_tokens}
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values_videos=MultiModalFieldConfig.batched("video"))
+
+    def _get_num_frame_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self._get_hf_config()
+        spatial_pool_stride = hf_config.spatial_pool_stride
 
-        pil_frame = dummy_image_for_clip(vision_config, num_images=1)
-        np_frame = np.array(pil_frame["image"])
-        mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0)
-        mm_data = {"video": mm_data_per_video}
-        return DummyData(seq_data, mm_data, ranges)
-    elif isinstance(vision_config, SiglipVisionConfig):
-        seq_data, ranges = dummy_seq_data_for_siglip(
-            vision_config,
-            seq_len,
-            num_videos,
-            image_token_id=hf_config.video_token_index,
-            image_feature_size_override=video_feature_size,
-            mm_key="video",
+        patch_grid_length = self._vision_encoder_info.get_patch_grid_length()
+        pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride)
+
+        return pooled_grid_length * pooled_grid_length
+
+    def _get_num_video_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int,
+    ) -> int:
+        num_frame_tokens = self._get_num_frame_tokens(
+            image_width=image_width,
+            image_height=image_height,
         )
 
-        pil_frame = dummy_image_for_siglip(vision_config, num_images=1)
-        np_frame = np.array(pil_frame["image"])
-        mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0)
-        mm_data = {"video": mm_data_per_video}
-        return DummyData(seq_data, mm_data, ranges)
+        return num_frame_tokens * num_frames
 
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
+    def _get_max_video_tokens(self, num_frames: int) -> int:
+        return self._get_num_video_tokens(image_width=999999,
+                                          image_height=999999,
+                                          num_frames=num_frames)
 
+    def _get_max_video_frames(self, max_tokens: int) -> int:
+        num_frames = 0
 
-def input_processor_for_llava_next_video(ctx: InputContext,
-                                         inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "video" not in multi_modal_data:
-        return inputs
+        while True:
+            next_num_frames = num_frames + 1
 
-    if "multi_modal_placeholders" in inputs and "video" in inputs[
-            "multi_modal_placeholders"]:
-        # The inputs already have placeholders.
-        return inputs
+            if self._get_max_video_tokens(next_num_frames) > max_tokens:
+                break
 
-    video_data = multi_modal_data["video"]
+            num_frames = next_num_frames
 
-    model_config = ctx.model_config
-    hf_config = ctx.get_hf_config(LlavaNextVideoConfig)
-    vision_config = hf_config.vision_config
+        return num_frames
 
-    if isinstance(video_data, np.ndarray):
-        # Supports both CLIP and Siglip
-        num_frames = video_data.shape[0]
-        frame_feature_size = \
-            get_llava_next_video_frame_feature_size(hf_config)
-        video_feature_size = num_frames * frame_feature_size
+    def _get_dummy_num_frames(self, seq_len: int) -> int:
+        mm_config = self.ctx.get_mm_config()
+        max_videos = mm_config.limit_per_prompt.get("video", 1)
 
-        tokenizer = cached_get_tokenizer(model_config.tokenizer)
+        max_total_frames = self._get_max_video_frames(seq_len)
 
-        new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-            tokenizer,
-            inputs.get("prompt"),
-            inputs["prompt_token_ids"],
-            placeholder_token_id=hf_config.video_token_index,
-            repeat_count=video_feature_size,
-        )
+        return max(max_total_frames // max(max_videos, 1), 1)
 
-        return token_inputs(prompt_token_ids=new_token_ids,
-                            prompt=new_prompt,
-                            multi_modal_data=multi_modal_data,
-                            multi_modal_placeholders={"video": ranges})
+    def _get_dummy_image_size(self) -> ImageSize:
+        image_size = self._vision_encoder_info.get_image_size()
+        return ImageSize(image_size, image_size)
 
-    elif is_list_of(video_data, np.ndarray):
-        raise NotImplementedError(
-            "Processing multiple videos is not supported")
+    def _get_video_token(self) -> str:
+        return self._get_hf_processor().video_token
 
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        hf_config = self._get_hf_config()
+        video_token_id = hf_config.video_token_index
+
+        def get_replacement(item_idx: int):
+            videos = mm_items.get_items(
+                "video", (VideoEmbeddingItems, VideoProcessorItems))
+
+            if isinstance(videos, VideoEmbeddingItems):
+                num_video_tokens = videos.get_feature_size(item_idx)
+            else:
+                image_size = videos.get_frame_size(item_idx)
+                num_video_tokens = self._get_num_video_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    num_frames=videos.get_num_frames(item_idx),
+                )
+
+            return [video_token_id] * num_video_tokens
+
+        return [
+            PromptReplacement(
+                modality="video",
+                target=[video_token_id],
+                replacement=get_replacement,
+            ),
+        ]
+
+    def _get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_videos = mm_counts.get("video", 0)
+
+        video_token = self._get_video_token()
+        target_width, target_height = self._get_dummy_image_size()
+
+        mm_data = {
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=self._get_dummy_num_frames(seq_len),
+                num_videos=num_videos,
+            )
+        }
+
+        return ProcessorInputs(
+            prompt_text=video_token * num_videos,
+            mm_data=mm_data,
+        )
 
 
 # adopted from transformers modeling_llava_next_video.py
@@ -246,11 +245,7 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_input_mapper("video")
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-    "video", get_max_llava_next_video_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next_video)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_next_video)
+@MULTIMODAL_REGISTRY.register_processor(LlavaNextVideoMultiModalProcessor)
 class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
                                              SupportsPP):
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 0bebc1c745e2b..1e51e09a24c18 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -3,47 +3,36 @@
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
-import numpy as np
 import torch
 import torch.nn as nn
-from PIL import Image
-from transformers import (CLIPVisionConfig, LlavaOnevisionConfig,
-                          SiglipVisionConfig)
+from transformers import (BatchFeature, LlavaOnevisionConfig,
+                          LlavaOnevisionProcessor)
 from transformers.models.llava_onevision.modeling_llava_onevision import (
     get_anyres_image_grid_shape, unpad_image)
 from typing_extensions import NotRequired
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   repeat_and_pad_placeholder_tokens)
+from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
+from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems,
+                                   VideoProcessorItems)
+from vllm.multimodal.processing import (MultiModalFieldConfig, ProcessorInputs,
+                                        PromptReplacement)
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
-from .clip import (CLIPVisionModel, dummy_seq_data_for_clip,
-                   dummy_video_for_clip, get_clip_image_feature_size,
-                   get_clip_patch_grid_length, input_processor_for_clip)
+from .clip import CLIPVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
 from .llava import init_vision_tower_for_llava
-from .siglip import (SiglipVisionModel, dummy_seq_data_for_siglip,
-                     dummy_video_for_siglip, get_siglip_image_feature_size,
-                     get_siglip_patch_grid_length, input_processor_for_siglip)
+from .llava_next import LlavaNextMultiModalProcessor
+from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
-# Result in the max possible feature size (2x2 grid of 336x336px tiles)
-MAX_IMAGE_FEATURE_SIZE_HEIGHT = MAX_IMAGE_FEATURE_SIZE_WIDTH = 448
-
-# For profile run
-_MAX_FRAMES_PER_VIDEO = 16
-
 
 class LlavaOnevisionVideoPixelInputs(TypedDict):
     type: Literal["pixel_values_videos"]
@@ -92,286 +81,251 @@ class LlavaOnevisionImageEmbeddingInputs(TypedDict):
                                   LlavaOnevisionVideoPixelInputs]
 
 
-def _get_llava_onevision_image_unppaded_feature_size(height, width, patches,
-                                                     scale_height,
-                                                     scale_width):
-    current_height = patches * scale_height
-    current_width = patches * scale_width
-
-    original_aspect_ratio = width / height
-    current_aspect_ratio = current_width / current_height
-    if original_aspect_ratio > current_aspect_ratio:
-        new_height = int(height * (current_width / width))
-        padding = (current_height - new_height) // 2
-        current_height -= padding * 2
-    else:
-        new_width = int(width * (current_height / height))
-        padding = (current_width - new_width) // 2
-        current_width -= padding * 2
-
-    unpadded_features = current_height * current_width
-    newline_features = current_height
-
-    ratio = math.sqrt(current_height * current_width / (9 * patches**2))
-    if ratio > 1.1:
-        unpadded_features = int(current_height // ratio) * int(
-            current_width // ratio)
-        newline_features = int(current_height // ratio)
-
-    return (unpadded_features, newline_features)
-
-
-def get_llava_onevision_image_feature_size(
-    hf_config: LlavaOnevisionConfig,
-    *,
-    input_height: int,
-    input_width: int,
-) -> int:
-    vision_config = hf_config.vision_config
-
-    if isinstance(vision_config, CLIPVisionConfig):
-        num_patches = get_clip_patch_grid_length(
-            image_size=vision_config.image_size,
-            patch_size=vision_config.patch_size,
-        )
-        base_feature_size = get_clip_image_feature_size(vision_config)
-    elif isinstance(vision_config, SiglipVisionConfig):
-        num_patches = get_siglip_patch_grid_length(
-            image_size=vision_config.image_size,
-            patch_size=vision_config.patch_size,
-        )
-        base_feature_size = get_siglip_image_feature_size(vision_config)
-    else:
-        msg = f"Unsupported vision config: {type(vision_config)}"
-        raise NotImplementedError(msg)
-
-    strategy = hf_config.vision_feature_select_strategy
-    if strategy == "default":
-        base_feature_size -= 1
-    elif strategy == "full":
-        pass
-    else:
-        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+class LlavaOnevisionMultiModalProcessor(LlavaNextMultiModalProcessor):
 
-    num_patch_height, num_patch_width = get_anyres_image_grid_shape(
-        image_size=(input_height, input_width),
-        grid_pinpoints=hf_config.image_grid_pinpoints,
-        patch_size=vision_config.image_size,
-    )
+    def _get_hf_config(self) -> LlavaOnevisionConfig:
+        return self.ctx.get_hf_config(LlavaOnevisionConfig)
 
-    (
-        unpadded_feature_size,
-        newline_feature_size,
-    ) = _get_llava_onevision_image_unppaded_feature_size(
-        input_height, input_width, num_patches, num_patch_height,
-        num_patch_width)
-
-    return unpadded_feature_size + newline_feature_size + base_feature_size
-
-
-def get_max_llava_onevision_image_tokens(ctx: InputContext):
-    return get_llava_onevision_image_feature_size(
-        ctx.get_hf_config(LlavaOnevisionConfig),
-        input_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
-        input_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-    )
-
-
-def get_llava_onevision_video_frame_feature_size(
-        hf_config: LlavaOnevisionConfig) -> int:
-    # Support both CLIPVisionConfig and SiglipVisionConfig
-    image_size = hf_config.vision_config.image_size
-    patch_size = hf_config.vision_config.patch_size
-    spatial_pool_stride = hf_config.spatial_pool_stride if hasattr(
-        hf_config, "spatial_pool_stride") else 2
-
-    height = width = image_size // patch_size
-    return math.ceil(height / spatial_pool_stride) * math.ceil(
-        width / spatial_pool_stride)
-
-
-def get_llava_onevision_video_tokens(ctx: InputContext,
-                                     num_frames: int) -> int:
-    hf_config = ctx.get_hf_config(LlavaOnevisionConfig)
-
-    # TODO: support configuring (not supported by HF right now)
-    num_token_image_newline = 1
-    tokens_per_frame = get_llava_onevision_video_frame_feature_size(hf_config)
-    video_feature_size = num_frames * tokens_per_frame + num_token_image_newline
-
-    return video_feature_size
-
-
-def get_max_llava_onevision_video_tokens(ctx: InputContext) -> int:
-    return get_llava_onevision_video_tokens(ctx, _MAX_FRAMES_PER_VIDEO)
-
-
-def dummy_data_for_llava_onevision(ctx: InputContext, seq_len: int,
-                                   mm_counts: Mapping[str, int]):
-    hf_config = ctx.get_hf_config(LlavaOnevisionConfig)
-    vision_config = hf_config.vision_config
-
-    num_videos = mm_counts["video"]
-
-    # TODO: support configuring the number of frames
-    num_frames = _MAX_FRAMES_PER_VIDEO
-    video_feature_size = get_llava_onevision_video_tokens(ctx, num_frames)
-
-    if isinstance(vision_config, CLIPVisionConfig):
-        seq_data, ranges = dummy_seq_data_for_clip(
-            vision_config,
-            seq_len,
-            num_videos,
-            image_token_id=hf_config.video_token_index,
-            image_feature_size_override=video_feature_size,
-            mm_key="video")
-
-        mm_data = dummy_video_for_clip(vision_config,
-                                       num_frames=num_frames,
-                                       num_videos=num_videos)
-        return DummyData(seq_data, mm_data, ranges)
-    elif isinstance(vision_config, SiglipVisionConfig):
-        seq_data, ranges = dummy_seq_data_for_siglip(
-            vision_config,
-            seq_len,
-            num_videos,
-            image_token_id=hf_config.video_token_index,
-            image_feature_size_override=video_feature_size,
-            mm_key="video")
-
-        mm_data = dummy_video_for_siglip(vision_config,
-                                         num_frames=num_frames,
-                                         num_videos=num_videos)
-        return DummyData(seq_data, mm_data, ranges)
-
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
-
-
-def input_processor_when_multimodal_input_image(ctx: InputContext,
-                                                inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-
-    model_config = ctx.model_config
-    hf_config = ctx.get_hf_config(LlavaOnevisionConfig)
-    vision_config = hf_config.vision_config
-
-    image_data = multi_modal_data["image"]
-    if isinstance(image_data, Image.Image):
-        width, height = image_data.size
-
-        image_feature_size = get_llava_onevision_image_feature_size(
-            hf_config,
-            input_height=height,
-            input_width=width,
-        )
-    elif is_list_of(image_data, Image.Image):
-        image_feature_size = [
-            get_llava_onevision_image_feature_size(hf_config,
-                                                   input_height=img.height,
-                                                   input_width=img.width)
-            for img in image_data
-        ]
-    elif isinstance(image_data, torch.Tensor):
-        num_images, image_feature_size, hidden_size = image_data.shape
-    elif is_list_of(image_data, torch.Tensor):
-        image_feature_size = [item.shape[1] for item in image_data]
-    else:
-        raise TypeError(f"Invalid image type: {type(image_data)}")
-
-    vision_config = hf_config.vision_config
-
-    if isinstance(vision_config, CLIPVisionConfig):
-        return input_processor_for_clip(
-            model_config,
-            vision_config,
-            inputs,
-            image_token_id=hf_config.image_token_index,
-            image_feature_size_override=image_feature_size,
+    def _get_hf_processor(self) -> LlavaOnevisionProcessor:
+        return self.ctx.get_hf_processor(LlavaOnevisionProcessor)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        max_image_tokens = self._get_max_image_tokens()
+
+        num_frames = self._get_dummy_num_frames(seq_len)
+        max_video_tokens = self._get_max_video_tokens(num_frames)
+
+        return {
+            "image": max_image_tokens,
+            "video": max_video_tokens,
+        }
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+            pixel_values_videos=MultiModalFieldConfig.batched("video"),
         )
-    elif isinstance(vision_config, SiglipVisionConfig):
-        return input_processor_for_siglip(
-            model_config,
-            vision_config,
-            inputs,
-            image_token_id=hf_config.image_token_index,
-            image_feature_size_override=image_feature_size,
+
+    def _get_num_unpadded_features(
+        self,
+        *,
+        original_height: int,
+        original_width: int,
+        npatches: int,
+        num_patch_height: int,
+        num_patch_width: int,
+    ) -> tuple[int, int]:
+        current_height = npatches * num_patch_height
+        current_width = npatches * num_patch_width
+
+        original_aspect_ratio = original_width / original_height
+        current_aspect_ratio = current_width / current_height
+        if original_aspect_ratio > current_aspect_ratio:
+            new_height = int(original_height *
+                             (current_width / original_width))
+            padding = (current_height - new_height) // 2
+            current_height -= padding * 2
+        else:
+            new_width = int(original_width *
+                            (current_height / original_height))
+            padding = (current_width - new_width) // 2
+            current_width -= padding * 2
+
+        unpadded_features = current_height * current_width
+        newline_features = current_height
+
+        ratio = math.sqrt(current_height * current_width / (9 * npatches**2))
+        if ratio > 1.1:
+            unpadded_features = int(current_height // ratio) * int(
+                current_width // ratio)
+            newline_features = int(current_height // ratio)
+
+        return (unpadded_features, newline_features)
+
+    def _get_num_frame_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self._get_hf_config()
+        spatial_pool_stride = getattr(hf_config, "spatial_pool_stride", 2)
+
+        patch_grid_length = self._vision_encoder_info.get_patch_grid_length()
+        pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride)
+
+        return pooled_grid_length * pooled_grid_length
+
+    def _get_num_video_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int,
+    ) -> int:
+        num_frame_tokens = self._get_num_frame_tokens(
+            image_width=image_width,
+            image_height=image_height,
         )
 
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
+        return num_frame_tokens * num_frames + 1  # Newline token
+
+    def _get_max_video_tokens(self, num_frames: int) -> int:
+        return self._get_num_video_tokens(image_width=999999,
+                                          image_height=999999,
+                                          num_frames=num_frames)
 
+    def _get_max_video_frames(self, max_tokens: int) -> int:
+        num_frames = 0
 
-def input_processor_when_multimodal_input_video(ctx: InputContext,
-                                                inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "video" not in multi_modal_data:
-        return inputs
-    video_data = multi_modal_data["video"]
+        while True:
+            next_num_frames = num_frames + 1
 
-    model_config = ctx.model_config
-    hf_config = ctx.get_hf_config(LlavaOnevisionConfig)
+            if self._get_max_video_tokens(next_num_frames) > max_tokens:
+                break
 
-    if isinstance(video_data, np.ndarray):
-        # Supports both CLIP and Siglip
-        num_frames = video_data.shape[0]
-        video_feature_size = get_llava_onevision_video_tokens(ctx, num_frames)
-        tokenizer = cached_get_tokenizer(model_config.tokenizer)
+            num_frames = next_num_frames
 
-        new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-            tokenizer,
-            inputs.get("prompt"),
-            inputs["prompt_token_ids"],
-            placeholder_token_id=hf_config.video_token_index,
-            repeat_count=video_feature_size,
+        return num_frames
+
+    def _get_dummy_num_frames(self, seq_len: int) -> int:
+        mm_config = self.ctx.get_mm_config()
+        max_images = mm_config.limit_per_prompt.get("image", 1)
+        max_videos = mm_config.limit_per_prompt.get("video", 1)
+
+        max_image_tokens = self._get_max_image_tokens() * max_images
+        max_total_frames = self._get_max_video_frames(seq_len -
+                                                      max_image_tokens)
+
+        return max(max_total_frames // max(max_videos, 1), 1)
+
+    def _get_video_token(self) -> str:
+        return self._get_hf_processor().video_token
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        mm_data = dict(mm_data)
+        videos = mm_data.pop("videos", [])
+        assert isinstance(videos, list)
+
+        if not videos:
+            return super()._call_hf_processor(
+                prompt=prompt,
+                mm_data=mm_data,
+                mm_kwargs=mm_kwargs,
+            )
+
+        video_token = self._get_video_token()
+
+        # LLaVA-OneVision processor doesn't support multiple videos
+        # with different sizes when converting back to tensors
+        text_image_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        pixel_values_videos = []
+        for video in videos:
+            item_processor_data = dict(prompt=video_token, videos=video)
+
+            item_outputs = super()._call_hf_processor(
+                prompt=prompt,
+                mm_data=item_processor_data,
+                mm_kwargs=mm_kwargs,
+            )
+
+            pixel_values_videos.append(
+                item_outputs.pop("pixel_values_videos")[0])
+
+        combined_outputs = dict(
+            **text_image_outputs,
+            pixel_values_videos=pixel_values_videos,
         )
+        return BatchFeature(combined_outputs)
 
-        return token_inputs(prompt_token_ids=new_token_ids,
-                            prompt=new_prompt,
-                            multi_modal_data=multi_modal_data,
-                            multi_modal_placeholders={"video": ranges})
-
-    elif is_list_of(video_data, np.ndarray):
-        video_feature_size = []
-        for video in video_data:
-            num_frames = video.shape[0]
-            video_feature_size.append(
-                get_llava_onevision_video_tokens(ctx, num_frames))
-
-        tokenizer = cached_get_tokenizer(model_config.tokenizer)
-        new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-            tokenizer,
-            inputs.get("prompt"),
-            inputs["prompt_token_ids"],
-            placeholder_token_id=hf_config.video_token_index,
-            repeat_count=video_feature_size,
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        image_repls = super()._get_prompt_replacements(
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            out_mm_kwargs=out_mm_kwargs,
         )
-        return token_inputs(prompt_token_ids=new_token_ids,
-                            prompt=new_prompt,
-                            multi_modal_data=multi_modal_data,
-                            multi_modal_placeholders={"video": ranges})
-    else:
-        raise TypeError(f"Invalid video type: {type(video_data)}")
 
-    msg = f"Unsupported video type: {type(video_data)}"
-    raise NotImplementedError(msg)
+        hf_config = self._get_hf_config()
+        video_token_id = hf_config.video_token_index
 
+        def get_video_replacement(item_idx: int):
+            videos = mm_items.get_items(
+                "video", (VideoEmbeddingItems, VideoProcessorItems))
 
-def input_processor_for_llava_onevision(ctx: InputContext,
-                                        inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or ("video" not in multi_modal_data
-                                    and "image" not in multi_modal_data):
-        return inputs
-    if "image" in multi_modal_data:
-        return input_processor_when_multimodal_input_image(ctx, inputs)
-    if "video" in multi_modal_data:
-        return input_processor_when_multimodal_input_video(ctx, inputs)
+            if isinstance(videos, VideoEmbeddingItems):
+                num_video_tokens = videos.get_feature_size(item_idx)
+            else:
+                image_size = videos.get_frame_size(item_idx)
+                num_video_tokens = self._get_num_video_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    num_frames=videos.get_num_frames(item_idx),
+                )
+
+            return [video_token_id] * num_video_tokens
 
-    msg = "Unsupported multi data type"
-    raise NotImplementedError(msg)
+        return image_repls + [
+            PromptReplacement(
+                modality="video",
+                target=[video_token_id],
+                replacement=get_video_replacement,
+            ),
+        ]
+
+    def _get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        image_token = self._get_image_token()
+        video_token = self._get_video_token()
+        target_width, target_height = self._get_dummy_image_size()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=self._get_dummy_num_frames(seq_len),
+                num_videos=num_videos,
+            )
+        }
+
+        return ProcessorInputs(
+            prompt_text=image_token * num_images + video_token * num_videos,
+            mm_data=mm_data,
+        )
 
 
 class LlavaOnevisionMultiModalProjector(nn.Module):
@@ -394,14 +348,7 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
-@MULTIMODAL_REGISTRY.register_input_mapper("video")
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-    "image", get_max_llava_onevision_image_tokens)
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-    "video", get_max_llava_onevision_video_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_onevision)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_onevision)
+@MULTIMODAL_REGISTRY.register_processor(LlavaOnevisionMultiModalProcessor)
 class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
                                              SupportsPP):
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index f2e49d8e4848d..7aa9d58d1d348 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -323,7 +323,7 @@ def _get_num_image_tokens(
             height=image_height,
         )
 
-    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         max_image_tokens = self._get_num_image_tokens(
             image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
             image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
@@ -415,12 +415,12 @@ def get_replacement_phi3v(item_idx: int):
     def _apply_prompt_replacements(
         self,
         token_ids: list[int],
-        prompt_repls: Sequence[_BoundPromptReplacement],
+        mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
         mm_item_counts: Mapping[str, int],
-    ) -> tuple[list[int], str, list[_PlaceholderInfo]]:
+    ) -> tuple[list[int], str, Mapping[str, list[_PlaceholderInfo]]]:
         token_ids, text, placeholders = super()._apply_prompt_replacements(
             token_ids=token_ids,
-            prompt_repls=prompt_repls,
+            mm_prompt_repls=mm_prompt_repls,
             mm_item_counts=mm_item_counts,
         )
 
@@ -428,15 +428,23 @@ def _apply_prompt_replacements(
         if text.startswith("<s> <|image|>"):
             text = text.replace("<s> <|image|>", "<s><|image|>", 1)
             token_ids = [token_ids[0], *token_ids[2:]]
-            placeholders = [
-                _PlaceholderInfo(p.modality, p.start_idx - 1, p.replacement)
-                for p in placeholders
-            ]
+            placeholders = {
+                modality: [
+                    _PlaceholderInfo(
+                        modality=p.modality,
+                        item_idx=p.item_idx,
+                        start_idx=p.start_idx - 1,
+                        replacement=p.replacement,
+                    ) for p in ps
+                ]
+                for modality, ps in placeholders.items()
+            }
 
         return token_ids, text, placeholders
 
-    def _get_dummy_mm_inputs(
+    def _get_dummy_processor_inputs(
         self,
+        seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         num_images = mm_counts.get("image", 0)
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index d7233bd6028ed..9e1d38512c0b4 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -780,15 +780,18 @@ def get_num_image_tokens(
     def get_max_image_tokens(self) -> int:
         return get_max_pixtral_hf_image_tokens(self.vision_config)
 
-    def get_num_patches(self) -> int:
+    def get_image_size(self) -> int:
+        return self.vision_config.image_size
+
+    def get_patch_size(self) -> int:
+        return self.vision_config.patch_size
+
+    def get_patch_grid_length(self) -> int:
         return get_pixtral_hf_patch_grid_length(
             image_size=self.vision_config.image_size,
             patch_size=self.vision_config.patch_size,
         )
 
-    def get_image_size(self) -> int:
-        return self.vision_config.image_size
-
 
 class PixtralHFMLP(nn.Module):
 
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index d050fd060353a..bc3bb1f79b407 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -84,7 +84,7 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None}
 
-    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         hf_config = self.ctx.get_hf_config(Qwen2AudioConfig)
         max_source_positions = hf_config.audio_config.max_source_positions
         max_output_lengths = (max_source_positions - 2) // 2 + 1
@@ -184,15 +184,16 @@ def get_replacement_qwen2_audio(item_idx: int):
         ]
 
     def _always_apply_prompt_replacements(self) -> bool:
-        # HF never applies prompt replacements, so we have to do it ourselves
-        # _find_placeholders may incorrectly think that HF has already performed
-        # processing for multi-audio input when the input audios are short
-        # (the corresponding placeholders may take up fewer tokens than
-        # the number of audio items)
+        # HF never applies prompt replacements, so we have to do it ourselves.
+        # NOTE: `_find_placeholders_by_modality` may incorrectly think that HF
+        # has already performed processing for multi-audio input when the input
+        # audios are short (the corresponding placeholders may take up fewer
+        # tokens than the number of audio items)
         return True
 
-    def _get_dummy_mm_inputs(
+    def _get_dummy_processor_inputs(
         self,
+        seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         feature_extractor = self._get_feature_extractor()
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 5a8c6e4deb7ac..abca85e0e2024 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -56,7 +56,8 @@
 from vllm.multimodal.inputs import (ImageItem, ModalityData,
                                     MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors, VideoItem)
-from vllm.multimodal.parse import ModalityDataItems, MultiModalDataParser
+from vllm.multimodal.parse import (ImageSize, ModalityDataItems,
+                                   MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
@@ -641,58 +642,6 @@ def load_weights(self, weights: Iterable[Tuple[str,
         return loaded_params
 
 
-# === Vision input helpers === #
-
-
-def _get_vision_info(
-    vision_config: Qwen2VLVisionConfig,
-    height: int,
-    width: int,
-    min_pixels: int,
-    max_pixels: int,
-    *,
-    do_resize: bool = True,
-    modality: str = "image",
-    mm_count: int = 1,
-):
-    """Get information (resized height / width and number of vision tokens)
-    of input image / video frame."""
-    patch_size = vision_config.patch_size
-    merge_size = vision_config.spatial_merge_size
-    temporal_patch_size = vision_config.temporal_patch_size
-
-    if do_resize:
-        resized_height, resized_width = smart_resize(
-            height=height,
-            width=width,
-            factor=patch_size * merge_size,
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-        )
-    else:
-        resized_height, resized_width = height, width
-
-    if modality == "image":
-        grid_t = mm_count
-    elif modality == "video":
-        grid_t = max(mm_count // temporal_patch_size, 1)
-    else:
-        raise ValueError(f"Modality {modality} is not supported")
-
-    grid_h = resized_height // patch_size
-    grid_w = resized_width // patch_size
-    vision_tokens = grid_t * grid_h * grid_w
-    llm_num_vision_tokens = vision_tokens // (merge_size**2)
-
-    return resized_height, resized_width, llm_num_vision_tokens
-
-
-def _get_image_processor(hf_processor: Qwen2VLProcessor):
-    image_processor = hf_processor.image_processor  # type: ignore
-    assert isinstance(image_processor, Qwen2VLImageProcessor)
-    return image_processor
-
-
 class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor],
                                             dict[str, torch.Tensor]]):
 
@@ -764,32 +713,111 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 
-    def _get_max_mm_tokens(self, modality: str) -> int:
+    def _get_vision_info(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int = 1,
+        do_resize: bool = True,
+    ) -> tuple[ImageSize, int]:
         hf_config = self.ctx.get_hf_config(Qwen2VLConfig)
         vision_config = hf_config.vision_config
+        patch_size = vision_config.patch_size
+        merge_size = vision_config.spatial_merge_size
+        temporal_patch_size = vision_config.temporal_patch_size
 
         hf_processor = self._get_hf_processor()
-        image_processor = _get_image_processor(hf_processor)
-
-        _, _, max_llm_image_tokens = _get_vision_info(
-            vision_config,
-            height=9999999,
-            width=9999999,
-            min_pixels=image_processor.min_pixels,
-            max_pixels=image_processor.max_pixels,
-            modality=modality,
+        image_processor = self._get_image_processor(hf_processor)
+
+        if do_resize:
+            resized_height, resized_width = smart_resize(
+                height=image_height,
+                width=image_width,
+                factor=patch_size * merge_size,
+                min_pixels=image_processor.min_pixels,
+                max_pixels=image_processor.max_pixels,
+            )
+            preprocessed_size = ImageSize(width=resized_width,
+                                          height=resized_height)
+        else:
+            preprocessed_size = ImageSize(width=image_width,
+                                          height=image_height)
+
+        grid_t = max(num_frames // temporal_patch_size, 1)
+        grid_h = preprocessed_size.height // patch_size
+        grid_w = preprocessed_size.width // patch_size
+
+        num_patches = grid_t * grid_h * grid_w
+        num_vision_tokens = num_patches // (merge_size**2)
+
+        return preprocessed_size, num_vision_tokens
+
+    def _get_dummy_image_size(self) -> ImageSize:
+        max_image_size, _ = self._get_vision_info(
+            image_width=9999999,
+            image_height=9999999,
+        )
+        return max_image_size
+
+    def _get_max_image_tokens(self) -> int:
+        _, max_image_tokens = self._get_vision_info(
+            image_width=9999999,
+            image_height=9999999,
+        )
+        return max_image_tokens
+
+    def _get_max_video_tokens(self, num_frames: int) -> int:
+        _, max_video_tokens = self._get_vision_info(
+            image_width=9999999,
+            image_height=9999999,
+            num_frames=num_frames,
         )
-        return max_llm_image_tokens
+        return max_video_tokens
+
+    def _get_max_video_frames(self, max_tokens: int) -> int:
+        num_frames = 0
+
+        while True:
+            next_num_frames = num_frames + 1
+
+            if self._get_max_video_tokens(next_num_frames) > max_tokens:
+                break
+
+            num_frames = next_num_frames
+
+        return num_frames
+
+    def _get_dummy_num_frames(self, seq_len: int) -> int:
+        mm_config = self.ctx.get_mm_config()
+        max_images = mm_config.limit_per_prompt.get("image", 1)
+        max_videos = mm_config.limit_per_prompt.get("video", 1)
+
+        max_image_tokens = self._get_max_image_tokens() * max_images
+        max_total_frames = self._get_max_video_frames(seq_len -
+                                                      max_image_tokens)
+
+        return max(max_total_frames // max(max_videos, 1), 1)
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        max_image_tokens = self._get_max_image_tokens()
+
+        num_frames = self._get_dummy_num_frames(seq_len)
+        max_video_tokens = self._get_max_video_tokens(num_frames)
 
-    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
         return {
-            "image": self._get_max_mm_tokens("image"),
-            "video": self._get_max_mm_tokens("video"),
+            "image": max_image_tokens,
+            "video": max_video_tokens,
         }
 
     def _get_data_parser(self) -> MultiModalDataParser:
         return Qwen2MultiModalDataParser()
 
+    def _get_image_processor(self, hf_processor: Qwen2VLProcessor):
+        image_processor = hf_processor.image_processor  # type: ignore
+        assert isinstance(image_processor, Qwen2VLImageProcessor)
+        return image_processor
+
     def _get_hf_processor(
         self,
         *,
@@ -797,7 +825,7 @@ def _get_hf_processor(
         max_pixels: Optional[int] = None,
     ) -> Qwen2VLProcessor:
         hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor)
-        image_processor = _get_image_processor(hf_processor)
+        image_processor = self._get_image_processor(hf_processor)
 
         if min_pixels:
             image_processor.min_pixels = min_pixels
@@ -818,7 +846,7 @@ def _get_prompt_replacements(
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         hf_processor = self._get_hf_processor()
-        image_processor = _get_image_processor(hf_processor)
+        image_processor = self._get_image_processor(hf_processor)
 
         # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
         # image_token and video_token registered
@@ -873,32 +901,35 @@ def _get_mm_fields_config(
             video_grid_thw=MultiModalFieldConfig.batched("video"),
         )
 
-    def _get_dummy_mm_inputs(
+    def _get_dummy_processor_inputs(
         self,
+        seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        hf_processor = self._get_hf_processor()
-        image_processor = _get_image_processor(hf_processor)
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
 
+        hf_processor = self._get_hf_processor()
         image_token: str = hf_processor.image_token
-        resized_height, resized_width = smart_resize(
-            height=9999999,
-            width=9999999,
-            factor=image_processor.patch_size * image_processor.merge_size,
-            min_pixels=image_processor.min_pixels,
-            max_pixels=image_processor.max_pixels,
-        )
-        num_images = mm_counts.get("image", 0)
+        video_token: str = hf_processor.video_token
+        target_width, target_height = self._get_dummy_image_size()
 
         mm_data = {
             "image":
-            self._get_dummy_images(width=resized_width,
-                                   height=resized_height,
-                                   num_images=num_images)
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=self._get_dummy_num_frames(seq_len),
+                num_videos=num_videos,
+            )
         }
 
         return ProcessorInputs(
-            prompt_text=image_token * num_images,
+            prompt_text=image_token * num_images + video_token * num_videos,
             mm_data=mm_data,
         )
 
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 115eaaac900e0..7ea177e94afc0 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -171,15 +171,18 @@ def get_num_image_tokens(
     def get_max_image_tokens(self) -> int:
         return get_max_siglip_image_tokens(self.vision_config)
 
-    def get_num_patches(self) -> int:
+    def get_image_size(self) -> int:
+        return self.vision_config.image_size
+
+    def get_patch_size(self) -> int:
+        return self.vision_config.patch_size
+
+    def get_patch_grid_length(self) -> int:
         return get_siglip_patch_grid_length(
             image_size=self.vision_config.image_size,
             patch_size=self.vision_config.patch_size,
         )
 
-    def get_image_size(self) -> int:
-        return self.vision_config.image_size
-
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L249 # noqa
 class SiglipVisionEmbeddings(nn.Module):
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 0b83684c9bac5..6ad4661e3bb8d 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -6,7 +6,6 @@
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
-import numpy as np
 import torch
 import torch.utils.checkpoint
 from torch import nn
@@ -31,7 +30,6 @@
                                         PromptReplacement)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
-from vllm.utils import is_list_of
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
@@ -62,7 +60,7 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None}
 
-    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         feature_extractor = self._get_feature_extractor()
         max_audio_tokens = math.ceil(feature_extractor.chunk_length *
                                      _AUDIO_TOKENS_PER_SECOND)
@@ -103,6 +101,7 @@ def _call_hf_processor(
 
         mm_data = dict(mm_data)
         audios = mm_data.pop("audios", [])
+        assert isinstance(audios, list)
 
         if not audios:
             return super()._call_hf_processor(
@@ -117,9 +116,6 @@ def _call_hf_processor(
             sampling_rate=feature_extractor.sampling_rate,
         )
 
-        # Already resampled by _get_hf_mm_data
-        assert is_list_of(audios, np.ndarray)
-
         # Ultravox processor doesn't support multiple inputs,
         # therefore we need to input text and audio one by one
         audio_features, audio_token_len = [], []
@@ -177,8 +173,9 @@ def get_replacement_ultravox(item_idx: int):
             )
         ]
 
-    def _get_dummy_mm_inputs(
+    def _get_dummy_processor_inputs(
         self,
+        seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         feature_extractor = self._get_feature_extractor()
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 65a773480d2a1..014f02ee10a1b 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,8 +1,12 @@
 from abc import ABC, abstractmethod
-from typing import Generic, TypeVar
+from typing import Final, Generic, Optional, Protocol, TypeVar
 
 from transformers import PretrainedConfig
 
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        InputProcessingContext,
+                                        ProcessingCache)
+
 _C = TypeVar("_C", bound=PretrainedConfig)
 
 
@@ -27,11 +31,15 @@ def get_max_image_tokens(self) -> int:
         raise NotImplementedError
 
     @abstractmethod
-    def get_num_patches(self) -> int:
+    def get_image_size(self) -> int:
         raise NotImplementedError
 
     @abstractmethod
-    def get_image_size(self) -> int:
+    def get_patch_size(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_patch_grid_length(self) -> int:
         raise NotImplementedError
 
 
@@ -50,3 +58,26 @@ def vision_encoder_info(vision_config: PretrainedConfig) -> VisionEncoderInfo:
 
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
+
+
+class VisionLanguageConfig(Protocol):
+    vision_config: Final[PretrainedConfig]
+
+
+class BaseVisionLanguageMultiModalProcessor(BaseMultiModalProcessor):
+
+    def __init__(self,
+                 ctx: InputProcessingContext,
+                 *,
+                 cache: Optional[ProcessingCache] = None,
+                 enable_sanity_checks: bool = True) -> None:
+        super().__init__(ctx,
+                         cache=cache,
+                         enable_sanity_checks=enable_sanity_checks)
+
+        vision_config = self._get_hf_config().vision_config
+        self._vision_encoder_info = vision_encoder_info(vision_config)
+
+    @abstractmethod
+    def _get_hf_config(self) -> VisionLanguageConfig:
+        raise NotImplementedError
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 00acb77435163..6be046ba77ca7 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -146,6 +146,20 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
     def __init__(self, data: Sequence[HfVideoItem]) -> None:
         super().__init__(data, "video")
 
+    def get_num_frames(self, item_idx: int) -> int:
+        return len(self.get(item_idx))
+
+    def get_frame_size(self, item_idx: int) -> ImageSize:
+        image = self.get(item_idx)[0]  # Assume that the video isn't empty
+
+        if isinstance(image, Image):
+            return ImageSize(*image.size)
+        if isinstance(image, (np.ndarray, torch.Tensor)):
+            _, h, w = image.shape
+            return ImageSize(w, h)
+
+        assert_never(image)
+
 
 class VideoEmbeddingItems(EmbeddingItems):
 
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index eb7552176e974..ebc16b817684a 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -16,7 +16,8 @@
 
 from vllm.inputs import DummyData, InputProcessingContext
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer, encode_tokens
+from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
+                                               encode_tokens)
 from vllm.utils import LRUCache, flatten_2d_lists, full_groupby
 
 from .inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -69,19 +70,6 @@ def _cached_encode(
                          add_special_tokens=add_special_tokens)
 
 
-def _decode(
-    tokenizer: AnyTokenizer,
-    token_ids: list[int],
-    *,
-    skip_special_tokens: bool = False,
-) -> str:
-    """
-    Backend-agnostic equivalent of HF's
-    :code:`tokenizer.decode(token_ids, skip_special_tokens=...)`.
-    """
-    return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
-
-
 @lru_cache(maxsize=2048)
 def _cached_decode(
     tokenizer: AnyTokenizer,
@@ -89,9 +77,9 @@ def _cached_decode(
     *,
     skip_special_tokens: bool = False,
 ) -> str:
-    return _decode(tokenizer,
-                   list(token_ids),
-                   skip_special_tokens=skip_special_tokens)
+    return decode_tokens(tokenizer,
+                         list(token_ids),
+                         skip_special_tokens=skip_special_tokens)
 
 
 class _HasModalityAttr(Protocol):
@@ -269,8 +257,10 @@ def end_idx(self) -> int:
         return self.match.end()
 
 
-class _PlaceholderInfo(NamedTuple):
+@dataclass
+class _PlaceholderInfo:
     modality: str
+    item_idx: int
     start_idx: int
     replacement: list[int]
 
@@ -311,12 +301,14 @@ def find_text_matches(
 
 def _resolve_matches(
     prompt: _PromptSeq,
-    matches: Sequence[_PromptReplacementMatch],
+    mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]],
 ) -> list[_PromptReplacementMatch]:
     """
-    Resolve :code:`matches` to ensure that there are no overlapping matches,
+    Resolve :code:`mm_matches` to ensure that there are no overlapping matches,
     and sort them such that earlier matches take priority over later ones.
     """
+    matches = [m for matches in mm_matches.values() for m in matches]
+
     seen_matches: list[Optional[_PromptReplacementMatch]] = [None
                                                              ] * len(prompt)
 
@@ -334,14 +326,15 @@ def _resolve_matches(
 
 def _replace_matches(
     prompt: _S,
-    matches: Sequence[_PromptReplacementMatch],
+    mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]],
     mm_item_counts: Mapping[str, int],
 ) -> list[_S]:
+    """Apply the replacements in :code:`mm_matches` to :code:`prompt`."""
     out_seqs = list[_S]()
     prev_end_idx = 0
     next_idx_by_modality = defaultdict[str, int](lambda: 0)
 
-    for match in _resolve_matches(prompt, matches):
+    for match in _resolve_matches(prompt, mm_matches):
         modality = match.modality
 
         item_idx = next_idx_by_modality[modality]
@@ -371,28 +364,28 @@ def _replace_matches(
 
 def replace_token_matches(
     prompt: list[int],
-    matches: Sequence[_PromptReplacementTokenMatch],
+    mm_matches: Mapping[str, Sequence[_PromptReplacementTokenMatch]],
     mm_item_counts: Mapping[str, int],
 ) -> list[int]:
-    """Apply :code:`prompt_repls` to :code:`prompt`."""
-    if not matches:
+    """Apply the replacements in :code:`mm_matches` to :code:`prompt`."""
+    if not mm_matches:
         return prompt
 
-    token_id_seqs = _replace_matches(prompt, matches, mm_item_counts)
+    token_id_seqs = _replace_matches(prompt, mm_matches, mm_item_counts)
 
     return flatten_2d_lists(token_id_seqs)
 
 
 def replace_text_matches(
     prompt: str,
-    matches: Sequence[_PromptReplacementTextMatch],
+    mm_matches: Mapping[str, Sequence[_PromptReplacementTextMatch]],
     mm_item_counts: Mapping[str, int],
 ) -> str:
-    """Apply :code:`prompt_repls` to :code:`prompt`."""
-    if not matches:
+    """Apply the replacements in :code:`mm_matches` to :code:`prompt`."""
+    if not mm_matches:
         return prompt
 
-    texts = _replace_matches(prompt, matches, mm_item_counts)
+    texts = _replace_matches(prompt, mm_matches, mm_item_counts)
 
     return "".join(texts)
 
@@ -407,14 +400,14 @@ def _iter_modality_placeholders(
         return
 
     prompt_len = len(prompt)
-    item_index = 0
+    item_idx = 0
 
     start_idx = 0
     while start_idx < prompt_len:
         found = False
 
         for repl_info in modality_repls:
-            replacement = repl_info.get_replacement(item_index)
+            replacement = repl_info.get_replacement(item_idx)
             repl_tokens = replacement.token_ids
             repl_len = len(repl_tokens)
             end_idx = start_idx + repl_len
@@ -425,12 +418,13 @@ def _iter_modality_placeholders(
             if prompt[start_idx:end_idx] == repl_tokens:
                 yield _PlaceholderInfo(
                     modality=modality,
+                    item_idx=item_idx,
                     start_idx=start_idx,
                     replacement=repl_tokens,
                 )
 
-                item_index += 1
-                if item_index >= modal_item_count:
+                item_idx += 1
+                if item_idx >= modal_item_count:
                     return
 
                 # Exclude overlapping matches
@@ -442,28 +436,36 @@ def _iter_modality_placeholders(
             start_idx += 1
 
 
-def iter_placeholders(
-    prompt_repls: Sequence[_BoundPromptReplacement],
+def _iter_placeholders(
+    mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
     prompt: list[int],
     mm_item_counts: Mapping[str, int],
 ) -> Iterable[_PlaceholderInfo]:
     """
-    Yield each set of placeholder tokens found in :code:`prompt`.
+    For each modality, yield each set of placeholder tokens found in
+    :code:`prompt`.
 
     Note that empty matches are ignored.
     """
-    repls_by_modality = dict(full_groupby_modality(prompt_repls))
-
     for modality, modal_item_count in mm_item_counts.items():
-        if modality in repls_by_modality:
+        if modality in mm_prompt_repls:
             yield from _iter_modality_placeholders(
                 prompt,
                 modality,
-                repls_by_modality[modality],
+                mm_prompt_repls[modality],
                 modal_item_count,
             )
 
 
+def find_mm_placeholders(
+    mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
+    prompt: list[int],
+    mm_item_counts: Mapping[str, int],
+) -> Mapping[str, list[_PlaceholderInfo]]:
+    it = _iter_placeholders(mm_prompt_repls, prompt, mm_item_counts)
+    return dict(full_groupby_modality(it))
+
+
 @dataclass
 class ProcessorInputs:
     """Keyword arguments to :meth:`BaseMultiModalProcessor`."""
@@ -620,7 +622,7 @@ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         raise NotImplementedError
 
     @abstractmethod
-    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         """
         Get the maximum possible number of tokens per data item
         for each modality.
@@ -703,14 +705,14 @@ def _get_prompt_replacements(
         """
         raise NotImplementedError
 
-    def _find_placeholders(
+    def _find_mm_placeholders(
         self,
-        all_prompt_repls: Sequence[_BoundPromptReplacement],
+        mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
         new_token_ids: list[int],
         mm_item_counts: Mapping[str, int],
-    ) -> list[_PlaceholderInfo]:
-        return list(
-            iter_placeholders(all_prompt_repls, new_token_ids, mm_item_counts))
+    ) -> Mapping[str, list[_PlaceholderInfo]]:
+        return find_mm_placeholders(mm_prompt_repls, new_token_ids,
+                                    mm_item_counts)
 
     def _get_hf_mm_data(
         self,
@@ -797,7 +799,10 @@ def _apply_hf_processor_missing(
 
         # Some HF processors (e.g. Qwen2-VL) expect corresponding
         # multi-modal tokens to be in the prompt text
-        dummy_inputs = self._get_dummy_mm_inputs(mm_missing_counts)
+        dummy_inputs = self._get_dummy_processor_inputs(
+            self.ctx.model_config.max_model_len,
+            mm_missing_counts,
+        )
 
         _, mm_missing_kwargs = self._apply_hf_processor(
             prompt_text=dummy_inputs.prompt_text,
@@ -889,50 +894,44 @@ def _cached_apply_hf_processor(
 
         mm_kwargs = MultiModalKwargs.from_items(merged_kw_items)
 
-        if self.enable_sanity_checks:
-            mm_item_counts = mm_data_items.get_all_counts()
-
-            for modality, item_count in mm_item_counts.items():
-                for item_idx in range(item_count):
-                    try:
-                        mm_kwargs.get_item(modality, item_idx)
-                    except Exception as e:
-                        # Make it easy to set a breakpoint in the debugger
-                        raise e
-
         return prompt_ids, mm_kwargs
 
-    def _bind_prompt_replacements(
+    def _bind_and_group_repls(
         self,
         prompt_repls: list[PromptReplacement],
-    ) -> list[_BoundPromptReplacement]:
+    ) -> dict[str, list[_BoundPromptReplacement]]:
         tokenizer = self._get_tokenizer()
 
-        return [prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls]
+        it = (prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls)
+        return dict(full_groupby_modality(it))
 
     def _always_apply_prompt_replacements(self) -> bool:
         """
         A flag which can be overridden so that
         :meth:`_apply_prompt_replacements` is always called even if we
-        detect that HF has performed processing via :meth:`_find_placeholders`.
+        detect that HF has performed processing via
+        :meth:`_find_placeholders_by_modality`.
 
-        This is useful in cases where :meth:`_find_placeholders` cannot be
-        reliably used to detect whether HF has performed processing or not.
+        This is useful in cases where :meth:`_find_placeholders_by_modality`
+        cannot be reliably used to detect whether HF has performed processing.
         """
         return False
 
     def _apply_prompt_replacements(
         self,
         token_ids: list[int],
-        prompt_repls: Sequence[_BoundPromptReplacement],
+        mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
         mm_item_counts: Mapping[str, int],
-    ) -> tuple[list[int], str, list[_PlaceholderInfo]]:
+    ) -> tuple[list[int], str, Mapping[str, list[_PlaceholderInfo]]]:
         tokenizer = self._get_tokenizer()
 
-        token_matches = find_token_matches(token_ids, prompt_repls)
+        mm_token_matches = {
+            modality: find_token_matches(token_ids, prompt_repls)
+            for modality, prompt_repls in mm_prompt_repls.items()
+        }
         mm_match_counts = {
             modality: len(matches)
-            for modality, matches in full_groupby_modality(token_matches)
+            for modality, matches in mm_token_matches.items()
         }
 
         # If the search text does not represent a special token,
@@ -951,32 +950,92 @@ def _apply_prompt_replacements(
         ):  # yapf: disable
             token_ids = replace_token_matches(
                 token_ids,
-                token_matches,
+                mm_token_matches,
                 mm_item_counts,
             )
 
-            text = _decode(tokenizer, token_ids)
-            matched_repls = [match.prompt_repl for match in token_matches]
+            text = decode_tokens(tokenizer, token_ids)
+            matched_repls = {
+                modality: [match.prompt_repl for match in token_matches]
+                for modality, token_matches in mm_token_matches.items()
+            }
         else:
-            text = _decode(tokenizer, token_ids)
+            text = decode_tokens(tokenizer, token_ids)
 
-            text_matches = find_text_matches(text, prompt_repls)
+            mm_text_matches = {
+                modality: find_text_matches(text, prompt_repls)
+                for modality, prompt_repls in mm_prompt_repls.items()
+            }
             text = replace_text_matches(
                 text,
-                text_matches,
+                mm_text_matches,
                 mm_item_counts,
             )
 
             token_ids = encode_tokens(tokenizer,
                                       text,
                                       add_special_tokens=False)
-            matched_repls = [match.prompt_repl for match in text_matches]
-
-        placeholders = self._find_placeholders(matched_repls, token_ids,
-                                               mm_item_counts)
+            matched_repls = {
+                modality: [match.prompt_repl for match in token_matches]
+                for modality, token_matches in mm_text_matches.items()
+            }
+
+        placeholders = self._find_mm_placeholders(
+            matched_repls,
+            token_ids,
+            mm_item_counts,
+        )
 
         return token_ids, text, placeholders
 
+    def _validate_mm_kwargs(
+        self,
+        mm_kwargs: MultiModalKwargs,
+        mm_item_counts: Mapping[str, int],
+    ) -> None:
+        for modality, item_count in mm_item_counts.items():
+            if modality in mm_kwargs.modalities:
+                items = mm_kwargs.get_items(modality)
+            else:
+                items = []
+
+            if len(items) != item_count:
+                raise RuntimeError(
+                    f"Expected there to be {item_count} {modality} items in "
+                    f"keyword arguments corresponding to {item_count} "
+                    f"{modality} data items, but only found {len(items)}! "
+                    "There is likely a problem with your "
+                    "implementation of merged multi-modal processor for this "
+                    "model (usually arising from an inconsistency between "
+                    "`_call_hf_processor` and `_get_mm_fields_config`).")
+
+    def _validate_mm_placeholders(
+        self,
+        mm_placeholders: Mapping[str, list[_PlaceholderInfo]],
+        mm_item_counts: Mapping[str, int],
+        *,
+        allow_missing: bool = False,
+    ) -> Mapping[str, int]:
+        missing_repl_counts = dict[str, int]()
+
+        for modality, item_count in mm_item_counts.items():
+            placeholders = mm_placeholders.get(modality, [])
+
+            if len(placeholders) != item_count and not allow_missing:
+                raise RuntimeError(
+                    f"Expected there to be {item_count} prompt replacements "
+                    f"corresponding to {item_count} {modality} items, but only "
+                    f"found {len(placeholders)} prompt replacements! Either "
+                    "the prompt text has missing/incorrect tokens for "
+                    "multi-modal inputs, or there is a problem with your "
+                    "implementation of merged multi-modal processor for this "
+                    "model (usually arising from an inconsistency between "
+                    "`_call_hf_processor` and `_get_prompt_replacements`).")
+
+            missing_repl_counts[modality] = item_count - len(placeholders)
+
+        return missing_repl_counts
+
     def apply(
         self,
         prompt_text: str,
@@ -1009,56 +1068,69 @@ def apply(
             hf_processor_mm_kwargs,
             mm_kwargs,
         )
-        prompt_repls = self._bind_prompt_replacements(unbound_prompt_repls)
+        mm_prompt_repls = self._bind_and_group_repls(unbound_prompt_repls)
 
-        # If HF processor already inserts placeholder tokens,
-        # there is no need for us to insert them
         mm_item_counts = mm_items.get_all_counts()
-        all_placeholders = self._find_placeholders(prompt_repls, prompt_ids,
-                                                   mm_item_counts)
+        self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
+
+        hf_mm_placeholders = self._find_mm_placeholders(
+            mm_prompt_repls,
+            prompt_ids,
+            mm_item_counts,
+        )
+
+        if self._always_apply_prompt_replacements():
+            mm_missing_repl_counts = mm_item_counts
+            mm_missing_repls = dict(mm_prompt_repls)
+        else:
+            mm_missing_repl_counts = self._validate_mm_placeholders(
+                hf_mm_placeholders,
+                mm_item_counts,
+                allow_missing=True,
+            )
+
+            mm_missing_repls = dict[str, list[_BoundPromptReplacement]]()
+            for modality, missing_repl_count in mm_missing_repl_counts.items():
+                if missing_repl_count == 0:
+                    mm_missing_repls[modality] = []
+                elif missing_repl_count == mm_item_counts.get(modality, 0):
+                    mm_missing_repls[modality] = mm_prompt_repls[modality]
+                else:
+                    raise ValueError("Partial prompt replacement within "
+                                     f"{modality=} is not supported")
 
-        if all_placeholders and not self._always_apply_prompt_replacements():
+        # If HF processor already inserts placeholder tokens,
+        # there is no need for us to insert them
+        if all(len(repls) == 0 for repls in mm_missing_repls.items()):
             tokenizer = self._get_tokenizer()
-            prompt_text = _decode(tokenizer, prompt_ids)
+            prompt_text = decode_tokens(tokenizer, prompt_ids)
+            mm_placeholders = hf_mm_placeholders
         else:
             (
                 prompt_ids,
                 prompt_text,
-                all_placeholders,
+                missing_mm_placeholders,
             ) = self._apply_prompt_replacements(
                 prompt_ids,
-                prompt_repls,
-                mm_item_counts,
+                mm_missing_repls,
+                mm_missing_repl_counts,
             )
 
-        mm_placeholders = dict[str, list[PlaceholderRange]]()
-        err_suffix = ("This suggests a problem with your implementation of "
-                      "the merged multi-modal processor for this model, "
-                      "particularly in the `_get_prompt_replacements` method.")
-
-        for modality, placeholders in full_groupby_modality(all_placeholders):
-            if modality not in mm_items:
-                raise AssertionError(
-                    f"Expected no placeholders for {modality=}, "
-                    f"but found {placeholders=}. Input items: {mm_items}"
-                    f"\n{err_suffix}")
-
-            if len(placeholders) != len(mm_items[modality]):
-                raise AssertionError(
-                    f"Expected length of {placeholders=} for {modality=} "
-                    f"to equal that of input items: {mm_items[modality]}"
-                    f"\n{err_suffix}")
-
-            mm_placeholders[modality] = [
-                item.to_range() for item in placeholders
-            ]
+            mm_placeholders = {**hf_mm_placeholders, **missing_mm_placeholders}
+
+        self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
+
+        mm_placeholder_ranges = {
+            modality: [item.to_range() for item in placeholders]
+            for modality, placeholders in mm_placeholders.items()
+        }
 
         return MultiModalInputsV2(
             type="multimodal",
             prompt=prompt_text,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
-            mm_placeholders=mm_placeholders,
+            mm_placeholders=mm_placeholder_ranges,
         )
 
     def _get_dummy_audios(
@@ -1092,8 +1164,9 @@ def _get_dummy_videos(
         return [video] * num_videos
 
     @abstractmethod
-    def _get_dummy_mm_inputs(
+    def _get_dummy_processor_inputs(
         self,
+        seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         """
@@ -1121,12 +1194,25 @@ def _get_and_validate_dummy_mm_counts(self) -> Mapping[str, int]:
 
         return mm_limits
 
+    def _get_dummy_mm_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalInputsV2:
+        processor_inputs = self._get_dummy_processor_inputs(seq_len, mm_counts)
+
+        return self.apply(
+            prompt_text=processor_inputs.prompt_text,
+            mm_data=processor_inputs.mm_data,
+            hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
+        )
+
     def get_dummy_data(self, seq_len: int) -> DummyData:
         # Avoid circular import
         from vllm.sequence import SequenceData
 
         mm_counts = self._get_and_validate_dummy_mm_counts()
-        mm_max_tokens_per_item = self.get_mm_max_tokens_per_item()
+        mm_max_tokens_per_item = self.get_mm_max_tokens_per_item(seq_len)
         if mm_counts.keys() != mm_max_tokens_per_item.keys():
             raise AssertionError(
                 "The keys returned by `get_supported_mm_limits`"
@@ -1134,13 +1220,7 @@ def get_dummy_data(self, seq_len: int) -> DummyData:
                 "returned by `get_mm_max_tokens_per_item` "
                 f"({set(mm_max_tokens_per_item.keys())})")
 
-        processor_inputs = self._get_dummy_mm_inputs(mm_counts)
-        mm_inputs = self.apply(
-            prompt_text=processor_inputs.prompt_text,
-            mm_data=processor_inputs.mm_data,
-            hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
-        )
-
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
         prompt_token_ids = mm_inputs["prompt_token_ids"]
         placeholders_by_modality = mm_inputs["mm_placeholders"]
 
@@ -1171,6 +1251,12 @@ def get_dummy_data(self, seq_len: int) -> DummyData:
                 "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len,
                 total_len, total_placeholders_by_modality)
 
+            return DummyData(
+                seq_data=SequenceData.from_prompt_token_counts((0, seq_len)),
+                multi_modal_data=None,
+                multi_modal_placeholders=None,
+            )
+
         prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
 
         return DummyData(
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 073d49d7d2009..fb4389dc4df42 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -223,7 +223,8 @@ def get_max_tokens_per_item_by_modality(
         if self.has_processor(model_config):
             tokenizer = cached_get_tokenizer(model_config.tokenizer)
             processor = self.create_processor(model_config, tokenizer)
-            return processor.get_mm_max_tokens_per_item()
+            seq_len = model_config.max_model_len
+            return processor.get_mm_max_tokens_per_item(seq_len)
 
         return {
             key: plugin.get_max_multimodal_tokens(model_config)
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 42b2f095bc543..97920f42ec52f 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -21,6 +21,19 @@
                      MistralTokenizer]
 
 
+def decode_tokens(
+    tokenizer: AnyTokenizer,
+    token_ids: list[int],
+    *,
+    skip_special_tokens: bool = False,
+) -> str:
+    """
+    Backend-agnostic equivalent of HF's
+    :code:`tokenizer.decode(token_ids, skip_special_tokens=...)`.
+    """
+    return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
+
+
 def encode_tokens(
     tokenizer: AnyTokenizer,
     text: str,

From ba214dffbeec070051b61c1985ce6342c947f598 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 4 Jan 2025 23:45:57 +0800
Subject: [PATCH 267/357] [Bugfix] Fix precision error in LLaVA-NeXT (#11735)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../processing/test_llava_next.py             |  3 +--
 vllm/model_executor/models/llava_next.py      | 14 +++++++----
 vllm/model_executor/models/llava_onevision.py | 23 ++++++++++++-------
 3 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
index 6772130c9b884..6c8d300717de4 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_next.py
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
@@ -15,10 +15,9 @@ def processor_for_llava_next():
     return LlavaNextMultiModalProcessor
 
 
-# FIXME: image_size [(198, 176), (176, 198)]
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488),
-                                        (488, 183)])
+                                        (488, 183), (198, 176), (176, 198)])
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_prompt_replacements(
     processor_for_llava_next,
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 3769f04f94a92..f79021596f915 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -2,6 +2,7 @@
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
+import numpy as np
 import torch
 import torch.nn as nn
 from transformers import BatchFeature, LlavaNextConfig, LlavaNextProcessor
@@ -139,16 +140,21 @@ def _get_num_unpadded_features(
         current_height = npatches * num_patch_height
         current_width = npatches * num_patch_width
 
-        original_aspect_ratio = original_width / original_height
-        current_aspect_ratio = current_width / current_height
+        # NOTE: HF resizes based on float32
+        original_aspect_ratio = np.array(original_width / original_height,
+                                         dtype=np.float32)
+        current_aspect_ratio = np.array(current_width / current_height,
+                                        dtype=np.float32)
 
         if original_aspect_ratio > current_aspect_ratio:
-            scale_factor = current_width / original_width
+            scale_factor = np.array(current_width / original_width,
+                                    dtype=np.float32)
             new_height = int(original_height * scale_factor)
             padding = (current_height - new_height) // 2
             current_height -= 2 * padding
         else:
-            scale_factor = current_height / original_height
+            scale_factor = np.array(current_height / original_height,
+                                    dtype=np.float32)
             new_width = int(original_width * scale_factor)
             padding = (current_width - new_width) // 2
             current_width -= 2 * padding
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 1e51e09a24c18..5a3cdadc47cac 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -3,6 +3,7 @@
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
+import numpy as np
 import torch
 import torch.nn as nn
 from transformers import (BatchFeature, LlavaOnevisionConfig,
@@ -127,18 +128,24 @@ def _get_num_unpadded_features(
         current_height = npatches * num_patch_height
         current_width = npatches * num_patch_width
 
-        original_aspect_ratio = original_width / original_height
-        current_aspect_ratio = current_width / current_height
+        # NOTE: HF resizes based on float32
+        original_aspect_ratio = np.array(original_width / original_height,
+                                         dtype=np.float32)
+        current_aspect_ratio = np.array(current_width / current_height,
+                                        dtype=np.float32)
+
         if original_aspect_ratio > current_aspect_ratio:
-            new_height = int(original_height *
-                             (current_width / original_width))
+            scale_factor = np.array(current_width / original_width,
+                                    dtype=np.float32)
+            new_height = int(original_height * scale_factor)
             padding = (current_height - new_height) // 2
-            current_height -= padding * 2
+            current_height -= 2 * padding
         else:
-            new_width = int(original_width *
-                            (current_height / original_height))
+            scale_factor = np.array(current_height / original_height,
+                                    dtype=np.float32)
+            new_width = int(original_width * scale_factor)
             padding = (current_width - new_width) // 2
-            current_width -= padding * 2
+            current_width -= 2 * padding
 
         unpadded_features = current_height * current_width
         newline_features = current_height

From 65c08928c2db934b18f7c6f5eeb02617826fae8e Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 4 Jan 2025 23:46:21 +0800
Subject: [PATCH 268/357] [Model] Remove unnecessary weight initialization
 logic (#11736)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/layers/resampler.py | 20 ++++----------------
 vllm/model_executor/models/aria.py      |  5 +----
 vllm/model_executor/models/minicpmv.py  |  2 --
 3 files changed, 5 insertions(+), 22 deletions(-)

diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py
index aae806f6af323..a67713c320b86 100644
--- a/vllm/model_executor/layers/resampler.py
+++ b/vllm/model_executor/layers/resampler.py
@@ -27,7 +27,7 @@
 Shared resampler perceiver network used in multimodal models and
 related helpers for sincos positional embeddings.
 
-Example models: Qwen (Qwen-VL), Minicpmv2.0
+Example models: Qwen (Qwen-VL), MiniCPM-V 2.0
 """
 import math
 from functools import partial
@@ -37,7 +37,6 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
-from torch.nn.init import trunc_normal_
 
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -169,8 +168,8 @@ def __init__(self,
         self.embed_dim = embed_dim
         self.num_heads = num_heads
 
-        self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
-        trunc_normal_(self.query, std=0.02)
+        self.query = nn.Parameter(torch.empty(self.num_queries, embed_dim))
+
         if kv_dim is not None and kv_dim != embed_dim:
             self.kv_proj = ReplicatedLinear(kv_dim,
                                             embed_dim,
@@ -190,16 +189,7 @@ def __init__(self,
         self.ln_post = norm_layer(embed_dim) if do_post_projection else None
         self.proj = nn.Parameter(
             (embed_dim**-0.5) *
-            torch.randn(embed_dim, embed_dim)) if do_post_projection else None
-
-    def _init_weights(self, m: nn.Module) -> None:
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=0.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
+            torch.empty(embed_dim, embed_dim)) if do_post_projection else None
 
     def _repeat(self, query, N: int):
         return query.unsqueeze(1).repeat(1, N, 1)
@@ -240,8 +230,6 @@ def __init__(self,
         self.pos_embed = nn.Parameter(
             torch.from_numpy(pos_embed_arr).requires_grad_(False))
 
-        self.apply(self._init_weights)
-
     def forward(
         self,
         x: torch.Tensor,
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 2fd4262a9d3b9..8f5fd64a90c87 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -3,7 +3,6 @@
 
 import torch
 import torch.nn as nn
-from torch.nn.init import trunc_normal_
 from transformers import BatchFeature, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
@@ -216,9 +215,7 @@ def __init__(
         self.num_heads = num_heads
 
         self.query = nn.Parameter(
-            torch.zeros(max(patch_to_query_dict.values()), self.embed_dim))
-
-        trunc_normal_(self.query, std=0.02)
+            torch.empty(max(patch_to_query_dict.values()), self.embed_dim))
 
         self.cross_attn = CrossAttention(kv_dim, embed_dim, num_heads)
 
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 712022502539b..8f36437d47d9e 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -141,8 +141,6 @@ def __init__(self,
         self.max_size = max_size
         self._set_2d_pos_cache(self.max_size)
 
-        self.apply(self._init_weights)
-
     def _set_2d_pos_cache(self,
                           max_size: Tuple[int, int],
                           device: torch.types.Device = "cpu") -> None:

From 47831430cc943cd470d38d27f8c69a5782795ec3 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 5 Jan 2025 00:07:59 +0800
Subject: [PATCH 269/357] [Bugfix][V1] Fix test_kv_cache_utils.py (#11738)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/v1/core/test_kv_cache_utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index faa3a91de151f..2ed70b42991b5 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -147,12 +147,12 @@ def test_generate_block_hash_extra_keys():
 
     # Test with no extra keys
     extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0)
-    assert extra_keys == (("hash1", 0), )
+    assert extra_keys == ("hash1", )
     assert next_mm_idx == 1
 
     # Test with partial overlap
     extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 3, 8, 0)
-    assert extra_keys == (("hash1", 3), )
+    assert extra_keys == ("hash1", )
     assert next_mm_idx == 1
 
     # Test with no overlap
@@ -162,7 +162,7 @@ def test_generate_block_hash_extra_keys():
 
     # Test with multiple extra keys
     extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 15, 0)
-    assert extra_keys == (("hash1", 0), ("hash2", 0))
+    assert extra_keys == ('hash1', 'hash2')
     assert next_mm_idx == 2
 
 
@@ -216,11 +216,11 @@ def test_hash_request_tokens():
 
     # Check the first block
     assert block_hashes[0].token_ids == (0, 1, 2)
-    assert block_hashes[0].extra_keys == (("hash1", 0), )
+    assert block_hashes[0].extra_keys == ("hash1", )
 
     # Check the second block
     assert block_hashes[1].token_ids == (3, 4, 5)
-    assert block_hashes[1].extra_keys == (("hash2", 0), )
+    assert block_hashes[1].extra_keys == ("hash2", )
 
 
 def test_hash_request_tokens_no_mm_inputs():

From 4068f4b5b5dc5e2d1114be0cbb126bc44fb4e906 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Sat, 4 Jan 2025 17:20:34 -0800
Subject: [PATCH 270/357] [MISC] Replace c10::optional with std::optional
 (#11730)

Signed-off-by: Lu Fang <lufang@fb.com>
---
 csrc/attention/paged_attention_v1.cu          |  4 +-
 csrc/attention/paged_attention_v2.cu          |  4 +-
 csrc/cpu/attention.cpp                        |  8 ++--
 csrc/cpu/quant.cpp                            | 10 ++--
 csrc/cpu/torch_bindings.cpp                   |  6 +--
 .../epilogue/scaled_mm_epilogues_c2x.hpp      |  6 +--
 .../epilogue/scaled_mm_epilogues_c3x.hpp      |  6 +--
 csrc/cutlass_extensions/torch_utils.hpp       |  2 +-
 csrc/mamba/causal_conv1d/causal_conv1d.cu     | 24 +++++-----
 csrc/mamba/mamba_ssm/selective_scan_fwd.cu    | 22 ++++-----
 csrc/ops.h                                    | 46 +++++++++----------
 .../compressed_tensors/int8_quant_kernels.cu  |  4 +-
 .../cutlass_w8a8/scaled_mm_c2x.cu             | 18 ++++----
 .../cutlass_w8a8/scaled_mm_c3x.cu             |  6 +--
 .../cutlass_w8a8/scaled_mm_entry.cu           | 30 ++++++------
 csrc/quantization/machete/generate.py         |  2 +-
 .../machete/machete_mm_kernel.cuh             | 10 ++--
 .../machete/machete_mm_launcher.cuh           | 24 +++++-----
 .../machete/machete_prepack_launcher.cuh      |  2 +-
 csrc/quantization/machete/machete_pytorch.cu  | 26 +++++------
 csrc/rocm/attention.cu                        |  4 +-
 csrc/rocm/ops.h                               |  2 +-
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   |  2 +-
 csrc/sparse/cutlass/sparse_scaled_mm_entry.cu |  4 +-
 24 files changed, 136 insertions(+), 136 deletions(-)

diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu
index cb1a069942069..27321148f6dda 100644
--- a/csrc/attention/paged_attention_v1.cu
+++ b/csrc/attention/paged_attention_v1.cu
@@ -53,7 +53,7 @@ void paged_attention_v1_launcher(
     torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int num_kv_heads, float scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
+    const std::optional<torch::Tensor>& alibi_slopes, float k_scale,
     float v_scale, const int tp_rank, const int blocksparse_local_blocks,
     const int blocksparse_vert_stride, const int blocksparse_block_size,
     const int blocksparse_head_sliding_step) {
@@ -176,7 +176,7 @@ void paged_attention_v1(
     torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
     torch::Tensor& seq_lens,      // [num_seqs]
     int64_t block_size, int64_t max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::optional<torch::Tensor>& alibi_slopes,
     const std::string& kv_cache_dtype, double k_scale, double v_scale,
     const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu
index c457bdb89008e..a453b2243e48c 100644
--- a/csrc/attention/paged_attention_v2.cu
+++ b/csrc/attention/paged_attention_v2.cu
@@ -54,7 +54,7 @@ void paged_attention_v2_launcher(
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int num_kv_heads, float scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
+    const std::optional<torch::Tensor>& alibi_slopes, float k_scale,
     float v_scale, const int tp_rank, const int blocksparse_local_blocks,
     const int blocksparse_vert_stride, const int blocksparse_block_size,
     const int blocksparse_head_sliding_step) {
@@ -187,7 +187,7 @@ void paged_attention_v2(
     torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
     torch::Tensor& seq_lens,      // [num_seqs]
     int64_t block_size, int64_t max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::optional<torch::Tensor>& alibi_slopes,
     const std::string& kv_cache_dtype, double k_scale, double v_scale,
     const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index e21832ba7582f..ef5b14088c63b 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -386,7 +386,7 @@ void paged_attention_v1_impl_launcher(
     torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int num_kv_heads, float scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes) {
+    const std::optional<torch::Tensor>& alibi_slopes) {
   int num_seqs = query.size(0);
   int num_heads = query.size(1);
   int head_size = query.size(2);
@@ -459,7 +459,7 @@ void paged_attention_v1(
     torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
     const std::string& kv_cache_dtype, double k_scale, double v_scale,
     const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
@@ -702,7 +702,7 @@ void paged_attention_v2_impl_launcher(
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int num_kv_heads, float scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
-    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes) {
+    int max_seq_len, const std::optional<torch::Tensor>& alibi_slopes) {
   int num_seqs = query.size(0);
   int num_heads = query.size(1);
   int head_size = query.size(2);
@@ -781,7 +781,7 @@ void paged_attention_v2(
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
     const std::string& kv_cache_dtype, double k_scale, double v_scale,
     const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp
index d9aed657a3113..33b1637832888 100644
--- a/csrc/cpu/quant.cpp
+++ b/csrc/cpu/quant.cpp
@@ -359,7 +359,7 @@ void int8_scaled_mm(torch::Tensor& c,               // [M, OC], row-major
                     const torch::Tensor& b,         // [IC, OC], column-major
                     const torch::Tensor& a_scales,  // [1] or [M]
                     const torch::Tensor& b_scales,  // [1] or [OC]
-                    const c10::optional<torch::Tensor>& bias  // [OC]
+                    const std::optional<torch::Tensor>& bias  // [OC]
 ) {
   CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
   // Checks for conformality
@@ -442,8 +442,8 @@ void int8_scaled_mm_azp(torch::Tensor& c,        // [M, OC], row-major
                         const torch::Tensor& a_scales,            // [1] or [M]
                         const torch::Tensor& b_scales,            // [1] or [OC]
                         const torch::Tensor& azp_adj,             // [OC]
-                        const c10::optional<torch::Tensor>& azp,  // [1] or [M]
-                        const c10::optional<torch::Tensor>& bias  // [OC]
+                        const std::optional<torch::Tensor>& azp,  // [1] or [M]
+                        const std::optional<torch::Tensor>& bias  // [OC]
 ) {
   CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp)
   // Checks for conformality
@@ -561,7 +561,7 @@ void int8_scaled_mm_azp(torch::Tensor& c,        // [M, OC], row-major
 void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
                               const torch::Tensor& input,  // [..., hidden_size]
                               const torch::Tensor& scale,
-                              c10::optional<torch::Tensor> const& azp) {
+                              std::optional<torch::Tensor> const& azp) {
   CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
   TORCH_CHECK(input.is_contiguous());
   TORCH_CHECK(out.is_contiguous());
@@ -590,7 +590,7 @@ void dynamic_scaled_int8_quant(
     torch::Tensor& out,          // [..., hidden_size]
     const torch::Tensor& input,  // [..., hidden_size]
     torch::Tensor& scale,        // [..., 1]
-    c10::optional<torch::Tensor> const& azp) {
+    std::optional<torch::Tensor> const& azp) {
   CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
   TORCH_CHECK(input.is_contiguous());
   TORCH_CHECK(out.is_contiguous());
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index 03beefbc6de7d..74e4d8189d403 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -9,14 +9,14 @@ std::string init_cpu_threads_env(const std::string& cpu_ids);
 void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
                     const torch::Tensor& b, const torch::Tensor& a_scales,
                     const torch::Tensor& b_scales,
-                    const c10::optional<torch::Tensor>& bias);
+                    const std::optional<torch::Tensor>& bias);
 
 void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a,
                         const torch::Tensor& b, const torch::Tensor& a_scales,
                         const torch::Tensor& b_scales,
                         const torch::Tensor& azp_adj,
-                        const c10::optional<torch::Tensor>& azp,
-                        const c10::optional<torch::Tensor>& bias);
+                        const std::optional<torch::Tensor>& azp,
+                        const std::optional<torch::Tensor>& bias);
 
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
index 26f7423fd7455..ef413e6dd75c5 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
@@ -68,7 +68,7 @@ struct ScaledEpilogueBase {
   // This overload handles the case where there might not be a tensor, in which
   // case a nullptr is passed and a constant (0) is used.
   template <typename Descriptor, typename T>
-  static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
+  static auto args_from_tensor(std::optional<torch::Tensor> const& tensor) {
     static_assert(std::is_same_v<Descriptor, RowOrZeroLoad<T>>);
     using Arguments = typename Descriptor::Arguments;
     auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
@@ -223,7 +223,7 @@ struct ScaledEpilogueBiasAzp
   static ArgumentType prepare_args(torch::Tensor const& a_scales,
                                    torch::Tensor const& b_scales,
                                    torch::Tensor const& azp_adj,
-                                   c10::optional<torch::Tensor> const& bias) {
+                                   std::optional<torch::Tensor> const& bias) {
     auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
     auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
     auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
@@ -301,7 +301,7 @@ struct ScaledEpilogueBiasAzpToken
                                    torch::Tensor const& b_scales,
                                    torch::Tensor const& azp_adj,
                                    torch::Tensor const& azp,
-                                   c10::optional<torch::Tensor> const& bias) {
+                                   std::optional<torch::Tensor> const& bias) {
     auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
     auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
     auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
index c723adf126422..c590c66a66652 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -67,7 +67,7 @@ struct ScaledEpilogueBase {
   // This overload handles the case where there might not be a tensor, in which
   // case a nullptr is passed and a constant (0) is used.
   template <typename Descriptor, typename T>
-  static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
+  static auto args_from_tensor(std::optional<torch::Tensor> const& tensor) {
     using Arguments = typename Descriptor::Arguments;
     auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
     static_assert(std::is_same_v<Descriptor, ColLoad<T, true>> ||
@@ -223,7 +223,7 @@ struct ScaledEpilogueBiasAzp
   static ArgumentType prepare_args(torch::Tensor const& a_scales,
                                    torch::Tensor const& b_scales,
                                    torch::Tensor const& azp_adj,
-                                   c10::optional<torch::Tensor> const& bias) {
+                                   std::optional<torch::Tensor> const& bias) {
     auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
     auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
     auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
@@ -299,7 +299,7 @@ struct ScaledEpilogueBiasAzpToken
                                    torch::Tensor const& b_scales,
                                    torch::Tensor const& azp_adj,
                                    torch::Tensor const& azp,
-                                   c10::optional<torch::Tensor> const& bias) {
+                                   std::optional<torch::Tensor> const& bias) {
     auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
     auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
     auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
diff --git a/csrc/cutlass_extensions/torch_utils.hpp b/csrc/cutlass_extensions/torch_utils.hpp
index 2c78572521eec..a1ff933cce63f 100644
--- a/csrc/cutlass_extensions/torch_utils.hpp
+++ b/csrc/cutlass_extensions/torch_utils.hpp
@@ -97,7 +97,7 @@ static inline auto make_cute_layout(torch::Tensor const& tensor,
 
 template <typename Stride>
 static inline auto maybe_make_cute_layout(
-    c10::optional<torch::Tensor> const& tensor,
+    std::optional<torch::Tensor> const& tensor,
     std::string_view name = "tensor") {
   using Layout = decltype(make_cute_layout<Stride>(*tensor));
 
diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu
index dd1e6de2e0180..f0e5533bcae60 100644
--- a/csrc/mamba/causal_conv1d/causal_conv1d.cu
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@@ -53,12 +53,12 @@ void set_conv_params_fwd(ConvParamsBase &params,
                          const at::Tensor x,
                          const at::Tensor weight,
                          const at::Tensor out,
-                         const c10::optional<at::Tensor>& bias,
+                         const std::optional<at::Tensor>& bias,
                          bool silu_activation,
                          int64_t pad_slot_id,
-                         const c10::optional<at::Tensor>& query_start_loc = std::nullopt,
-                         const c10::optional<at::Tensor>& cache_indices = std::nullopt,
-                         const c10::optional<at::Tensor>& has_initial_state = std::nullopt) {
+                         const std::optional<at::Tensor>& query_start_loc = std::nullopt,
+                         const std::optional<at::Tensor>& cache_indices = std::nullopt,
+                         const std::optional<at::Tensor>& has_initial_state = std::nullopt) {
 
     // Reset the parameters
     memset(&params, 0, sizeof(params));
@@ -93,11 +93,11 @@ void set_conv_params_fwd(ConvParamsBase &params,
 
 
 void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
-                  const c10::optional<at::Tensor> &bias_,
-                  const c10::optional<at::Tensor> &conv_states,
-                  const c10::optional<at::Tensor> &query_start_loc,
-                  const c10::optional<at::Tensor> &cache_indices,
-                  const c10::optional<at::Tensor> &has_initial_state,
+                  const std::optional<at::Tensor> &bias_,
+                  const std::optional<at::Tensor> &conv_states,
+                  const std::optional<at::Tensor> &query_start_loc,
+                  const std::optional<at::Tensor> &cache_indices,
+                  const std::optional<at::Tensor> &has_initial_state,
                   bool silu_activation,
                  // used to identify padding entries if cache_indices provided
                  // in case of padding, the kernel will return early
@@ -194,10 +194,10 @@ void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
 void causal_conv1d_update(const at::Tensor &x,
                      const at::Tensor &conv_state,
                      const at::Tensor &weight,
-                     const c10::optional<at::Tensor> &bias_,
+                     const std::optional<at::Tensor> &bias_,
                      bool silu_activation,
-                     const c10::optional<at::Tensor> &cache_seqlens_,
-                     const c10::optional<at::Tensor> &conv_state_indices_,
+                     const std::optional<at::Tensor> &cache_seqlens_,
+                     const std::optional<at::Tensor> &conv_state_indices_,
                      // used to identify padding entries if cache_indices provided
                      // in case of padding, the kernel will return early
                      int64_t pad_slot_id) {
diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
index 71624696338d0..bd0a34119c82b 100644
--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@@ -402,14 +402,14 @@ void set_ssm_params_fwd(SSMParamsBase &params,
                         const torch::Tensor out,
                         const torch::Tensor z,
                         const torch::Tensor out_z,
-                        const c10::optional<at::Tensor>& D,
-                        const c10::optional<at::Tensor>& delta_bias,
+                        const std::optional<at::Tensor>& D,
+                        const std::optional<at::Tensor>& delta_bias,
                         const torch::Tensor ssm_states,
                         bool has_z, 
                         bool delta_softplus,
-                        const c10::optional<at::Tensor>& query_start_loc,
-                        const c10::optional<at::Tensor>& cache_indices,
-                        const c10::optional<at::Tensor>& has_initial_state,
+                        const std::optional<at::Tensor>& query_start_loc,
+                        const std::optional<at::Tensor>& cache_indices,
+                        const std::optional<at::Tensor>& has_initial_state,
                         bool varlen,
                         int64_t pad_slot_id) {
 
@@ -504,13 +504,13 @@ void set_ssm_params_fwd(SSMParamsBase &params,
 
 void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
                   const torch::Tensor &A, const torch::Tensor &B, const torch::Tensor &C,
-                  const c10::optional<torch::Tensor> &D_,
-                  const c10::optional<torch::Tensor> &z_,
-                  const c10::optional<torch::Tensor> &delta_bias_,
+                  const std::optional<torch::Tensor> &D_,
+                  const std::optional<torch::Tensor> &z_,
+                  const std::optional<torch::Tensor> &delta_bias_,
                   bool delta_softplus,
-                  const c10::optional<torch::Tensor> &query_start_loc,
-                  const c10::optional<torch::Tensor> &cache_indices,
-                  const c10::optional<torch::Tensor> &has_initial_state,
+                  const std::optional<torch::Tensor> &query_start_loc,
+                  const std::optional<torch::Tensor> &cache_indices,
+                  const std::optional<torch::Tensor> &has_initial_state,
                   const torch::Tensor &ssm_states,
                   // used to identify padding entries if cache_indices provided
                   // in case of padding, the kernel will return early
diff --git a/csrc/ops.h b/csrc/ops.h
index 347c502845d8f..9efd9b0c24700 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -33,7 +33,7 @@ void paged_attention_v1(
     torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
     const std::string& kv_cache_dtype, double k_scale, double v_scale,
     const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
@@ -44,7 +44,7 @@ void paged_attention_v2(
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
     const std::string& kv_cache_dtype, double k_scale, double v_scale,
     const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
@@ -153,15 +153,15 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
 void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales,
-                       c10::optional<torch::Tensor> const& bias);
+                       std::optional<torch::Tensor> const& bias);
 
 void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
                            torch::Tensor const& b_scales,
                            torch::Tensor const& azp_adj,
-                           c10::optional<torch::Tensor> const& azp,
-                           c10::optional<torch::Tensor> const& bias);
+                           std::optional<torch::Tensor> const& azp,
+                           std::optional<torch::Tensor> const& bias);
 
 bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability);
 
@@ -169,7 +169,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
                               torch::Tensor const& b, torch::Tensor const& e,
                               torch::Tensor const& a_scales,
                               torch::Tensor const& b_scales,
-                              c10::optional<torch::Tensor> const& bias);
+                              std::optional<torch::Tensor> const& bias);
 
 bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed,
                                    torch::Tensor& e, torch::Tensor const& a);
@@ -177,11 +177,11 @@ bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed,
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
                               torch::Tensor const& scale,
-                              c10::optional<torch::Tensor> const& azp);
+                              std::optional<torch::Tensor> const& azp);
 
 void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
                                torch::Tensor& scales,
-                               c10::optional<torch::Tensor> const& azp);
+                               std::optional<torch::Tensor> const& azp);
 
 torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                         torch::Tensor b_gptq_qzeros,
@@ -198,34 +198,34 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
 
 void dynamic_per_token_scaled_fp8_quant(
     torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
-    c10::optional<torch::Tensor> const& scale_ub);
+    std::optional<torch::Tensor> const& scale_ub);
 
 void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
                         const torch::Tensor& A, const torch::Tensor& B,
                         const torch::Tensor& C,
-                        const c10::optional<torch::Tensor>& D_,
-                        const c10::optional<torch::Tensor>& z_,
-                        const c10::optional<torch::Tensor>& delta_bias_,
+                        const std::optional<torch::Tensor>& D_,
+                        const std::optional<torch::Tensor>& z_,
+                        const std::optional<torch::Tensor>& delta_bias_,
                         bool delta_softplus,
-                        const c10::optional<torch::Tensor>& query_start_loc,
-                        const c10::optional<torch::Tensor>& cache_indices,
-                        const c10::optional<torch::Tensor>& has_initial_state,
+                        const std::optional<torch::Tensor>& query_start_loc,
+                        const std::optional<torch::Tensor>& cache_indices,
+                        const std::optional<torch::Tensor>& has_initial_state,
                         const torch::Tensor& ssm_states, int64_t pad_slot_id);
 
 void causal_conv1d_update(const at::Tensor& x, const at::Tensor& conv_state,
                           const at::Tensor& weight,
-                          const c10::optional<at::Tensor>& bias_,
+                          const std::optional<at::Tensor>& bias_,
                           bool silu_activation,
-                          const c10::optional<at::Tensor>& cache_seqlens_,
-                          const c10::optional<at::Tensor>& conv_state_indices_,
+                          const std::optional<at::Tensor>& cache_seqlens_,
+                          const std::optional<at::Tensor>& conv_state_indices_,
                           int64_t pad_slot_id);
 
 void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
-                       const c10::optional<at::Tensor>& bias_,
-                       const c10::optional<at::Tensor>& conv_states,
-                       const c10::optional<at::Tensor>& query_start_loc,
-                       const c10::optional<at::Tensor>& cache_indices,
-                       const c10::optional<at::Tensor>& has_initial_state,
+                       const std::optional<at::Tensor>& bias_,
+                       const std::optional<at::Tensor>& conv_states,
+                       const std::optional<at::Tensor>& query_start_loc,
+                       const std::optional<at::Tensor>& cache_indices,
+                       const std::optional<at::Tensor>& has_initial_state,
                        bool silu_activation, int64_t pad_slot_id);
 
 #ifndef USE_ROCM
diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
index e9987535bd3ea..e79785827189d 100644
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@@ -226,7 +226,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
 void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
                               torch::Tensor const& input,  // [..., hidden_size]
                               torch::Tensor const& scale,
-                              c10::optional<torch::Tensor> const& azp) {
+                              std::optional<torch::Tensor> const& azp) {
   TORCH_CHECK(input.is_contiguous());
   TORCH_CHECK(out.is_contiguous());
   TORCH_CHECK(scale.numel() == 1);
@@ -257,7 +257,7 @@ void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
 void dynamic_scaled_int8_quant(
     torch::Tensor& out,          // [..., hidden_size]
     torch::Tensor const& input,  // [..., hidden_size]
-    torch::Tensor& scales, c10::optional<torch::Tensor> const& azp) {
+    torch::Tensor& scales, std::optional<torch::Tensor> const& azp) {
   TORCH_CHECK(input.is_contiguous());
   TORCH_CHECK(out.is_contiguous());
   TORCH_CHECK(scales.is_contiguous());
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
index dbb72e8bbd3f5..865fef5aeea11 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@@ -39,7 +39,7 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias) {
+                            std::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
   if (bias) {
@@ -58,8 +58,8 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& out, torch::Tensor const& a,
                                 torch::Tensor const& a_scales,
                                 torch::Tensor const& b_scales,
                                 torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias) {
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
@@ -94,7 +94,7 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias) {
+                            std::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
   if (bias) {
@@ -113,8 +113,8 @@ void cutlass_scaled_mm_azp_sm80(torch::Tensor& out, torch::Tensor const& a,
                                 torch::Tensor const& a_scales,
                                 torch::Tensor const& b_scales,
                                 torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias) {
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
@@ -165,7 +165,7 @@ void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias) {
+                            std::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
   if (bias) {
@@ -184,8 +184,8 @@ void cutlass_scaled_mm_azp_sm89(torch::Tensor& out, torch::Tensor const& a,
                                 torch::Tensor const& a_scales,
                                 torch::Tensor const& b_scales,
                                 torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias) {
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index 123f4359c0d1a..e18d7d79e5b77 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -51,7 +51,7 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias) {
+                            std::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
   if (bias) {
@@ -70,8 +70,8 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
                                 torch::Tensor const& a_scales,
                                 torch::Tensor const& b_scales,
                                 torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias) {
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 4f7b6588ef3f7..3f2b52624f366 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -9,26 +9,26 @@ void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias);
+                            std::optional<torch::Tensor> const& bias);
 
 void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias);
+                            std::optional<torch::Tensor> const& bias);
 
 void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias);
+                            std::optional<torch::Tensor> const& bias);
 
 #if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
 void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias);
+                            std::optional<torch::Tensor> const& bias);
 #endif
 
 void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
@@ -36,24 +36,24 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
                                 torch::Tensor const& a_scales,
                                 torch::Tensor const& b_scales,
                                 torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias);
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias);
 
 void cutlass_scaled_mm_azp_sm80(torch::Tensor& c, torch::Tensor const& a,
                                 torch::Tensor const& b,
                                 torch::Tensor const& a_scales,
                                 torch::Tensor const& b_scales,
                                 torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias);
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias);
 
 void cutlass_scaled_mm_azp_sm89(torch::Tensor& c, torch::Tensor const& a,
                                 torch::Tensor const& b,
                                 torch::Tensor const& a_scales,
                                 torch::Tensor const& b_scales,
                                 torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias);
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias);
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
 void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a,
@@ -61,8 +61,8 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a,
                                 torch::Tensor const& a_scales,
                                 torch::Tensor const& b_scales,
                                 torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias);
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias);
 #endif
 
 bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
@@ -84,7 +84,7 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
 void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales,
-                       c10::optional<torch::Tensor> const& bias) {
+                       std::optional<torch::Tensor> const& bias) {
   // Checks for conformality
   TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
   TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
@@ -148,8 +148,8 @@ void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
                            torch::Tensor const& a_scales,
                            torch::Tensor const& b_scales,
                            torch::Tensor const& azp_adj,
-                           c10::optional<torch::Tensor> const& azp,
-                           c10::optional<torch::Tensor> const& bias) {
+                           std::optional<torch::Tensor> const& azp,
+                           std::optional<torch::Tensor> const& bias) {
   // Checks for conformality
   TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
   TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index 2df4d181902f8..a9b5ddf4cbdd2 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -63,7 +63,7 @@
 
 
 static inline std::optional<at::ScalarType> maybe_scalartype(
-    c10::optional<at::Tensor> const& t) {
+    std::optional<at::Tensor> const& t) {
     if (!t) {
       return std::nullopt;
     } else {
diff --git a/csrc/quantization/machete/machete_mm_kernel.cuh b/csrc/quantization/machete/machete_mm_kernel.cuh
index d4d19ae5deec7..e4af067915e0a 100644
--- a/csrc/quantization/machete/machete_mm_kernel.cuh
+++ b/csrc/quantization/machete/machete_mm_kernel.cuh
@@ -183,11 +183,11 @@ struct MacheteKernelTemplate {
       torch::Tensor const& A,  // MxK matrix
       torch::Tensor const& B,  // KxN prepacked matrix
       torch::Tensor& D,        // MxN matrix
-      c10::optional<torch::Tensor> const& maybe_g_scales,  // scale_KxN matrix
-      c10::optional<torch::Tensor> const& maybe_g_zeros,   // scale_KxN matrix
-      c10::optional<int64_t> maybe_group_size,
-      c10::optional<torch::Tensor> const& maybe_ch_scales,   // len N vector
-      c10::optional<torch::Tensor> const& maybe_tok_scales)  // len M vector
+      std::optional<torch::Tensor> const& maybe_g_scales,  // scale_KxN matrix
+      std::optional<torch::Tensor> const& maybe_g_zeros,   // scale_KxN matrix
+      std::optional<int64_t> maybe_group_size,
+      std::optional<torch::Tensor> const& maybe_ch_scales,   // len N vector
+      std::optional<torch::Tensor> const& maybe_tok_scales)  // len M vector
   {
     static_assert(!with_group_zeropoints || with_group_scales);
 
diff --git a/csrc/quantization/machete/machete_mm_launcher.cuh b/csrc/quantization/machete/machete_mm_launcher.cuh
index 4b0da5b303e0c..cabe0af46f069 100644
--- a/csrc/quantization/machete/machete_mm_launcher.cuh
+++ b/csrc/quantization/machete/machete_mm_launcher.cuh
@@ -13,23 +13,23 @@ struct MMArgs {
   torch::Tensor const& A;
   torch::Tensor const& B;
   vllm::ScalarType const& b_type;
-  c10::optional<at::ScalarType> const& maybe_out_type;
-  c10::optional<torch::Tensor> const& maybe_group_scales;
-  c10::optional<torch::Tensor> const& maybe_group_zeros;
-  c10::optional<int64_t> maybe_group_size;
-  c10::optional<torch::Tensor> const& maybe_channel_scales;
-  c10::optional<torch::Tensor> const& maybe_token_scales;
-  c10::optional<std::string> maybe_schedule;
+  std::optional<at::ScalarType> const& maybe_out_type;
+  std::optional<torch::Tensor> const& maybe_group_scales;
+  std::optional<torch::Tensor> const& maybe_group_zeros;
+  std::optional<int64_t> maybe_group_size;
+  std::optional<torch::Tensor> const& maybe_channel_scales;
+  std::optional<torch::Tensor> const& maybe_token_scales;
+  std::optional<std::string> maybe_schedule;
 };
 
 struct SupportedSchedulesArgs {
   at::ScalarType a_type;
   vllm::ScalarType b_type;
-  c10::optional<at::ScalarType> maybe_group_scales_type;
-  c10::optional<at::ScalarType> maybe_group_zeros_type;
-  c10::optional<at::ScalarType> maybe_channel_scales_type;
-  c10::optional<at::ScalarType> maybe_token_scales_type;
-  c10::optional<at::ScalarType> maybe_out_type;
+  std::optional<at::ScalarType> maybe_group_scales_type;
+  std::optional<at::ScalarType> maybe_group_zeros_type;
+  std::optional<at::ScalarType> maybe_channel_scales_type;
+  std::optional<at::ScalarType> maybe_token_scales_type;
+  std::optional<at::ScalarType> maybe_out_type;
 };
 
 torch::Tensor mm_dispatch(MMArgs args);
diff --git a/csrc/quantization/machete/machete_prepack_launcher.cuh b/csrc/quantization/machete/machete_prepack_launcher.cuh
index 3486d28be2126..634b651a4d107 100644
--- a/csrc/quantization/machete/machete_prepack_launcher.cuh
+++ b/csrc/quantization/machete/machete_prepack_launcher.cuh
@@ -10,7 +10,7 @@ struct PrepackBArgs {
   torch::Tensor const& B;
   at::ScalarType a_type;
   vllm::ScalarType b_type;
-  c10::optional<at::ScalarType> maybe_group_scales_type;
+  std::optional<at::ScalarType> maybe_group_scales_type;
 };
 
 template <typename PrepackedLayoutB>
diff --git a/csrc/quantization/machete/machete_pytorch.cu b/csrc/quantization/machete/machete_pytorch.cu
index da2c2fb0d3e77..05a51ee21ddb7 100644
--- a/csrc/quantization/machete/machete_pytorch.cu
+++ b/csrc/quantization/machete/machete_pytorch.cu
@@ -10,11 +10,11 @@ using namespace vllm;
 
 std::vector<std::string> supported_schedules(
     at::ScalarType a_type, int64_t b_type_id,
-    c10::optional<at::ScalarType> maybe_group_scales_type,
-    c10::optional<at::ScalarType> maybe_group_zeros_type,
-    c10::optional<at::ScalarType> maybe_channel_scales_type,
-    c10::optional<at::ScalarType> maybe_token_scales_type,
-    c10::optional<at::ScalarType> maybe_out_type) {
+    std::optional<at::ScalarType> maybe_group_scales_type,
+    std::optional<at::ScalarType> maybe_group_zeros_type,
+    std::optional<at::ScalarType> maybe_channel_scales_type,
+    std::optional<at::ScalarType> maybe_token_scales_type,
+    std::optional<at::ScalarType> maybe_out_type) {
   ScalarType const b_type = ScalarType::from_id(b_type_id);
   return supported_schedules_dispatch({
       .a_type = a_type,
@@ -29,13 +29,13 @@ std::vector<std::string> supported_schedules(
 
 torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B,
                  int64_t b_type_id,
-                 c10::optional<at::ScalarType> const& maybe_out_type,
-                 c10::optional<torch::Tensor> const& maybe_group_scales,
-                 c10::optional<torch::Tensor> const& maybe_group_zeros,
-                 c10::optional<int64_t> maybe_group_size,
-                 c10::optional<torch::Tensor> const& maybe_channel_scales,
-                 c10::optional<torch::Tensor> const& maybe_token_scales,
-                 c10::optional<std::string> maybe_schedule) {
+                 std::optional<at::ScalarType> const& maybe_out_type,
+                 std::optional<torch::Tensor> const& maybe_group_scales,
+                 std::optional<torch::Tensor> const& maybe_group_zeros,
+                 std::optional<int64_t> maybe_group_size,
+                 std::optional<torch::Tensor> const& maybe_channel_scales,
+                 std::optional<torch::Tensor> const& maybe_token_scales,
+                 std::optional<std::string> maybe_schedule) {
   ScalarType const b_type = ScalarType::from_id(b_type_id);
   return mm_dispatch({.A = A,
                       .B = B,
@@ -51,7 +51,7 @@ torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B,
 
 torch::Tensor prepack_B(
     torch::Tensor const& B, at::ScalarType const& a_type, int64_t b_type_id,
-    c10::optional<at::ScalarType> const& maybe_group_scales_type) {
+    std::optional<at::ScalarType> const& maybe_group_scales_type) {
   ScalarType const b_type = ScalarType::from_id(b_type_id);
   return prepack_B_dispatch(
       {.B = B,
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index b48348a515c8d..0fec9624c457e 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -928,7 +928,7 @@ void paged_attention_custom_launcher(
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, const int num_kv_heads, float scale,
     torch::Tensor& block_tables, torch::Tensor& context_lens,
-    int max_context_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    int max_context_len, const std::optional<torch::Tensor>& alibi_slopes,
     float k_scale, float v_scale) {
   int num_seqs = query.size(0);
   int num_heads = query.size(1);
@@ -1086,7 +1086,7 @@ void paged_attention(
     torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
     torch::Tensor& context_lens,  // [num_seqs]
     int64_t block_size, int64_t max_context_len,
-    const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::optional<torch::Tensor>& alibi_slopes,
     const std::string& kv_cache_dtype, double k_scale, double v_scale) {
   const int head_size = query.size(2);
   if (kv_cache_dtype == "auto") {
diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h
index 9f085115a3956..34b2f9ce8a4c4 100644
--- a/csrc/rocm/ops.h
+++ b/csrc/rocm/ops.h
@@ -9,6 +9,6 @@ void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
                      double scale, torch::Tensor& block_tables,
                      torch::Tensor& context_lens, int64_t block_size,
                      int64_t max_context_len,
-                     const c10::optional<torch::Tensor>& alibi_slopes,
+                     const std::optional<torch::Tensor>& alibi_slopes,
                      const std::string& kv_cache_dtype, double k_scale,
                      double v_scale);
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index 6223dc8cca704..5a1879787c328 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -286,7 +286,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
                                    torch::Tensor const& bt_meta,
                                    torch::Tensor const& a_scales,
                                    torch::Tensor const& b_scales,
-                                   c10::optional<torch::Tensor> const& bias) {
+                                   std::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
   if (bias) {
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
index d464b045b895f..371de0950bc99 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -22,7 +22,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                                    torch::Tensor const& e,
                                    torch::Tensor const& a_scales,
                                    torch::Tensor const& b_scales,
-                                   c10::optional<torch::Tensor> const& bias);
+                                   std::optional<torch::Tensor> const& bias);
 #endif
 
 void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
@@ -30,7 +30,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
                               torch::Tensor const& bt_meta,
                               torch::Tensor const& a_scales,
                               torch::Tensor const& b_scales,
-                              c10::optional<torch::Tensor> const& bias) {
+                              std::optional<torch::Tensor> const& bias) {
   // Checks for conformality
   TORCH_CHECK(a.dim() == 2 && bt_nzs.dim() == 2 && c.dim() == 2);
   TORCH_CHECK(c.size(1) == bt_nzs.size(0) && bt_nzs.size(1) * 2 == a.size(1) &&

From 635b897246da121238454ed4b2bbc87cb4d4166b Mon Sep 17 00:00:00 2001
From: cennn <61925104+cennn@users.noreply.github.com>
Date: Sun, 5 Jan 2025 23:09:11 +0800
Subject: [PATCH 271/357] [distributed] remove pynccl's redundant stream
 (#11744)

---
 tests/distributed/test_pynccl.py              |  5 ++--
 .../device_communicators/pynccl.py            | 28 ++++++-------------
 vllm/distributed/parallel_state.py            |  3 +-
 3 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index 36cfe42251384..a77b48d5e49f3 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -137,9 +137,8 @@ def worker_fn_with_cudagraph():
         # run something in the default stream to initialize torch engine
         a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}')
         torch.cuda.synchronize()
-        with torch.cuda.graph(
-                graph, stream=pynccl_comm.stream), pynccl_comm.change_state(
-                    enable=True):
+        with torch.cuda.graph(graph), \
+            pynccl_comm.change_state(enable=True):
             a_out = pynccl_comm.all_reduce(a)
         torch.cuda.synchronize()
         graph.replay()
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index a6800f93f167b..93d96fd8f5686 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -51,7 +51,6 @@ def __init__(
         if self.world_size == 1:
             self.available = False
             self.disabled = True
-            self.stream = None
             return
         try:
             self.nccl = NCCLLibrary(library_path)
@@ -60,7 +59,6 @@ def __init__(
             # e.g. in a non-GPU environment
             self.available = False
             self.disabled = True
-            self.stream = None
             return
 
         self.available = True
@@ -98,12 +96,12 @@ def __init__(
         with torch.cuda.device(device):
             self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
                 self.world_size, self.unique_id, self.rank)
-            self.stream = torch.cuda.Stream()
 
+            stream = torch.cuda.current_stream()
             # A small all_reduce for warmup.
             data = torch.zeros(1, device=device)
             self.all_reduce(data)
-            self.stream.synchronize()
+            stream.synchronize()
             del data
 
     def all_reduce(self,
@@ -122,7 +120,7 @@ def all_reduce(self,
         out_tensor = torch.empty_like(in_tensor)
 
         if stream is None:
-            stream = self.stream
+            stream = torch.cuda.current_stream()
         self.nccl.ncclAllReduce(buffer_type(in_tensor.data_ptr()),
                                 buffer_type(out_tensor.data_ptr()),
                                 in_tensor.numel(),
@@ -144,7 +142,7 @@ def all_gather(self,
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {input_tensor.device}")
         if stream is None:
-            stream = self.stream
+            stream = torch.cuda.current_stream()
         self.nccl.ncclAllGather(
             buffer_type(input_tensor.data_ptr()),
             buffer_type(output_tensor.data_ptr()), input_tensor.numel(),
@@ -165,7 +163,7 @@ def reduce_scatter(self,
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {input_tensor.device}")
         if stream is None:
-            stream = self.stream
+            stream = torch.cuda.current_stream()
         self.nccl.ncclReduceScatter(
             buffer_type(input_tensor.data_ptr()),
             buffer_type(output_tensor.data_ptr()), output_tensor.numel(),
@@ -180,7 +178,7 @@ def send(self, tensor: torch.Tensor, dst: int, stream=None):
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}")
         if stream is None:
-            stream = self.stream
+            stream = torch.cuda.current_stream()
         self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(),
                            ncclDataTypeEnum.from_torch(tensor.dtype), dst,
                            self.comm, cudaStream_t(stream.cuda_stream))
@@ -192,7 +190,7 @@ def recv(self, tensor: torch.Tensor, src: int, stream=None):
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}")
         if stream is None:
-            stream = self.stream
+            stream = torch.cuda.current_stream()
         self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(),
                            ncclDataTypeEnum.from_torch(tensor.dtype), src,
                            self.comm, cudaStream_t(stream.cuda_stream))
@@ -204,7 +202,7 @@ def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}")
         if stream is None:
-            stream = self.stream
+            stream = torch.cuda.current_stream()
         if src == self.rank:
             sendbuff = buffer_type(tensor.data_ptr())
             # NCCL requires the sender also to have a receive buffer
@@ -217,9 +215,7 @@ def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
                                 self.comm, cudaStream_t(stream.cuda_stream))
 
     @contextmanager
-    def change_state(self,
-                     enable: Optional[bool] = None,
-                     stream: Optional[torch.cuda.Stream] = None):
+    def change_state(self, enable: Optional[bool] = None):
         """
         A context manager to change the state of the communicator.
         """
@@ -227,15 +223,9 @@ def change_state(self,
             # guess a default value when not specified
             enable = self.available
 
-        if stream is None:
-            stream = self.stream
-
         old_disable = self.disabled
-        old_stream = self.stream
 
-        self.stream = stream
         self.disabled = not enable
         yield
 
         self.disabled = old_disable
-        self.stream = old_stream
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index a0d4235460f3b..dccd3addbcb35 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -310,8 +310,7 @@ def graph_capture(
             if not pynccl_comm:
                 maybe_pynccl_context = nullcontext()
             else:
-                maybe_pynccl_context = pynccl_comm.change_state(
-                    stream=torch.cuda.current_stream())
+                maybe_pynccl_context = pynccl_comm.change_state()
             with maybe_pynccl_context:
                 yield graph_capture_context
 

From eba17173d34548a39989eae2530dce53496a1f3d Mon Sep 17 00:00:00 2001
From: Lancer <402430575@qq.com>
Date: Mon, 6 Jan 2025 00:48:16 +0800
Subject: [PATCH 272/357] fix: [doc] fix typo (#11751)

Co-authored-by: Lancer <maruixiang6688@gmail.com>
---
 vllm/core/block/block_table.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index dca0b3fe8d304..90c1438efbd08 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -23,7 +23,7 @@ class BlockTable:
             blocks to initialize the BlockTable with. If not provided, an empty
             BlockTable is created.
         max_block_sliding_window (Optional[int], optional): The number of
-            blocks to keep around for each sequance. If None, all blocks
+            blocks to keep around for each sequence. If None, all blocks
             are kept (eg., when sliding window is not used).
             It should at least fit the sliding window size of the model.
 

From 33fc1e2e86ce5d60940463f8f71daaa61728d3b7 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sun, 5 Jan 2025 16:35:01 -0500
Subject: [PATCH 273/357] [Frontend] Improve `StreamingResponse` Exception
 Handling (#11752)

---
 vllm/entrypoints/openai/serving_chat.py       | 4 ++--
 vllm/entrypoints/openai/serving_completion.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 9ba5eeb7709c9..89a119ac65695 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -301,7 +301,7 @@ async def chat_completion_stream_generator(
                 ] * num_choices
             else:
                 tool_parsers = [None] * num_choices
-        except RuntimeError as e:
+        except Exception as e:
             logger.exception("Error in tool parser creation.")
             data = self.create_streaming_error_response(str(e))
             yield f"data: {data}\n\n"
@@ -591,7 +591,7 @@ async def chat_completion_stream_generator(
                 completion_tokens=num_completion_tokens,
                 total_tokens=num_prompt_tokens + num_completion_tokens)
 
-        except ValueError as e:
+        except Exception as e:
             # TODO: Use a vllm-specific Validation Error
             logger.exception("Error in chat completion stream generator.")
             data = self.create_streaming_error_response(str(e))
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 17197dce8da23..2c9c20caf8119 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -371,7 +371,7 @@ async def completion_stream_generator(
             # report to FastAPI middleware aggregate usage across all choices
             request_metadata.final_usage_info = final_usage_info
 
-        except ValueError as e:
+        except Exception as e:
             # TODO: Use a vllm-specific Validation Error
             data = self.create_streaming_error_response(str(e))
             yield f"data: {data}\n\n"

From 9e764e7b105a483ebc702cad33922ba8d8c210e1 Mon Sep 17 00:00:00 2001
From: cennn <61925104+cennn@users.noreply.github.com>
Date: Mon, 6 Jan 2025 09:05:48 +0800
Subject: [PATCH 274/357] [distributed] remove pynccl's redundant change_state
 (#11749)

---
 tests/distributed/test_pynccl.py              | 64 ++++++++-----------
 .../device_communicators/pynccl.py            | 17 -----
 vllm/distributed/parallel_state.py            |  9 +--
 3 files changed, 28 insertions(+), 62 deletions(-)

diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index a77b48d5e49f3..a8571a1157892 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -59,8 +59,7 @@ def worker_fn():
                                      device=get_world_group().device)
     tensor = torch.ones(16, 1024, 1024,
                         dtype=torch.float32).cuda(pynccl_comm.rank)
-    with pynccl_comm.change_state(enable=True):
-        tensor = pynccl_comm.all_reduce(tensor)
+    tensor = pynccl_comm.all_reduce(tensor)
     torch.cuda.synchronize()
     assert torch.all(tensor == pynccl_comm.world_size).cpu().item()
 
@@ -81,17 +80,16 @@ def multiple_allreduce_worker_fn():
     group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1]
     pynccl_comm = PyNcclCommunicator(group=group, device=device)
     tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
-    with pynccl_comm.change_state(enable=True):
-        # two groups can communicate independently
-        if torch.distributed.get_rank() in [0, 1]:
-            tensor = pynccl_comm.all_reduce(tensor)
-            tensor = pynccl_comm.all_reduce(tensor)
-            torch.cuda.synchronize()
-            assert torch.all(tensor == 4).cpu().item()
-        else:
-            tensor = pynccl_comm.all_reduce(tensor)
-            torch.cuda.synchronize()
-            assert torch.all(tensor == 2).cpu().item()
+    # two groups can communicate independently
+    if torch.distributed.get_rank() in [0, 1]:
+        tensor = pynccl_comm.all_reduce(tensor)
+        tensor = pynccl_comm.all_reduce(tensor)
+        torch.cuda.synchronize()
+        assert torch.all(tensor == 4).cpu().item()
+    else:
+        tensor = pynccl_comm.all_reduce(tensor)
+        torch.cuda.synchronize()
+        assert torch.all(tensor == 2).cpu().item()
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
@@ -137,8 +135,7 @@ def worker_fn_with_cudagraph():
         # run something in the default stream to initialize torch engine
         a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}')
         torch.cuda.synchronize()
-        with torch.cuda.graph(graph), \
-            pynccl_comm.change_state(enable=True):
+        with torch.cuda.graph(graph):
             a_out = pynccl_comm.all_reduce(a)
         torch.cuda.synchronize()
         graph.replay()
@@ -167,8 +164,7 @@ def all_gather_worker_fn():
         for r in range(world_size)
     ]).to(device)
 
-    with pynccl_comm.change_state(enable=True):
-        pynccl_comm.all_gather(result, tensor)
+    pynccl_comm.all_gather(result, tensor)
     torch.cuda.synchronize()
     torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
 
@@ -205,8 +201,7 @@ def reduce_scatter_worker_fn():
     expected = sum(tensor[rank * scattered_size:(rank + 1) * scattered_size]
                    for tensor in all_tensors).to(device)
 
-    with pynccl_comm.change_state(enable=True):
-        pynccl_comm.reduce_scatter(result, tensor)
+    pynccl_comm.reduce_scatter(result, tensor)
     torch.cuda.synchronize()
     torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
 
@@ -233,15 +228,13 @@ def send_recv_worker_fn():
     else:
         tensor = torch.empty(16, 1024, 1024,
                              dtype=torch.float32).cuda(pynccl_comm.rank)
-    with pynccl_comm.change_state(enable=True):
-        if pynccl_comm.rank == 0:
-            pynccl_comm.send(tensor,
-                             dst=(pynccl_comm.rank + 1) %
-                             pynccl_comm.world_size)
-        else:
-            pynccl_comm.recv(tensor,
-                             src=(pynccl_comm.rank - 1) %
-                             pynccl_comm.world_size)
+
+    if pynccl_comm.rank == 0:
+        pynccl_comm.send(tensor,
+                         dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
+    else:
+        pynccl_comm.recv(tensor,
+                         src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
     torch.cuda.synchronize()
     assert torch.all(tensor == 1).cpu().item()
 
@@ -272,15 +265,12 @@ def multiple_send_recv_worker_fn():
                              1024,
                              dtype=torch.float32,
                              device=device)
-    with pynccl_comm.change_state(enable=True):
-        if torch.distributed.get_rank() in [0, 1]:
-            pynccl_comm.send(tensor,
-                             dst=(pynccl_comm.rank + 1) %
-                             pynccl_comm.world_size)
-        else:
-            pynccl_comm.recv(tensor,
-                             src=(pynccl_comm.rank - 1) %
-                             pynccl_comm.world_size)
+    if torch.distributed.get_rank() in [0, 1]:
+        pynccl_comm.send(tensor,
+                         dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
+    else:
+        pynccl_comm.recv(tensor,
+                         src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
     torch.cuda.synchronize()
     if torch.distributed.get_rank() in [0, 2]:
         assert torch.all(tensor == 1).cpu().item()
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 93d96fd8f5686..fda4d007ceb5b 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -1,4 +1,3 @@
-from contextlib import contextmanager
 from typing import Optional, Union
 
 # ===================== import region =====================
@@ -213,19 +212,3 @@ def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
         self.nccl.ncclBroadcast(sendbuff, recvbuff, tensor.numel(),
                                 ncclDataTypeEnum.from_torch(tensor.dtype), src,
                                 self.comm, cudaStream_t(stream.cuda_stream))
-
-    @contextmanager
-    def change_state(self, enable: Optional[bool] = None):
-        """
-        A context manager to change the state of the communicator.
-        """
-        if enable is None:
-            # guess a default value when not specified
-            enable = self.available
-
-        old_disable = self.disabled
-
-        self.disabled = not enable
-        yield
-
-        self.disabled = old_disable
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index dccd3addbcb35..a837c1dc5953b 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -305,14 +305,7 @@ def graph_capture(
             stream.wait_stream(curr_stream)
 
         with torch.cuda.stream(stream), maybe_ca_context:
-            pynccl_comm = self.pynccl_comm
-            maybe_pynccl_context: Any
-            if not pynccl_comm:
-                maybe_pynccl_context = nullcontext()
-            else:
-                maybe_pynccl_context = pynccl_comm.change_state()
-            with maybe_pynccl_context:
-                yield graph_capture_context
+            yield graph_capture_context
 
     def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
         """

From 402d37836059463c7ec8b1e25d40c29138f1dd40 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 6 Jan 2025 10:18:33 +0800
Subject: [PATCH 275/357] [Doc] [1/N] Reorganize Getting Started section
 (#11645)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/design/arch_overview.md           |  3 +--
 docs/source/design/multiprocessing.md         |  2 +-
 docs/source/{usage => getting_started}/faq.md |  0
 .../cpu-arm.md}                               |  2 +-
 .../cpu-x86.md}                               |  6 +++---
 .../gpu-cuda.md}                              |  4 ++--
 .../gpu-rocm.md}                              |  2 +-
 .../hpu-gaudi.md}                             |  4 +++-
 .../getting_started/installation/index.md     | 19 +++++++++++++++++++
 .../neuron.md}                                |  2 +-
 .../openvino.md}                              |  4 ++--
 .../tpu.md}                                   |  2 +-
 .../xpu.md}                                   |  2 +-
 docs/source/getting_started/quickstart.md     |  2 +-
 .../{debugging.md => troubleshooting.md}      | 11 ++++++-----
 docs/source/index.md                          | 16 ++++------------
 docs/source/models/generative_models.md       |  2 +-
 docs/source/models/pooling_models.md          |  2 +-
 docs/source/serving/distributed_serving.md    |  2 +-
 docs/source/usage/spec_decode.md              |  4 ++--
 docs/source/usage/structured_outputs.md       |  2 +-
 vllm/utils.py                                 |  2 +-
 22 files changed, 54 insertions(+), 41 deletions(-)
 rename docs/source/{usage => getting_started}/faq.md (100%)
 rename docs/source/getting_started/{arm-installation.md => installation/cpu-arm.md} (92%)
 rename docs/source/getting_started/{cpu-installation.md => installation/cpu-x86.md} (95%)
 rename docs/source/getting_started/{installation.md => installation/gpu-cuda.md} (99%)
 rename docs/source/getting_started/{amd-installation.md => installation/gpu-rocm.md} (99%)
 rename docs/source/getting_started/{gaudi-installation.md => installation/hpu-gaudi.md} (99%)
 create mode 100644 docs/source/getting_started/installation/index.md
 rename docs/source/getting_started/{neuron-installation.md => installation/neuron.md} (99%)
 rename docs/source/getting_started/{openvino-installation.md => installation/openvino.md} (90%)
 rename docs/source/getting_started/{tpu-installation.md => installation/tpu.md} (99%)
 rename docs/source/getting_started/{xpu-installation.md => installation/xpu.md} (98%)
 rename docs/source/getting_started/{debugging.md => troubleshooting.md} (94%)

diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md
index 475a3e5fa9ddc..2f1280c047672 100644
--- a/docs/source/design/arch_overview.md
+++ b/docs/source/design/arch_overview.md
@@ -77,8 +77,7 @@ python -m vllm.entrypoints.openai.api_server --model <model>
 
 That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
 
-More details on the API server can be found in the {doc}`OpenAI Compatible
-Server </serving/openai_compatible_server>` document.
+More details on the API server can be found in the [OpenAI-Compatible Server](#openai-compatible-server) document.
 
 ## LLM Engine
 
diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md
index 34564413b34f6..da87638e5b743 100644
--- a/docs/source/design/multiprocessing.md
+++ b/docs/source/design/multiprocessing.md
@@ -2,7 +2,7 @@
 
 ## Debugging
 
-Please see the [Debugging Tips](#debugging-python-multiprocessing)
+Please see the [Troubleshooting](#troubleshooting-python-multiprocessing)
 page for information on known issues and how to solve them.
 
 ## Introduction
diff --git a/docs/source/usage/faq.md b/docs/source/getting_started/faq.md
similarity index 100%
rename from docs/source/usage/faq.md
rename to docs/source/getting_started/faq.md
diff --git a/docs/source/getting_started/arm-installation.md b/docs/source/getting_started/installation/cpu-arm.md
similarity index 92%
rename from docs/source/getting_started/arm-installation.md
rename to docs/source/getting_started/installation/cpu-arm.md
index 799b597b3ad5d..a46e2c010600d 100644
--- a/docs/source/getting_started/arm-installation.md
+++ b/docs/source/getting_started/installation/cpu-arm.md
@@ -2,7 +2,7 @@
 
 # Installation for ARM CPUs
 
-vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering:
+vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering:
 
 - CPU backend inference capabilities
 - Relevant runtime environment variables
diff --git a/docs/source/getting_started/cpu-installation.md b/docs/source/getting_started/installation/cpu-x86.md
similarity index 95%
rename from docs/source/getting_started/cpu-installation.md
rename to docs/source/getting_started/installation/cpu-x86.md
index c3d3f715ed804..bbb2d1872ef39 100644
--- a/docs/source/getting_started/cpu-installation.md
+++ b/docs/source/getting_started/installation/cpu-x86.md
@@ -1,6 +1,6 @@
-(installation-cpu)=
+(installation-x86)=
 
-# Installation with CPU
+# Installation for x86 CPUs
 
 vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
 
@@ -151,4 +151,4 @@ $ python examples/offline_inference.py
     $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
     ```
 
-  - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](../serving/deploying_with_nginx.md) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md).
+  - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md).
diff --git a/docs/source/getting_started/installation.md b/docs/source/getting_started/installation/gpu-cuda.md
similarity index 99%
rename from docs/source/getting_started/installation.md
rename to docs/source/getting_started/installation/gpu-cuda.md
index 996fb346f43d4..7ea10bb8b59ff 100644
--- a/docs/source/getting_started/installation.md
+++ b/docs/source/getting_started/installation/gpu-cuda.md
@@ -1,6 +1,6 @@
-(installation)=
+(installation-cuda)=
 
-# Installation
+# Installation for CUDA
 
 vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.
 
diff --git a/docs/source/getting_started/amd-installation.md b/docs/source/getting_started/installation/gpu-rocm.md
similarity index 99%
rename from docs/source/getting_started/amd-installation.md
rename to docs/source/getting_started/installation/gpu-rocm.md
index 6d01efbbf8828..796911d7305a6 100644
--- a/docs/source/getting_started/amd-installation.md
+++ b/docs/source/getting_started/installation/gpu-rocm.md
@@ -1,6 +1,6 @@
 (installation-rocm)=
 
-# Installation with ROCm
+# Installation for ROCm
 
 vLLM supports AMD GPUs with ROCm 6.2.
 
diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/installation/hpu-gaudi.md
similarity index 99%
rename from docs/source/getting_started/gaudi-installation.md
rename to docs/source/getting_started/installation/hpu-gaudi.md
index 1f2ee62860dec..94de169f51a73 100644
--- a/docs/source/getting_started/gaudi-installation.md
+++ b/docs/source/getting_started/installation/hpu-gaudi.md
@@ -1,4 +1,6 @@
-# Installation with Intel® Gaudi® AI Accelerators
+(installation-gaudi)=
+
+# Installation for Intel® Gaudi®
 
 This README provides instructions on running vLLM with Intel Gaudi devices.
 
diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md
new file mode 100644
index 0000000000000..83de1aff409b2
--- /dev/null
+++ b/docs/source/getting_started/installation/index.md
@@ -0,0 +1,19 @@
+(installation-index)=
+
+# Installation
+
+vLLM supports the following hardware platforms:
+
+```{toctree}
+:maxdepth: 1
+
+gpu-cuda
+gpu-rocm
+cpu-x86
+cpu-arm
+hpu-gaudi
+tpu
+xpu
+openvino
+neuron
+```
diff --git a/docs/source/getting_started/neuron-installation.md b/docs/source/getting_started/installation/neuron.md
similarity index 99%
rename from docs/source/getting_started/neuron-installation.md
rename to docs/source/getting_started/installation/neuron.md
index baaeeb9f53a10..431f90537f543 100644
--- a/docs/source/getting_started/neuron-installation.md
+++ b/docs/source/getting_started/installation/neuron.md
@@ -1,6 +1,6 @@
 (installation-neuron)=
 
-# Installation with Neuron
+# Installation for Neuron
 
 vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching.
 Paged Attention and Chunked Prefill are currently in development and will be available soon.
diff --git a/docs/source/getting_started/openvino-installation.md b/docs/source/getting_started/installation/openvino.md
similarity index 90%
rename from docs/source/getting_started/openvino-installation.md
rename to docs/source/getting_started/installation/openvino.md
index 8b43c0a90447f..60f95fd1c4250 100644
--- a/docs/source/getting_started/openvino-installation.md
+++ b/docs/source/getting_started/installation/openvino.md
@@ -1,8 +1,8 @@
 (installation-openvino)=
 
-# Installation with OpenVINO
+# Installation for OpenVINO
 
-vLLM powered by OpenVINO supports all LLM models from {doc}`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features:
+vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features:
 
 - Prefix caching (`--enable-prefix-caching`)
 - Chunked prefill (`--enable-chunked-prefill`)
diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/installation/tpu.md
similarity index 99%
rename from docs/source/getting_started/tpu-installation.md
rename to docs/source/getting_started/installation/tpu.md
index 4d3ac541c90ce..bc93c44fead30 100644
--- a/docs/source/getting_started/tpu-installation.md
+++ b/docs/source/getting_started/installation/tpu.md
@@ -1,6 +1,6 @@
 (installation-tpu)=
 
-# Installation with TPU
+# Installation for TPUs
 
 Tensor Processing Units (TPUs) are Google's custom-developed application-specific
 integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs
diff --git a/docs/source/getting_started/xpu-installation.md b/docs/source/getting_started/installation/xpu.md
similarity index 98%
rename from docs/source/getting_started/xpu-installation.md
rename to docs/source/getting_started/installation/xpu.md
index 9554ae4b7fb44..be4e3b9bd1bc5 100644
--- a/docs/source/getting_started/xpu-installation.md
+++ b/docs/source/getting_started/installation/xpu.md
@@ -1,6 +1,6 @@
 (installation-xpu)=
 
-# Installation with XPU
+# Installation for XPUs
 
 vLLM initially supports basic model inferencing and serving on Intel GPU platform.
 
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 9c8b7e4f592c9..ff216f8af30f9 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -23,7 +23,7 @@ $ conda activate myenv
 $ pip install vllm
 ```
 
-Please refer to the {ref}`installation documentation <installation>` for more details on installing vLLM.
+Please refer to the [installation documentation](#installation-index) for more details on installing vLLM.
 
 (offline-batched-inference)=
 
diff --git a/docs/source/getting_started/debugging.md b/docs/source/getting_started/troubleshooting.md
similarity index 94%
rename from docs/source/getting_started/debugging.md
rename to docs/source/getting_started/troubleshooting.md
index 19eb699572a08..5a0310da0f2cb 100644
--- a/docs/source/getting_started/debugging.md
+++ b/docs/source/getting_started/troubleshooting.md
@@ -1,8 +1,8 @@
-(debugging)=
+(troubleshooting)=
 
-# Debugging Tips
+# Troubleshooting
 
-This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
+This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
 
 ```{note}
 Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated.
@@ -47,6 +47,7 @@ You might also need to set `export NCCL_SOCKET_IFNAME=<your_network_interface>`
 If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph.
 To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.
 
+(troubleshooting-incorrect-hardware-driver)=
 ## Incorrect hardware/driver
 
 If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
@@ -139,7 +140,7 @@ A multi-node environment is more complicated than a single-node one. If you see
 Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes.
 ```
 
-(debugging-python-multiprocessing)=
+(troubleshooting-python-multiprocessing)=
 ## Python multiprocessing
 
 ### `RuntimeError` Exception
@@ -150,7 +151,7 @@ If you have seen a warning in your logs like this:
 WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
     initialized. We must use the `spawn` multiprocessing start method. Setting
     VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
-    https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
+    https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#python-multiprocessing
     for more information.
 ```
 
diff --git a/docs/source/index.md b/docs/source/index.md
index 34f9c4caebe6f..f390474978790 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -50,7 +50,7 @@ For more information, check out the following:
 - [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention)
 - [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023)
 - [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al.
-- {ref}`vLLM Meetups <meetups>`.
+- [vLLM Meetups](#meetups)
 
 ## Documentation
 
@@ -58,18 +58,11 @@ For more information, check out the following:
 :caption: Getting Started
 :maxdepth: 1
 
-getting_started/installation
-getting_started/amd-installation
-getting_started/openvino-installation
-getting_started/cpu-installation
-getting_started/gaudi-installation
-getting_started/arm-installation
-getting_started/neuron-installation
-getting_started/tpu-installation
-getting_started/xpu-installation
+getting_started/installation/index
 getting_started/quickstart
-getting_started/debugging
 getting_started/examples/examples_index
+getting_started/troubleshooting
+getting_started/faq
 ```
 
 ```{toctree}
@@ -110,7 +103,6 @@ usage/structured_outputs
 usage/spec_decode
 usage/compatibility_matrix
 usage/performance
-usage/faq
 usage/engine_args
 usage/env_vars
 usage/usage_stats
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index 35e0302b86619..383299d61b5dd 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -120,7 +120,7 @@ outputs = llm.chat(conversation, chat_template=custom_template)
 
 ## Online Inference
 
-Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
+Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs:
 
 - [Completions API](#completions-api) is similar to `LLM.generate` but only accepts text.
 - [Chat API](#chat-api)  is similar to `LLM.chat`, accepting both text and [multi-modal inputs](#multimodal-inputs) for models with a chat template.
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 76c96c9edcc5d..12ded68eb30b5 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -106,7 +106,7 @@ A code example can be found here: <gh-file:examples/offline_inference_scoring.py
 
 ## Online Inference
 
-Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
+Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs:
 
 - [Pooling API](#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
 - [Embeddings API](#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs) for embedding models.
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index a1dd0e89e8c79..6fbc1ea104678 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -95,7 +95,7 @@ $     --tensor-parallel-size 16
 To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
 
 ```{warning}
-After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](../getting_started/debugging.md) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information.
+After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](#troubleshooting-incorrect-hardware-driver) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information.
 ```
 
 ```{warning}
diff --git a/docs/source/usage/spec_decode.md b/docs/source/usage/spec_decode.md
index 8302da81b6173..8c52c97a41e48 100644
--- a/docs/source/usage/spec_decode.md
+++ b/docs/source/usage/spec_decode.md
@@ -182,7 +182,7 @@ speculative decoding, breaking down the guarantees into three key areas:
 3. **vLLM Logprob Stability**
    \- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the
    same request across runs. For more details, see the FAQ section
-   titled *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs <faq>`.
+   titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq).
 
 **Conclusion**
 
@@ -195,7 +195,7 @@ can occur due to following factors:
 
 **Mitigation Strategies**
 
-For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs <faq>`.
+For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq).
 
 ## Resources for vLLM contributors
 
diff --git a/docs/source/usage/structured_outputs.md b/docs/source/usage/structured_outputs.md
index 7292012e36a26..26c09bb0d8a0c 100644
--- a/docs/source/usage/structured_outputs.md
+++ b/docs/source/usage/structured_outputs.md
@@ -18,7 +18,7 @@ The following parameters are supported, which must be added as extra parameters:
 - `guided_whitespace_pattern`: used to override the default whitespace pattern for guided json decoding.
 - `guided_decoding_backend`: used to select the guided decoding backend to use.
 
-You can see the complete list of supported parameters on the [OpenAI Compatible Server](../serving/openai_compatible_server.md) page.
+You can see the complete list of supported parameters on the [OpenAI-Compatible Server](#openai-compatible-server)page.
 
 Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 8ef07d2c326a3..aadeddabf8b55 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1938,7 +1938,7 @@ def _check_multiproc_method():
                        "the `spawn` multiprocessing start method. Setting "
                        "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
                        "See https://docs.vllm.ai/en/latest/getting_started/"
-                       "debugging.html#python-multiprocessing "
+                       "troubleshooting.html#python-multiprocessing "
                        "for more information.")
         os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 

From 408e5600158bfa34306cfbd034a3779e488752fa Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Sun, 5 Jan 2025 20:49:55 -0800
Subject: [PATCH 276/357] [Bugfix] Remove block size constraint (#11723)

---
 vllm/config.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index b51f9783008b2..b0ed88cb7f42b 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1015,11 +1015,6 @@ def _verify_args(self) -> None:
             raise ValueError(
                 "GPU memory utilization must be less than 1.0. Got "
                 f"{self.gpu_memory_utilization}.")
-        from vllm.platforms import current_platform
-        if (current_platform.is_cuda() and self.block_size is not None
-                and self.block_size > 32):
-            raise ValueError("CUDA Paged Attention kernel only supports "
-                             f"block sizes up to 32. Got {self.block_size}.")
 
     def _verify_cache_dtype(self) -> None:
         if self.cache_dtype == "auto":

From 06bfb51963953d6ae31b87965bfb91b6eca4fd24 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 6 Jan 2025 14:24:42 +0900
Subject: [PATCH 277/357] [V1] Add BlockTable class (#11693)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/block_table.py      | 78 ++++++++++++++++++++++++++++++
 vllm/v1/worker/gpu_input_batch.py  | 25 ++++------
 vllm/v1/worker/gpu_model_runner.py | 16 +++---
 3 files changed, 94 insertions(+), 25 deletions(-)
 create mode 100644 vllm/v1/worker/block_table.py

diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
new file mode 100644
index 0000000000000..26a2084b131fa
--- /dev/null
+++ b/vllm/v1/worker/block_table.py
@@ -0,0 +1,78 @@
+from typing import List
+
+import numpy as np
+import torch
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class BlockTable:
+
+    def __init__(
+        self,
+        max_num_reqs: int,
+        max_model_len: int,
+        max_num_blocks_per_req: int,
+        pin_memory: bool,
+        device: torch.device,
+    ):
+        self.max_num_reqs = max_num_reqs
+        self.max_model_len = max_model_len
+        self.max_num_blocks_per_req = max_num_blocks_per_req
+        self.pin_memory = pin_memory
+        self.device = device
+
+        self.block_table = torch.zeros(
+            (max_num_reqs, max_num_blocks_per_req),
+            device=self.device,
+            dtype=torch.int32,
+        )
+        self.block_table_cpu = torch.zeros(
+            (max_num_reqs, max_num_blocks_per_req),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.block_table_np = self.block_table_cpu.numpy()
+        self.num_blocks_per_row = np.zeros(max_num_reqs, dtype=np.int32)
+
+    def append_row(
+        self,
+        row_idx: int,
+        start: int,
+        block_ids: List[int],
+    ) -> None:
+        num_blocks = len(block_ids)
+        self.block_table_np[row_idx, start:start + num_blocks] = block_ids
+        self.num_blocks_per_row[row_idx] = start + num_blocks
+
+    def add_row(self, row_idx: int, block_ids: List[int]) -> None:
+        self.append_row(row_idx, 0, block_ids)
+
+    def move_row(self, src: int, tgt: int) -> None:
+        num_blocks = self.num_blocks_per_row[src]
+        self.block_table_np[tgt, :num_blocks] = self.block_table_np[
+            src, :num_blocks]
+        self.num_blocks_per_row[tgt] = num_blocks
+
+    def commit(self, num_reqs: int) -> None:
+        self.block_table[:num_reqs].copy_(self.block_table_cpu[:num_reqs],
+                                          non_blocking=True)
+
+    def clear(self) -> None:
+        self.block_table.fill_(0)
+        self.block_table_cpu.fill_(0)
+
+    def get_device_tensor(self) -> torch.Tensor:
+        """Ruturns the device tensor of the block table."""
+        return self.block_table
+
+    def get_cpu_tensor(self) -> torch.Tensor:
+        """Returns the CPU tensor of the block table."""
+        return self.block_table_cpu
+
+    def get_numpy_array(self) -> np.ndarray:
+        """Returns the numpy array of the block table."""
+        return self.block_table_np
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index f8a1427c6c26c..40494e64b22f0 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -9,6 +9,7 @@
 from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.worker.block_table import BlockTable
 
 if TYPE_CHECKING:
     from vllm.multimodal.inputs import PlaceholderRange
@@ -70,19 +71,14 @@ def __init__(
         self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
 
-        # Attention-related.
-        self.block_table = torch.zeros(
-            (max_num_reqs, max_num_blocks_per_req),
-            device=self.device,
-            dtype=torch.int32,
-        )
-        self.block_table_cpu_tensor = torch.zeros(
-            (max_num_reqs, max_num_blocks_per_req),
-            device="cpu",
-            dtype=torch.int32,
+        # Block table.
+        self.block_table = BlockTable(
+            max_num_reqs=max_num_reqs,
+            max_model_len=max_model_len,
+            max_num_blocks_per_req=max_num_blocks_per_req,
             pin_memory=pin_memory,
+            device=device,
         )
-        self.block_table_cpu = self.block_table_cpu_tensor.numpy()
 
         # Sampling-related.
         self.temperature = torch.empty((max_num_reqs, ),
@@ -193,8 +189,7 @@ def add_request(
         self.num_tokens[req_index] = request.num_tokens
 
         self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
-        num_blocks = len(request.block_ids)
-        self.block_table_cpu[req_index, :num_blocks] = request.block_ids
+        self.block_table.add_row(req_index, request.block_ids)
 
         sampling_params = request.sampling_params
         self.temperature_cpu[req_index] = sampling_params.temperature
@@ -300,9 +295,7 @@ def condense(self, empty_req_indices: List[int]) -> None:
                 self.num_prompt_tokens[last_req_index]
             self.num_computed_tokens_cpu[
                 empty_index] = self.num_computed_tokens_cpu[last_req_index]
-            # TODO(woosuk): Optimize the copy of block_table_cpu.
-            self.block_table_cpu[empty_index] = self.block_table_cpu[
-                last_req_index]
+            self.block_table.move_row(last_req_index, empty_index)
             self.temperature_cpu[empty_index] = self.temperature_cpu[
                 last_req_index]
             self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 294c76cfb680e..31e693235f99f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -211,10 +211,9 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             if num_new_blocks == 0:
                 continue
             start_index = len(req_state.block_ids)
-            end_index = start_index + num_new_blocks
             req_state.block_ids.extend(req_data.new_block_ids)
-            self.input_batch.block_table_cpu[
-                req_index, start_index:end_index] = req_data.new_block_ids
+            self.input_batch.block_table.append_row(req_index, start_index,
+                                                    req_data.new_block_ids)
 
         req_ids_to_add: List[str] = []
         # Add new requests to the cached states.
@@ -275,9 +274,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
 
         # OPTIMIZATION: Start copying the block table first.
         # This way, we can overlap the copy with the following CPU operations.
-        self.input_batch.block_table[:num_reqs].copy_(
-            self.input_batch.block_table_cpu_tensor[:num_reqs],
-            non_blocking=True)
+        self.input_batch.block_table.commit(num_reqs)
 
         # Get the number of scheduled tokens for each request.
         # TODO: The Python loop can be slow. Optimize.
@@ -333,8 +330,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # NOTE(woosuk): We use torch.index_select instead of np.take here
         # because torch.index_select is much faster than np.take for large
         # tensors.
-        block_numbers = (self.input_batch.block_table_cpu_tensor.flatten()
-                         [block_table_indices].numpy())
+        block_table_cpu = self.input_batch.block_table.get_cpu_tensor()
+        block_numbers = block_table_cpu.flatten()[block_table_indices].numpy()
         block_offsets = positions_np % self.block_size
         np.add(block_numbers * self.block_size,
                block_offsets,
@@ -450,7 +447,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
             query_start_loc=query_start_loc,
             max_seq_len=max_seq_len,
             seq_start_loc=seq_start_loc,
-            block_table=self.input_batch.block_table[:num_reqs],
+            block_table=(
+                self.input_batch.block_table.get_device_tensor()[:num_reqs]),
             slot_mapping=slot_mapping,
             use_cascade=use_cascade,
             common_prefix_len=common_prefix_len,

From f8fcca100beada88136944976da88f47f363acab Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Sun, 5 Jan 2025 23:12:38 -0800
Subject: [PATCH 278/357] [Misc] Fix typo for valid_tool_parses  (#11753)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 vllm/entrypoints/openai/api_server.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index e942b475535ad..047f699e4f277 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -767,11 +767,11 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
         ToolParserManager.import_tool_parser(args.tool_parser_plugin)
 
-    valide_tool_parses = ToolParserManager.tool_parsers.keys()
+    valid_tool_parses = ToolParserManager.tool_parsers.keys()
     if args.enable_auto_tool_choice \
-        and args.tool_call_parser not in valide_tool_parses:
+        and args.tool_call_parser not in valid_tool_parses:
         raise KeyError(f"invalid tool call parser: {args.tool_call_parser} "
-                       f"(chose from {{ {','.join(valide_tool_parses)} }})")
+                       f"(chose from {{ {','.join(valid_tool_parses)} }})")
 
     # workaround to make sure that we bind the port before the engine is set up.
     # This avoids race conditions with ray.

From 022c5c6944bcf28ac4d0d28ce14f2b559358be52 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Sun, 5 Jan 2025 23:59:16 -0800
Subject: [PATCH 279/357] [V1] Refactor get_executor_cls (#11754)

---
 tests/v1/engine/test_engine_core.py        |  6 +++---
 tests/v1/engine/test_engine_core_client.py |  6 +++---
 vllm/v1/engine/async_llm.py                | 21 +--------------------
 vllm/v1/engine/llm_engine.py               | 20 +-------------------
 vllm/v1/executor/abstract.py               | 19 ++++++++++++++++++-
 5 files changed, 26 insertions(+), 46 deletions(-)

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 954cec734b956..8dd9b23fbdd5f 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -8,8 +8,8 @@
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core import EngineCore
+from vllm.v1.executor.abstract import Executor
 
 if not current_platform.is_cuda():
     pytest.skip(reason="V1 currently only supported on CUDA.",
@@ -43,7 +43,7 @@ def test_engine_core(monkeypatch):
         """Setup the EngineCore."""
         engine_args = EngineArgs(model=MODEL_NAME)
         vllm_config = engine_args.create_engine_config()
-        executor_class = AsyncLLM._get_executor_cls(vllm_config)
+        executor_class = Executor.get_class(vllm_config)
 
         engine_core = EngineCore(vllm_config=vllm_config,
                                  executor_class=executor_class)
@@ -149,7 +149,7 @@ def test_engine_core_advanced_sampling(monkeypatch):
         """Setup the EngineCore."""
         engine_args = EngineArgs(model=MODEL_NAME)
         vllm_config = engine_args.create_engine_config()
-        executor_class = AsyncLLM._get_executor_cls(vllm_config)
+        executor_class = Executor.get_class(vllm_config)
 
         engine_core = EngineCore(vllm_config=vllm_config,
                                  executor_class=executor_class)
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 20d4e6f63b339..5a21806e57a11 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -11,8 +11,8 @@
 from vllm.platforms import current_platform
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core_client import EngineCoreClient
+from vllm.v1.executor.abstract import Executor
 
 if not current_platform.is_cuda():
     pytest.skip(reason="V1 currently only supported on CUDA.",
@@ -84,7 +84,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
         engine_args = EngineArgs(model=MODEL_NAME, compilation_config=3)
         vllm_config = engine_args.create_engine_config(
             UsageContext.UNKNOWN_CONTEXT)
-        executor_class = AsyncLLM._get_executor_cls(vllm_config)
+        executor_class = Executor.get_class(vllm_config)
         client = EngineCoreClient.make_client(
             multiprocess_mode=multiprocessing_mode,
             asyncio_mode=False,
@@ -152,7 +152,7 @@ async def test_engine_core_client_asyncio(monkeypatch):
         engine_args = EngineArgs(model=MODEL_NAME)
         vllm_config = engine_args.create_engine_config(
             usage_context=UsageContext.UNKNOWN_CONTEXT)
-        executor_class = AsyncLLM._get_executor_cls(vllm_config)
+        executor_class = Executor.get_class(vllm_config)
         client = EngineCoreClient.make_client(
             multiprocess_mode=True,
             asyncio_mode=True,
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 0696caf88385d..b963ba74f13f0 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -22,7 +22,6 @@
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.executor.ray_utils import initialize_ray_cluster
 
 logger = init_logger(__name__)
 
@@ -105,7 +104,7 @@ def from_engine_args(
         else:
             vllm_config = engine_config
 
-        executor_class = cls._get_executor_cls(vllm_config)
+        executor_class = Executor.get_class(vllm_config)
 
         # Create the AsyncLLM.
         return cls(
@@ -127,24 +126,6 @@ def shutdown(self):
         if handler := getattr(self, "output_handler", None):
             handler.cancel()
 
-    @classmethod
-    def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
-        executor_class: Type[Executor]
-        distributed_executor_backend = (
-            vllm_config.parallel_config.distributed_executor_backend)
-        if distributed_executor_backend == "ray":
-            initialize_ray_cluster(vllm_config.parallel_config)
-            from vllm.v1.executor.ray_executor import RayExecutor
-            executor_class = RayExecutor
-        elif distributed_executor_backend == "mp":
-            from vllm.v1.executor.multiproc_executor import MultiprocExecutor
-            executor_class = MultiprocExecutor
-        else:
-            assert (distributed_executor_backend is None)
-            from vllm.v1.executor.uniproc_executor import UniprocExecutor
-            executor_class = UniprocExecutor
-        return executor_class
-
     async def add_request(
         self,
         request_id: str,
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 0bd9b52c9be82..8ced3a34d2da3 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -89,7 +89,7 @@ def from_engine_args(
 
         # Create the engine configs.
         vllm_config = engine_args.create_engine_config(usage_context)
-        executor_class = cls._get_executor_cls(vllm_config)
+        executor_class = Executor.get_class(vllm_config)
 
         if VLLM_ENABLE_V1_MULTIPROCESSING:
             logger.debug("Enabling multiprocessing for LLMEngine.")
@@ -103,24 +103,6 @@ def from_engine_args(
                    stat_loggers=stat_loggers,
                    multiprocess_mode=enable_multiprocessing)
 
-    @classmethod
-    def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
-        executor_class: Type[Executor]
-        distributed_executor_backend = (
-            vllm_config.parallel_config.distributed_executor_backend)
-        if distributed_executor_backend == "ray":
-            from vllm.v1.executor.ray_executor import RayExecutor
-            executor_class = RayExecutor
-        elif distributed_executor_backend == "mp":
-            from vllm.v1.executor.multiproc_executor import MultiprocExecutor
-            executor_class = MultiprocExecutor
-        else:
-            assert (distributed_executor_backend is None)
-            from vllm.v1.executor.uniproc_executor import UniprocExecutor
-            executor_class = UniprocExecutor
-
-        return executor_class
-
     def get_num_unfinished_requests(self) -> int:
         return self.detokenizer.get_num_unfinished_requests()
 
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 564d0447f15a6..5d74d4b01f500 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Tuple
+from typing import Tuple, Type
 
 from vllm.config import VllmConfig
 from vllm.v1.outputs import ModelRunnerOutput
@@ -8,6 +8,23 @@
 class Executor(ABC):
     """Abstract class for executors."""
 
+    @staticmethod
+    def get_class(vllm_config: VllmConfig) -> Type["Executor"]:
+        executor_class: Type[Executor]
+        distributed_executor_backend = (
+            vllm_config.parallel_config.distributed_executor_backend)
+        if distributed_executor_backend == "ray":
+            from vllm.v1.executor.ray_executor import RayExecutor
+            executor_class = RayExecutor
+        elif distributed_executor_backend == "mp":
+            from vllm.v1.executor.multiproc_executor import MultiprocExecutor
+            executor_class = MultiprocExecutor
+        else:
+            assert (distributed_executor_backend is None)
+            from vllm.v1.executor.uniproc_executor import UniprocExecutor
+            executor_class = UniprocExecutor
+        return executor_class
+
     @abstractmethod
     def __init__(self, vllm_config: VllmConfig) -> None:
         raise NotImplementedError

From 9c749713f6990a9f9d12e526d9bfc2669dfa8ee6 Mon Sep 17 00:00:00 2001
From: Lucas Tucker <47258766+lucas-tucker@users.noreply.github.com>
Date: Mon, 6 Jan 2025 01:59:36 -0600
Subject: [PATCH 280/357] [mypy] Forward pass function type hints in lora
 (#11740)

Signed-off-by: lucast2021 <lucast2021@headroyce.org>
Co-authored-by: lucast2021 <lucast2021@headroyce.org>
---
 vllm/lora/layers.py                  | 12 +++++++++---
 vllm/lora/models.py                  |  3 ++-
 vllm/model_executor/layers/linear.py |  4 +++-
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 102e40d3f448d..a933ccaecf15e 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -405,7 +405,9 @@ def __init__(self, base_layer: ReplicatedLinear) -> None:
         self.output_size = self.base_layer.output_size
         self.n_slices = 1
 
-    def forward(self, input_):
+    def forward(
+        self, input_: torch.Tensor
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
         """Forward of ReplicatedLinearWithLoRA
 
         Args:
@@ -496,7 +498,9 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
         bias = bias[start_idx:end_idx]
         return bias
 
-    def forward(self, input_):
+    def forward(
+        self, input_: torch.Tensor
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
         """Forward of ColumnParallelLinear
 
         Args:
@@ -833,7 +837,9 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
     def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
         return bias
 
-    def forward(self, input_):
+    def forward(
+        self, input_: torch.Tensor
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
         """Forward of RowParallelLinear
 
         Args:
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 9cfcc6bba727f..5b7225bdc8f37 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -4,7 +4,7 @@
 import os
 import re
 from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Optional, Sequence, Type
+from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union
 
 import safetensors.torch
 import torch
@@ -219,6 +219,7 @@ def from_local_checkpoint(
 
         config["vllm_max_position_embeddings"] = max_position_embeddings
         peft_helper = PEFTHelper.from_dict(config)
+        unexpected_modules: List[Union[list[str], str]]
         if os.path.isfile(lora_tensor_path):
             tensors: Dict[str, torch.Tensor] = {}
             # Find unexpected modules.
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 33b221b994b2b..48cfb1b221720 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -238,7 +238,9 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         assert param.size() == loaded_weight.size()
         param.data.copy_(loaded_weight)
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self, x: torch.Tensor
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
         bias = self.bias if not self.skip_bias_add else None
         assert self.quant_method is not None
         output = self.quant_method.apply(self, x, bias)

From 2a622d704a4270c8d6fab057e8a545ed86ac35b7 Mon Sep 17 00:00:00 2001
From: Suraj Deshmukh <surajd.service@gmail.com>
Date: Mon, 6 Jan 2025 00:01:22 -0800
Subject: [PATCH 281/357] k8s-config: Update the secret to use stringData
 (#11679)

Signed-off-by: Suraj Deshmukh <surajd.service@gmail.com>
---
 docs/source/serving/deploying_with_k8s.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/serving/deploying_with_k8s.md b/docs/source/serving/deploying_with_k8s.md
index 77f848088ea43..5f9b0e4f55ecc 100644
--- a/docs/source/serving/deploying_with_k8s.md
+++ b/docs/source/serving/deploying_with_k8s.md
@@ -43,7 +43,7 @@ metadata:
   name: hf-token-secret
   namespace: default
 type: Opaque
-data:
+stringData:
   token: "REPLACE_WITH_TOKEN"
 ```
 

From 996357e4808ca5eab97d4c97c7d25b3073f46aab Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 6 Jan 2025 16:02:21 +0800
Subject: [PATCH 282/357] [VLM] Separate out profiling-related logic (#11746)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/multimodal/test_processing.py           |   7 +-
 vllm/model_executor/models/aria.py            |  79 +++---
 vllm/model_executor/models/blip2.py           |  78 +++---
 vllm/model_executor/models/chameleon.py       |  72 +++---
 vllm/model_executor/models/fuyu.py            |  85 ++++---
 vllm/model_executor/models/llava.py           | 181 +++++++++-----
 vllm/model_executor/models/llava_next.py      |  75 +++---
 .../model_executor/models/llava_next_video.py | 148 ++++++-----
 vllm/model_executor/models/llava_onevision.py | 174 +++++++------
 vllm/model_executor/models/phi3v.py           | 104 ++++----
 vllm/model_executor/models/qwen2_audio.py     |  96 +++++---
 vllm/model_executor/models/qwen2_vl.py        | 231 +++++++++++-------
 vllm/model_executor/models/ultravox.py        |  91 ++++---
 vllm/model_executor/models/vision.py          |  37 +--
 vllm/multimodal/processing.py                 | 152 ++++--------
 vllm/multimodal/profiling.py                  | 121 +++++++++
 vllm/multimodal/registry.py                   |   2 +-
 17 files changed, 1015 insertions(+), 718 deletions(-)
 create mode 100644 vllm/multimodal/profiling.py

diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index b32faa699ebf2..75d878217b657 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -586,9 +586,10 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
     )
 
     processor = processor_factory(ctx, cache=None)
+    profiler = processor.profiling_info
 
     mock_supported_mm_limits = MagicMock(return_value={"image": num_supported})
-    processor.get_supported_mm_limits = mock_supported_mm_limits
+    profiler.get_supported_mm_limits = mock_supported_mm_limits
 
     if is_valid:
         exc_ctx = nullcontext()
@@ -596,7 +597,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         exc_ctx = pytest.raises(ValueError, match="this model only supports")
 
     with exc_ctx:
-        processor._get_and_validate_dummy_mm_counts()
+        profiler.get_mm_limits()
 
 
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@@ -723,7 +724,7 @@ def _test_processing_cache_correctness(
         }
 
         mm_counts = {k: len(vs) for k, vs in mm_data.items()}
-        prompt = baseline_processor._get_dummy_processor_inputs(
+        prompt = baseline_processor.profiling_info.get_dummy_processor_inputs(
             model_config.max_model_len,
             mm_counts,
         ).prompt_text
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 8f5fd64a90c87..2e649f10c0765 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -24,8 +24,9 @@
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
+                                        MultiModalDataItems, ProcessingMixin,
                                         PromptReplacement)
+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.aria import (AriaMoELMConfig,
                                                   AriaVisionConfig)
@@ -444,18 +445,58 @@ def build_mm_projector(config: PretrainedConfig):
     )
 
 
-class AriaMultiModalProcessor(BaseMultiModalProcessor):
+class AriaProcessingMixin(ProcessingMixin):
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None}
+    def _get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def _get_vision_config(self) -> AriaVisionConfig:
+        return self._get_hf_config().vision_config
 
     def _get_num_image_tokens(self) -> int:
-        hf_config = self.ctx.get_hf_config()
+        hf_config = self._get_hf_config()
         return max(hf_config.projector_patch_to_query_dict.values())
 
+
+class AriaProfilingInfo(AriaProcessingMixin, BaseProfilingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         return {"image": self._get_num_image_tokens()}
 
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        vision_config = self._get_vision_config()
+
+        max_image_size = vision_config.image_size
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=max_image_size,
+                                   height=max_image_size,
+                                   num_images=num_images)
+        }
+
+        hf_processor = self._get_hf_processor()
+        image_token: str = hf_processor.image_token  # type: ignore
+
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=mm_data,
+        )
+
+
+class AriaMultiModalProcessor(AriaProcessingMixin, BaseMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return AriaProfilingInfo(self.ctx)
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -472,7 +513,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self.ctx.get_hf_config()
+        hf_config = self._get_hf_config()
         image_token_id = hf_config.image_token_index
 
         num_image_tokens = self._get_num_image_tokens()
@@ -485,32 +526,6 @@ def _get_prompt_replacements(
             )
         ]
 
-    def _get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        hf_config = self.ctx.get_hf_config()
-        vision_config: AriaVisionConfig = hf_config.vision_config
-
-        max_image_size = vision_config.image_size
-        num_images = mm_counts.get("image", 0)
-
-        mm_data = {
-            "image":
-            self._get_dummy_images(width=max_image_size,
-                                   height=max_image_size,
-                                   num_images=num_images)
-        }
-
-        hf_processor = self._get_hf_processor()
-        image_token: str = hf_processor.image_token  # type: ignore
-
-        return ProcessorInputs(
-            prompt_text=image_token * num_images,
-            mm_data=mm_data,
-        )
-
 
 @MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor)
 class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index b3ecb2f22dc19..fd45783f167b4 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -4,8 +4,8 @@
 
 import torch
 import torch.nn as nn
-from transformers import (BatchFeature, Blip2Config, Blip2Processor,
-                          Blip2QFormerConfig, apply_chunking_to_forward)
+from transformers import (BatchFeature, Blip2Config, Blip2QFormerConfig,
+                          apply_chunking_to_forward)
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
@@ -18,8 +18,9 @@
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
+                                        MultiModalDataItems, ProcessingMixin,
                                         PromptReplacement)
+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .blip import BlipVisionModel
@@ -396,20 +397,52 @@ def forward(
         return sequence_output
 
 
-class Blip2MultiModalProcessor(BaseMultiModalProcessor):
+class Blip2ProcessingMixin(ProcessingMixin):
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": 1}
+    def _get_hf_config(self):
+        return self.ctx.get_hf_config(Blip2Config)
 
     def _get_num_image_tokens(self) -> int:
-        hf_config = self.ctx.get_hf_config(Blip2Config)
+        hf_config = self._get_hf_config()
         return hf_config.num_query_tokens
 
+
+class Blip2ProfilingInfo(Blip2ProcessingMixin, BaseProfilingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
+
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         return {"image": self._get_num_image_tokens()}
 
-    def _get_hf_processor(self) -> Blip2Processor:
-        return self.ctx.get_hf_processor(Blip2Processor)
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        hf_config = self._get_hf_config()
+        vision_config = hf_config.vision_config
+
+        max_image_size = vision_config.image_size
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=max_image_size,
+                                   height=max_image_size,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="",
+            mm_data=mm_data,
+        )
+
+
+class Blip2MultiModalProcessor(Blip2ProcessingMixin, BaseMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return Blip2ProfilingInfo(self.ctx)
 
     def _get_mm_fields_config(
         self,
@@ -427,13 +460,13 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        max_image_tokens = self._get_num_image_tokens()
+        num_image_tokens = self._get_num_image_tokens()
 
         return [
             PromptReplacement(
                 modality="image",
                 target="</s>",
-                replacement="<image>" * max_image_tokens + "</s>",
+                replacement="<image>" * num_image_tokens + "</s>",
             )
         ]
 
@@ -457,29 +490,6 @@ def apply(
 
         return result
 
-    def _get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        hf_config = self.ctx.get_hf_config(Blip2Config)
-        vision_config = hf_config.vision_config
-
-        max_image_size = vision_config.image_size
-        num_images = mm_counts.get("image", 0)
-
-        mm_data = {
-            "image":
-            self._get_dummy_images(width=max_image_size,
-                                   height=max_image_size,
-                                   num_images=num_images)
-        }
-
-        return ProcessorInputs(
-            prompt_text="",
-            mm_data=mm_data,
-        )
-
 
 @MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor)
 class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 1ad44678a591d..73ed73b61ebf9 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -31,8 +31,9 @@
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
+                                        MultiModalDataItems, ProcessingMixin,
                                         PromptReplacement)
+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import print_warning_once
 
@@ -48,20 +49,55 @@ class ChameleonImagePixelInputs(TypedDict):
     """Shape: `(batch_size * num_images, num_channels, height, width)`"""
 
 
-class ChameleonMultiModalProcessor(BaseMultiModalProcessor):
+class ChameleonProcessingMixin(ProcessingMixin):
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": 1}
+    def _get_hf_config(self):
+        return self.ctx.get_hf_config(ChameleonConfig)
+
+    def _get_hf_processor(self):
+        return self.ctx.get_hf_processor(ChameleonProcessor)
 
     def _get_num_image_tokens(self) -> int:
         processor = self._get_hf_processor()
         return processor.image_seq_length
 
+
+class ChameleonProfilingInfo(ChameleonProcessingMixin, BaseProfilingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
+
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         return {"image": self._get_num_image_tokens()}
 
-    def _get_hf_processor(self) -> ChameleonProcessor:
-        return self.ctx.get_hf_processor(ChameleonProcessor)
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        config = self._get_hf_config()
+
+        width = height = config.vq_config.resolution
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=width,
+                                   height=height,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="<image>" * num_images,
+            mm_data=mm_data,
+        )
+
+
+class ChameleonMultiModalProcessor(ChameleonProcessingMixin,
+                                   BaseMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return ChameleonProfilingInfo(self.ctx)
 
     def _get_mm_fields_config(
         self,
@@ -76,7 +112,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        processor = self._get_hf_processor()
+        processor = self._get_hf_processor(**hf_processor_mm_kwargs)
 
         return [
             PromptReplacement(
@@ -90,28 +126,6 @@ def _get_prompt_replacements(
             )
         ]
 
-    def _get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        config = self.ctx.get_hf_config(ChameleonConfig)
-
-        width = height = config.vq_config.resolution
-        num_images = mm_counts.get("image", 0)
-
-        mm_data = {
-            "image":
-            self._get_dummy_images(width=width,
-                                   height=height,
-                                   num_images=num_images)
-        }
-
-        return ProcessorInputs(
-            prompt_text="<image>" * num_images,
-            mm_data=mm_data,
-        )
-
     def apply(
         self,
         prompt_text: str,
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 7cd58fbc7cf21..c937fcb0978b9 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -35,8 +35,9 @@
                                     NestedTensors, PlaceholderRange)
 from vllm.multimodal.parse import ImageProcessorItems, ImageSize
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
+                                        MultiModalDataItems, ProcessingMixin,
                                         PromptReplacement)
+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -63,18 +64,16 @@ class FuyuImagePatchInputs(TypedDict):
     """
 
 
-class FuyuMultiModalProcessor(BaseMultiModalProcessor):
+class FuyuProcessingMixin(ProcessingMixin):
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": 1}
+    def _get_hf_config(self):
+        return self.ctx.get_hf_config(FuyuConfig)
 
-    def _get_image_target_size(self) -> ImageSize:
-        processor = self._get_hf_processor()
-        image_processor: FuyuImageProcessor = processor.image_processor
+    def _get_hf_processor(self):
+        return self.ctx.get_hf_processor(FuyuProcessor)
 
-        target_size = image_processor.size
-        return ImageSize(width=target_size["width"],
-                         height=target_size["height"])
+    def _get_image_processor(self) -> FuyuImageProcessor:
+        return self._get_hf_processor().image_processor
 
     def _get_image_feature_grid_size(
         self,
@@ -82,7 +81,9 @@ def _get_image_feature_grid_size(
         image_width: int,
         image_height: int,
     ) -> tuple[int, int]:
-        target_width, target_height = self._get_image_target_size()
+        image_processor = self._get_image_processor()
+        target_width = image_processor.size["width"]
+        target_height = image_processor.size["height"]
 
         if not (image_width <= target_width and image_height <= target_height):
             height_scale_factor = target_height / image_height
@@ -96,8 +97,14 @@ def _get_image_feature_grid_size(
         nrows = math.ceil(image_height / 30)
         return ncols, nrows
 
+
+class FuyuProfilingInfo(FuyuProcessingMixin, BaseProfilingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
+
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        target_width, target_height = self._get_image_target_size()
+        target_width, target_height = self._get_image_size_with_most_features()
 
         max_ncols, max_nrows = self._get_image_feature_grid_size(
             image_width=target_width,
@@ -107,8 +114,36 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
 
         return {"image": max_image_tokens}
 
-    def _get_hf_processor(self) -> FuyuProcessor:
-        return self.ctx.get_hf_processor(FuyuProcessor)
+    def _get_image_size_with_most_features(self) -> ImageSize:
+        image_processor = self._get_image_processor()
+        return ImageSize(width=image_processor.size["width"],
+                         height=image_processor.size["height"])
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        target_width, target_height = self._get_image_size_with_most_features()
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="",
+            mm_data=mm_data,
+        )
+
+
+class FuyuMultiModalProcessor(FuyuProcessingMixin, BaseMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return FuyuProfilingInfo(self.ctx)
 
     def _call_hf_processor(
         self,
@@ -161,7 +196,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self.ctx.get_hf_config(FuyuConfig)
+        hf_config = self._get_hf_config()
         bos_token_id = hf_config.bos_token_id
 
         tokenizer = self._get_tokenizer()
@@ -208,26 +243,6 @@ def apply(
 
         return result
 
-    def _get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        target_width, target_height = self._get_image_target_size()
-        num_images = mm_counts.get("image", 0)
-
-        mm_data = {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
-        }
-
-        return ProcessorInputs(
-            prompt_text="",
-            mm_data=mm_data,
-        )
-
 
 @MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor)
 class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index d522378e0bebb..4299af8cd03a2 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,4 +1,4 @@
-from abc import abstractmethod
+from abc import ABC, abstractmethod
 from functools import cached_property
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
                     Protocol, Set, Tuple, TypedDict, Union)
@@ -13,6 +13,7 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
+from vllm.inputs import InputProcessingContext
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
@@ -25,9 +26,10 @@
                                     NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize)
-from vllm.multimodal.processing import (InputProcessingContext,
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessingCache,
-                                        ProcessorInputs, PromptReplacement)
+                                        ProcessingMixin, PromptReplacement)
+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .clip import CLIPVisionModel
@@ -37,7 +39,7 @@
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import BaseVisionLanguageMultiModalProcessor
+from .vision import get_vision_encoder_info
 
 
 class LlavaImagePixelInputs(TypedDict):
@@ -94,30 +96,42 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
 
 class LlavaLikeConfig(Protocol):
     vision_config: Final[PretrainedConfig]
+    image_token_index: Final[int]
     vision_feature_select_strategy: Final[str]
-    vision_feature_layer: Final[Union[int, List[int]]]
+    vision_feature_layer: Final[Union[int, list[int]]]
 
 
-class BaseLlavaMultiModalProcessor(BaseVisionLanguageMultiModalProcessor):
+class LlavaLikeProcessor(Protocol):
+    image_token: Final[str]
+
+
+class BaseLlavaProcessingMixin(ProcessingMixin, ABC):
 
-    @abstractmethod
     def _get_hf_config(self) -> LlavaLikeConfig:
-        raise NotImplementedError
+        return self.ctx.get_hf_config(LlavaConfig)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None}
+    def _get_vision_encoder_info(self):
+        return get_vision_encoder_info(self._get_hf_config())
 
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {"image": self._get_max_image_tokens()}
+    @abstractmethod
+    def _get_hf_processor(self) -> LlavaLikeProcessor:
+        raise NotImplementedError
 
-    def _get_mm_fields_config(
+    def _get_num_image_tokens(
         self,
-        hf_inputs: BatchFeature,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
-            pixel_values=MultiModalFieldConfig.batched("image"),
-            image_embeds=MultiModalFieldConfig.batched("image"),
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self._get_hf_config()
+        vision_encoder_info = self._get_vision_encoder_info()
+
+        return self._apply_feature_select_strategy(
+            hf_config.vision_feature_select_strategy,
+            vision_encoder_info.get_num_image_tokens(
+                image_width=image_width,
+                image_height=image_height,
+            ),
         )
 
     def _apply_feature_select_strategy(
@@ -133,31 +147,38 @@ def _apply_feature_select_strategy(
         msg = f"Unexpected feature select strategy: {strategy!r}"
         raise NotImplementedError(msg)
 
-    def _get_max_image_tokens(self) -> int:
-        hf_config = self._get_hf_config()
 
-        return self._apply_feature_select_strategy(
-            hf_config.vision_feature_select_strategy,
-            self._vision_encoder_info.get_max_image_tokens(),
-        )
+class BaseLlavaProfilingInfo(BaseLlavaProcessingMixin, BaseProfilingInfo):
 
-    def _get_dummy_image_size(self) -> ImageSize:
-        image_size = self._vision_encoder_info.get_image_size()
-        return ImageSize(image_size, image_size)
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
 
-    @abstractmethod
-    def _get_image_token(self) -> str:
-        raise NotImplementedError
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {"image": self._get_max_image_tokens()}
+
+    def _get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self._get_vision_encoder_info()
+        width = height = vision_encoder_info.get_image_size()
+        return ImageSize(width=width, height=height)
 
-    def _get_dummy_processor_inputs(
+    def _get_max_image_tokens(self) -> int:
+        target_width, target_height = self._get_image_size_with_most_features()
+
+        return self._get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+        )
+
+    def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         num_images = mm_counts.get("image", 0)
 
-        image_token = self._get_image_token()
-        target_width, target_height = self._get_dummy_image_size()
+        processor = self._get_hf_processor()
+        image_token = processor.image_token
+        target_width, target_height = self._get_image_size_with_most_features()
 
         mm_data = {
             "image":
@@ -172,32 +193,32 @@ def _get_dummy_processor_inputs(
         )
 
 
-class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor):
-
-    def _get_hf_config(self) -> LlavaConfig:
-        return self.ctx.get_hf_config(LlavaConfig)
+class LlavaProcessingMixin(BaseLlavaProcessingMixin):
 
-    def _get_hf_processor(self) -> LlavaProcessor:
+    def _get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaProcessor)
 
-    def _get_image_token(self) -> str:
-        return self._get_hf_processor().image_token
 
-    def _get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        hf_config = self._get_hf_config()
+class LlavaProfilingInfo(LlavaProcessingMixin, BaseLlavaProfilingInfo):
+    pass
 
-        return self._apply_feature_select_strategy(
-            hf_config.vision_feature_select_strategy,
-            self._vision_encoder_info.get_num_image_tokens(
-                image_width=image_width,
-                image_height=image_height,
-            ),
-        )
+
+class BaseLlavaMultiModalProcessor(LlavaProcessingMixin,
+                                   BaseMultiModalProcessor):
+
+    # Copied from BaseMultiModalProcessor
+    @abstractmethod
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        raise NotImplementedError
+
+    # Copied from BaseMultiModalProcessor
+    @abstractmethod
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        raise NotImplementedError
 
     def _get_prompt_replacements(
         self,
@@ -232,16 +253,37 @@ def get_replacement(item_idx: int):
         ]
 
 
-class PixtralHFMultiModalProcessor(BaseLlavaMultiModalProcessor):
+class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return LlavaProfilingInfo(self.ctx)
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
 
-    def _get_hf_config(self) -> LlavaConfig:
-        return self.ctx.get_hf_config(LlavaConfig)
 
-    def _get_hf_processor(self) -> PixtralProcessor:
+class PixtralHFProcessingMixin(BaseLlavaProcessingMixin):
+
+    def _get_hf_processor(self):
         return self.ctx.get_hf_processor(PixtralProcessor)
 
-    def _get_image_token(self) -> str:
-        return self._get_hf_processor().image_token
+
+class PixtralHFProfilingInfo(PixtralHFProcessingMixin, BaseLlavaProfilingInfo):
+    pass
+
+
+class PixtralHFMultiModalProcessor(PixtralHFProcessingMixin,
+                                   BaseMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return PixtralHFProfilingInfo(self.ctx)
 
     def _call_hf_processor(
         self,
@@ -270,6 +312,16 @@ def _call_hf_processor(
 
         return processed_outputs
 
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
@@ -316,7 +368,7 @@ def _build_llava_or_pixtral_hf_processor(
     *,
     cache: Optional[ProcessingCache] = None,
     enable_sanity_checks: bool = True,
-) -> BaseLlavaMultiModalProcessor:
+) -> BaseMultiModalProcessor:
     hf_config = ctx.get_hf_config(LlavaConfig)
 
     if isinstance(hf_config.vision_config, PixtralVisionConfig):
@@ -663,16 +715,13 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
 class MantisMultiModalProcessor(LlavaMultiModalProcessor):
 
-    def _get_hf_processor(self):
-        return self.ctx.get_hf_processor(LlavaProcessor)
-
     def apply(
         self,
         prompt_text: str,
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        hf_config = self.ctx.get_hf_config(LlavaConfig)
+        hf_config = self._get_hf_config()
         image_token_id = hf_config.image_token_index
 
         # Assume that it doesn't depend on the image size
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index f79021596f915..c76ec164a3087 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,6 +1,6 @@
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import (Final, Iterable, List, Literal, Mapping, Optional,
+                    Protocol, Set, Tuple, TypedDict, Union)
 
 import numpy as np
 import torch
@@ -17,12 +17,14 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
 from vllm.multimodal.parse import ImageSize
+from vllm.multimodal.profiling import BaseProfilingInfo
 from vllm.sequence import IntermediateTensors
 
 from .clip import CLIPVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
-from .llava import (LlavaMultiModalProcessor, LlavaMultiModalProjector,
-                    init_vision_tower_for_llava)
+from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingMixin,
+                    BaseLlavaProfilingInfo, LlavaLikeConfig,
+                    LlavaMultiModalProjector, init_vision_tower_for_llava)
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn,
                     init_vllm_registered_model, maybe_prefix)
@@ -60,35 +62,17 @@ class LlavaNextImageEmbeddingInputs(TypedDict):
                              LlavaNextImageEmbeddingInputs]
 
 
-class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor):
+class LlavaNextLikeConfig(LlavaLikeConfig, Protocol):
+    image_grid_pinpoints: Final[list[list[int]]]
 
-    def _get_hf_config(self) -> LlavaNextConfig:
-        return self.ctx.get_hf_config(LlavaNextConfig)
 
-    def _get_hf_processor(self) -> LlavaNextProcessor:
-        return self.ctx.get_hf_processor(LlavaNextProcessor)
+class LlavaNextProcessingMixin(BaseLlavaProcessingMixin):
 
-    def _get_mm_fields_config(
-        self,
-        hf_inputs: BatchFeature,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
-            pixel_values=MultiModalFieldConfig.batched("image"),
-            image_sizes=MultiModalFieldConfig.batched("image"),
-            image_embeds=MultiModalFieldConfig.batched("image"),
-        )
-
-    def _get_image_token(self) -> str:
-        return self._get_hf_processor().image_token
-
-    def _get_max_image_tokens(self) -> int:
-        largest_feature_size, _ = self._get_pinpoint_with_most_features()
-        return largest_feature_size
+    def _get_hf_config(self) -> LlavaNextLikeConfig:
+        return self.ctx.get_hf_config(LlavaNextConfig)
 
-    def _get_dummy_image_size(self) -> ImageSize:
-        _, pinpoint = self._get_pinpoint_with_most_features()
-        return pinpoint
+    def _get_hf_processor(self):
+        return self.ctx.get_hf_processor(LlavaNextProcessor)
 
     # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106
     def _get_num_image_tokens(
@@ -98,7 +82,7 @@ def _get_num_image_tokens(
         image_height: int,
     ) -> int:
         hf_config = self._get_hf_config()
-        vision_encoder_info = self._vision_encoder_info
+        vision_encoder_info = self._get_vision_encoder_info()
 
         base_feature_size = self._apply_feature_select_strategy(
             hf_config.vision_feature_select_strategy,
@@ -140,7 +124,7 @@ def _get_num_unpadded_features(
         current_height = npatches * num_patch_height
         current_width = npatches * num_patch_width
 
-        # NOTE: HF resizes based on float32
+        # NOTE: Use float32 to remain consistent with HF output
         original_aspect_ratio = np.array(original_width / original_height,
                                          dtype=np.float32)
         current_aspect_ratio = np.array(current_width / current_height,
@@ -164,11 +148,10 @@ def _get_num_unpadded_features(
 
         return (unpadded_features, newline_features)
 
-    def _get_pinpoint_with_most_features(self) -> tuple[int, ImageSize]:
-        """
-        Get the grid pinpoint with the most features and
-        the corresponding feature size.
-        """
+
+class LlavaNextProfilingInfo(LlavaNextProcessingMixin, BaseLlavaProfilingInfo):
+
+    def _get_image_size_with_most_features(self) -> ImageSize:
         hf_config = self._get_hf_config()
 
         largest_feature_size, largest_feature_pinpoint = 0, None
@@ -183,7 +166,25 @@ def _get_pinpoint_with_most_features(self) -> tuple[int, ImageSize]:
         if largest_feature_size == 0 or largest_feature_pinpoint is None:
             raise ValueError("Cannot have a largest feature size of 0!")
 
-        return largest_feature_size, largest_feature_pinpoint
+        return largest_feature_pinpoint
+
+
+class LlavaNextMultiModalProcessor(LlavaNextProcessingMixin,
+                                   BaseLlavaMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return LlavaNextProfilingInfo(self.ctx)
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
 
 
 @MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor)
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index ee6b89f0d4498..6e82cee1c95a4 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -15,11 +15,14 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
-from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
-                                   VideoEmbeddingItems, VideoProcessorItems)
-from vllm.multimodal.processing import (MultiModalFieldConfig, ProcessorInputs,
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import (ImageSize, VideoEmbeddingItems,
+                                   VideoProcessorItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems, ProcessingMixin,
                                         PromptReplacement)
+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
@@ -28,7 +31,7 @@
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import BaseVisionLanguageMultiModalProcessor
+from .vision import get_vision_encoder_info
 
 
 class LlavaNextVideoPixelInputs(TypedDict):
@@ -44,29 +47,16 @@ class LlavaNextVideoPixelInputs(TypedDict):
     """
 
 
-class LlavaNextVideoMultiModalProcessor(BaseVisionLanguageMultiModalProcessor):
+class LlavaNextVideoProcessingMixin(ProcessingMixin):
 
-    def _get_hf_config(self) -> LlavaNextVideoConfig:
+    def _get_hf_config(self):
         return self.ctx.get_hf_config(LlavaNextVideoConfig)
 
-    def _get_hf_processor(self) -> LlavaNextVideoProcessor:
-        return self.ctx.get_hf_processor(LlavaNextVideoProcessor)
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"video": 1}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        num_frames = self._get_dummy_num_frames(seq_len)
-        max_video_tokens = self._get_max_video_tokens(num_frames)
-
-        return {"video": max_video_tokens}
+    def _get_vision_encoder_info(self):
+        return get_vision_encoder_info(self._get_hf_config())
 
-    def _get_mm_fields_config(
-        self,
-        hf_inputs: BatchFeature,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(pixel_values_videos=MultiModalFieldConfig.batched("video"))
+    def _get_hf_processor(self):
+        return self.ctx.get_hf_processor(LlavaNextVideoProcessor)
 
     def _get_num_frame_tokens(
         self,
@@ -77,7 +67,8 @@ def _get_num_frame_tokens(
         hf_config = self._get_hf_config()
         spatial_pool_stride = hf_config.spatial_pool_stride
 
-        patch_grid_length = self._vision_encoder_info.get_patch_grid_length()
+        vision_encoder_info = self._get_vision_encoder_info()
+        patch_grid_length = vision_encoder_info.get_patch_grid_length()
         pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride)
 
         return pooled_grid_length * pooled_grid_length
@@ -96,18 +87,43 @@ def _get_num_video_tokens(
 
         return num_frame_tokens * num_frames
 
-    def _get_max_video_tokens(self, num_frames: int) -> int:
-        return self._get_num_video_tokens(image_width=999999,
-                                          image_height=999999,
-                                          num_frames=num_frames)
+
+class LlavaNextVideoProfilingInfo(LlavaNextVideoProcessingMixin,
+                                  BaseProfilingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"video": 1}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        target_width, target_height = self._get_image_size_with_most_features()
+
+        max_video_tokens = self._get_num_video_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=self._get_dummy_num_frames(seq_len),
+        )
+
+        return {"video": max_video_tokens}
+
+    def _get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self._get_vision_encoder_info()
+        width = height = vision_encoder_info.get_image_size()
+        return ImageSize(width=width, height=height)
 
     def _get_max_video_frames(self, max_tokens: int) -> int:
+        target_width, target_height = self._get_image_size_with_most_features()
+
         num_frames = 0
 
         while True:
             next_num_frames = num_frames + 1
+            next_max_tokens = self._get_num_video_tokens(
+                image_width=target_width,
+                image_height=target_height,
+                num_frames=next_num_frames,
+            )
 
-            if self._get_max_video_tokens(next_num_frames) > max_tokens:
+            if next_max_tokens > max_tokens:
                 break
 
             num_frames = next_num_frames
@@ -122,12 +138,45 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
 
         return max(max_total_frames // max(max_videos, 1), 1)
 
-    def _get_dummy_image_size(self) -> ImageSize:
-        image_size = self._vision_encoder_info.get_image_size()
-        return ImageSize(image_size, image_size)
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_videos = mm_counts.get("video", 0)
+
+        processor = self._get_hf_processor()
+        video_token = processor.video_token
+        target_width, target_height = self._get_image_size_with_most_features()
+
+        mm_data = {
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=self._get_dummy_num_frames(seq_len),
+                num_videos=num_videos,
+            )
+        }
+
+        return ProcessorInputs(
+            prompt_text=video_token * num_videos,
+            mm_data=mm_data,
+        )
+
 
-    def _get_video_token(self) -> str:
-        return self._get_hf_processor().video_token
+class LlavaNextVideoMultiModalProcessor(LlavaNextVideoProcessingMixin,
+                                        BaseMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return LlavaNextVideoProfilingInfo(self.ctx)
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values_videos=MultiModalFieldConfig.batched("video"))
 
     def _get_prompt_replacements(
         self,
@@ -162,36 +211,11 @@ def get_replacement(item_idx: int):
             ),
         ]
 
-    def _get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        num_videos = mm_counts.get("video", 0)
-
-        video_token = self._get_video_token()
-        target_width, target_height = self._get_dummy_image_size()
-
-        mm_data = {
-            "video":
-            self._get_dummy_videos(
-                width=target_width,
-                height=target_height,
-                num_frames=self._get_dummy_num_frames(seq_len),
-                num_videos=num_videos,
-            )
-        }
-
-        return ProcessorInputs(
-            prompt_text=video_token * num_videos,
-            mm_data=mm_data,
-        )
-
 
 # adopted from transformers modeling_llava_next_video.py
 class LlavaNextVideoPooler(nn.Module):
 
-    def __init__(self, config):
+    def __init__(self, config: LlavaNextVideoConfig):
         super().__init__()
 
         mode = config.spatial_pool_mode
@@ -209,7 +233,7 @@ def __init__(self, config):
             raise ValueError(
                 f"Unknown pooling mode: {mode}. Expected [`average`, `max`]")
 
-    def forward(self, image_features):
+    def forward(self, image_features: torch.Tensor):
         ori_width = int(
             math.sqrt(image_features.shape[1] * self.image_size //
                       self.image_size))
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 5a3cdadc47cac..6dccc1e0d3b8d 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -1,7 +1,7 @@
 import math
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import (Final, Iterable, List, Literal, Mapping, Optional,
+                    Protocol, Set, Tuple, TypedDict, Union)
 
 import numpy as np
 import torch
@@ -21,15 +21,16 @@
 from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
 from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems,
                                    VideoProcessorItems)
-from vllm.multimodal.processing import (MultiModalFieldConfig, ProcessorInputs,
-                                        PromptReplacement)
+from vllm.multimodal.processing import MultiModalFieldConfig, PromptReplacement
+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
 from .clip import CLIPVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
-from .llava import init_vision_tower_for_llava
-from .llava_next import LlavaNextMultiModalProcessor
+from .llava import BaseLlavaProfilingInfo, init_vision_tower_for_llava
+from .llava_next import (LlavaNextLikeConfig, LlavaNextMultiModalProcessor,
+                         LlavaNextProcessingMixin)
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -82,39 +83,17 @@ class LlavaOnevisionImageEmbeddingInputs(TypedDict):
                                   LlavaOnevisionVideoPixelInputs]
 
 
-class LlavaOnevisionMultiModalProcessor(LlavaNextMultiModalProcessor):
+class LlavaOnevisionLikeConfig(LlavaNextLikeConfig, Protocol):
+    video_token_index: Final[int]
 
-    def _get_hf_config(self) -> LlavaOnevisionConfig:
-        return self.ctx.get_hf_config(LlavaOnevisionConfig)
-
-    def _get_hf_processor(self) -> LlavaOnevisionProcessor:
-        return self.ctx.get_hf_processor(LlavaOnevisionProcessor)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None, "video": None}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        max_image_tokens = self._get_max_image_tokens()
+class LlavaOnevisionProcessingMixin(LlavaNextProcessingMixin):
 
-        num_frames = self._get_dummy_num_frames(seq_len)
-        max_video_tokens = self._get_max_video_tokens(num_frames)
-
-        return {
-            "image": max_image_tokens,
-            "video": max_video_tokens,
-        }
+    def _get_hf_config(self) -> LlavaOnevisionLikeConfig:
+        return self.ctx.get_hf_config(LlavaOnevisionConfig)
 
-    def _get_mm_fields_config(
-        self,
-        hf_inputs: BatchFeature,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
-            pixel_values=MultiModalFieldConfig.batched("image"),
-            image_sizes=MultiModalFieldConfig.batched("image"),
-            image_embeds=MultiModalFieldConfig.batched("image"),
-            pixel_values_videos=MultiModalFieldConfig.batched("video"),
-        )
+    def _get_hf_processor(self):
+        return self.ctx.get_hf_processor(LlavaOnevisionProcessor)
 
     def _get_num_unpadded_features(
         self,
@@ -128,7 +107,7 @@ def _get_num_unpadded_features(
         current_height = npatches * num_patch_height
         current_width = npatches * num_patch_width
 
-        # NOTE: HF resizes based on float32
+        # NOTE: Use float32 to remain consistent with HF output
         original_aspect_ratio = np.array(original_width / original_height,
                                          dtype=np.float32)
         current_aspect_ratio = np.array(current_width / current_height,
@@ -167,7 +146,8 @@ def _get_num_frame_tokens(
         hf_config = self._get_hf_config()
         spatial_pool_stride = getattr(hf_config, "spatial_pool_stride", 2)
 
-        patch_grid_length = self._vision_encoder_info.get_patch_grid_length()
+        vision_encoder_info = self._get_vision_encoder_info()
+        patch_grid_length = vision_encoder_info.get_patch_grid_length()
         pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride)
 
         return pooled_grid_length * pooled_grid_length
@@ -186,18 +166,33 @@ def _get_num_video_tokens(
 
         return num_frame_tokens * num_frames + 1  # Newline token
 
-    def _get_max_video_tokens(self, num_frames: int) -> int:
-        return self._get_num_video_tokens(image_width=999999,
-                                          image_height=999999,
-                                          num_frames=num_frames)
+
+class LlavaOnevisionProfilingInfo(LlavaOnevisionProcessingMixin,
+                                  BaseLlavaProfilingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {
+            "image": self._get_max_image_tokens(),
+            "video": self._get_max_video_tokens(seq_len),
+        }
 
     def _get_max_video_frames(self, max_tokens: int) -> int:
+        target_width, target_height = self._get_image_size_with_most_features()
+
         num_frames = 0
 
         while True:
             next_num_frames = num_frames + 1
+            next_max_tokens = self._get_num_video_tokens(
+                image_width=target_width,
+                image_height=target_height,
+                num_frames=next_num_frames,
+            )
 
-            if self._get_max_video_tokens(next_num_frames) > max_tokens:
+            if next_max_tokens > max_tokens:
                 break
 
             num_frames = next_num_frames
@@ -215,8 +210,65 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
 
         return max(max_total_frames // max(max_videos, 1), 1)
 
-    def _get_video_token(self) -> str:
-        return self._get_hf_processor().video_token
+    def _get_max_video_tokens(self, seq_len: int) -> int:
+        target_width, target_height = self._get_image_size_with_most_features()
+
+        return self._get_num_video_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=self._get_dummy_num_frames(seq_len),
+        )
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        processor = self._get_hf_processor()
+        image_token = processor.image_token
+        video_token = processor.video_token
+        target_width, target_height = self._get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=self._get_dummy_num_frames(seq_len),
+                num_videos=num_videos,
+            )
+        }
+
+        return ProcessorInputs(
+            prompt_text=image_token * num_images + video_token * num_videos,
+            mm_data=mm_data,
+        )
+
+
+class LlavaOnevisionMultiModalProcessor(LlavaOnevisionProcessingMixin,
+                                        LlavaNextMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return LlavaOnevisionProfilingInfo(self.ctx)
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+            pixel_values_videos=MultiModalFieldConfig.batched("video"),
+        )
 
     def _call_hf_processor(
         self,
@@ -235,7 +287,8 @@ def _call_hf_processor(
                 mm_kwargs=mm_kwargs,
             )
 
-        video_token = self._get_video_token()
+        processor = self._get_hf_processor()
+        video_token = processor.video_token
 
         # LLaVA-OneVision processor doesn't support multiple videos
         # with different sizes when converting back to tensors
@@ -303,37 +356,6 @@ def get_video_replacement(item_idx: int):
             ),
         ]
 
-    def _get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        num_images = mm_counts.get("image", 0)
-        num_videos = mm_counts.get("video", 0)
-
-        image_token = self._get_image_token()
-        video_token = self._get_video_token()
-        target_width, target_height = self._get_dummy_image_size()
-
-        mm_data = {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images),
-            "video":
-            self._get_dummy_videos(
-                width=target_width,
-                height=target_height,
-                num_frames=self._get_dummy_num_frames(seq_len),
-                num_videos=num_videos,
-            )
-        }
-
-        return ProcessorInputs(
-            prompt_text=image_token * num_images + video_token * num_videos,
-            mm_data=mm_data,
-        )
-
 
 class LlavaOnevisionMultiModalProjector(nn.Module):
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 7aa9d58d1d348..c8418c14e5fdf 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
+from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -28,22 +28,23 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
-from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
-from vllm.multimodal.parse import ImageEmbeddingItems, ImageProcessorItems
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   ImageSize)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
+                                        MultiModalDataItems, ProcessingMixin,
                                         PromptReplacement,
                                         _BoundPromptReplacement,
                                         _PlaceholderInfo)
+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
-from .clip import dummy_image_for_clip
+from .clip import CLIPVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     init_vllm_registered_model, maybe_prefix,
@@ -54,10 +55,6 @@
 # Cannot find the following 2 numbers from hf config.
 _IMAGE_TOKEN_ID = 32044
 
-# Result in the max possible feature size (h:w = 16:1)
-MAX_IMAGE_FEATURE_SIZE_HEIGHT = 8000
-MAX_IMAGE_FEATURE_SIZE_WIDTH = 50
-
 CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(dropout=0.0,
                                                      hidden_act="quick_gelu",
                                                      hidden_size=1024,
@@ -305,10 +302,17 @@ def add_image_newline(self, image_features_hd):
         return image_features_hd_newline
 
 
-class Phi3VMultiModalProcessor(BaseMultiModalProcessor):
+class Phi3VProcessingMixin(ProcessingMixin):
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None}
+    def _get_hf_processor(
+        self,
+        *,
+        num_crops: Optional[int] = None,
+    ) -> ProcessorMixin:
+        if num_crops is not None:
+            return self.ctx.get_hf_processor(num_crops=num_crops)
+
+        return self.ctx.get_hf_processor()
 
     def _get_num_image_tokens(
         self,
@@ -323,23 +327,55 @@ def _get_num_image_tokens(
             height=image_height,
         )
 
+
+class Phi3VProfilingInfo(Phi3VProcessingMixin, BaseProfilingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        target_width, target_height = self._get_image_size_with_most_features()
+
         max_image_tokens = self._get_num_image_tokens(
-            image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-            image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+            image_width=target_width,
+            image_height=target_height,
         )
 
         return {"image": max_image_tokens}
 
-    def _get_hf_processor(
+    def _get_image_size_with_most_features(self) -> ImageSize:
+        # Result in the max possible feature size (h:w = 16:1)
+        return ImageSize(height=8000, width=50)
+
+    def get_dummy_processor_inputs(
         self,
-        *,
-        num_crops: Optional[int] = None,
-    ) -> ProcessorMixin:
-        if num_crops is not None:
-            return self.ctx.get_hf_processor(num_crops=num_crops)
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
 
-        return self.ctx.get_hf_processor()
+        target_width, target_height = self._get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+        hf_processor = self._get_hf_processor()
+        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
+
+        return ProcessorInputs(
+            prompt_text="".join(image_tokens[:num_images]),
+            mm_data=mm_data,
+        )
+
+
+class Phi3VMultiModalProcessor(Phi3VProcessingMixin, BaseMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return Phi3VProfilingInfo(self.ctx)
 
     def _call_hf_processor(
         self,
@@ -377,10 +413,10 @@ def _get_mm_fields_config(
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_processor = self._get_hf_processor()
+        hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs)
         image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
 
         tokenizer = self._get_tokenizer()
@@ -442,28 +478,6 @@ def _apply_prompt_replacements(
 
         return token_ids, text, placeholders
 
-    def _get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        num_images = mm_counts.get("image", 0)
-
-        data = dummy_image_for_clip(
-            CLIP_VIT_LARGE_PATCH14_336_CONFIG,
-            num_images,
-            image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-            image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
-        )
-
-        hf_processor = self._get_hf_processor()
-        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
-
-        return ProcessorInputs(
-            prompt_text="".join(image_tokens[:num_images]),
-            mm_data=data,
-        )
-
     def apply(
         self,
         prompt_text: str,
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index bc3bb1f79b407..a7bb3425ed17c 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -20,8 +20,8 @@
 # limitations under the License.
 """Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
 from functools import cached_property
-from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
-                    Union)
+from typing import (Any, Iterable, List, Mapping, Optional, Set, Tuple,
+                    TypedDict, Union)
 
 import torch
 import torch.nn as nn
@@ -40,8 +40,9 @@
                                     NestedTensors)
 from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
+                                        MultiModalDataItems, ProcessingMixin,
                                         PromptReplacement)
+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -79,28 +80,70 @@ def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
     return feat_lengths, output_lengths
 
 
-class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor):
+class Qwen2AudioProcessingMixin(ProcessingMixin):
+
+    def _get_hf_config(self):
+        return self.ctx.get_hf_config(Qwen2AudioConfig)
+
+    def _get_hf_processor(
+        self,
+        *,
+        # Ignored in initialization
+        sampling_rate: Optional[int] = None,
+    ) -> Qwen2AudioProcessor:
+        return self.ctx.get_hf_processor(Qwen2AudioProcessor)
+
+    def _get_feature_extractor(
+        self,
+        *,
+        # Ignored in initialization
+        sampling_rate: Optional[int] = None,
+    ) -> WhisperFeatureExtractor:
+        hf_processor = self._get_hf_processor(sampling_rate=sampling_rate)
+        feature_extractor = hf_processor.feature_extractor  # type: ignore
+        assert isinstance(feature_extractor, WhisperFeatureExtractor)
+        return feature_extractor
+
+
+class Qwen2AudioProfilingInfo(Qwen2AudioProcessingMixin, BaseProfilingInfo):
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        hf_config = self.ctx.get_hf_config(Qwen2AudioConfig)
+        hf_config = self._get_hf_config()
         max_source_positions = hf_config.audio_config.max_source_positions
         max_output_lengths = (max_source_positions - 2) // 2 + 1
 
         return {"audio": max_output_lengths}
 
-    def _get_hf_processor(
+    def get_dummy_processor_inputs(
         self,
-        *,
-        # Ignored in initialization
-        sampling_rate: Optional[int] = None,
-    ) -> Qwen2AudioProcessor:
-        return self.ctx.get_hf_processor(Qwen2AudioProcessor)
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        feature_extractor = self._get_feature_extractor()
+
+        sampling_rate = feature_extractor.sampling_rate
+        audio_len = feature_extractor.chunk_length * sampling_rate
+        num_audios = mm_counts.get("audio", 0)
+
+        mm_data = {
+            "audio":
+            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+        }
+
+        return ProcessorInputs(
+            prompt_text="<|AUDIO|>" * num_audios,
+            mm_data=mm_data,
+        )
+
 
-    def _get_feature_extractor(self) -> WhisperFeatureExtractor:
-        return self._get_hf_processor().feature_extractor  # type: ignore
+class Qwen2AudioMultiModalProcessor(Qwen2AudioProcessingMixin,
+                                    BaseMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return Qwen2AudioProfilingInfo(self.ctx)
 
     def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self._get_feature_extractor()
@@ -110,7 +153,7 @@ def _call_hf_processor(
         self,
         prompt: str,
         mm_data: Mapping[str, object],
-        mm_kwargs: Mapping[str, object],
+        mm_kwargs: Mapping[str, Any],
     ) -> BatchFeature:
         mm_data = dict(mm_data)
         audios = mm_data.pop("audios", [])
@@ -118,7 +161,7 @@ def _call_hf_processor(
         if audios:
             mm_data["audios"] = audios
 
-            feature_extractor = self._get_feature_extractor()
+            feature_extractor = self._get_feature_extractor(**mm_kwargs)
             mm_kwargs = dict(
                 **mm_kwargs,
                 sampling_rate=feature_extractor.sampling_rate,
@@ -151,7 +194,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self.ctx.get_hf_config(Qwen2AudioConfig)
+        hf_config = self._get_hf_config()
         placeholder = hf_config.audio_token_index
 
         feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
@@ -191,27 +234,6 @@ def _always_apply_prompt_replacements(self) -> bool:
         # tokens than the number of audio items)
         return True
 
-    def _get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        feature_extractor = self._get_feature_extractor()
-
-        sampling_rate = feature_extractor.sampling_rate
-        audio_len = feature_extractor.chunk_length * sampling_rate
-        num_audios = mm_counts.get("audio", 0)
-
-        mm_data = {
-            "audio":
-            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
-        }
-
-        return ProcessorInputs(
-            prompt_text="<|AUDIO|>" * num_audios,
-            mm_data=mm_data,
-        )
-
 
 @MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor)
 class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index abca85e0e2024..a5c2fb9e84df3 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -59,8 +59,9 @@
 from vllm.multimodal.parse import (ImageSize, ModalityDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
+                                        MultiModalDataItems, ProcessingMixin,
                                         PromptReplacement)
+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
@@ -708,10 +709,44 @@ def _parse_video_data(
         return super()._parse_video_data(data)
 
 
-class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
+class Qwen2VLProcessingMixin(ProcessingMixin):
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None, "video": None}
+    def _get_hf_config(self):
+        return self.ctx.get_hf_config(Qwen2VLConfig)
+
+    def _get_hf_processor(
+        self,
+        *,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+    ) -> Qwen2VLProcessor:
+        hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor)
+        image_processor = hf_processor.image_processor  # type: ignore
+        assert isinstance(image_processor, Qwen2VLImageProcessor)
+
+        if min_pixels:
+            image_processor.min_pixels = min_pixels
+        if max_pixels:
+            image_processor.max_pixels = max_pixels
+        if max_pixels or min_pixels:
+            image_processor.size = {
+                "min_pixels": image_processor.min_pixels,
+                "max_pixels": image_processor.max_pixels,
+            }
+
+        return hf_processor
+
+    def _get_image_processor(
+        self,
+        *,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+    ):
+        hf_processor = self._get_hf_processor(min_pixels=min_pixels,
+                                              max_pixels=max_pixels)
+        image_processor = hf_processor.image_processor  # type: ignore
+        assert isinstance(image_processor, Qwen2VLImageProcessor)
+        return image_processor
 
     def _get_vision_info(
         self,
@@ -721,14 +756,13 @@ def _get_vision_info(
         num_frames: int = 1,
         do_resize: bool = True,
     ) -> tuple[ImageSize, int]:
-        hf_config = self.ctx.get_hf_config(Qwen2VLConfig)
+        hf_config = self._get_hf_config()
         vision_config = hf_config.vision_config
         patch_size = vision_config.patch_size
         merge_size = vision_config.spatial_merge_size
         temporal_patch_size = vision_config.temporal_patch_size
 
-        hf_processor = self._get_hf_processor()
-        image_processor = self._get_image_processor(hf_processor)
+        image_processor = self._get_image_processor()
 
         if do_resize:
             resized_height, resized_width = smart_resize(
@@ -753,7 +787,45 @@ def _get_vision_info(
 
         return preprocessed_size, num_vision_tokens
 
-    def _get_dummy_image_size(self) -> ImageSize:
+    def _get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        _, num_image_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+        )
+        return num_image_tokens
+
+    def _get_num_video_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int,
+    ) -> int:
+        _, num_video_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            num_frames=num_frames,
+        )
+        return num_video_tokens
+
+
+class Qwen2VLProfilingInfo(Qwen2VLProcessingMixin, BaseProfilingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {
+            "image": self._get_max_image_tokens(),
+            "video": self._get_max_video_tokens(seq_len),
+        }
+
+    def _get_image_size_with_most_features(self) -> ImageSize:
         max_image_size, _ = self._get_vision_info(
             image_width=9999999,
             image_height=9999999,
@@ -761,27 +833,27 @@ def _get_dummy_image_size(self) -> ImageSize:
         return max_image_size
 
     def _get_max_image_tokens(self) -> int:
-        _, max_image_tokens = self._get_vision_info(
-            image_width=9999999,
-            image_height=9999999,
-        )
-        return max_image_tokens
+        target_width, target_height = self._get_image_size_with_most_features()
 
-    def _get_max_video_tokens(self, num_frames: int) -> int:
-        _, max_video_tokens = self._get_vision_info(
-            image_width=9999999,
-            image_height=9999999,
-            num_frames=num_frames,
+        return self._get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
         )
-        return max_video_tokens
 
     def _get_max_video_frames(self, max_tokens: int) -> int:
+        target_width, target_height = self._get_image_size_with_most_features()
+
         num_frames = 0
 
         while True:
             next_num_frames = num_frames + 1
+            next_max_tokens = self._get_num_video_tokens(
+                image_width=target_width,
+                image_height=target_height,
+                num_frames=next_num_frames,
+            )
 
-            if self._get_max_video_tokens(next_num_frames) > max_tokens:
+            if next_max_tokens > max_tokens:
                 break
 
             num_frames = next_num_frames
@@ -797,56 +869,73 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
         max_total_frames = self._get_max_video_frames(seq_len -
                                                       max_image_tokens)
 
-        return max(max_total_frames // max(max_videos, 1), 1)
+        num_frames = max(max_total_frames // max(max_videos, 1), 1)
 
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        max_image_tokens = self._get_max_image_tokens()
+        # Temporary workaround for https://github.com/huggingface/transformers/issues/35412
+        if num_frames > 1 and num_frames % 2 == 1:
+            num_frames += 1
 
-        num_frames = self._get_dummy_num_frames(seq_len)
-        max_video_tokens = self._get_max_video_tokens(num_frames)
+        return num_frames
 
-        return {
-            "image": max_image_tokens,
-            "video": max_video_tokens,
+    def _get_max_video_tokens(self, seq_len: int) -> int:
+        target_width, target_height = self._get_image_size_with_most_features()
+
+        return self._get_num_video_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=self._get_dummy_num_frames(seq_len),
+        )
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        hf_processor = self._get_hf_processor()
+        image_token: str = hf_processor.image_token
+        video_token: str = hf_processor.video_token
+        target_width, target_height = self._get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=self._get_dummy_num_frames(seq_len),
+                num_videos=num_videos,
+            )
         }
 
-    def _get_data_parser(self) -> MultiModalDataParser:
-        return Qwen2MultiModalDataParser()
+        return ProcessorInputs(
+            prompt_text=image_token * num_images + video_token * num_videos,
+            mm_data=mm_data,
+        )
 
-    def _get_image_processor(self, hf_processor: Qwen2VLProcessor):
-        image_processor = hf_processor.image_processor  # type: ignore
-        assert isinstance(image_processor, Qwen2VLImageProcessor)
-        return image_processor
 
-    def _get_hf_processor(
-        self,
-        *,
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-    ) -> Qwen2VLProcessor:
-        hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor)
-        image_processor = self._get_image_processor(hf_processor)
+class Qwen2VLMultiModalProcessor(Qwen2VLProcessingMixin,
+                                 BaseMultiModalProcessor):
 
-        if min_pixels:
-            image_processor.min_pixels = min_pixels
-        if max_pixels:
-            image_processor.max_pixels = max_pixels
-        if max_pixels or min_pixels:
-            image_processor.size = {
-                "min_pixels": image_processor.min_pixels,
-                "max_pixels": image_processor.max_pixels,
-            }
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return Qwen2VLProfilingInfo(self.ctx)
 
-        return hf_processor
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return Qwen2MultiModalDataParser()
 
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_processor = self._get_hf_processor()
-        image_processor = self._get_image_processor(hf_processor)
+        hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs)
+        image_processor = self._get_image_processor(**hf_processor_mm_kwargs)
 
         # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
         # image_token and video_token registered
@@ -901,38 +990,6 @@ def _get_mm_fields_config(
             video_grid_thw=MultiModalFieldConfig.batched("video"),
         )
 
-    def _get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        num_images = mm_counts.get("image", 0)
-        num_videos = mm_counts.get("video", 0)
-
-        hf_processor = self._get_hf_processor()
-        image_token: str = hf_processor.image_token
-        video_token: str = hf_processor.video_token
-        target_width, target_height = self._get_dummy_image_size()
-
-        mm_data = {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images),
-            "video":
-            self._get_dummy_videos(
-                width=target_width,
-                height=target_height,
-                num_frames=self._get_dummy_num_frames(seq_len),
-                num_videos=num_videos,
-            )
-        }
-
-        return ProcessorInputs(
-            prompt_text=image_token * num_images + video_token * num_videos,
-            mm_data=mm_data,
-        )
-
 
 @MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor)
 class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 6ad4661e3bb8d..ba823acecbb56 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -3,8 +3,8 @@
 
 import math
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set,
+                    Tuple, TypedDict, Union)
 
 import torch
 import torch.utils.checkpoint
@@ -26,8 +26,9 @@
                                     NestedTensors)
 from vllm.multimodal.parse import MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
+                                        MultiModalDataItems, ProcessingMixin,
                                         PromptReplacement)
+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
@@ -55,7 +56,30 @@ class UltravoxAudioEmbeddingInputs(TypedDict):
                             UltravoxAudioEmbeddingInputs]
 
 
-class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
+class UltravoxProcessingMixin(ProcessingMixin):
+
+    def _get_hf_processor(
+        self,
+        *,
+        # Ignored in initialization
+        sampling_rate: Optional[int] = None,
+    ) -> ProcessorMixin:
+        return self.ctx.get_hf_processor()
+
+    def _get_feature_extractor(
+        self,
+        *,
+        # Ignored in initialization
+        sampling_rate: Optional[int] = None,
+    ) -> WhisperFeatureExtractor:
+        hf_processor = self._get_hf_processor(sampling_rate=sampling_rate)
+        audio_processor = hf_processor.audio_processor  # type: ignore
+        feature_extractor = audio_processor.feature_extractor  # type: ignore
+        assert isinstance(feature_extractor, WhisperFeatureExtractor)
+        return feature_extractor
+
+
+class UltravoxProfilingInfo(UltravoxProcessingMixin, BaseProfilingInfo):
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None}
@@ -67,17 +91,33 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
 
         return {"audio": max_audio_tokens}
 
-    def _get_hf_processor(
+    def get_dummy_processor_inputs(
         self,
-        *,
-        # Ignored in initialization
-        sampling_rate: Optional[int] = None,
-    ) -> ProcessorMixin:
-        return self.ctx.get_hf_processor()
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        feature_extractor = self._get_feature_extractor()
+
+        sampling_rate = feature_extractor.sampling_rate
+        audio_len = feature_extractor.chunk_length * sampling_rate
+        num_audios = mm_counts.get("audio", 0)
+
+        mm_data = {
+            "audio":
+            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+        }
+
+        return ProcessorInputs(
+            prompt_text="<|audio|>" * num_audios,
+            mm_data=mm_data,
+        )
 
-    def _get_feature_extractor(self) -> WhisperFeatureExtractor:
-        hf_processor = self._get_hf_processor()
-        return hf_processor.audio_processor.feature_extractor  # type: ignore
+
+class UltravoxMultiModalProcessor(UltravoxProcessingMixin,
+                                  BaseMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return UltravoxProfilingInfo(self.ctx)
 
     def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self._get_feature_extractor()
@@ -155,10 +195,10 @@ def _get_mm_fields_config(
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_processor = self._get_hf_processor()
+        hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs)
         placeholder = hf_processor.audio_token_replacement  # type: ignore
 
         def get_replacement_ultravox(item_idx: int):
@@ -173,27 +213,6 @@ def get_replacement_ultravox(item_idx: int):
             )
         ]
 
-    def _get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        feature_extractor = self._get_feature_extractor()
-
-        sampling_rate = feature_extractor.sampling_rate
-        audio_len = feature_extractor.chunk_length * sampling_rate
-        num_audios = mm_counts.get("audio", 0)
-
-        mm_data = {
-            "audio":
-            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
-        }
-
-        return ProcessorInputs(
-            prompt_text="<|audio|>" * num_audios,
-            mm_data=mm_data,
-        )
-
 
 class StackAudioFrames(nn.Module):
     """
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 014f02ee10a1b..8516c9f7066f7 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,12 +1,8 @@
 from abc import ABC, abstractmethod
-from typing import Final, Generic, Optional, Protocol, TypeVar
+from typing import Final, Generic, Protocol, TypeVar
 
 from transformers import PretrainedConfig
 
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        InputProcessingContext,
-                                        ProcessingCache)
-
 _C = TypeVar("_C", bound=PretrainedConfig)
 
 
@@ -43,12 +39,18 @@ def get_patch_grid_length(self) -> int:
         raise NotImplementedError
 
 
-def vision_encoder_info(vision_config: PretrainedConfig) -> VisionEncoderInfo:
+class VisionLanguageConfig(Protocol):
+    vision_config: Final[PretrainedConfig]
+
+
+def get_vision_encoder_info(
+        hf_config: VisionLanguageConfig) -> VisionEncoderInfo:
     # Avoid circular imports
     from .clip import CLIPEncoderInfo, CLIPVisionConfig
     from .pixtral import PixtralHFEncoderInfo, PixtralVisionConfig
     from .siglip import SiglipEncoderInfo, SiglipVisionConfig
 
+    vision_config = hf_config.vision_config
     if isinstance(vision_config, CLIPVisionConfig):
         return CLIPEncoderInfo(vision_config)
     if isinstance(vision_config, PixtralVisionConfig):
@@ -58,26 +60,3 @@ def vision_encoder_info(vision_config: PretrainedConfig) -> VisionEncoderInfo:
 
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
-
-
-class VisionLanguageConfig(Protocol):
-    vision_config: Final[PretrainedConfig]
-
-
-class BaseVisionLanguageMultiModalProcessor(BaseMultiModalProcessor):
-
-    def __init__(self,
-                 ctx: InputProcessingContext,
-                 *,
-                 cache: Optional[ProcessingCache] = None,
-                 enable_sanity_checks: bool = True) -> None:
-        super().__init__(ctx,
-                         cache=cache,
-                         enable_sanity_checks=enable_sanity_checks)
-
-        vision_config = self._get_hf_config().vision_config
-        self._vision_encoder_info = vision_encoder_info(vision_config)
-
-    @abstractmethod
-    def _get_hf_config(self) -> VisionLanguageConfig:
-        raise NotImplementedError
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index ebc16b817684a..933c1d3aff0cb 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -8,11 +8,10 @@
 from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union
 
 import numpy as np
-import numpy.typing as npt
 import torch
 from blake3 import blake3
 from PIL import Image
-from transformers import BatchFeature, ProcessorMixin
+from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 
 from vllm.inputs import DummyData, InputProcessingContext
 from vllm.logger import init_logger
@@ -24,6 +23,7 @@
                      MultiModalInputsV2, MultiModalKwargs,
                      MultiModalKwargsItem, PlaceholderRange)
 from .parse import MultiModalDataItems, MultiModalDataParser
+from .profiling import BaseProfilingInfo
 
 logger = init_logger(__name__)
 
@@ -466,14 +466,6 @@ def find_mm_placeholders(
     return dict(full_groupby_modality(it))
 
 
-@dataclass
-class ProcessorInputs:
-    """Keyword arguments to :meth:`BaseMultiModalProcessor`."""
-    prompt_text: str
-    mm_data: MultiModalDataDict
-    hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
-
-
 class ProcessingCache:
 
     def __init__(self, capacity: int) -> None:
@@ -585,9 +577,33 @@ def put(
         self._cache.put(cache_key, output_kwargs)
 
 
-class BaseMultiModalProcessor(ABC):
+class ProcessingMixin:
+    """
+    Contains helper functions to perform processing.
+
+    Not to be confused with :class:`transformers.ProcessorMixin`.
+    """
+    ctx: InputProcessingContext
+
+    def _get_tokenizer(self) -> AnyTokenizer:
+        return self.ctx.tokenizer
+
+    def _get_hf_config(self) -> PretrainedConfig:
+        return self.ctx.get_hf_config()
+
+    def _get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
+        """
+        Subclasses can override this method to handle
+        specific kwargs from model config or user inputs.
+        """
+        return self.ctx.get_hf_processor(**kwargs)
+
+
+class BaseMultiModalProcessor(ProcessingMixin, ABC):
     """
     Abstract base class to process multi-modal inputs to be used in vLLM.
+
+    Not to be confused with :class:`transformers.ProcessorMixin`.
     """
 
     def __init__(self,
@@ -601,6 +617,9 @@ def __init__(self,
         self.cache = cache
         self.enable_sanity_checks = enable_sanity_checks
 
+        self.data_parser = self._get_data_parser()
+        self.profiling_info = self._get_profiling_info()
+
     def __call__(
         self,
         prompt: str,
@@ -609,32 +628,9 @@ def __call__(
     ) -> MultiModalInputsV2:
         return self.apply(prompt, mm_data, hf_processor_mm_kwargs)
 
-    @abstractmethod
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        """
-        Return the maximum supported number of items for each modality.
-
-        A value of `None` means unlimited number of items.
-
-        Omitting a modality from the returned dictionary means that
-        it is not supported at all.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        """
-        Get the maximum possible number of tokens per data item
-        for each modality.
-
-        The dictionary returned by this method should have the same
-        keys as that returned by :meth:`get_supported_mm_limits`.
-        """
-        raise NotImplementedError
-
     def _get_data_parser(self) -> MultiModalDataParser:
         """
-        Construct a data parser to preprocess multi-modal data items
+        Construct a parser to preprocess multi-modal data items
         before passing them to :meth:`_get_hf_mm_data`.
 
         You can support additional modalities by creating a subclass
@@ -642,15 +638,12 @@ def _get_data_parser(self) -> MultiModalDataParser:
         """
         return MultiModalDataParser()
 
-    def _get_hf_processor(self) -> ProcessorMixin:
+    def _get_profiling_info(self) -> BaseProfilingInfo:
         """
-        Subclasses can add keyword arguments to this method to accept
-        additional kwargs from model config or user inputs.
+        Get the profiling information to find the worst-case memory usage of
+        the model.
         """
-        return self.ctx.get_hf_processor()
-
-    def _get_tokenizer(self) -> AnyTokenizer:
-        return self.ctx.tokenizer
+        raise NotImplementedError
 
     def _to_mm_items(
         self,
@@ -660,8 +653,7 @@ def _to_mm_items(
         Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`
         before passing them to :meth:`_get_hf_mm_data`.
         """
-        parser = self._get_data_parser()
-        mm_items = parser.parse_mm_data(mm_data)
+        mm_items = self.data_parser.parse_mm_data(mm_data)
 
         mm_limits = self.ctx.get_mm_config().limit_per_prompt
         for modality, items in mm_items.items():
@@ -799,7 +791,7 @@ def _apply_hf_processor_missing(
 
         # Some HF processors (e.g. Qwen2-VL) expect corresponding
         # multi-modal tokens to be in the prompt text
-        dummy_inputs = self._get_dummy_processor_inputs(
+        dummy_inputs = self.profiling_info.get_dummy_processor_inputs(
             self.ctx.model_config.max_model_len,
             mm_missing_counts,
         )
@@ -1133,73 +1125,14 @@ def apply(
             mm_placeholders=mm_placeholder_ranges,
         )
 
-    def _get_dummy_audios(
-        self,
-        *,
-        length: int,
-        num_audios: int,
-    ) -> list[npt.NDArray]:
-        audio = np.zeros((length, ))
-        return [audio] * num_audios
-
-    def _get_dummy_images(
-        self,
-        *,
-        width: int,
-        height: int,
-        num_images: int,
-    ) -> list[Image.Image]:
-        image = Image.new("RGB", (width, height), color=0)
-        return [image] * num_images
-
-    def _get_dummy_videos(
-        self,
-        *,
-        width: int,
-        height: int,
-        num_frames: int,
-        num_videos: int,
-    ) -> list[npt.NDArray]:
-        video = np.zeros((num_frames, width, height, 3))
-        return [video] * num_videos
-
-    @abstractmethod
-    def _get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        """
-        Build the multi-modal portion of the input which, after processing,
-        results in `mm_max_tokens` in :meth:`get_dummy_data`.
-        """
-        raise NotImplementedError
-
-    def _get_and_validate_dummy_mm_counts(self) -> Mapping[str, int]:
-        mm_limit_per_prompt = self.ctx.get_mm_config().limit_per_prompt
-        supported_mm_limits = self.get_supported_mm_limits()
-
-        mm_limits = {
-            modality: mm_limit_per_prompt.get(modality, 1)
-            for modality in supported_mm_limits
-        }
-
-        for modality, supported_limit in supported_mm_limits.items():
-            limit = mm_limits[modality]
-            if supported_limit is not None and supported_limit < limit:
-                raise ValueError(
-                    f"You set {modality}={limit} (or defaulted to 1) in "
-                    f"`--limit-mm-per-prompt`, but this model only supports "
-                    f"at most {supported_limit} {modality} items.")
-
-        return mm_limits
-
     def _get_dummy_mm_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> MultiModalInputsV2:
-        processor_inputs = self._get_dummy_processor_inputs(seq_len, mm_counts)
+        profiling = self.profiling_info
+        processor_inputs = profiling.get_dummy_processor_inputs(
+            seq_len, mm_counts)
 
         return self.apply(
             prompt_text=processor_inputs.prompt_text,
@@ -1211,8 +1144,9 @@ def get_dummy_data(self, seq_len: int) -> DummyData:
         # Avoid circular import
         from vllm.sequence import SequenceData
 
-        mm_counts = self._get_and_validate_dummy_mm_counts()
-        mm_max_tokens_per_item = self.get_mm_max_tokens_per_item(seq_len)
+        profiling = self.profiling_info
+        mm_counts = profiling.get_mm_limits()
+        mm_max_tokens_per_item = profiling.get_mm_max_tokens_per_item(seq_len)
         if mm_counts.keys() != mm_max_tokens_per_item.keys():
             raise AssertionError(
                 "The keys returned by `get_supported_mm_limits`"
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
new file mode 100644
index 0000000000000..2ecf0db1a485d
--- /dev/null
+++ b/vllm/multimodal/profiling.py
@@ -0,0 +1,121 @@
+from abc import ABC, abstractmethod
+from collections.abc import Mapping
+from dataclasses import dataclass, field
+from typing import Optional
+
+import numpy as np
+import numpy.typing as npt
+from PIL import Image
+
+from vllm.inputs import InputProcessingContext
+from vllm.logger import init_logger
+
+from .inputs import MultiModalDataDict
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class ProcessorInputs:
+    """Keyword arguments to :meth:`BaseMultiModalProcessor`."""
+    prompt_text: str
+    mm_data: MultiModalDataDict
+    hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
+
+
+class BaseProfilingInfo(ABC):
+    """
+    Abstract base class that provides the information necessary to profile
+    multi-modal models.
+    """
+
+    def __init__(self, ctx: InputProcessingContext) -> None:
+        super().__init__()
+
+        self.ctx = ctx
+
+    @abstractmethod
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        """
+        Return the maximum supported number of items for each modality.
+
+        A value of `None` means unlimited number of items.
+
+        Omitting a modality from the returned dictionary means that
+        it is not supported at all.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        """
+        Get the maximum possible number of tokens per data item
+        for each modality.
+
+        The dictionary returned by this method should have the same
+        keys as that returned by :meth:`get_supported_mm_limits`.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        """
+        Build the multi-modal portion of the input which, after processing,
+        results in `mm_max_tokens` in :meth:`get_mm_max_tokens_per_item`.
+        """
+        raise NotImplementedError
+
+    def _get_dummy_audios(
+        self,
+        *,
+        length: int,
+        num_audios: int,
+    ) -> list[npt.NDArray]:
+        audio = np.zeros((length, ))
+        return [audio] * num_audios
+
+    def _get_dummy_images(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_images: int,
+    ) -> list[Image.Image]:
+        image = Image.new("RGB", (width, height), color=0)
+        return [image] * num_images
+
+    def _get_dummy_videos(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_frames: int,
+        num_videos: int,
+    ) -> list[npt.NDArray]:
+        video = np.zeros((num_frames, width, height, 3))
+        return [video] * num_videos
+
+    def get_mm_limits(self) -> Mapping[str, int]:
+        mm_config = self.ctx.get_mm_config()
+        mm_limit_per_prompt = mm_config.limit_per_prompt
+
+        supported_mm_limits = self.get_supported_mm_limits()
+
+        mm_limits = {
+            modality: mm_limit_per_prompt.get(modality, 1)
+            for modality in supported_mm_limits
+        }
+
+        for modality, supported_limit in supported_mm_limits.items():
+            limit = mm_limits[modality]
+            if supported_limit is not None and supported_limit < limit:
+                raise ValueError(
+                    f"You set {modality}={limit} (or defaulted to 1) in "
+                    f"`--limit-mm-per-prompt`, but this model only supports "
+                    f"at most {supported_limit} {modality} items.")
+
+        return mm_limits
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index fb4389dc4df42..f75a594a4c4e0 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -224,7 +224,7 @@ def get_max_tokens_per_item_by_modality(
             tokenizer = cached_get_tokenizer(model_config.tokenizer)
             processor = self.create_processor(model_config, tokenizer)
             seq_len = model_config.max_model_len
-            return processor.get_mm_max_tokens_per_item(seq_len)
+            return processor.profiling_info.get_mm_max_tokens_per_item(seq_len)
 
         return {
             key: plugin.get_max_multimodal_tokens(model_config)

From ee77fdb5de42a6fead2b897d87d99d4b1e5650a9 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 6 Jan 2025 21:40:31 +0800
Subject: [PATCH 283/357] [Doc][2/N] Reorganize Models and Usage sections
 (#11755)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .github/ISSUE_TEMPLATE/600-new-model.yml      |   2 +-
 .../disagg_prefill/abstraction.jpg            | Bin
 .../disagg_prefill/overview.jpg               | Bin
 docs/source/contributing/model/basic.md       | 102 ++++++++++++
 docs/source/contributing/model/index.md       |  26 +++
 .../model/multimodal.md}                      |   8 +-
 .../source/contributing/model/registration.md |  56 +++++++
 .../automatic_prefix_caching.md}              |   6 +-
 docs/source/design/kernel/paged_attention.md  |   2 +
 .../dev/offline_inference/offline_index.md    |   1 +
 .../automatic_prefix_caching.md}              |   8 +-
 .../compatibility_matrix.md                   |   6 +-
 .../{usage => features}/disagg_prefill.md     |   4 +-
 docs/source/{usage => features}/lora.md       |   0
 .../{usage => features}/multimodal_inputs.md  |   0
 .../{ => features}/quantization/auto_awq.md   |   0
 .../source/{ => features}/quantization/bnb.md |   0
 .../source/{ => features}/quantization/fp8.md |   0
 .../quantization/fp8_e4m3_kvcache.md          |   0
 .../quantization/fp8_e5m2_kvcache.md          |   0
 .../{ => features}/quantization/gguf.md       |   0
 docs/source/features/quantization/index.md    |  19 +++
 .../{ => features}/quantization/int8.md       |   0
 .../quantization/supported_hardware.md        |  10 +-
 .../source/{usage => features}/spec_decode.md |   0
 .../{usage => features}/structured_outputs.md |   0
 .../{usage => features}/tool_calling.md       |   0
 docs/source/index.md                          |  66 +++-----
 docs/source/models/adding_model.md            | 155 ------------------
 docs/source/models/supported_models.md        |   2 +-
 .../optimization.md}                          |   4 +-
 docs/source/{usage => serving}/engine_args.md |   0
 docs/source/{usage => serving}/env_vars.md    |   0
 .../serving/openai_compatible_server.md       |   2 +-
 docs/source/{usage => serving}/usage_stats.md |   0
 vllm/attention/backends/rocm_flash_attn.py    |   2 +-
 vllm/config.py                                |   6 +-
 vllm/engine/arg_utils.py                      |   2 +-
 vllm/engine/output_processor/multi_step.py    |   2 +-
 vllm/executor/cpu_executor.py                 |   2 +-
 vllm/platforms/cpu.py                         |   2 +-
 vllm/spec_decode/spec_decode_worker.py        |   2 +-
 vllm/utils.py                                 |   2 +-
 vllm/worker/multi_step_model_runner.py        |   2 +-
 vllm/worker/utils.py                          |   2 +-
 45 files changed, 265 insertions(+), 238 deletions(-)
 rename docs/source/assets/{usage => features}/disagg_prefill/abstraction.jpg (100%)
 rename docs/source/assets/{usage => features}/disagg_prefill/overview.jpg (100%)
 create mode 100644 docs/source/contributing/model/basic.md
 create mode 100644 docs/source/contributing/model/index.md
 rename docs/source/{models/enabling_multimodal_inputs.md => contributing/model/multimodal.md} (96%)
 create mode 100644 docs/source/contributing/model/registration.md
 rename docs/source/{automatic_prefix_caching/details.md => design/automatic_prefix_caching.md} (90%)
 rename docs/source/{automatic_prefix_caching/apc.md => features/automatic_prefix_caching.md} (97%)
 rename docs/source/{usage => features}/compatibility_matrix.md (98%)
 rename docs/source/{usage => features}/disagg_prefill.md (96%)
 rename docs/source/{usage => features}/lora.md (100%)
 rename docs/source/{usage => features}/multimodal_inputs.md (100%)
 rename docs/source/{ => features}/quantization/auto_awq.md (100%)
 rename docs/source/{ => features}/quantization/bnb.md (100%)
 rename docs/source/{ => features}/quantization/fp8.md (100%)
 rename docs/source/{ => features}/quantization/fp8_e4m3_kvcache.md (100%)
 rename docs/source/{ => features}/quantization/fp8_e5m2_kvcache.md (100%)
 rename docs/source/{ => features}/quantization/gguf.md (100%)
 create mode 100644 docs/source/features/quantization/index.md
 rename docs/source/{ => features}/quantization/int8.md (100%)
 rename docs/source/{ => features}/quantization/supported_hardware.md (86%)
 rename docs/source/{usage => features}/spec_decode.md (100%)
 rename docs/source/{usage => features}/structured_outputs.md (100%)
 rename docs/source/{usage => features}/tool_calling.md (100%)
 delete mode 100644 docs/source/models/adding_model.md
 rename docs/source/{usage/performance.md => performance/optimization.md} (98%)
 rename docs/source/{usage => serving}/engine_args.md (100%)
 rename docs/source/{usage => serving}/env_vars.md (100%)
 rename docs/source/{usage => serving}/usage_stats.md (100%)

diff --git a/.github/ISSUE_TEMPLATE/600-new-model.yml b/.github/ISSUE_TEMPLATE/600-new-model.yml
index 794617a0cfdf6..713e76c1a5cec 100644
--- a/.github/ISSUE_TEMPLATE/600-new-model.yml
+++ b/.github/ISSUE_TEMPLATE/600-new-model.yml
@@ -9,7 +9,7 @@ body:
     value: >
       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
 
-      #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
+      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
 - type: textarea
   attributes:
     label: The model to consider.
diff --git a/docs/source/assets/usage/disagg_prefill/abstraction.jpg b/docs/source/assets/features/disagg_prefill/abstraction.jpg
similarity index 100%
rename from docs/source/assets/usage/disagg_prefill/abstraction.jpg
rename to docs/source/assets/features/disagg_prefill/abstraction.jpg
diff --git a/docs/source/assets/usage/disagg_prefill/overview.jpg b/docs/source/assets/features/disagg_prefill/overview.jpg
similarity index 100%
rename from docs/source/assets/usage/disagg_prefill/overview.jpg
rename to docs/source/assets/features/disagg_prefill/overview.jpg
diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md
new file mode 100644
index 0000000000000..14690ffe24a83
--- /dev/null
+++ b/docs/source/contributing/model/basic.md
@@ -0,0 +1,102 @@
+(new-model-basic)=
+
+# Basic Implementation
+
+This guide walks you through the steps to implement a basic vLLM model.
+
+## 1. Bring your model code
+
+First, clone the PyTorch model code from the source repository.
+For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
+HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
+
+```{warning}
+Make sure to review and adhere to the original code's copyright and licensing terms!
+```
+
+## 2. Make your code compatible with vLLM
+
+To ensure compatibility with vLLM, your model must meet the following requirements:
+
+### Initialization Code
+
+All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for:
+
+- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts.
+- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode.
+
+The initialization code should look like this:
+
+```python
+from torch import nn
+from vllm.config import VllmConfig
+from vllm.attention import Attention
+
+class MyAttention(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str):
+        super().__init__()
+        self.attn = Attention(prefix=f"{prefix}.attn")
+
+class MyDecoderLayer(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str):
+        super().__init__()
+        self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
+
+class MyModel(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
+        )
+
+class MyModelForCausalLM(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
+```
+
+### Computation Code
+
+Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
+
+```python
+def forward(
+    self,
+    input_ids: torch.Tensor,
+    positions: torch.Tensor,
+    kv_caches: List[torch.Tensor],
+    attn_metadata: AttentionMetadata,
+) -> torch.Tensor:
+    ...
+```
+
+```{note}
+Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
+If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
+```
+
+For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
+
+## 3. (Optional) Implement tensor parallelism and quantization support
+
+If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
+To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
+For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with `VocabParallelEmbedding`. For the output LM head, you can use `ParallelLMHead`.
+When it comes to the linear layers, we provide the following options to parallelize them:
+
+- `ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
+- `RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
+- `ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
+- `MergedColumnParallelLinear`: Column-parallel linear that merges multiple `ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
+- `QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
+
+Note that all the linear layers above take `linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
+
+## 4. Implement the weight loading logic
+
+You now need to implement the `load_weights` method in your `*ForCausalLM` class.
+This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for `MergedColumnParallelLinear` and `QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
+
+## 5. Register your model
+
+See [this page](#new-model-registration) for instructions on how to register your new model to be used by vLLM.
diff --git a/docs/source/contributing/model/index.md b/docs/source/contributing/model/index.md
new file mode 100644
index 0000000000000..a2d601c83cf47
--- /dev/null
+++ b/docs/source/contributing/model/index.md
@@ -0,0 +1,26 @@
+(new-model)=
+
+# Adding a New Model
+
+This section provides more information on how to integrate a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM.
+
+```{toctree}
+:caption: Contents
+:maxdepth: 1
+
+basic
+registration
+multimodal
+```
+
+```{note}
+The complexity of adding a new model depends heavily on the model's architecture.
+The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
+However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
+```
+
+```{tip}
+If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
+or ask on our [developer slack](https://slack.vllm.ai).
+We will be happy to help you out!
+```
diff --git a/docs/source/models/enabling_multimodal_inputs.md b/docs/source/contributing/model/multimodal.md
similarity index 96%
rename from docs/source/models/enabling_multimodal_inputs.md
rename to docs/source/contributing/model/multimodal.md
index fdd770887900e..e5dcd1223b361 100644
--- a/docs/source/models/enabling_multimodal_inputs.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -2,15 +2,11 @@
 
 # Enabling Multimodal Inputs
 
-This document walks you through the steps to extend a vLLM model so that it accepts [multi-modal inputs](#multimodal-inputs).
-
-```{seealso}
-[Adding a New Model](adding-a-new-model)
-```
+This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](#multimodal-inputs).
 
 ## 1. Update the base vLLM model
 
-It is assumed that you have already implemented the model in vLLM according to [these steps](#adding-a-new-model).
+It is assumed that you have already implemented the model in vLLM according to [these steps](#new-model-basic).
 Further update the model as follows:
 
 - Implement the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md
new file mode 100644
index 0000000000000..cf1cdb0c9de0f
--- /dev/null
+++ b/docs/source/contributing/model/registration.md
@@ -0,0 +1,56 @@
+(new-model-registration)=
+
+# Model Registration
+
+vLLM relies on a model registry to determine how to run each model.
+A list of pre-registered architectures can be found on the [Supported Models](#supported-models) page.
+
+If your model is not on this list, you must register it to vLLM.
+This page provides detailed instructions on how to do so.
+
+## Built-in models
+
+To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](#build-from-source).
+This gives you the ability to modify the codebase and test your model.
+
+After you have implemented your model (see [tutorial](#new-model-basic)), put it into the <gh-dir:vllm/model_executor/models> directory.
+Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
+You should also include an example HuggingFace repository for this model in <gh-file:tests/models/registry.py> to run the unit tests.
+Finally, update the [Supported Models](#supported-models) documentation page to promote your model!
+
+```{important}
+The list of models in each section should be maintained in alphabetical order.
+```
+
+## Out-of-tree models
+
+You can load an external model using a plugin without modifying the vLLM codebase.
+
+```{seealso}
+[vLLM's Plugin System](#plugin-system)
+```
+
+To register the model, use the following code:
+
+```python
+from vllm import ModelRegistry
+from your_code import YourModelForCausalLM
+ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
+```
+
+If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
+
+```python
+from vllm import ModelRegistry
+
+ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
+```
+
+```{important}
+If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
+Read more about that [here](#enabling-multimodal-inputs).
+```
+
+```{note}
+Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
+```
diff --git a/docs/source/automatic_prefix_caching/details.md b/docs/source/design/automatic_prefix_caching.md
similarity index 90%
rename from docs/source/automatic_prefix_caching/details.md
rename to docs/source/design/automatic_prefix_caching.md
index 17f806217aa65..4398536b2b4ad 100644
--- a/docs/source/automatic_prefix_caching/details.md
+++ b/docs/source/design/automatic_prefix_caching.md
@@ -1,6 +1,8 @@
-# Implementation
+(design-automatic-prefix-caching)=
 
-The core idea of PagedAttention is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
+# Automatic Prefix Caching
+
+The core idea of [PagedAttention](#design-paged-attention) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
 
 To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block.
 
diff --git a/docs/source/design/kernel/paged_attention.md b/docs/source/design/kernel/paged_attention.md
index c21985b36eb3a..f896f903c78f5 100644
--- a/docs/source/design/kernel/paged_attention.md
+++ b/docs/source/design/kernel/paged_attention.md
@@ -1,3 +1,5 @@
+(design-paged-attention)=
+
 # vLLM Paged Attention
 
 - Currently, vLLM utilizes its own implementation of a multi-head query
diff --git a/docs/source/dev/offline_inference/offline_index.md b/docs/source/dev/offline_inference/offline_index.md
index 318a02d8c78df..c32f99d59e3db 100644
--- a/docs/source/dev/offline_inference/offline_index.md
+++ b/docs/source/dev/offline_inference/offline_index.md
@@ -1,6 +1,7 @@
 # Offline Inference
 
 ```{toctree}
+:caption: Contents
 :maxdepth: 1
 
 llm
diff --git a/docs/source/automatic_prefix_caching/apc.md b/docs/source/features/automatic_prefix_caching.md
similarity index 97%
rename from docs/source/automatic_prefix_caching/apc.md
rename to docs/source/features/automatic_prefix_caching.md
index c0c141c5fb7ef..3d70cbb29c385 100644
--- a/docs/source/automatic_prefix_caching/apc.md
+++ b/docs/source/features/automatic_prefix_caching.md
@@ -1,13 +1,13 @@
-(apc)=
+(automatic-prefix-caching)=
 
-# Introduction
+# Automatic Prefix Caching
 
-## What is Automatic Prefix Caching
+## Introduction
 
 Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
 
 ```{note}
-Technical details on how vLLM implements APC are in the next page.
+Technical details on how vLLM implements APC can be found [here](#design-automatic-prefix-caching).
 ```
 
 ## Enabling APC in vLLM
diff --git a/docs/source/usage/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md
similarity index 98%
rename from docs/source/usage/compatibility_matrix.md
rename to docs/source/features/compatibility_matrix.md
index 3cefa12ea8a1d..8d8f7dca2e5b5 100644
--- a/docs/source/usage/compatibility_matrix.md
+++ b/docs/source/features/compatibility_matrix.md
@@ -32,7 +32,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
 
    * - Feature
      - [CP](#chunked-prefill)
-     - [APC](#apc)
+     - [APC](#automatic-prefix-caching)
      - [LoRA](#lora-adapter)
      - <abbr title="Prompt Adapter">prmpt adptr</abbr>
      - [SD](#spec_decode)
@@ -64,7 +64,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
      -
      -
      -
-   * - [APC](#apc)
+   * - [APC](#automatic-prefix-caching)
      - ✅
      -
      -
@@ -345,7 +345,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
      - ✅
      - ✅
      - ✅
-   * - [APC](#apc)
+   * - [APC](#automatic-prefix-caching)
      - [✗](gh-issue:3687)
      - ✅
      - ✅
diff --git a/docs/source/usage/disagg_prefill.md b/docs/source/features/disagg_prefill.md
similarity index 96%
rename from docs/source/usage/disagg_prefill.md
rename to docs/source/features/disagg_prefill.md
index a61c00fad1e3c..05226f2dec87c 100644
--- a/docs/source/usage/disagg_prefill.md
+++ b/docs/source/features/disagg_prefill.md
@@ -41,13 +41,13 @@ Key abstractions for disaggregated prefilling:
 
 Here is a figure illustrating how the above 3 abstractions are organized:
 
-```{image} /assets/usage/disagg_prefill/abstraction.jpg
+```{image} /assets/features/disagg_prefill/abstraction.jpg
 :alt: Disaggregated prefilling abstractions
 ```
 
 The workflow of disaggregated prefilling is as follows:
 
-```{image} /assets/usage/disagg_prefill/overview.jpg
+```{image} /assets/features/disagg_prefill/overview.jpg
 :alt: Disaggregated prefilling workflow
 ```
 
diff --git a/docs/source/usage/lora.md b/docs/source/features/lora.md
similarity index 100%
rename from docs/source/usage/lora.md
rename to docs/source/features/lora.md
diff --git a/docs/source/usage/multimodal_inputs.md b/docs/source/features/multimodal_inputs.md
similarity index 100%
rename from docs/source/usage/multimodal_inputs.md
rename to docs/source/features/multimodal_inputs.md
diff --git a/docs/source/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md
similarity index 100%
rename from docs/source/quantization/auto_awq.md
rename to docs/source/features/quantization/auto_awq.md
diff --git a/docs/source/quantization/bnb.md b/docs/source/features/quantization/bnb.md
similarity index 100%
rename from docs/source/quantization/bnb.md
rename to docs/source/features/quantization/bnb.md
diff --git a/docs/source/quantization/fp8.md b/docs/source/features/quantization/fp8.md
similarity index 100%
rename from docs/source/quantization/fp8.md
rename to docs/source/features/quantization/fp8.md
diff --git a/docs/source/quantization/fp8_e4m3_kvcache.md b/docs/source/features/quantization/fp8_e4m3_kvcache.md
similarity index 100%
rename from docs/source/quantization/fp8_e4m3_kvcache.md
rename to docs/source/features/quantization/fp8_e4m3_kvcache.md
diff --git a/docs/source/quantization/fp8_e5m2_kvcache.md b/docs/source/features/quantization/fp8_e5m2_kvcache.md
similarity index 100%
rename from docs/source/quantization/fp8_e5m2_kvcache.md
rename to docs/source/features/quantization/fp8_e5m2_kvcache.md
diff --git a/docs/source/quantization/gguf.md b/docs/source/features/quantization/gguf.md
similarity index 100%
rename from docs/source/quantization/gguf.md
rename to docs/source/features/quantization/gguf.md
diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md
new file mode 100644
index 0000000000000..861cb165c11c2
--- /dev/null
+++ b/docs/source/features/quantization/index.md
@@ -0,0 +1,19 @@
+(quantization-index)=
+
+# Quantization
+
+Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices.
+
+```{toctree}
+:caption: Contents
+:maxdepth: 1
+
+supported_hardware
+auto_awq
+bnb
+gguf
+int8
+fp8
+fp8_e5m2_kvcache
+fp8_e4m3_kvcache
+```
diff --git a/docs/source/quantization/int8.md b/docs/source/features/quantization/int8.md
similarity index 100%
rename from docs/source/quantization/int8.md
rename to docs/source/features/quantization/int8.md
diff --git a/docs/source/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md
similarity index 86%
rename from docs/source/quantization/supported_hardware.md
rename to docs/source/features/quantization/supported_hardware.md
index 7330c2f8aa194..988288a82d9bc 100644
--- a/docs/source/quantization/supported_hardware.md
+++ b/docs/source/features/quantization/supported_hardware.md
@@ -1,6 +1,6 @@
-(supported-hardware-for-quantization)=
+(quantization-supported-hardware)=
 
-# Supported Hardware for Quantization Kernels
+# Supported Hardware
 
 The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
 
@@ -120,12 +120,12 @@ The table below shows the compatibility of various quantization implementations
   - ✗
 ```
 
-## Notes:
-
 - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
 - "✅︎" indicates that the quantization method is supported on the specified hardware.
 - "✗" indicates that the quantization method is not supported on the specified hardware.
 
-Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
+```{note}
+This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
 
 For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
+```
diff --git a/docs/source/usage/spec_decode.md b/docs/source/features/spec_decode.md
similarity index 100%
rename from docs/source/usage/spec_decode.md
rename to docs/source/features/spec_decode.md
diff --git a/docs/source/usage/structured_outputs.md b/docs/source/features/structured_outputs.md
similarity index 100%
rename from docs/source/usage/structured_outputs.md
rename to docs/source/features/structured_outputs.md
diff --git a/docs/source/usage/tool_calling.md b/docs/source/features/tool_calling.md
similarity index 100%
rename from docs/source/usage/tool_calling.md
rename to docs/source/features/tool_calling.md
diff --git a/docs/source/index.md b/docs/source/index.md
index f390474978790..4bc40bf0f5e41 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -79,6 +79,9 @@ serving/metrics
 serving/integrations
 serving/tensorizer
 serving/runai_model_streamer
+serving/engine_args
+serving/env_vars
+serving/usage_stats
 ```
 
 ```{toctree}
@@ -88,53 +91,28 @@ serving/runai_model_streamer
 models/supported_models
 models/generative_models
 models/pooling_models
-models/adding_model
-models/enabling_multimodal_inputs
 ```
 
 ```{toctree}
-:caption: Usage
+:caption: Features
 :maxdepth: 1
 
-usage/lora
-usage/multimodal_inputs
-usage/tool_calling
-usage/structured_outputs
-usage/spec_decode
-usage/compatibility_matrix
-usage/performance
-usage/engine_args
-usage/env_vars
-usage/usage_stats
-usage/disagg_prefill
-```
-
-```{toctree}
-:caption: Quantization
-:maxdepth: 1
-
-quantization/supported_hardware
-quantization/auto_awq
-quantization/bnb
-quantization/gguf
-quantization/int8
-quantization/fp8
-quantization/fp8_e5m2_kvcache
-quantization/fp8_e4m3_kvcache
-```
-
-```{toctree}
-:caption: Automatic Prefix Caching
-:maxdepth: 1
-
-automatic_prefix_caching/apc
-automatic_prefix_caching/details
+features/quantization/index
+features/lora
+features/multimodal_inputs
+features/tool_calling
+features/structured_outputs
+features/automatic_prefix_caching
+features/disagg_prefill
+features/spec_decode
+features/compatibility_matrix
 ```
 
 ```{toctree}
 :caption: Performance
 :maxdepth: 1
 
+performance/optimization
 performance/benchmarks
 ```
 
@@ -148,10 +126,8 @@ community/meetups
 community/sponsors
 ```
 
-% API Documentation: API reference aimed at vllm library usage
-
 ```{toctree}
-:caption: API Documentation
+:caption: API Reference
 :maxdepth: 2
 
 dev/sampling_params
@@ -160,30 +136,32 @@ dev/offline_inference/offline_index
 dev/engine/engine_index
 ```
 
-% Design: docs about vLLM internals
+% Design Documents: Details about vLLM internals
 
 ```{toctree}
-:caption: Design
+:caption: Design Documents
 :maxdepth: 2
 
 design/arch_overview
 design/huggingface_integration
 design/plugin_system
-design/input_processing/model_inputs_index
 design/kernel/paged_attention
+design/input_processing/model_inputs_index
 design/multimodal/multimodal_index
+design/automatic_prefix_caching
 design/multiprocessing
 ```
 
-% For Developers: contributing to the vLLM project
+% Developer Guide: How to contribute to the vLLM project
 
 ```{toctree}
-:caption: For Developers
+:caption: Developer Guide
 :maxdepth: 2
 
 contributing/overview
 contributing/profiling/profiling_index
 contributing/dockerfile/dockerfile
+contributing/model/index
 ```
 
 # Indices and tables
diff --git a/docs/source/models/adding_model.md b/docs/source/models/adding_model.md
deleted file mode 100644
index 02537fba020c4..0000000000000
--- a/docs/source/models/adding_model.md
+++ /dev/null
@@ -1,155 +0,0 @@
-(adding-a-new-model)=
-
-# Adding a New Model
-
-This document provides a high-level guide on integrating a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM.
-
-```{note}
-The complexity of adding a new model depends heavily on the model's architecture.
-The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
-However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
-```
-
-```{note}
-By default, vLLM models do not support multi-modal inputs. To enable multi-modal support,
-please follow [this guide](#enabling-multimodal-inputs) after implementing the model here.
-```
-
-```{tip}
-If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our [GitHub](https://github.com/vllm-project/vllm/issues) repository.
-We will be happy to help you out!
-```
-
-## 0. Fork the vLLM repository
-
-Start by forking our [GitHub] repository and then [build it from source](#build-from-source).
-This gives you the ability to modify the codebase and test your model.
-
-```{tip}
-If you don't want to fork the repository and modify vLLM's codebase, please refer to the "Out-of-Tree Model Integration" section below.
-```
-
-## 1. Bring your model code
-
-Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the <gh-dir:vllm/model_executor/models> directory.
-For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from the HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
-
-```{warning}
-When copying the model code, make sure to review and adhere to the code's copyright and licensing terms.
-```
-
-## 2. Make your code compatible with vLLM
-
-To ensure compatibility with vLLM, your model must meet the following requirements:
-
-### Initialization Code
-
-All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for:
-
-- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts.
-- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode.
-
-The initialization code should look like this:
-
-```python
-from torch import nn
-from vllm.config import VllmConfig
-from vllm.attention import Attention
-
-class MyAttention(nn.Module):
-    def __init__(self, vllm_config: VllmConfig, prefix: str):
-        super().__init__()
-        self.attn = Attention(prefix=f"{prefix}.attn")
-
-class MyDecoderLayer(nn.Module):
-    def __init__(self, vllm_config: VllmConfig, prefix: str):
-        super().__init__()
-        self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
-
-class MyModel(nn.Module):
-    def __init__(self, vllm_config: VllmConfig, prefix: str):
-        super().__init__()
-        self.layers = nn.ModuleList(
-            [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
-        )
-
-class MyModelForCausalLM(nn.Module):
-    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
-```
-
-### Computation Code
-
-Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
-
-```python
-def forward(
-    self,
-    input_ids: torch.Tensor,
-    positions: torch.Tensor,
-    kv_caches: List[torch.Tensor],
-    attn_metadata: AttentionMetadata,
-) -> torch.Tensor:
-    ...
-```
-
-```{note}
-Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
-If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
-```
-
-For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
-
-## 3. (Optional) Implement tensor parallelism and quantization support
-
-If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
-To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
-For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with {code}`VocabParallelEmbedding`. For the output LM head, you can use {code}`ParallelLMHead`.
-When it comes to the linear layers, we provide the following options to parallelize them:
-
-- {code}`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
-- {code}`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
-- {code}`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
-- {code}`MergedColumnParallelLinear`: Column-parallel linear that merges multiple {code}`ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
-- {code}`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
-
-Note that all the linear layers above take {code}`linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
-
-## 4. Implement the weight loading logic
-
-You now need to implement the {code}`load_weights` method in your {code}`*ForCausalLM` class.
-This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for {code}`MergedColumnParallelLinear` and {code}`QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
-
-## 5. Register your model
-
-Finally, register your {code}`*ForCausalLM` class to the {code}`_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py>.
-
-## 6. Out-of-Tree Model Integration
-
-You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see [plugin-system](#plugin-system).
-
-To register the model, use the following code:
-
-```python
-from vllm import ModelRegistry
-from your_code import YourModelForCausalLM
-ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
-```
-
-If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like {code}`RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
-
-```python
-from vllm import ModelRegistry
-
-ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
-```
-
-```{important}
-If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
-Read more about that [here](#enabling-multimodal-inputs).
-```
-
-```{note}
-Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
-```
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 7682ed104b8c5..5a2778026192a 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -37,7 +37,7 @@ print(output)
 If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
 ````
 
-Otherwise, please refer to [Adding a New Model](#adding-a-new-model) and [Enabling Multimodal Inputs](#enabling-multimodal-inputs) for instructions on how to implement your model in vLLM.
+Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
 Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
 
 ### ModelScope
diff --git a/docs/source/usage/performance.md b/docs/source/performance/optimization.md
similarity index 98%
rename from docs/source/usage/performance.md
rename to docs/source/performance/optimization.md
index 2cd3801bfc82d..4fcde9b03b887 100644
--- a/docs/source/usage/performance.md
+++ b/docs/source/performance/optimization.md
@@ -1,6 +1,6 @@
-(performance)=
+(optimization-and-tuning)=
 
-# Performance and Tuning
+# Optimization and Tuning
 
 ## Preemption
 
diff --git a/docs/source/usage/engine_args.md b/docs/source/serving/engine_args.md
similarity index 100%
rename from docs/source/usage/engine_args.md
rename to docs/source/serving/engine_args.md
diff --git a/docs/source/usage/env_vars.md b/docs/source/serving/env_vars.md
similarity index 100%
rename from docs/source/usage/env_vars.md
rename to docs/source/serving/env_vars.md
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index caf5e8cafd9aa..97e9879075570 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -217,7 +217,7 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
 
 We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
 [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters;
-see our [Multimodal Inputs](../usage/multimodal_inputs.md) guide for more information.
+see our [Multimodal Inputs](#multimodal-inputs) guide for more information.
 - *Note: `image_url.detail` parameter is not supported.*
 
 Code example: <gh-file:examples/openai_chat_completion_client.py>
diff --git a/docs/source/usage/usage_stats.md b/docs/source/serving/usage_stats.md
similarity index 100%
rename from docs/source/usage/usage_stats.md
rename to docs/source/serving/usage_stats.md
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 480901f71047f..d43c15b661ef7 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -430,7 +430,7 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        # Reminder: Please update docs/source/usage/compatibility_matrix.md
+        # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
diff --git a/vllm/config.py b/vllm/config.py
index b0ed88cb7f42b..8b824a1fca511 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -644,7 +644,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
             self.use_async_output_proc = False
             return
 
-        # Reminder: Please update docs/source/usage/compatibility_matrix.md
+        # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
         from vllm.platforms import current_platform
         if not current_platform.is_async_output_supported(self.enforce_eager):
@@ -665,7 +665,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
         if self.runner_type == "pooling":
             self.use_async_output_proc = False
 
-        # Reminder: Please update docs/source/usage/compatibility_matrix.md
+        # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
         if speculative_config:
             logger.warning("Async output processing is not supported with"
@@ -2064,7 +2064,7 @@ def verify_with_model_config(self, model_config: ModelConfig):
                            model_config.quantization)
 
     def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
-        # Reminder: Please update docs/source/usage/compatibility_matrix.md
+        # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
         if scheduler_config.chunked_prefill_enabled:
             logger.warning("LoRA with chunked prefill is still experimental "
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 69c7c5077fe32..e94664308cf8d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1148,7 +1148,7 @@ def create_engine_config(self,
             disable_logprobs=self.disable_logprobs_during_spec_decoding,
         )
 
-        # Reminder: Please update docs/source/usage/compatibility_matrix.md
+        # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
         if self.num_scheduler_steps > 1:
             if speculative_config is not None:
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 1c6f735f39e04..c8b282b1a7676 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -65,7 +65,7 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
     @staticmethod
     @functools.lru_cache
     def _log_prompt_logprob_unsupported_warning_once():
-        # Reminder: Please update docs/source/usage/compatibility_matrix.md
+        # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
         logger.warning(
             "Prompt logprob is not supported by multi step workers. "
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index 5495bc50ede83..c7f018d9a203e 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -22,7 +22,7 @@ class CPUExecutor(ExecutorBase):
 
     def _init_executor(self) -> None:
         assert self.device_config.device_type == "cpu"
-        # Reminder: Please update docs/source/usage/compatibility_matrix.md
+        # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
         assert self.lora_config is None, "cpu backend doesn't support LoRA"
 
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 09bde9f065eaa..7ba7f5150150c 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -50,7 +50,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         import vllm.envs as envs
         from vllm.utils import GiB_bytes
         model_config = vllm_config.model_config
-        # Reminder: Please update docs/source/usage/compatibility_matrix.md
+        # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
         if not model_config.enforce_eager:
             logger.warning(
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index de593113b938b..e369da1a70c23 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -108,7 +108,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
     return spec_decode_worker
 
 
-# Reminder: Please update docs/source/usage/compatibility_matrix.md
+# Reminder: Please update docs/source/features/compatibility_matrix.md
 # If the feature combo become valid
 class SpecDecodeWorker(LoraNotSupportedWorkerBase):
     """Worker which implements speculative decoding.
diff --git a/vllm/utils.py b/vllm/utils.py
index aadeddabf8b55..63057153f851d 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -58,7 +58,7 @@
 
 # Exception strings for non-implemented encoder/decoder scenarios
 
-# Reminder: Please update docs/source/usage/compatibility_matrix.md
+# Reminder: Please update docs/source/features/compatibility_matrix.md
 # If the feature combo become valid
 
 STR_NOT_IMPL_ENC_DEC_SWA = \
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index dee63a75c0605..a2c2cebf8d1f6 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -822,7 +822,7 @@ def _pythonize_sampler_output(
 
     for sgdx, (seq_group,
                sample_result) in enumerate(zip(seq_groups, samples_list)):
-        # Reminder: Please update docs/source/usage/compatibility_matrix.md
+        # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
         # (Check for Guided Decoding)
         if seq_group.sampling_params.logits_processors:
diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py
index 8f2d343440d3e..ffa8c4cb0ff46 100644
--- a/vllm/worker/utils.py
+++ b/vllm/worker/utils.py
@@ -13,7 +13,7 @@ def assert_enc_dec_mr_supported_scenario(
     a supported scenario.
     '''
 
-    # Reminder: Please update docs/source/usage/compatibility_matrix.md
+    # Reminder: Please update docs/source/features/compatibility_matrix.md
     # If the feature combo become valid
 
     if enc_dec_mr.cache_config.enable_prefix_caching:

From 9279b9f83dd3aa5bb3d3ce57bf92d9361755d164 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 6 Jan 2025 05:48:53 -0800
Subject: [PATCH 284/357] [Bugfix] Fix max image size for LLaVA-Onevision
 (#11769)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/llava_onevision.py | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 6dccc1e0d3b8d..5eac2f223d794 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -19,8 +19,8 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
-from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems,
-                                   VideoProcessorItems)
+from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
+                                   VideoEmbeddingItems, VideoProcessorItems)
 from vllm.multimodal.processing import MultiModalFieldConfig, PromptReplacement
 from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.sequence import IntermediateTensors
@@ -170,6 +170,22 @@ def _get_num_video_tokens(
 class LlavaOnevisionProfilingInfo(LlavaOnevisionProcessingMixin,
                                   BaseLlavaProfilingInfo):
 
+    def _get_image_size_with_most_features(self) -> ImageSize:
+        hf_config = self._get_hf_config()
+        largest_feature_size, largest_feature_pinpoint = 0, None
+        for (height, width) in hf_config.image_grid_pinpoints:
+            feat_size = self._get_num_image_tokens(image_width=width,
+                                                   image_height=height)
+            if feat_size > largest_feature_size:
+                largest_feature_size = feat_size
+                largest_feature_pinpoint = ImageSize(width=width,
+                                                     height=height)
+
+        if largest_feature_size == 0 or largest_feature_pinpoint is None:
+            raise ValueError("Cannot have a largest feature size of 0!")
+
+        return largest_feature_pinpoint
+
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 

From 4ca5d40adc53aca2a1fbaed81d9d622fde46ebf1 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 6 Jan 2025 21:57:44 +0800
Subject: [PATCH 285/357] [doc] explain how to add interleaving sliding window
 support (#11771)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/contributing/model/basic.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md
index 14690ffe24a83..002808ac5fbbd 100644
--- a/docs/source/contributing/model/basic.md
+++ b/docs/source/contributing/model/basic.md
@@ -100,3 +100,16 @@ This method should load the weights from the HuggingFace's checkpoint file and a
 ## 5. Register your model
 
 See [this page](#new-model-registration) for instructions on how to register your new model to be used by vLLM.
+
+## Frequently Asked Questions
+
+### How to support models with interleaving sliding windows?
+
+For models with interleaving sliding windows (e.g. `google/gemma-2-2b-it` and `mistralai/Ministral-8B-Instruct-2410`), the scheduler will treat the model as a full-attention model, i.e., kv-cache of all tokens will not be dropped. This is to make sure prefix caching works with these models. Sliding window only appears as a parameter to the attention kernel computation.
+
+To support a model with interleaving sliding windows, we need to take care of the following details:
+
+- Make sure [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/config.py#L308) evaluates `has_interleaved_attention` to `True` for this model, and set `self.hf_text_config.interleaved_sliding_window` to the format of interleaving sliding windows the model can understand. Then, `self.hf_text_config.sliding_window` will be deleted, and the model will be treated as a full-attention model.
+- In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171).
+
+With these two steps, interleave sliding windows should work with the model.

From 32c9eff2fff8ee91a60c9410c69042dc4c1cc5c8 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 6 Jan 2025 23:22:25 +0800
Subject: [PATCH 286/357] [Bugfix][V1] Fix molmo text-only inputs (#11676)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../vision_language/test_models.py            | 10 ++
 .../vision_language/vlm_utils/model_utils.py  | 99 ++++++++++++++++++-
 vllm/model_executor/models/molmo.py           | 56 ++++-------
 3 files changed, 123 insertions(+), 42 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index dc0b683c1f1cb..146685738a1d0 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -341,6 +341,16 @@
         ),
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
     ),
+    "molmo": VLMTestInfo(
+        models=["allenai/Molmo-7B-D-0924"],
+        test_type=(VLMTestType.IMAGE),
+        prompt_formatter=lambda img_prompt:"User: " + img_prompt + " Assistant:", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        image_size_factors=[(),(1.0, 1.0, 1.0)],
+        patch_hf_runner=model_utils.mlomo_patch_hf_runner,
+        postprocess_inputs=model_utils.molmo_post_processor,
+    ),
     # Tests for phi3v currently live in another file because of a bug in
     # transformers. Once this issue is fixed, we can enable them here instead.
     # https://github.com/huggingface/transformers/issues/34307
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 3eca8fb9dcb1a..6c7a753af787e 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -5,17 +5,20 @@
 import re
 import types
 from pathlib import PosixPath
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 from PIL.Image import Image
-from transformers import AutoConfig, AutoTokenizer, BatchEncoding
+from transformers import (AutoConfig, AutoTokenizer, BatchEncoding,
+                          GenerationConfig)
 
 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import patch_padding_side
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 
-from .....conftest import HfRunner, ImageAsset, _ImageAssets
+from .....conftest import (HfRunner, ImageAsset, PromptAudioInput,
+                           PromptImageInput, PromptVideoInput, _ImageAssets)
+from ....utils import TokensTextLogprobs
 from .types import RunnerOutput
 
 
@@ -222,6 +225,11 @@ def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
     return {"model_inputs": hf_inputs}
 
 
+def molmo_post_processor(hf_inputs: BatchEncoding, dtype: str):
+    hf_inputs = cast_dtype_post_processor("images")(hf_inputs, dtype)
+    return {k: v.unsqueeze(0) for k, v in hf_inputs.items()}
+
+
 ####### Prompt path encoders for models that need models on disk
 def qwen_prompt_path_encoder(
         tmp_path: PosixPath, prompt: str, assets: Union[List[ImageAsset],
@@ -451,3 +459,88 @@ def _generate(self, *args, **kwargs):
     hf_model.model.generate = types.MethodType(_generate, hf_model.model)
 
     return hf_model
+
+
+def _generate_greedy_logprobs_limit(
+    self,
+    prompts: List[str],
+    max_tokens: int,
+    num_logprobs: int,
+    images: Optional[PromptImageInput] = None,
+    audios: Optional[PromptAudioInput] = None,
+    videos: Optional[PromptVideoInput] = None,
+    **kwargs: Any,
+) -> List[TokensTextLogprobs]:
+    all_inputs = self.get_inputs(prompts,
+                                 images=images,
+                                 videos=videos,
+                                 audios=audios)
+
+    # Process in batches for inference.
+    if len(all_inputs):
+        input_ids_lst = []
+        images_lst = []
+        images_input_idx_lst = []
+        imges_masks_lst = []
+        for inputs in all_inputs:
+            input_ids_lst.append(inputs["input_ids"])
+            images_lst.append(inputs["images"])
+            images_input_idx_lst.append(inputs["image_input_idx"])
+            imges_masks_lst.append(inputs["image_masks"])
+        batch_inputs = {}
+        batch_inputs['input_ids'] = torch.cat(input_ids_lst, dim=0)
+        batch_inputs['images'] = torch.cat(images_lst, dim=0)
+        batch_inputs['image_input_idx'] = torch.cat(images_input_idx_lst,
+                                                    dim=0)
+        batch_inputs['image_masks'] = torch.cat(imges_masks_lst, dim=0)
+
+        outputs = self.model.generate_from_batch(
+            batch=self.wrap_device(batch_inputs,
+                                   device=self.model.device.type),
+            generation_config=GenerationConfig(
+                max_new_tokens=max_tokens,
+                stop_strings="<|endoftext|>",
+                do_sample=False,
+            ),
+            tokenizer=self.tokenizer,
+            output_hidden_states=True,
+            return_dict_in_generate=True,
+        )
+
+    all_logprobs: List[List[Dict[int, float]]] = []
+    all_output_ids: List[List[int]] = []
+    all_output_strs: List[str] = []
+
+    for index in range(len(all_inputs)):
+        (
+            seq_logprobs_lst,
+            output_len,
+        ) = self._hidden_states_to_logprobs(outputs.hidden_states,
+                                            num_logprobs)
+        all_logprobs.append(seq_logprobs_lst)
+        seq_ids = outputs.sequences[index]
+        output_ids = seq_ids[-output_len:]
+        all_output_ids.append(output_ids.tolist())
+        all_output_strs.append(self.tokenizer.decode(output_ids))
+    outputs = zip(all_output_ids, all_output_strs, all_logprobs)
+    return [(output_ids, output_str, output_logprobs)
+            for output_ids, output_str, output_logprobs in outputs]
+
+
+####### Molmo-specific HuggingFace runner patchers
+def mlomo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for Molmo."""
+    hf_processor = hf_model.processor
+
+    def _processor(*args, **kwargs):
+        return hf_processor.process(*args, **kwargs)
+
+    hf_model.processor = _processor
+
+    setattr(  # noqa: B010
+        hf_model,
+        "generate_greedy_logprobs_limit",
+        types.MethodType(_generate_greedy_logprobs_limit, hf_model),
+    )
+
+    return hf_model
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index cc25be9f5b6a9..0e8287bb56b6b 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1081,45 +1081,25 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
     else:
         out = processor.process(None, image, tokens=inputs["prompt_token_ids"])
 
-    image_processor = processor.image_processor
-    max_total_crops = 1 + image_processor.max_crops
-    if image is not None:
-        images, image_input_idx, image_masks = pad_images(
-            max_total_crops,
-            out["images"],
-            out["image_input_idx"],
-            out.get("image_masks"),
-        )
-    else:
-        base_image_input_size = image_processor.base_image_input_size
-        image_patch_size = image_processor.image_patch_size
-        image_num_patch = (
-            base_image_input_size[0] // image_patch_size,
-            base_image_input_size[1] // image_patch_size,
-        )
-        n_pixels = image_patch_size * image_patch_size * 3
-        n_patches = image_num_patch[0] * image_num_patch[1]
-
-        image_length_w = image_processor.image_token_length_w
-        image_length_h = image_processor.image_token_length_h
-        tokens_per_image = image_length_w * image_length_h
-        images = torch.full(
-            (max_total_crops, n_patches, n_pixels),
-            -1,
-            dtype=torch.float32,
-        )
-        image_input_idx = torch.full(
-            (max_total_crops, tokens_per_image),
-            -1,
-            dtype=torch.int32,
+    # If there is no image, return directly.
+    if image is None:
+        new_prompt_token_ids = out["input_ids"].tolist()
+        prompt = inputs.get("prompt")
+        if prompt is None:
+            prompt = tokenizer.decode(new_prompt_token_ids)
+        return token_inputs(
+            prompt_token_ids=new_prompt_token_ids,
+            prompt=prompt,
         )
-        if image_processor.image_padding_mask:
-            image_masks = torch.full(
-                (max_total_crops, n_patches),
-                -1,
-                dtype=torch.float32,
-            )
 
+    image_processor = processor.image_processor
+    max_total_crops = 1 + image_processor.max_crops
+    images, image_input_idx, image_masks = pad_images(
+        max_total_crops,
+        out["images"],
+        out["image_input_idx"],
+        out.get("image_masks"),
+    )
     image_data = dict(
         images=images,
         image_input_idx=image_input_idx,
@@ -1143,11 +1123,9 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
                 offset = i
             size += 1
     image_data["image_start_end"] = (offset, offset + size)
-
     prompt = inputs.get("prompt")
     if prompt is None:
         prompt = tokenizer.decode(new_prompt_token_ids)
-
     return token_inputs(
         prompt_token_ids=new_prompt_token_ids,
         prompt=prompt,

From e20c92bb618384ce8d0013e0c9ad273d0c23d65b Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 7 Jan 2025 00:11:28 +0800
Subject: [PATCH 287/357] [Kernel] Move attn_type to Attention.__init__()
 (#11690)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 tests/kernels/test_encoder_decoder_attn.py  | 100 ++++++++++----------
 tests/kernels/utils.py                      |  12 ++-
 vllm/attention/backends/abstract.py         |   2 +-
 vllm/attention/backends/blocksparse_attn.py |  14 +--
 vllm/attention/backends/flash_attn.py       |   4 +-
 vllm/attention/backends/flashinfer.py       |  15 ++-
 vllm/attention/backends/hpu_attn.py         |  13 +--
 vllm/attention/backends/ipex_attn.py        |  12 +--
 vllm/attention/backends/pallas.py           |  13 +--
 vllm/attention/backends/rocm_flash_attn.py  |  14 +--
 vllm/attention/backends/torch_sdpa.py       |   4 +-
 vllm/attention/backends/xformers.py         |   6 +-
 vllm/attention/layer.py                     |  37 ++------
 vllm/model_executor/models/bart.py          |  44 +++------
 vllm/model_executor/models/bert.py          |  10 +-
 vllm/model_executor/models/mllama.py        |  11 +--
 vllm/model_executor/models/qwen2.py         |  35 ++++---
 vllm/v1/attention/backends/flash_attn.py    |  14 +--
 18 files changed, 159 insertions(+), 201 deletions(-)

diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py
index d943b048b7934..614674375786e 100644
--- a/tests/kernels/test_encoder_decoder_attn.py
+++ b/tests/kernels/test_encoder_decoder_attn.py
@@ -13,8 +13,7 @@
 import torch
 
 from tests.kernels.utils import *
-from vllm.attention import (Attention, AttentionBackend, AttentionMetadata,
-                            AttentionType)
+from vllm.attention import Attention, AttentionMetadata, AttentionType
 from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
 from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
                                      global_force_attn_backend_context_manager)
@@ -64,6 +63,7 @@ class TestPoint(NamedTuple):
     max_dec_seq_len: int
     max_enc_seq_len: int
     num_blocks: int
+    attn_type: AttentionType
 
 
 class TestResources(NamedTuple):
@@ -96,7 +96,6 @@ class TestResources(NamedTuple):
     '''
 
     scale: float
-    attn_backend: AttentionBackend
     attn: Attention
     kv_cache: torch.Tensor
 
@@ -129,16 +128,17 @@ class that Attention will automatically select when it is constructed.
     '''
 
     scale = float(1.0 / (test_pt.head_size**0.5))
-    attn_backend = make_backend(test_pt.backend_name)
     attn = Attention(
         test_pt.num_heads,
         test_pt.head_size,
         scale=scale,
+        prefix=f"{test_pt.attn_type}",
+        attn_type=test_pt.attn_type,
     )
     if test_pt.num_blocks is None or test_pt.num_heads is None:
         # Caller does not require a KV cache
         return TestResources(
-            scale, attn_backend, attn,
+            scale, attn,
             torch.tensor([], dtype=torch.float32, device=CUDA_DEVICE))
 
     # Construct KV cache
@@ -148,7 +148,7 @@ class that Attention will automatically select when it is constructed.
                              test_pt.block_size,
                              device=CUDA_DEVICE,
                              backend=test_pt.backend_name)
-    return TestResources(scale, attn_backend, attn, kv_cache)
+    return TestResources(scale, attn, kv_cache)
 
 
 def _encoder_attn_setup(
@@ -193,6 +193,7 @@ def _encoder_attn_setup(
         _,
         max_q_seq_len,
         _,
+        _,
     ) = test_pt
 
     scale = test_rsrcs.scale
@@ -301,6 +302,7 @@ def _decoder_attn_setup(
         max_q_seq_len,
         _,
         _,
+        _,
     ) = test_pt
 
     scale = test_rsrcs.scale
@@ -488,6 +490,7 @@ def _enc_dec_cross_attn_setup_reuses_query(
         max_decoder_seq_len,
         max_encoder_seq_len,
         _,
+        _,
     ) = test_pt
 
     scale = test_rsrcs.scale
@@ -622,7 +625,6 @@ def _run_encoder_attention_test(
       & attn_metadata
     '''
     assert attn_metadata.num_decode_tokens == 0
-    attn_type = AttentionType.ENCODER
     packed_qkv = encoder_test_params.packed_qkvo.packed_qkv
     assert packed_qkv is not None
     with set_forward_context(attn_metadata, vllm_config):
@@ -635,14 +637,11 @@ def _run_encoder_attention_test(
         # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
         reshaped_query = packed_qkv.query.view(
             -1, test_pt.num_heads * test_pt.head_size)
-        return attn.forward(reshaped_query,
-                            packed_qkv.key,
-                            packed_qkv.value,
-                            torch.tensor([],
-                                         dtype=torch.float32,
-                                         device=packed_qkv.query.device),
-                            attn_metadata,
-                            attn_type=attn_type)
+        return attn.forward(
+            reshaped_query, packed_qkv.key, packed_qkv.value,
+            torch.tensor([],
+                         dtype=torch.float32,
+                         device=packed_qkv.query.device), attn_metadata)
 
 
 def _run_decoder_self_attention_test(
@@ -675,7 +674,6 @@ def _run_decoder_self_attention_test(
     * Attention.forward() applied to packed_{query,key,value}, kv_cache
       & attn_metadata
     '''
-    attn_type = AttentionType.DECODER
     attn = test_rsrcs.attn
     kv_cache = test_rsrcs.kv_cache
     packed_qkv = decoder_test_params.packed_qkvo.packed_qkv
@@ -690,12 +688,8 @@ def _run_decoder_self_attention_test(
         # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
         reshaped_query = packed_qkv.query.view(
             -1, test_pt.num_heads * test_pt.head_size)
-        return attn.forward(reshaped_query,
-                            packed_qkv.key,
-                            packed_qkv.value,
-                            kv_cache,
-                            attn_metadata,
-                            attn_type=attn_type)
+        return attn.forward(reshaped_query, packed_qkv.key, packed_qkv.value,
+                            kv_cache, attn_metadata)
 
 
 def _run_encoder_decoder_cross_attention_test(
@@ -742,7 +736,6 @@ def _run_encoder_decoder_cross_attention_test(
     '''
     assert decoder_test_params.packed_qkvo.packed_qkv is not None
 
-    attn_type = AttentionType.ENCODER_DECODER
     attn = test_rsrcs.attn
     kv_cache = test_rsrcs.kv_cache
     if cross_test_params is None:
@@ -762,12 +755,8 @@ def _run_encoder_decoder_cross_attention_test(
         # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
         reshaped_query = decoder_test_params.packed_qkvo.packed_qkv.query.view(
             -1, test_pt.num_heads * test_pt.head_size)
-        return attn.forward(reshaped_query,
-                            key,
-                            value,
-                            kv_cache,
-                            attn_metadata,
-                            attn_type=attn_type)
+        return attn.forward(reshaped_query, key, value, kv_cache,
+                            attn_metadata)
 
 
 @pytest.fixture(autouse=True)
@@ -839,7 +828,7 @@ def test_encoder_only(
         # is not part of this test
         test_pt = TestPoint(num_heads, head_size, attn_backend.name,
                             batch_size, block_size, max_dec_seq_len,
-                            max_enc_seq_len, 4096)
+                            max_enc_seq_len, 4096, AttentionType.ENCODER)
 
         # Attention scale factor, attention backend instance, attention wrapper
         # instance, KV cache init
@@ -855,7 +844,7 @@ def test_encoder_only(
         # Shared prefill metadata structure
 
         prephase_attn_metadata: AttentionMetadata = make_test_metadata(
-            test_rsrcs.attn_backend,
+            attn_backend,
             True,
             None,
             decoder_test_params=None,
@@ -961,20 +950,29 @@ def test_e2e_enc_dec_attn(
         # Note: KV cache size of 4096 is arbitrary & chosen intentionally
         # to be more than necessary, since exceeding the kv cache size
         # is not part of this test
-        test_pt = TestPoint(num_heads, head_size, attn_backend.name,
-                            batch_size, block_size, max_dec_seq_len,
-                            max_enc_seq_len, 4096)
+        enc_test_pt = TestPoint(num_heads, head_size, attn_backend.name,
+                                batch_size, block_size, max_dec_seq_len,
+                                max_enc_seq_len, 4096, AttentionType.ENCODER)
+        enc_dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name,
+                                    batch_size, block_size, max_dec_seq_len,
+                                    max_enc_seq_len, 4096,
+                                    AttentionType.ENCODER_DECODER)
+        dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name,
+                                batch_size, block_size, max_dec_seq_len,
+                                max_enc_seq_len, 4096, AttentionType.DECODER)
 
         # Attention scale factor, attention backend instance, attention wrapper
         # instance, KV cache init
         vllm_config = VllmConfig()
         with set_current_vllm_config(vllm_config):
-            test_rsrcs = _make_test_resources(test_pt)
+            enc_test_rsrcs = _make_test_resources(enc_test_pt)
+            enc_dec_test_rsrcs = _make_test_resources(enc_dec_test_pt)
+            dec_test_rsrcs = _make_test_resources(dec_test_pt)
 
         # Construct encoder attention test params (only used
         # during prefill)
 
-        enc_test_params = _encoder_attn_setup(test_pt, test_rsrcs)
+        enc_test_params = _encoder_attn_setup(enc_test_pt, enc_test_rsrcs)
 
         # Construct Decoder self-attention prefill-phase & decode-phase
         # test params, including query/key/value tensors, decoder self-attention
@@ -987,7 +985,7 @@ def test_e2e_enc_dec_attn(
             prephase_dec_test_params,
             decphase_dec_test_params,
             cross_block_base_addr,
-        ) = _decoder_attn_setup(test_pt, test_rsrcs)
+        ) = _decoder_attn_setup(dec_test_pt, dec_test_rsrcs)
 
         # Construct encoder/decoder cross-attention prefill-phase
         # & decode-phase test params, including key/value tensors,
@@ -1000,14 +998,14 @@ def test_e2e_enc_dec_attn(
             dec_qkv,
             enc_test_params,
             prephase_dec_test_params,
-            test_pt,
-            test_rsrcs,
+            enc_dec_test_pt,
+            enc_dec_test_rsrcs,
             block_base_addr=cross_block_base_addr)
 
         # Shared prefill metadata structure
         assert prephase_dec_test_params.packed_qkvo.packed_qkv is not None
         prephase_attn_metadata: AttentionMetadata = make_test_metadata(
-            test_rsrcs.attn_backend,
+            attn_backend,
             True,
             prephase_dec_test_params.packed_qkvo.packed_qkv.q_seq_lens,
             decoder_test_params=prephase_dec_test_params,
@@ -1017,10 +1015,10 @@ def test_e2e_enc_dec_attn(
 
         # PREFILL: encoder attention
 
-        enc_pckd_act_out = _run_encoder_attention_test(test_rsrcs.attn,
+        enc_pckd_act_out = _run_encoder_attention_test(enc_test_rsrcs.attn,
                                                        enc_test_params,
                                                        prephase_attn_metadata,
-                                                       test_pt=test_pt,
+                                                       test_pt=enc_test_pt,
                                                        vllm_config=vllm_config)
 
         # - Is encoder attention result correct?
@@ -1030,10 +1028,10 @@ def test_e2e_enc_dec_attn(
         # PREFILL: decoder self-attention test
 
         prephase_dec_pckd_act_out = _run_decoder_self_attention_test(
-            test_rsrcs,
+            dec_test_rsrcs,
             prephase_dec_test_params,
             prephase_attn_metadata,
-            test_pt=test_pt,
+            test_pt=dec_test_pt,
             vllm_config=vllm_config)
 
         # - Is prefill decoder self-attention correct?
@@ -1044,11 +1042,11 @@ def test_e2e_enc_dec_attn(
         # PREFILL: encoder/decoder cross-attention test
 
         prephase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test(
-            test_rsrcs,
+            enc_dec_test_rsrcs,
             prephase_dec_test_params,
             prephase_cross_test_params,
             prephase_attn_metadata,
-            test_pt=test_pt,
+            test_pt=enc_dec_test_pt,
             vllm_config=vllm_config)
 
         # - Is prefill encoder/decoder cross-attention correct?
@@ -1059,7 +1057,7 @@ def test_e2e_enc_dec_attn(
         # DECODE: build decode-phase attention metadata
 
         decphase_attn_metadata: AttentionMetadata = make_test_metadata(
-            test_rsrcs.attn_backend,
+            attn_backend,
             False,
             dec_qkv.q_seq_lens,
             decoder_test_params=decphase_dec_test_params,
@@ -1070,10 +1068,10 @@ def test_e2e_enc_dec_attn(
         # DECODE: decoder self-attention test
 
         decphase_dec_pckd_act_out = _run_decoder_self_attention_test(
-            test_rsrcs,
+            dec_test_rsrcs,
             decphase_dec_test_params,
             decphase_attn_metadata,
-            test_pt=test_pt,
+            test_pt=dec_test_pt,
             vllm_config=vllm_config)
 
         # - Is decode-phase decoder self-attention correct?
@@ -1084,11 +1082,11 @@ def test_e2e_enc_dec_attn(
         # DECODE: encoder/decoder cross-attention test
 
         decphase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test(
-            test_rsrcs,
+            enc_dec_test_rsrcs,
             decphase_dec_test_params,
             None,
             decphase_attn_metadata,
-            test_pt=test_pt,
+            test_pt=enc_dec_test_pt,
             vllm_config=vllm_config)
 
         # - Is decode-phase encoder/decoder cross-attention correct?
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index e7865fb2500ef..848eea7f54cab 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -13,6 +13,7 @@
 
 from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
 from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.platforms.interface import _Backend
 from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
                         STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)
 
@@ -790,7 +791,7 @@ def make_block_tables_slot_mapping(
 
 
 def make_test_metadata(
-    attn_backend: AttentionBackend,
+    attn_backend: _Backend,
     is_prompt: bool,
     seq_lens: Optional[List[int]],
     decoder_test_params: Optional[PhaseTestParameters],
@@ -815,7 +816,7 @@ def make_test_metadata(
 
     Arguments:
 
-    * attn_backend: Backend for sourcing attention kernels
+    * attn_backend_name: Backend for sourcing attention kernels
     * is_prompt: prefill if True, o/w decode
     * seq_lens: list of token counts for each sequence
     * decoder_test_params: decoder self-attention test params; 
@@ -882,6 +883,8 @@ def make_test_metadata(
         #   (kv_mmap)
         cross_kv_mmap = cross_test_params.kv_mmap
 
+    attn_backend_obj = make_backend(attn_backend.name)
+
     if is_prompt:
         # Prefill-phase scenario
 
@@ -902,8 +905,7 @@ def make_test_metadata(
                                    context_lens,
                                    encoder_seq_lens,
                                    device=device)
-
-        return attn_backend.make_metadata(
+        return attn_backend_obj.make_metadata(
             num_prefills=num_prefills,
             slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping),
             multi_modal_placeholder_index_maps=None,
@@ -952,7 +954,7 @@ def make_test_metadata(
                                    encoder_seq_lens,
                                    device=device)
 
-        return attn_backend.make_metadata(
+        return attn_backend_obj.make_metadata(
             num_prefills=num_prefills,
             slot_mapping=kv_mmap.slot_mapping,
             multi_modal_placeholder_index_maps=None,
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index aed04361e5fb4..f5dcaea79af93 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -233,6 +233,7 @@ def __init__(
         kv_cache_dtype: str = "auto",
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         raise NotImplementedError
 
@@ -246,7 +247,6 @@ def forward(
         attn_metadata: T,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         raise NotImplementedError
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index 99cb84346d84e..7089d59392c36 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -300,6 +300,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         assert blocksparse_params is not None
         assert alibi_slopes is None, ValueError(
@@ -350,6 +351,12 @@ def __init__(
             active_head_range=self.blocksparse_params.active_head_range,
         )
 
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "BlocksparseFlashAttentionImpl")
+
     def forward(
         self,
         query: torch.Tensor,
@@ -359,7 +366,6 @@ def forward(
         attn_metadata: BlocksparseFlashAttentionMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
@@ -375,12 +381,6 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "BlocksparseFlashAttentionImpl")
-
         num_tokens, hidden_size = query.shape
         # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index c69e12ad78c44..23ea244f07dfe 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -600,6 +600,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -627,6 +628,7 @@ def __init__(
             raise ValueError(
                 f"Head size {head_size} is not supported by FlashAttention. "
                 f"Supported head sizes are: {support_head_sizes}.")
+        self.attn_type = attn_type
 
     def forward(
         self,
@@ -637,7 +639,6 @@ def forward(
         attn_metadata: FlashAttentionMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
@@ -659,6 +660,7 @@ def forward(
 
         assert output is not None, "Output tensor must be provided."
 
+        attn_type = self.attn_type
         if (attn_type == AttentionType.ENCODER
                 and (not attn_metadata.is_all_encoder_attn_metadata_set)):
             raise AttributeError("Encoder attention requires setting "
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index e367468d05d26..a11462b2068a5 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -748,6 +748,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -764,6 +765,12 @@ def __init__(
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashInferImpl")
+
     def forward(
         self,
         query: torch.Tensor,
@@ -773,18 +780,10 @@ def forward(
         attn_metadata: FlashInferMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
 
         # TODO: directly write to output tensor
-
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "FlashInferImpl")
-
         num_heads: int = self.num_heads
         head_size: int = self.head_size
         num_kv_heads: int = self.num_kv_heads
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index f90d15d4207e7..94a461e0c8c29 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -102,6 +102,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         max_seq_len: int = 4096,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         super(AttentionImpl, self).__init__()
         self.kv_cache_dtype = kv_cache_dtype
@@ -143,6 +144,12 @@ def __init__(
                 f"Head size {head_size} is not supported by PagedAttention. "
                 f"Supported head sizes are: {suppored_head_sizes}.")
 
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "HPUAttentionImpl")
+
     def forward(
         self,
         query: torch.Tensor,
@@ -152,7 +159,6 @@ def forward(
         attn_metadata: HPUAttentionMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
@@ -166,11 +172,6 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "HPUAttentionImpl")
         batch_size, seq_len, hidden_size = query.shape
         _, seq_len_kv, _ = key.shape
 
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index 21949874bea47..da1d307daa517 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -115,6 +115,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -146,6 +147,11 @@ def __init__(
             raise NotImplementedError(
                 "IPEX backend does not support FP8 KV cache. "
                 "Please use xFormers backend instead.")
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "IpexAttnBackendImpl")
 
     def split_kv_cache(
         self,
@@ -172,7 +178,6 @@ def forward(
         attn_metadata: IpexAttnMetadata,  # type: ignore
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with IPEX varlen_attention and PagedAttention.
@@ -189,11 +194,6 @@ def forward(
             shape = [num_tokens, num_heads * head_size]
         """
         assert k_scale == 1.0 and v_scale == 1.0
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "IpexAttnBackendImpl")
         num_tokens, hidden_size = query.shape
         # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 9809aed0e66f9..2ac492dd8ae54 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -100,6 +100,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -141,6 +142,12 @@ def __init__(
                 # megacore mode will be None.
                 self.megacore_mode = "batch"
 
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "PallasAttentionBackendImpl")
+
     def forward(
         self,
         query: torch.Tensor,
@@ -150,7 +157,6 @@ def forward(
         attn_metadata: PallasMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with Pallas attention.
@@ -168,11 +174,6 @@ def forward(
             shape = [batch_size, seq_len, num_heads * head_size]
         """
         assert k_scale == 1.0 and v_scale == 1.0
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "PallasAttentionBackendImpl")
         batch_size, seq_len, hidden_size = query.shape
         query = query.view(batch_size, seq_len, self.num_heads, self.head_size)
         key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size)
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index d43c15b661ef7..a91a5af5c3d58 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -338,6 +338,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -397,6 +398,12 @@ def __init__(
                 self.attn_func = _sdpa_attention
                 logger.debug("Using naive attention in ROCmBackend")
 
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "ROCmFlashAttentionImpl")
+
     def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor:
         """torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
         tokens, n_kv_heads, head_dim = x.shape
@@ -414,7 +421,6 @@ def forward(
         attn_metadata: ROCmFlashAttentionMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
@@ -432,12 +438,6 @@ def forward(
         """
         # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "ROCmFlashAttentionImpl")
-
         num_tokens, hidden_size = query.shape
         # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index 0cff6f5952aba..c14f7754596dd 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -390,6 +390,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -421,6 +422,7 @@ def __init__(
             raise NotImplementedError(
                 "Torch SDPA backend does not support FP8 KV cache. "
                 "Please use xFormers backend instead.")
+        self.attn_type = attn_type
 
     def forward(
         self,
@@ -431,7 +433,6 @@ def forward(
         attn_metadata: TorchSDPAMetadata,  # type: ignore
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with torch SDPA and PagedAttention.
@@ -448,6 +449,7 @@ def forward(
             shape = [num_tokens, num_heads * head_size]
         """
         assert k_scale == 1.0 and v_scale == 1.0
+        attn_type = self.attn_type
         if (attn_type == AttentionType.ENCODER
                 and (not attn_metadata.is_all_encoder_attn_metadata_set)):
             raise AttributeError("Encoder attention requires setting "
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 3e59b3603d2c6..694c7cc1bc36a 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -379,6 +379,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -405,6 +406,8 @@ def __init__(
                 f"Head size {head_size} is not supported by PagedAttention. "
                 f"Supported head sizes are: {suppored_head_sizes}.")
 
+        self.attn_type = attn_type
+
     def forward(
         self,
         query: torch.Tensor,
@@ -414,7 +417,6 @@ def forward(
         attn_metadata: "XFormersMetadata",
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
@@ -468,7 +470,7 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-
+        attn_type = self.attn_type
         # Check that appropriate attention metadata attributes are
         # selected for the desired attention type
         if (attn_type == AttentionType.ENCODER
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 69b6d1e4648df..f1b3598e60b54 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -41,6 +41,7 @@ def __init__(
         logits_soft_cap: Optional[float] = None,
         per_layer_sliding_window: Optional[int] = None,
         prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         super().__init__()
         if per_layer_sliding_window is not None:
@@ -96,7 +97,7 @@ def __init__(
         impl_cls = attn_backend.get_impl_cls()
         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
                              alibi_slopes, sliding_window, kv_cache_dtype,
-                             blocksparse_params, logits_soft_cap)
+                             blocksparse_params, logits_soft_cap, attn_type)
         self.num_heads = num_heads
         self.head_size = head_size
         self.num_kv_heads = num_kv_heads
@@ -119,6 +120,7 @@ def __init__(
             raise ValueError(f"Duplicate layer name: {prefix}")
         compilation_config.static_forward_context[prefix] = self
         self.layer_name = prefix
+        self.attn_type = attn_type
 
     def forward(
         self,
@@ -127,18 +129,12 @@ def forward(
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: AttentionMetadata,
-        attn_type: str = AttentionType.DECODER,
     ) -> torch.Tensor:
 
         if self.use_direct_call:
-            return self.impl.forward(query,
-                                     key,
-                                     value,
-                                     kv_cache,
-                                     attn_metadata,
-                                     self._k_scale,
-                                     self._v_scale,
-                                     attn_type=attn_type)
+            return self.impl.forward(query, key, value, kv_cache,
+                                     attn_metadata, self._k_scale,
+                                     self._v_scale)
         elif self.use_output:
             output = torch.empty_like(query)
             hidden_size = query.size(-1)
@@ -152,13 +148,11 @@ def forward(
             if value is not None:
                 value = value.view(-1, self.num_kv_heads, self.head_size)
             torch.ops.vllm.unified_attention_with_output(
-                query, key, value, output, kv_cache, attn_type,
-                self.layer_name)
+                query, key, value, output, kv_cache, self.layer_name)
             return output.view(-1, hidden_size)
         else:
             return torch.ops.vllm.unified_attention(query, key, value,
-                                                    kv_cache, attn_type,
-                                                    self.layer_name)
+                                                    kv_cache, self.layer_name)
 
     def extra_repr(self) -> str:
         s = f"head_size={self.impl.head_size}"  # type: ignore
@@ -237,20 +231,13 @@ def unified_attention(
     key: torch.Tensor,
     value: torch.Tensor,
     kv_cache: torch.Tensor,
-    attn_type: str,
     layer_name: str,
 ) -> torch.Tensor:
     forward_context: ForwardContext = get_forward_context()
     attn_metadata = forward_context.dynamic_forward_context
     self = forward_context.static_forward_context[layer_name]
-    return self.impl.forward(query,
-                             key,
-                             value,
-                             kv_cache,
-                             attn_metadata,
-                             self._k_scale,
-                             self._v_scale,
-                             attn_type=attn_type)
+    return self.impl.forward(query, key, value, kv_cache, attn_metadata,
+                             self._k_scale, self._v_scale)
 
 
 def unified_attention_fake(
@@ -258,7 +245,6 @@ def unified_attention_fake(
     key: torch.Tensor,
     value: torch.Tensor,
     kv_cache: torch.Tensor,
-    attn_type: str,
     layer_name: str,
 ) -> torch.Tensor:
     return torch.empty_like(query).contiguous()
@@ -279,7 +265,6 @@ def unified_attention_with_output(
     value: torch.Tensor,
     output: torch.Tensor,
     kv_cache: torch.Tensor,
-    attn_type: str,
     layer_name: str,
 ) -> None:
     forward_context: ForwardContext = get_forward_context()
@@ -292,7 +277,6 @@ def unified_attention_with_output(
                       attn_metadata,
                       self._k_scale,
                       self._v_scale,
-                      attn_type=attn_type,
                       output=output)
 
 
@@ -302,7 +286,6 @@ def unified_attention_with_output_fake(
     value: torch.Tensor,
     output: torch.Tensor,
     kv_cache: torch.Tensor,
-    attn_type: str,
     layer_name: str,
 ) -> None:
     return
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 3776490cb3465..57eb5adc82d5b 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -71,12 +71,8 @@ def __init__(self, num_embeddings: int, embedding_dim: int):
     def forward(
         self,
         positions: torch.Tensor,
-        attn_type: AttentionType,
     ) -> torch.Tensor:
         """`input_ids' shape is expected to be [bsz x seqlen]."""
-
-        assert attn_type != AttentionType.ENCODER_DECODER
-
         return super().forward(positions + self.offset)
 
 
@@ -180,7 +176,8 @@ def __init__(
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
                               quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+                              prefix=f"{prefix}.attn",
+                              attn_type=AttentionType.ENCODER)
 
     def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor,
                 attn_metadata: AttentionMetadata) -> torch.Tensor:
@@ -189,12 +186,7 @@ def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor,
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
 
-        attn_output = self.attn(q,
-                                k,
-                                v,
-                                kv_cache,
-                                attn_metadata,
-                                attn_type=AttentionType.ENCODER)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
 
         output, _ = self.out_proj(attn_output)
         return output
@@ -264,7 +256,8 @@ def __init__(
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
                               quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+                              prefix=f"{prefix}.attn",
+                              attn_type=AttentionType.DECODER)
 
     def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor,
                 attn_metadata: AttentionMetadata) -> torch.Tensor:
@@ -273,12 +266,7 @@ def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor,
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
 
-        attn_output = self.attn(q,
-                                k,
-                                v,
-                                kv_cache,
-                                attn_metadata,
-                                attn_type=AttentionType.DECODER)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
 
         output, _ = self.out_proj(attn_output)
         return output
@@ -348,7 +336,8 @@ def __init__(
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
                               quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+                              prefix=f"{prefix}.attn",
+                              attn_type=AttentionType.ENCODER_DECODER)
 
     def forward(
         self,
@@ -372,12 +361,7 @@ def forward(
             _, k, v = qkv_enc.split([self.q_size, self.kv_size, self.kv_size],
                                     dim=-1)
 
-        attn_output = self.attn(q,
-                                k,
-                                v,
-                                kv_cache,
-                                attn_metadata,
-                                attn_type=AttentionType.ENCODER_DECODER)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
 
         output, _ = self.out_proj(attn_output)
         return output
@@ -644,10 +628,7 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
         # retrieve input_ids and inputs_embeds
         inputs_embeds = self.embed_tokens(input_ids)
 
-        embed_pos = self.embed_positions(
-            positions,
-            AttentionType.ENCODER,
-        )
+        embed_pos = self.embed_positions(positions)
         embed_pos = embed_pos.to(inputs_embeds.device)
 
         hidden_states = inputs_embeds + embed_pos
@@ -734,10 +715,7 @@ def forward(self, decoder_input_ids: torch.Tensor,
         inputs_embeds = self.embed_tokens(decoder_input_ids)
 
         # embed positions
-        embed_pos = self.embed_positions(
-            decoder_positions,
-            AttentionType.DECODER,
-        )
+        embed_pos = self.embed_positions(decoder_positions)
         embed_pos = embed_pos.to(inputs_embeds.device)
 
         hidden_states = inputs_embeds + embed_pos
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index c1d47b1bc9bcd..4be136543de15 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -238,7 +238,8 @@ def __init__(
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
                               quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+                              prefix=f"{prefix}.attn",
+                              attn_type=AttentionType.ENCODER_ONLY)
 
     def forward(
         self,
@@ -248,12 +249,7 @@ def forward(
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        output = self.attn(q,
-                           k,
-                           v,
-                           kv_cache,
-                           attn_metadata,
-                           attn_type=AttentionType.ENCODER_ONLY)
+        output = self.attn(q, k, v, kv_cache, attn_metadata)
         return output
 
 
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 6536f9807730c..c5046e06edecb 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -770,6 +770,7 @@ def __init__(
             self.scaling,
             self.num_local_key_value_heads,
             prefix=f"{prefix}.attn",
+            attn_type=AttentionType.ENCODER_DECODER,
         )
 
     def forward(
@@ -805,13 +806,9 @@ def forward(
                                                kv_range_for_decode,
                                                attn_metadata)
         else:
-            output = self.attn(q.view(-1,
-                                      self.num_local_heads * self.head_dim),
-                               k,
-                               v,
-                               kv_cache,
-                               attn_metadata,
-                               attn_type=AttentionType.ENCODER_DECODER)
+            output = self.attn(
+                q.view(-1, self.num_local_heads * self.head_dim), k, v,
+                kv_cache, attn_metadata)
         out, _ = self.o_proj(output)
         return out
 
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 88f4ea4352726..01745b5fd53e1 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -107,7 +107,8 @@ def __init__(self,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  rope_scaling: Optional[Tuple] = None,
-                 prefix: str = "") -> None:
+                 prefix: str = "",
+                 attn_type: str = AttentionType.DECODER) -> None:
         super().__init__()
         self.hidden_size = hidden_size
         tp_size = get_tensor_model_parallel_world_size()
@@ -160,7 +161,8 @@ def __init__(self,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
                               quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+                              prefix=f"{prefix}.attn",
+                              attn_type=attn_type)
 
     def forward(
         self,
@@ -168,17 +170,11 @@ def forward(
         hidden_states: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: AttentionMetadata,
-        attn_type: str = AttentionType.DECODER,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q,
-                                k,
-                                v,
-                                kv_cache,
-                                attn_metadata,
-                                attn_type=attn_type)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -197,6 +193,16 @@ def __init__(
         # Requires transformers > 4.32.0
         rope_theta = getattr(config, "rope_theta", 1000000)
         rope_scaling = getattr(config, "rope_scaling", None)
+
+        # By default, Qwen2 uses causal attention as it is a decoder-only model.
+        # You can override the HF config with `is_causal=False` to enable
+        # bidirectional attention, which is used in some embedding models
+        # (e.g. Alibaba-NLP/gte-Qwen2-7B-instruct)
+        if getattr(config, "is_causal", True):
+            attn_type = AttentionType.DECODER
+        else:
+            attn_type = AttentionType.ENCODER_ONLY
+
         self.self_attn = Qwen2Attention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -207,6 +213,7 @@ def __init__(
             quant_config=quant_config,
             rope_scaling=rope_scaling,
             prefix=f"{prefix}.self_attn",
+            attn_type=attn_type,
         )
         self.mlp = Qwen2MLP(
             hidden_size=self.hidden_size,
@@ -220,15 +227,6 @@ def __init__(
         self.post_attention_layernorm = RMSNorm(config.hidden_size,
                                                 eps=config.rms_norm_eps)
 
-        # By default, Qwen2 uses causal attention as it is a decoder-only model.
-        # You can override the HF config with `is_causal=False` to enable
-        # bidirectional attention, which is used in some embedding models
-        # (e.g. Alibaba-NLP/gte-Qwen2-7B-instruct)
-        if getattr(config, "is_causal", True):
-            self._attn_type = AttentionType.DECODER
-        else:
-            self._attn_type = AttentionType.ENCODER_ONLY
-
     def forward(
         self,
         positions: torch.Tensor,
@@ -249,7 +247,6 @@ def forward(
             hidden_states=hidden_states,
             kv_cache=kv_cache,
             attn_metadata=attn_metadata,
-            attn_type=self._attn_type,
         )
 
         # Fully Connected
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 65002f1ad70c7..b02bc9ffde538 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -89,6 +89,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -119,6 +120,12 @@ def __init__(
                 f"Head size {head_size} is not supported by FlashAttention. "
                 f"Supported head sizes are: {support_head_sizes}.")
 
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashAttentionImpl")
+
     def forward(
         self,
         query: torch.Tensor,
@@ -128,7 +135,6 @@ def forward(
         attn_metadata: FlashAttentionMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: AttentionType = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
@@ -142,12 +148,6 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "FlashAttentionImpl")
-
         # NOTE(woosuk): FlashAttention does not support FP8 KV cache.
         assert k_scale == 1.0 and v_scale == 1.0, (
             "key/v_scale is not supported in FlashAttention.")

From 91b361ae898c944f823534121613f9d3dc19d7d1 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 6 Jan 2025 11:58:16 -0800
Subject: [PATCH 288/357] [V1] Extend beyond image modality and support
 mixed-modality inference with Llava-OneVision (#11685)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md        |   2 +-
 tests/multimodal/test_utils.py                | 209 +++++++++++++++++-
 tests/v1/core/test_kv_cache_utils.py          |  18 +-
 tests/v1/core/test_prefix_caching.py          |  17 +-
 vllm/model_executor/models/interfaces.py      |   6 +-
 vllm/model_executor/models/llava_onevision.py |  65 +++---
 vllm/model_executor/models/molmo.py           |   3 -
 vllm/multimodal/__init__.py                   |   3 +
 vllm/multimodal/hasher.py                     | 100 +++++++++
 vllm/multimodal/inputs.py                     |   9 +-
 vllm/multimodal/processing.py                 |  92 +++-----
 vllm/multimodal/utils.py                      |  86 ++++++-
 vllm/v1/engine/__init__.py                    |  18 +-
 vllm/v1/engine/mm_input_mapper.py             |  67 ------
 vllm/v1/engine/processor.py                   | 101 ++++++---
 vllm/v1/request.py                            |  48 ++--
 vllm/v1/worker/gpu_model_runner.py            |  74 ++++---
 17 files changed, 636 insertions(+), 282 deletions(-)
 create mode 100644 vllm/multimodal/hasher.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 5a2778026192a..94a8849f7edcd 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -647,7 +647,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `MiniCPMV`
   - MiniCPM-V
   - T + I<sup>E+</sup>
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 6029f2e514772..198344e5bd88c 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -2,16 +2,22 @@
 import mimetypes
 import os
 from tempfile import NamedTemporaryFile, TemporaryDirectory
-from typing import Dict, Tuple
+from typing import TYPE_CHECKING, Dict, NamedTuple, Optional, Tuple
 
 import numpy as np
 import pytest
 from PIL import Image, ImageChops
 from transformers import AutoConfig, AutoTokenizer
 
+from vllm.multimodal.inputs import PlaceholderRange
 from vllm.multimodal.utils import (MediaConnector,
+                                   merge_and_sort_multimodal_metadata,
                                    repeat_and_pad_placeholder_tokens)
 
+if TYPE_CHECKING:
+    from vllm.multimodal.hasher import MultiModalHashDict
+    from vllm.multimodal.inputs import MultiModalPlaceholderDict
+
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_URLS = [
     "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
@@ -191,3 +197,204 @@ def test_repeat_and_pad_placeholder_tokens(model):
         assert new_prompt == expected_prompt
         assert new_token_ids == expected_token_ids
         assert ranges == expected_ranges
+
+
+# Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
+class TestCase(NamedTuple):
+    mm_positions: "MultiModalPlaceholderDict"
+    mm_hashes: Optional["MultiModalHashDict"]
+    expected_modalities: list[str]
+    expected_ranges: list[PlaceholderRange]
+    expected_hashes: Optional[list[str]]
+
+
+def test_merge_and_sort_multimodal_metadata():
+
+    test_cases = [
+        # Single modality should return result as is but flattened
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=3, length=2),
+                ]
+            },
+            mm_hashes={"image": ["hash1", "hash2"]},
+            expected_modalities=["image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=3, length=2),
+            ],
+            expected_hashes=["hash1", "hash2"],
+        ),
+
+        # Single modality without hashes return None for mm hash.
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=2),
+                ]
+            },
+            mm_hashes=None,
+            expected_modalities=["image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=2),
+            ],
+            expected_hashes=None,
+        ),
+
+        # Multiple modalities with hashes should return sorted modalities
+        # and flattened ranges and hashes.
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=7, length=4),
+                    PlaceholderRange(offset=11, length=5),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=3),
+                ]
+            },
+            mm_hashes={
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1", "audio_hash2"],
+            },
+            expected_modalities=["audio", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=3),
+                PlaceholderRange(offset=7, length=4),
+                PlaceholderRange(offset=11, length=5),
+            ],
+            expected_hashes=[
+                "audio_hash1", "audio_hash2", "image_hash1", "image_hash2"
+            ],
+        ),
+
+        # Multiple modalities without hashes should return sorted modalities
+        # and flattened ranges and None.
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=7, length=4),
+                    PlaceholderRange(offset=11, length=5),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=3),
+                ]
+            },
+            mm_hashes=None,
+            expected_modalities=["audio", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=3),
+                PlaceholderRange(offset=7, length=4),
+                PlaceholderRange(offset=11, length=5),
+            ],
+            expected_hashes=None,
+        ),
+
+        # Three modalities
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=15, length=7),
+                    PlaceholderRange(offset=22, length=8),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=0, length=2),
+                ],
+                "video": [
+                    PlaceholderRange(offset=3, length=4),
+                    PlaceholderRange(offset=7, length=5),
+                    PlaceholderRange(offset=12, length=6),
+                ]
+            },
+            mm_hashes={
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1"],
+                "video": ["video_hash1", "video_hash2", "video_hash3"]
+            },
+            expected_modalities=["audio", "video", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=3, length=4),
+                PlaceholderRange(offset=7, length=5),
+                PlaceholderRange(offset=12, length=6),
+                PlaceholderRange(offset=15, length=7),
+                PlaceholderRange(offset=22, length=8),
+            ],
+            expected_hashes=[
+                "audio_hash1", "video_hash1", "video_hash2", "video_hash3",
+                "image_hash1", "image_hash2"
+            ],
+        ),
+    ]
+
+    for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
+         expected_hashes) in test_cases:
+        modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
+            mm_positions, mm_hashes)
+
+        assert modalities == expected_modalities
+        assert ranges == expected_ranges
+        assert hashes == expected_hashes
+
+
+def test_merge_and_sort_multimodal_metadata_with_interleaving():
+
+    test_cases = [
+
+        # <image> <audio> <image> <audio>
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=4),
+                    PlaceholderRange(offset=8, length=2),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=5, length=2),
+                    PlaceholderRange(offset=11, length=4),
+                ]
+            },
+            mm_hashes={
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1", "audio_hash2"],
+            },
+            expected_modalities=[],
+            expected_ranges=[],
+            expected_hashes=None,
+        ),
+
+        # <image> <image> <video> <audio> <image>
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=3),
+                    PlaceholderRange(offset=20, length=4),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=5, length=2),
+                ],
+                "video": [
+                    PlaceholderRange(offset=8, length=5),
+                ]
+            },
+            mm_hashes=None,
+            expected_modalities=[],
+            expected_ranges=[],
+            expected_hashes=None,
+        ),
+    ]
+
+    for case in test_cases:
+        with pytest.raises(ValueError) as ex_info:
+            merge_and_sort_multimodal_metadata(case.mm_positions,
+                                               case.mm_hashes)
+
+        assert "Interleaved mixed-modality" in str(ex_info.value)
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 2ed70b42991b5..f4081766e39a2 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1,6 +1,6 @@
 import pytest
 
-from vllm.inputs import token_inputs
+from vllm.multimodal.inputs import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
                                          KVCacheBlock,
@@ -14,14 +14,18 @@ def make_request(request_id,
                  prompt_token_ids,
                  mm_positions=None,
                  mm_hashes=None):
+    if mm_positions is None:
+        multi_modal_inputs = None
+    else:
+        multi_modal_inputs = [MultiModalKwargs({})] * len(mm_positions)
+
     return Request(
         request_id=request_id,
-        inputs=token_inputs(
-            prompt_token_ids=prompt_token_ids,
-            multi_modal_placeholders={"image": mm_positions}
-            if mm_positions else None,
-            multi_modal_hashes=mm_hashes,
-        ),
+        prompt=None,
+        prompt_token_ids=prompt_token_ids,
+        multi_modal_inputs=multi_modal_inputs,
+        multi_modal_hashes=mm_hashes,
+        multi_modal_placeholders=mm_positions,
         sampling_params=SamplingParams(max_tokens=17),
         eos_token_id=100,
         arrival_time=0,
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 35e3a2f972720..b97f55b8c6535 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -1,8 +1,7 @@
 """Compare the with and without prefix caching."""
 import pytest
 
-from vllm.inputs import token_inputs
-from vllm.multimodal.inputs import PlaceholderRange
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.utils import cdiv
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
@@ -13,12 +12,18 @@ def make_request(request_id,
                  prompt_token_ids,
                  mm_positions=None,
                  mm_hashes=None):
+    if mm_positions is None:
+        multi_modal_inputs = None
+    else:
+        multi_modal_inputs = [MultiModalKwargs({})] * len(mm_positions)
+
     return Request(
         request_id=request_id,
-        inputs=token_inputs(prompt_token_ids=prompt_token_ids,
-                            multi_modal_placeholders={"image": mm_positions}
-                            if mm_positions else None,
-                            multi_modal_hashes=mm_hashes),
+        prompt=None,
+        prompt_token_ids=prompt_token_ids,
+        multi_modal_inputs=multi_modal_inputs,
+        multi_modal_hashes=mm_hashes,
+        multi_modal_placeholders=mm_positions,
         sampling_params=SamplingParams(max_tokens=17),
         eos_token_id=100,
         arrival_time=0,
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 70b78fe64f2d8..6f26603046483 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -39,8 +39,12 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
 
         The output embeddings must be one of the following formats:
         - A list or tuple of 2D tensors, where each tensor corresponds to 
-          each input image.
+          each input multimodal data item (e.g, image).
         - A single 3D tensor, with the batch dimension grouping the 2D tensors.
+
+        NOTE: The returned multimodal embeddings must be in the same order as 
+        the appearances of their corresponding multimodal data item in the 
+        input prompt.
         """
         ...
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 5eac2f223d794..9117824995021 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -35,6 +35,9 @@
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
+# For profile run
+_MAX_FRAMES_PER_VIDEO = 16
+
 
 class LlavaOnevisionVideoPixelInputs(TypedDict):
     type: Literal["pixel_values_videos"]
@@ -223,8 +226,10 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
         max_image_tokens = self._get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
                                                       max_image_tokens)
+        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
+                                   _MAX_FRAMES_PER_VIDEO)
 
-        return max(max_total_frames // max(max_videos, 1), 1)
+        return max(max_frames_per_video, 1)
 
     def _get_max_video_tokens(self, seq_len: int) -> int:
         target_width, target_height = self._get_image_size_with_most_features()
@@ -558,13 +563,15 @@ def _parse_and_validate_video_input(
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         modalities = {}
 
-        if "pixel_values" in kwargs:
-            modalities["images"] = self._parse_and_validate_image_input(
-                **kwargs)
-
-        if "pixel_values_videos" in kwargs:
-            modalities["videos"] = self._parse_and_validate_video_input(
-                **kwargs)
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key == "pixel_values" and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key == "pixel_values_videos" and "videos" not in modalities:  # noqa E501
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
 
         return modalities
 
@@ -824,21 +831,21 @@ def get_multimodal_embeddings(
         if not modalities:
             return None
 
-        # We make a tuple of each embedding with its modality string. This is a
-        # temporary workaround for models to handle mixed modalities when
-        # get_multimodal_embeddings and get_input_embeddings are called
-        # separately.
-        # TODO(ywang96): Add support for mixed-modality inference for v1.
-        multimodal_embeddings: List[Tuple[NestedTensors, str]] = []
-
-        if "images" in modalities:
-            image_input = modalities["images"]
-            vision_embeddings = self._process_image_input(image_input)
-            multimodal_embeddings.append((vision_embeddings, "image"))
-        if "videos" in modalities:
-            video_input = modalities["videos"]
-            video_embeddings = self._process_video_pixels(video_input)
-            multimodal_embeddings.append((video_embeddings, "video"))
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += tuple(vision_embeddings)
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_embeddings = self._process_video_pixels(video_input)
+                multimodal_embeddings += tuple(video_embeddings)
 
         return multimodal_embeddings
 
@@ -850,15 +857,9 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
-            for embeddings, modality in multimodal_embeddings:
-                if modality == "image":
-                    inputs_embeds = merge_multimodal_embeddings(
-                        input_ids, inputs_embeds, embeddings,
-                        self.config.image_token_index)
-                if modality == "video":
-                    inputs_embeds = merge_multimodal_embeddings(
-                        input_ids, inputs_embeds, embeddings,
-                        self.config.video_token_index)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                [self.config.image_token_index, self.config.video_token_index])
         return inputs_embeds
 
     def forward(
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 0e8287bb56b6b..2e60bc719f096 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -972,8 +972,6 @@ def image_input_mapper_for_molmo(
         assert len(data) == 1, "Molmo supports only one image per prompt."
         data = data[0]
 
-    # Remove unused dummy PIL image
-    data.pop('raw_mm_data', None)
     return MultiModalKwargs(data)
 
 
@@ -1019,7 +1017,6 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
     dummy_imgdata = {
         "images": out["images"],
         "image_input_idx": out["image_input_idx"],
-        "raw_mm_data": dummy_image,
     }
     if "image_masks" in out:
         dummy_imgdata["image_masks"] = out["image_masks"]
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index e58bbe81717a0..343b9322ecc5e 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,4 +1,5 @@
 from .base import MultiModalPlaceholderMap, MultiModalPlugin
+from .hasher import MultiModalHashDict, MultiModalHasher
 from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
                      MultiModalDataDict, MultiModalKwargs,
                      MultiModalPlaceholderDict, NestedTensors)
@@ -18,6 +19,8 @@
     "ModalityData",
     "MultiModalDataBuiltins",
     "MultiModalDataDict",
+    "MultiModalHashDict",
+    "MultiModalHasher",
     "MultiModalKwargs",
     "MultiModalPlaceholderDict",
     "MultiModalPlaceholderMap",
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
new file mode 100644
index 0000000000000..24aa1ca658048
--- /dev/null
+++ b/vllm/multimodal/hasher.py
@@ -0,0 +1,100 @@
+import pickle
+from typing import TYPE_CHECKING, Iterable, Mapping, Optional
+
+import numpy as np
+import torch
+from blake3 import blake3
+from PIL import Image
+
+from vllm.logger import init_logger
+
+if TYPE_CHECKING:
+    from vllm.inputs import TokensPrompt
+
+logger = init_logger(__name__)
+
+MultiModalHashDict = Mapping[str, list[str]]
+"""
+A dictionary containing hashes for items in each modality.
+"""
+
+
+class MultiModalHasher:
+
+    @classmethod
+    def serialize_item(cls, obj: object) -> bytes:
+        # Simple cases
+        if isinstance(obj, str):
+            return obj.encode("utf-8")
+        if isinstance(obj, bytes):
+            return obj
+        if isinstance(obj, Image.Image):
+            return obj.tobytes()
+
+        # Convertible to NumPy arrays
+        if isinstance(obj, torch.Tensor):
+            obj = obj.numpy()
+        if isinstance(obj, (int, float)):
+            obj = np.array(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tobytes()
+
+        logger.warning(
+            "No serialization method found for %s. "
+            "Falling back to pickle.", type(obj))
+
+        return pickle.dumps(obj)
+
+    @classmethod
+    def item_to_bytes(
+        cls,
+        key: str,
+        obj: object,
+    ) -> Iterable[tuple[bytes, bytes]]:
+        # Recursive cases
+        if isinstance(obj, (list, tuple)):
+            for i, elem in enumerate(obj):
+                yield from cls.item_to_bytes(f"{key}.{i}", elem)
+        elif isinstance(obj, dict):
+            for k, v in obj.items():
+                yield from cls.item_to_bytes(f"{key}.{k}", v)
+        else:
+            key_bytes = cls.serialize_item(key)
+            value_bytes = cls.serialize_item(obj)
+            yield key_bytes, value_bytes
+
+    @classmethod
+    def hash_kwargs(cls, **kwargs: object) -> str:
+        hasher = blake3()
+
+        for k, v in kwargs.items():
+            for k_bytes, v_bytes in cls.item_to_bytes(k, v):
+                hasher.update(k_bytes)
+                hasher.update(v_bytes)
+
+        return hasher.hexdigest()
+
+    @classmethod
+    def hash_prompt_mm_data(
+            cls, prompt: "TokensPrompt") -> Optional["MultiModalHashDict"]:
+        """Hash multimodal data in the user input prompt if they exist."""
+
+        if "multi_modal_data" not in prompt:
+            return None
+
+        mm_data = prompt["multi_modal_data"]
+        if not mm_data:
+            # mm_data can be None or an empty dict.
+            return None
+
+        mm_items = {
+            modality: items if isinstance(items, list) else [items]
+            for modality, items in mm_data.items()
+        }
+
+        mm_hashes = {
+            modality: [cls.hash_kwargs(**{modality: item}) for item in items]
+            for modality, items in mm_items.items()
+        }
+
+        return mm_hashes
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index b0a1104546186..8fdcc4b524035 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -2,8 +2,8 @@
 from collections import UserDict, defaultdict
 from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
-from typing import (Any, Literal, Optional, TypedDict, TypeVar, Union, cast,
-                    final)
+from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar,
+                    Union, cast, final)
 
 import numpy as np
 import torch
@@ -14,6 +14,9 @@
 
 from vllm.utils import JSONTree, full_groupby, is_list_of, json_map_leaves
 
+if TYPE_CHECKING:
+    from .hasher import MultiModalHashDict
+
 _T = TypeVar("_T")
 
 HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
@@ -513,7 +516,7 @@ class MultiModalInputsV2(TypedDict):
     mm_kwargs: MultiModalKwargs
     """Keyword arguments to be directly passed to the model after batching."""
 
-    mm_hashes: NotRequired[list[str]]
+    mm_hashes: NotRequired[Optional["MultiModalHashDict"]]
     """The hashes of the multi-modal data."""
 
     mm_placeholders: MultiModalPlaceholderDict
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 933c1d3aff0cb..41113cd85bd16 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,4 +1,3 @@
-import pickle
 import re
 from abc import ABC, abstractmethod
 from collections import defaultdict
@@ -7,18 +6,16 @@
 from functools import lru_cache
 from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union
 
-import numpy as np
-import torch
-from blake3 import blake3
-from PIL import Image
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 
+from vllm import envs
 from vllm.inputs import DummyData, InputProcessingContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
                                                encode_tokens)
 from vllm.utils import LRUCache, flatten_2d_lists, full_groupby
 
+from .hasher import MultiModalHasher
 from .inputs import (MultiModalDataDict, MultiModalFieldConfig,
                      MultiModalInputsV2, MultiModalKwargs,
                      MultiModalKwargsItem, PlaceholderRange)
@@ -486,56 +483,6 @@ def _maybe_log_cache_stats(self) -> None:
             logger.debug("ProcessingCache: hit_ratio = %.2f",
                          cache_stats.hit_ratio)
 
-    def _serialize_item(self, obj: object) -> bytes:
-        # Simple cases
-        if isinstance(obj, str):
-            return obj.encode("utf-8")
-        if isinstance(obj, bytes):
-            return obj
-        if isinstance(obj, Image.Image):
-            return obj.tobytes()
-
-        # Convertible to NumPy arrays
-        if isinstance(obj, torch.Tensor):
-            obj = obj.numpy()
-        if isinstance(obj, (int, float)):
-            obj = np.array(obj)
-        if isinstance(obj, np.ndarray):
-            return obj.tobytes()
-
-        logger.warning(
-            "No serialization method found for %s. "
-            "Falling back to pickle.", type(obj))
-
-        return pickle.dumps(obj)
-
-    def _item_to_bytes(
-        self,
-        key: str,
-        obj: object,
-    ) -> Iterable[tuple[bytes, bytes]]:
-        # Recursive cases
-        if isinstance(obj, (list, tuple)):
-            for i, elem in enumerate(obj):
-                yield from self._item_to_bytes(f"{key}.{i}", elem)
-        elif isinstance(obj, dict):
-            for k, v in obj.items():
-                yield from self._item_to_bytes(f"{key}.{k}", v)
-        else:
-            key_bytes = self._serialize_item(key)
-            value_bytes = self._serialize_item(obj)
-            yield key_bytes, value_bytes
-
-    def _hash_kwargs(self, **kwargs: object) -> str:
-        hasher = blake3()
-
-        for k, v in kwargs.items():
-            for k_bytes, v_bytes in self._item_to_bytes(k, v):
-                hasher.update(k_bytes)
-                hasher.update(v_bytes)
-
-        return hasher.hexdigest()
-
     def get(
         self,
         model_id: str,
@@ -554,9 +501,9 @@ def get(
         """
         self._maybe_log_cache_stats()
 
-        cache_key = self._hash_kwargs(model_id=model_id,
-                                      **{modality: input_item},
-                                      **input_kwargs)
+        cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: input_item},
+                                                 **input_kwargs)
         return self._cache.get(cache_key)
 
     def put(
@@ -571,9 +518,9 @@ def put(
         Put a processed multi-modal item into the cache
         according to its dependencies (see :meth:`get`).
         """
-        cache_key = self._hash_kwargs(model_id=model_id,
-                                      **{modality: input_item},
-                                      **input_kwargs)
+        cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: input_item},
+                                                 **input_kwargs)
         self._cache.put(cache_key, output_kwargs)
 
 
@@ -1049,6 +996,24 @@ def apply(
         """
         mm_items = self._to_mm_items(mm_data)
 
+        # Create MM hashes (only used in V1)
+        # TODO: Use these hash keys for caching operations in apply_hf_processor
+        # instead of rehashing.
+
+        if envs.VLLM_USE_V1:
+            model_id = self.ctx.model_config.model
+            mm_hashes = {
+                modality: [
+                    MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: item},
+                                                 **hf_processor_mm_kwargs)
+                    for item in items
+                ]
+                for modality, items in mm_items.items()
+            }
+        else:
+            mm_hashes = None
+
         prompt_ids, mm_kwargs = self._cached_apply_hf_processor(
             prompt_text,
             mm_items,
@@ -1122,6 +1087,7 @@ def apply(
             prompt=prompt_text,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
+            mm_hashes=mm_hashes,
             mm_placeholders=mm_placeholder_ranges,
         )
 
@@ -1174,7 +1140,9 @@ def get_dummy_data(self, seq_len: int) -> DummyData:
                 "tokens.")
 
         total_len = len(prompt_token_ids)
-        if total_len > seq_len:
+
+        # V0 does not support chunked prefill.
+        if total_len > seq_len and not envs.VLLM_USE_V1:
             logger.warning(
                 "The context length (%d) of the model is too short "
                 "to hold the multi-modal embeddings in the worst case "
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 7b6ded6a27084..f4a514ba55d0c 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,6 +1,6 @@
 from functools import lru_cache
 from pathlib import Path
-from typing import Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Optional, TypeVar, Union
 from urllib.parse import ParseResult, urlparse
 
 import numpy as np
@@ -25,6 +25,10 @@
 
 _M = TypeVar("_M")
 
+if TYPE_CHECKING:
+    from .hasher import MultiModalHashDict
+    from .inputs import MultiModalPlaceholderDict
+
 
 class MediaConnector:
 
@@ -437,3 +441,83 @@ def consecutive_placeholder_ranges(
         PlaceholderRange(offset=initial_offset + i * item_size,
                          length=item_size) for i in range(num_items)
     ]
+
+
+def merge_and_sort_multimodal_metadata(
+    mm_positions: "MultiModalPlaceholderDict",
+    mm_hashes: Optional["MultiModalHashDict"],
+) -> tuple[list[str], list[PlaceholderRange], Optional[list[str]]]:
+    """Given a MultiModalPlaceholderDict, merge all PlaceholderRange
+    objects from all available modalities into a single list of 
+    PlaceholderRange, sorted by their offset (starting index in the input 
+    sequence) in the ascending order.
+
+    Optionally if a MultiModalHashDict is given, same operation will be 
+    applied to the object and the sorted list of hashes will be returned.
+
+    Raises:
+        ValueError: If the input prompt has interleaved placeholders from
+            different modalities (e.g, "<image><audio><image> Describe the 
+            content.")
+    
+    Returns:
+        list[str]: Sorted list of involved modalities.
+        list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from 
+            mm_positions.
+        Optional[list[str]]: Sorted list of all hashes from mm_hashes if 
+            given, None otherwise.
+    """
+
+    modalities = list(mm_positions.keys())
+
+    assert len(modalities) > 0, "No modalities found in the mm_positions."
+
+    # For single modality, placeholder ranges and hashes are already sorted
+    # so we can return the list directly.
+    if len(modalities) == 1:
+        if mm_hashes is None:
+            return modalities, list(mm_positions[modalities[0]]), None
+        else:
+            return modalities, list(mm_positions[modalities[0]]), list(
+                mm_hashes[modalities[0]])
+
+    placeholder_lists_with_modality = [(modality, mm_positions[modality])
+                                       for modality in modalities]
+
+    if mm_hashes is None:
+        sorted_placeholder_lists = sorted(placeholder_lists_with_modality,
+                                          key=lambda x: x[1][0]['offset'])
+        sorted_hash_lists = None
+    else:
+        hashes_lists = [
+            mm_hashes[modality] for modality in modalities
+            if modality in mm_hashes
+        ]
+        sorted_pairs = sorted(zip(placeholder_lists_with_modality,
+                                  hashes_lists),
+                              key=lambda x: x[0][1][0]['offset'])
+        sorted_placeholder_tuple, sorted_hash_tuple = zip(*sorted_pairs)
+        sorted_placeholder_lists = list(sorted_placeholder_tuple)
+        sorted_hash_lists = list(sorted_hash_tuple)
+
+    sorted_modalities = [modality for modality, _ in sorted_placeholder_lists]
+
+    # Flatten sorted list of lists to a single list and verify there is no
+    # interleaving of placeholders from different modalities.
+    merged_placeholders: list[PlaceholderRange] = []
+    for modality, placeholder_list in sorted_placeholder_lists:
+        if merged_placeholders and placeholder_list[0][
+                'offset'] < merged_placeholders[-1]['offset']:
+            raise ValueError(
+                "Interleaved mixed-modality inference is currently not "
+                "supported.")
+        merged_placeholders.extend(placeholder_list)
+
+    if sorted_hash_lists is not None:
+        merged_hashes = []
+        for hash_list in sorted_hash_lists:
+            merged_hashes.extend(hash_list)
+    else:
+        merged_hashes = None
+
+    return sorted_modalities, merged_placeholders, merged_hashes
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index f70464fc88298..5e3c5e327ef63 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,12 +1,14 @@
 import enum
 from dataclasses import dataclass
-from typing import List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union
 
 import msgspec
 
-from vllm.lora.request import LoRARequest
-from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
-from vllm.sampling_params import SamplingParams
+if TYPE_CHECKING:
+    from vllm.lora.request import LoRARequest
+    from vllm.multimodal import MultiModalKwargs
+    from vllm.multimodal.inputs import PlaceholderRange
+    from vllm.sampling_params import SamplingParams
 
 
 @dataclass
@@ -21,13 +23,13 @@ class EngineCoreRequest:
     # always be tokenized?
     prompt: Optional[str]
     prompt_token_ids: List[int]
-    mm_inputs: Optional[List[Optional[MultiModalKwargs]]]
+    mm_inputs: Optional[List[Optional["MultiModalKwargs"]]]
     mm_hashes: Optional[List[str]]
-    mm_placeholders: Optional[MultiModalPlaceholderDict]
-    sampling_params: SamplingParams
+    mm_placeholders: Optional[List["PlaceholderRange"]]
+    sampling_params: "SamplingParams"
     eos_token_id: Optional[int]
     arrival_time: float
-    lora_request: Optional[LoRARequest]
+    lora_request: Optional["LoRARequest"]
 
 
 class EngineCoreOutput(
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index 8bfc739b3dbbc..d83460a40ad26 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -1,10 +1,6 @@
 from typing import Any, Dict, List, Optional
 
-import PIL
-from blake3 import blake3
-
 from vllm.config import ModelConfig
-from vllm.inputs import PromptType
 from vllm.logger import init_logger
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
                              MultiModalKwargs, MultiModalRegistry)
@@ -144,66 +140,3 @@ def process_inputs(
             full_mm_inputs.append(mm_input)
 
         return full_mm_inputs
-
-
-class MMHasher:
-
-    def __init__(self):
-        pass
-
-    def hash_dummy_mm_data(
-            self,
-            mm_data: Optional[MultiModalDataDict]) -> Optional[List[str]]:
-        """Hash user-defined dummy multimodal data used for profiling."""
-
-        if mm_data is None:
-            return None
-
-        image_inputs = mm_data['image']
-
-        # This is a temporary workaround for models (e.g, Molmo) that
-        # process multimodal data in the input processor (therefore
-        # image_inputs is MultiModalKwargs instead of raw input format).
-        # `raw_mm_data` with the original input format is expected
-        # in this case.
-        if isinstance(image_inputs, dict):
-            assert "raw_mm_data" in image_inputs and isinstance(
-                image_inputs["raw_mm_data"], PIL.Image.Image)
-            image_inputs = image_inputs.pop("raw_mm_data")
-
-        return self.hash_images(image_inputs)
-
-    def hash_prompt_mm_data(self, prompt: PromptType) -> Optional[List[str]]:
-        """Hash multimodal data in the user input prompt if they exist."""
-
-        if "multi_modal_data" not in prompt:
-            return None
-
-        mm_data = prompt["multi_modal_data"]
-        if not mm_data:
-            # mm_data can be None or an empty dict.
-            return None
-
-        image_inputs = mm_data["image"]
-
-        return self.hash_images(image_inputs)
-
-    def hash_images(self, image_inputs) -> Optional[List[str]]:
-        """Hash PIL image objects to strings."""
-        if not isinstance(image_inputs, list):
-            image_inputs = [image_inputs]
-        assert len(image_inputs) > 0
-
-        ret = []
-        for image in image_inputs:
-            assert isinstance(image, PIL.Image.Image)
-
-            # Convert image to bytes
-            bytes = image.tobytes()
-
-            # Hash image bytes
-            hasher = blake3()
-            hasher.update(bytes)
-            ret.append(hasher.hexdigest())
-
-        return ret
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index c0f6cfab4865c..43419d2ff5381 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -7,14 +7,15 @@
 from vllm.inputs.parse import is_encoder_decoder_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
-                             MultiModalRegistry)
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalHasher,
+                             MultiModalKwargs, MultiModalRegistry)
+from vllm.multimodal.utils import merge_and_sort_multimodal_metadata
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
+from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
 
 
 class Processor:
@@ -47,7 +48,6 @@ def __init__(
         # Multi-modal hasher (for images)
         self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \
             cache_config.enable_prefix_caching
-        self.mm_hasher = MMHasher()
 
     def process_inputs(
         self,
@@ -73,11 +73,6 @@ def process_inputs(
         assert priority == 0, "vLLM V1 does not support priority at the moment."
         assert trace_headers is None, "vLLM V1 does not support tracing yet."
 
-        # Compute MM hashes (if enabled)
-        mm_hashes = None
-        if self.use_hash:
-            mm_hashes = self.mm_hasher.hash_prompt_mm_data(prompt)
-
         # Process inputs.
         preprocessed_inputs = self.input_preprocessor.preprocess(
             prompt,
@@ -108,8 +103,20 @@ def process_inputs(
         sampling_params.update_from_generation_config(
             self.generation_config_fields, eos_token_id)
 
+        # Multimodal related.
+        # Compute MM hashes (if enabled)
+        mm_hashes = None
+        if self.use_hash:
+            # Use mm_hashes from processed inputs if the model has merged
+            # input processor.
+            if decoder_inputs.multi_modal_hashes:
+                mm_hashes = decoder_inputs.multi_modal_hashes
+            # Fallback to using MultiModalHasher directly.
+            else:
+                mm_hashes = MultiModalHasher.hash_prompt_mm_data(prompt)
+
         # For merged preprocessor, mm_data is already mm_inputs
-        precomputed_mm_inputs = None
+        precomputed_mm_inputs: Optional[list[MultiModalKwargs]] = None
         decoder_mm_data = decoder_inputs.multi_modal_data
         if isinstance(decoder_mm_data, MultiModalKwargs):
             # The output of merged multi-modal processor (`decoder_mm_data`)
@@ -122,27 +129,67 @@ def process_inputs(
                 for item in decoder_mm_data.get_items(modality)
             ]
 
-        # Apply MM mapper
-        mm_inputs = None
-        if len(decoder_mm_data) > 0:
-            mm_inputs = self.mm_input_mapper_client.process_inputs(
-                decoder_mm_data,
+        mm_positions = decoder_inputs.multi_modal_placeholders
+
+        # Last-mile processing of multimodal metadata and inputs.
+        if mm_positions:
+
+            # Merge and flatten multimodal placeholders, hashes and inputs
+            # from dictionaries to lists, and sort them by each item's position
+            # in the input sequence.
+            # NOTE: interleaved modalities are not supported.
+            (
+                sorted_modalities,
+                sorted_mm_positions,
+                sorted_mm_hashes,
+            ) = merge_and_sort_multimodal_metadata(
+                mm_positions,
                 mm_hashes,
-                decoder_inputs.mm_processor_kwargs,
-                precomputed_mm_inputs,
             )
 
+            # NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple
+            # modalities involved AND the model supports merged input processor.
+            if len(sorted_modalities) > 1 and precomputed_mm_inputs:
+
+                modality_order_dict = {
+                    modality: order
+                    for order, modality in enumerate(sorted_modalities)
+                }
+
+                # Sanity check to make sure each multimodal input has only one
+                # modality key.
+                for mm_input in precomputed_mm_inputs:
+                    assert len(mm_input.modalities) == 1
+
+                # Sort MultiModalKwags to match sorted_mm_positions
+                precomputed_mm_inputs = sorted(
+                    precomputed_mm_inputs,
+                    key=lambda mm_input: modality_order_dict[list(
+                        mm_input.modalities)[0]])
+
+            # Apply mm input cache update (and input mapper if necessary).
+            sorted_mm_inputs = self.mm_input_mapper_client.process_inputs(
+                mm_data=decoder_mm_data,
+                mm_hashes=sorted_mm_hashes,
+                mm_processor_kwargs=decoder_inputs.mm_processor_kwargs,
+                precomputed_mm_inputs=precomputed_mm_inputs,
+            )
+        else:
+            sorted_mm_inputs = None
+            sorted_mm_hashes = None
+            sorted_mm_positions = None
+
         return EngineCoreRequest(
-            request_id,
-            decoder_inputs.prompt,
-            decoder_inputs.prompt_token_ids,
-            mm_inputs,
-            mm_hashes,
-            decoder_inputs.multi_modal_placeholders,
-            sampling_params,
-            eos_token_id,
-            arrival_time,
-            lora_request,
+            request_id=request_id,
+            prompt=decoder_inputs.prompt,
+            prompt_token_ids=decoder_inputs.prompt_token_ids,
+            mm_inputs=sorted_mm_inputs,
+            mm_hashes=sorted_mm_hashes,
+            mm_placeholders=sorted_mm_positions,
+            sampling_params=sampling_params,
+            eos_token_id=eos_token_id,
+            arrival_time=arrival_time,
+            lora_request=lora_request,
         )
 
     def _validate_model_inputs(self, inputs: ProcessorInputs):
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index f4783ae366ef0..45450165eaefe 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -1,15 +1,15 @@
 import enum
 from typing import TYPE_CHECKING, List, Optional, Union
 
-from vllm.inputs import DecoderOnlyInputs, SingletonInputsAdapter, token_inputs
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import RequestMetrics
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
+    from vllm.multimodal import MultiModalKwargs
+    from vllm.multimodal.inputs import PlaceholderRange
     from vllm.v1.core.kv_cache_utils import BlockHashType
 
 
@@ -18,14 +18,17 @@ class Request:
     def __init__(
         self,
         request_id: str,
-        inputs: DecoderOnlyInputs,
+        prompt: Optional[str],
+        prompt_token_ids: List[int],
+        multi_modal_inputs: Optional[List["MultiModalKwargs"]],
+        multi_modal_hashes: Optional[List[str]],
+        multi_modal_placeholders: Optional[List["PlaceholderRange"]],
         sampling_params: SamplingParams,
         eos_token_id: Optional[int],
         arrival_time: float,
         lora_request: Optional[LoRARequest] = None,
     ) -> None:
         self.request_id = request_id
-        self.inputs = SingletonInputsAdapter(inputs)
         self.sampling_params = sampling_params
         # Because of LoRA, the eos token id can be different for each request.
         self.eos_token_id = eos_token_id
@@ -41,26 +44,21 @@ def __init__(
         assert sampling_params.max_tokens is not None
         self.max_tokens = sampling_params.max_tokens
 
-        self.prompt = self.inputs.prompt
-        self.prompt_token_ids = self.inputs.prompt_token_ids
+        self.prompt = prompt
+        self.prompt_token_ids = prompt_token_ids
         self.num_prompt_tokens = len(self.prompt_token_ids)
         self._output_token_ids: List[int] = []
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
         self.num_computed_tokens = 0
 
-        # Multi-modal input metadata.
-        mm_positions = self.inputs.multi_modal_placeholders
-        if mm_positions:
-            # FIXME(woosuk): Support other modalities.
-            self.mm_positions = mm_positions.get("image", [])
-        else:
-            self.mm_positions = []
-        # Output of the mm input mapper (e.g., image tensors).
-        self.mm_inputs: List[MultiModalKwargs] = []
-        if self.inputs.multi_modal_inputs:
-            self.mm_inputs = self.inputs.multi_modal_inputs
+        # Multi-modal related
+        self.mm_positions = multi_modal_placeholders or []
+        self.mm_inputs = multi_modal_inputs or []
+        self.mm_hashes: List[str] = multi_modal_hashes or []
 
-        self.mm_hashes: List[str] = self.inputs.multi_modal_hashes
+        # Sanity check
+        assert len(self.mm_inputs) == len(self.mm_positions)
+        assert len(self.mm_inputs) == len(self.mm_hashes)
 
         # Cache the computed kv block hashes of the request to avoid
         # recomputing.
@@ -70,15 +68,11 @@ def __init__(
     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
         return cls(
             request_id=request.request_id,
-            inputs=token_inputs(
-                prompt_token_ids=request.prompt_token_ids,
-                prompt=request.prompt,
-                multi_modal_data=None,
-                multi_modal_inputs=request.mm_inputs,
-                multi_modal_hashes=request.mm_hashes,
-                multi_modal_placeholders=request.mm_placeholders,
-                mm_processor_kwargs=None,
-            ),
+            prompt=request.prompt,
+            prompt_token_ids=request.prompt_token_ids,
+            multi_modal_inputs=request.mm_inputs,
+            multi_modal_hashes=request.mm_hashes,
+            multi_modal_placeholders=request.mm_placeholders,
             sampling_params=request.sampling_params,
             eos_token_id=request.eos_token_id,
             arrival_time=request.arrival_time,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 31e693235f99f..a1d4f9b135789 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -19,7 +19,7 @@
                         LayerBlockType, cdiv, is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
-from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
+from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -82,12 +82,10 @@ def __init__(
         self.input_registry = INPUT_REGISTRY
         self.mm_registry = MULTIMODAL_REGISTRY
 
-        # NOTE: mm_input_mapper_client and mm_hasher are only used for memory
-        # profiling.
-        self.mm_input_mapper_client = MMInputMapperClient(self.model_config)
-        self.mm_hasher = MMHasher()
-        self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \
-            cache_config.enable_prefix_caching
+        # NOTE: Initialized input mapper is only used for processing dummy
+        # multimodal data into multimodal kwargs for GPU memory profiling.
+        self.mm_input_mapper_profiling = MMInputMapperClient(self.model_config)
+        self.mm_input_mapper_profiling.use_cache = False
 
         self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens  # noqa: E501
         self.encoder_cache_size = self.scheduler_config.encoder_cache_size
@@ -722,8 +720,6 @@ def profile_run(self) -> None:
         ]
 
         # Profile with multimodal encoder & encoder cache.
-        # TODO (ywang96): generalize this beyond image modality since
-        # mm_input_mapper only supports image inputs.
         if self.is_multimodal_model:
 
             # Create dummy batch of multimodal inputs.
@@ -735,15 +731,30 @@ def profile_run(self) -> None:
             dummy_mm_data = dummy_request_data.multi_modal_data
 
             # NOTE: Currently model is profiled with a single non-text
-            # modality even when it supports multiple.
-            max_tokens_per_mm_item = max(
-                self.mm_registry.get_max_tokens_per_item_by_modality(
-                    self.model_config).values())
-
-            max_num_mm_items_encoder_budget = min(
-                self.max_num_encoder_input_tokens,
-                self.encoder_cache_size) // max_tokens_per_mm_item
-
+            # modality with the max possible input tokens even when
+            # it supports multiple.
+            max_tokens_by_modality_dict = self.mm_registry.get_max_tokens_per_item_by_modality(  # noqa: E501
+                self.model_config)
+
+            dummy_data_modality, max_tokens_per_mm_item = max(
+                max_tokens_by_modality_dict.items(), key=lambda item: item[1])
+
+            # Check how many items of this modality can be supported by
+            # the encoder cache budget.
+            encoder_cache_budget = min(self.max_num_encoder_input_tokens,
+                                       self.encoder_cache_size)
+            max_num_mm_items_encoder_budget = encoder_cache_budget // \
+                max_tokens_per_mm_item
+
+            # TODO: Allow users to set encoder_cache_budget in case this
+            # happens.
+            assert max_num_mm_items_encoder_budget > 0, (
+                f"Encoder cache budget={encoder_cache_budget} is too small to "
+                f"support the maximum possible size of multimodal embeddings"
+                f"={max_tokens_per_mm_item}.")
+
+            # Check how many items of this modality can be supported by
+            # the decoder budget.
             max_mm_items_per_req = max(
                 self.mm_registry.get_mm_limits_per_prompt(
                     self.model_config).values())
@@ -763,33 +774,24 @@ def profile_run(self) -> None:
             # they are scheduled to be processed separately.
 
             # Case when models have a merged processor, their dummy data is
-            # already batched `MultiModalKwargs`, therefore we need to "unbatch"
-            # and take the first item in each batched tensor.
-            # TODO (ywang96): This is somewhat hacky. Refactor this to be
-            # consistent with the other case.
+            # already batched `MultiModalKwargs`, therefore we take the first
+            # `MultiModalKwargsItem` from the desired modality to profile on.
             if isinstance(dummy_mm_data, MultiModalKwargs):
-                dummy_mm_kwargs = {
-                    k: v[0].unsqueeze(0)
-                    for k, v in dummy_mm_data.items()
-                }
+                dummy_mm_item = dummy_mm_data.get_item(
+                    modality=dummy_data_modality, item_index=0)
+                dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
 
             # Case when models have dummy data explicitly defined as
             # `MultiModalDataDict`, so they need to be processed through input
             # mapper.
+            # TODO (ywang96): deprecate this path once merged processor is
+            # supported on all models.
             else:
-                # Compute MM hashes (if enabled)
-                mm_hashes = None
-                if self.use_hash:
-                    mm_hashes = self.mm_hasher.hash_dummy_mm_data(
-                        dummy_mm_data)
-
-                mm_kwargs_list = self.mm_input_mapper_client.process_inputs(
+                mm_kwargs_list = self.mm_input_mapper_profiling.process_inputs(
                     mm_data=dummy_mm_data,
-                    mm_hashes=mm_hashes,
+                    mm_hashes=None,
                     mm_processor_kwargs=None,
                     precomputed_mm_inputs=None)
-
-                # Take the first `MultiModalKwargs`
                 dummy_mm_kwargs = mm_kwargs_list[0]
 
             batched_dummy_mm_inputs = MultiModalKwargs.batch(

From 08fb75c72e39dcb4f0751dc59583b95bda4d3656 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 7 Jan 2025 09:10:54 +0800
Subject: [PATCH 289/357] [Bugfix] Fix LLaVA-NeXT feature size precision error
 (for real) (#11772)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../processing/test_llava_next.py             |  3 +-
 .../processing/test_llava_onevision.py        |  3 +-
 vllm/model_executor/models/llava_next.py      | 39 ++++++++-------
 vllm/model_executor/models/llava_onevision.py | 47 ++++++++++---------
 4 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
index 6c8d300717de4..37a6d334ee60c 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_next.py
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
@@ -17,7 +17,8 @@ def processor_for_llava_next():
 
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488),
-                                        (488, 183), (198, 176), (176, 198)])
+                                        (488, 183), (198, 176), (176, 198),
+                                        (161, 184), (184, 161)])
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_prompt_replacements(
     processor_for_llava_next,
diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
index 71adde6568a17..ed3e2db799be7 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
@@ -18,7 +18,8 @@ def processor_for_llava_onevision():
 @pytest.mark.parametrize("model_id",
                          ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
 @pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488),
-                                        (488, 183), (198, 176), (176, 198)])
+                                        (488, 183), (198, 176), (176, 198),
+                                        (161, 184), (184, 161)])
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_prompt_replacements(
     processor_for_llava_onevision,
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index c76ec164a3087..258352416d4a7 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -121,30 +121,29 @@ def _get_num_unpadded_features(
         num_patch_height: int,
         num_patch_width: int,
     ) -> tuple[int, int]:
-        current_height = npatches * num_patch_height
-        current_width = npatches * num_patch_width
-
         # NOTE: Use float32 to remain consistent with HF output
-        original_aspect_ratio = np.array(original_width / original_height,
-                                         dtype=np.float32)
-        current_aspect_ratio = np.array(current_width / current_height,
-                                        dtype=np.float32)
+        current_height_f = np.float32(npatches * num_patch_height)
+        current_width_f = np.float32(npatches * num_patch_width)
+
+        original_width_f = np.float32(original_width)
+        original_height_f = np.float32(original_height)
+
+        original_aspect_ratio = original_width_f / original_height_f
+        current_aspect_ratio = current_width_f / current_height_f
 
         if original_aspect_ratio > current_aspect_ratio:
-            scale_factor = np.array(current_width / original_width,
-                                    dtype=np.float32)
-            new_height = int(original_height * scale_factor)
-            padding = (current_height - new_height) // 2
-            current_height -= 2 * padding
+            scale_factor = current_width_f / original_width_f
+            new_height = int(original_height_f * scale_factor)
+            padding = (current_height_f - new_height) // 2
+            current_height_f -= 2 * padding
         else:
-            scale_factor = np.array(current_height / original_height,
-                                    dtype=np.float32)
-            new_width = int(original_width * scale_factor)
-            padding = (current_width - new_width) // 2
-            current_width -= 2 * padding
-
-        unpadded_features = current_height * current_width
-        newline_features = current_height
+            scale_factor = current_height_f / original_height_f
+            new_width = int(original_width_f * scale_factor)
+            padding = (current_width_f - new_width) // 2
+            current_width_f -= 2 * padding
+
+        unpadded_features = int(current_height_f * current_width_f)
+        newline_features = int(current_height_f)
 
         return (unpadded_features, newline_features)
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 9117824995021..62dae74e377be 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -107,36 +107,37 @@ def _get_num_unpadded_features(
         num_patch_height: int,
         num_patch_width: int,
     ) -> tuple[int, int]:
-        current_height = npatches * num_patch_height
-        current_width = npatches * num_patch_width
-
         # NOTE: Use float32 to remain consistent with HF output
-        original_aspect_ratio = np.array(original_width / original_height,
-                                         dtype=np.float32)
-        current_aspect_ratio = np.array(current_width / current_height,
-                                        dtype=np.float32)
+        current_height_f = np.float32(npatches * num_patch_height)
+        current_width_f = np.float32(npatches * num_patch_width)
+
+        original_width_f = np.float32(original_width)
+        original_height_f = np.float32(original_height)
+
+        original_aspect_ratio = original_width_f / original_height_f
+        current_aspect_ratio = current_width_f / current_height_f
 
         if original_aspect_ratio > current_aspect_ratio:
-            scale_factor = np.array(current_width / original_width,
-                                    dtype=np.float32)
-            new_height = int(original_height * scale_factor)
-            padding = (current_height - new_height) // 2
-            current_height -= 2 * padding
+            scale_factor = current_width_f / original_width_f
+            new_height = int(original_height_f * scale_factor)
+            padding = (current_height_f - new_height) // 2
+            current_height_f -= 2 * padding
         else:
-            scale_factor = np.array(current_height / original_height,
-                                    dtype=np.float32)
-            new_width = int(original_width * scale_factor)
-            padding = (current_width - new_width) // 2
-            current_width -= 2 * padding
+            scale_factor = current_height_f / original_height_f
+            new_width = int(original_width_f * scale_factor)
+            padding = (current_width_f - new_width) // 2
+            current_width_f -= 2 * padding
 
-        unpadded_features = current_height * current_width
-        newline_features = current_height
+        unpadded_features = int(current_height_f * current_width_f)
+        newline_features = int(current_height_f)
 
-        ratio = math.sqrt(current_height * current_width / (9 * npatches**2))
+        ratio = math.sqrt(current_height_f * current_width_f /
+                          (9 * npatches**2))
         if ratio > 1.1:
-            unpadded_features = int(current_height // ratio) * int(
-                current_width // ratio)
-            newline_features = int(current_height // ratio)
+            height_factor = int(current_height_f // ratio)
+            width_factor = int(current_width_f // ratio)
+            unpadded_features = height_factor * width_factor
+            newline_features = height_factor
 
         return (unpadded_features, newline_features)
 

From d0169e1b0fa44a80ba40baf92dd2cedd3611076b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 7 Jan 2025 11:05:17 +0800
Subject: [PATCH 290/357] [Model] Future-proof Qwen2-Audio multi-modal
 processor (#11776)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/qwen2_audio.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index a7bb3425ed17c..576b01776e5de 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -227,12 +227,14 @@ def get_replacement_qwen2_audio(item_idx: int):
         ]
 
     def _always_apply_prompt_replacements(self) -> bool:
-        # HF never applies prompt replacements, so we have to do it ourselves.
+        # Qwen2-Audio processor will start inserting placeholder tokens
+        # in an upcoming release:
+        # https://github.com/huggingface/transformers/pull/35534
         # NOTE: `_find_placeholders_by_modality` may incorrectly think that HF
         # has already performed processing for multi-audio input when the input
         # audios are short (the corresponding placeholders may take up fewer
         # tokens than the number of audio items)
-        return True
+        return not hasattr(self._get_hf_processor(), "audio_token")
 
 
 @MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor)

From d93d2d74fd807a091add17c2065ee8869339f76a Mon Sep 17 00:00:00 2001
From: YiSheng5 <yi.sheng@intel.com>
Date: Tue, 7 Jan 2025 11:09:58 +0800
Subject: [PATCH 291/357] [XPU] Make pp group initilized for
 pipeline-parallelism (#11648)

Signed-off-by: yisheng <yi.sheng@intel.com>
---
 vllm/worker/xpu_worker.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index 1295666055b04..e9cb623c8eb45 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -11,6 +11,7 @@
 from vllm.config import VllmConfig
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
+from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
@@ -176,3 +177,8 @@ def init_worker_distributed_environment(self) -> None:
             parallel_config.pipeline_parallel_size)
         # global all_reduce needed for overall oneccl warm up
         torch.distributed.all_reduce(torch.zeros(1).xpu())
+
+        if parallel_config.pipeline_parallel_size > 1:
+            # Add pp group init to avoid
+            # p2p communication as the first call
+            get_pp_group().all_reduce(torch.zeros(1).xpu())

From 8ceffbf3152d3b26d293ba1e157d0c187884572b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 7 Jan 2025 11:20:01 +0800
Subject: [PATCH 292/357] [Doc][3/N] Reorganize Serving section (#11766)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 README.md                                     |   2 +-
 .../architecture_helm_deployment.png          | Bin
 .../contributing/dockerfile/dockerfile.md     |   2 +-
 .../source/contributing/model/registration.md |   4 +-
 .../docker.md}                                |   4 +-
 .../frameworks/bentoml.md}                    |   4 +-
 .../frameworks/cerebrium.md}                  |   4 +-
 .../frameworks/dstack.md}                     |   4 +-
 .../frameworks/helm.md}                       |   6 +-
 docs/source/deployment/frameworks/index.md    |  13 +++
 .../frameworks/lws.md}                        |   4 +-
 .../frameworks/skypilot.md}                   |   8 +-
 .../frameworks/triton.md}                     |   4 +-
 docs/source/deployment/integrations/index.md  |   9 ++
 .../integrations/kserve.md}                   |   4 +-
 .../integrations/kubeai.md}                   |   4 +-
 .../integrations/llamastack.md}               |   4 +-
 .../k8s.md}                                   |   4 +-
 .../nginx.md}                                 |   2 +-
 docs/source/design/arch_overview.md           |   2 +-
 docs/source/features/disagg_prefill.md        |   8 +-
 docs/source/features/spec_decode.md           |   2 +-
 .../getting_started/installation/gpu-rocm.md  |   2 +-
 .../getting_started/installation/hpu-gaudi.md |   2 +-
 docs/source/getting_started/quickstart.md     |  18 ++--
 docs/source/index.md                          |  49 ++++++-----
 docs/source/models/extensions/index.md        |   8 ++
 .../extensions}/runai_model_streamer.md       |   2 +-
 .../extensions}/tensorizer.md                 |   2 +-
 docs/source/models/supported_models.md        |  44 +++++-----
 docs/source/serving/distributed_serving.md    |  12 +--
 docs/source/serving/integrations.md           |  17 ----
 docs/source/serving/integrations/index.md     |   8 ++
 .../langchain.md}                             |   8 +-
 .../llamaindex.md}                            |   8 +-
 docs/source/serving/metrics.md                |   2 +-
 .../multimodal_inputs.md                      |  12 +--
 docs/source/serving/offline_inference.md      |  79 ++++++++++++++++++
 .../serving/openai_compatible_server.md       |   8 +-
 docs/source/serving/usage_stats.md            |   2 +-
 40 files changed, 248 insertions(+), 133 deletions(-)
 rename docs/source/{serving => assets/deployment}/architecture_helm_deployment.png (100%)
 rename docs/source/{serving/deploying_with_docker.md => deployment/docker.md} (98%)
 rename docs/source/{serving/deploying_with_bentoml.md => deployment/frameworks/bentoml.md} (89%)
 rename docs/source/{serving/deploying_with_cerebrium.md => deployment/frameworks/cerebrium.md} (98%)
 rename docs/source/{serving/deploying_with_dstack.md => deployment/frameworks/dstack.md} (98%)
 rename docs/source/{serving/deploying_with_helm.md => deployment/frameworks/helm.md} (98%)
 create mode 100644 docs/source/deployment/frameworks/index.md
 rename docs/source/{serving/deploying_with_lws.md => deployment/frameworks/lws.md} (91%)
 rename docs/source/{serving/run_on_sky.md => deployment/frameworks/skypilot.md} (98%)
 rename docs/source/{serving/deploying_with_triton.md => deployment/frameworks/triton.md} (87%)
 create mode 100644 docs/source/deployment/integrations/index.md
 rename docs/source/{serving/deploying_with_kserve.md => deployment/integrations/kserve.md} (85%)
 rename docs/source/{serving/deploying_with_kubeai.md => deployment/integrations/kubeai.md} (93%)
 rename docs/source/{serving/serving_with_llamastack.md => deployment/integrations/llamastack.md} (95%)
 rename docs/source/{serving/deploying_with_k8s.md => deployment/k8s.md} (99%)
 rename docs/source/{serving/deploying_with_nginx.md => deployment/nginx.md} (99%)
 create mode 100644 docs/source/models/extensions/index.md
 rename docs/source/{serving => models/extensions}/runai_model_streamer.md (98%)
 rename docs/source/{serving => models/extensions}/tensorizer.md (95%)
 delete mode 100644 docs/source/serving/integrations.md
 create mode 100644 docs/source/serving/integrations/index.md
 rename docs/source/serving/{serving_with_langchain.md => integrations/langchain.md} (82%)
 rename docs/source/serving/{serving_with_llamaindex.md => integrations/llamaindex.md} (74%)
 rename docs/source/{features => serving}/multimodal_inputs.md (95%)
 create mode 100644 docs/source/serving/offline_inference.md

diff --git a/README.md b/README.md
index f83c9d759b359..652268ec29cac 100644
--- a/README.md
+++ b/README.md
@@ -77,7 +77,7 @@ pip install vllm
 Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
 - [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
 - [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
-- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
+- [List of Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
 
 ## Contributing
 
diff --git a/docs/source/serving/architecture_helm_deployment.png b/docs/source/assets/deployment/architecture_helm_deployment.png
similarity index 100%
rename from docs/source/serving/architecture_helm_deployment.png
rename to docs/source/assets/deployment/architecture_helm_deployment.png
diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md
index 7ffec83333d7d..38ea956ba8dfb 100644
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@@ -1,7 +1,7 @@
 # Dockerfile
 
 We provide a <gh-file:Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
-More information about deploying with Docker can be found [here](../../serving/deploying_with_docker.md).
+More information about deploying with Docker can be found [here](#deployment-docker).
 
 Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
 
diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md
index cf1cdb0c9de0f..fe5aa94c52896 100644
--- a/docs/source/contributing/model/registration.md
+++ b/docs/source/contributing/model/registration.md
@@ -3,7 +3,7 @@
 # Model Registration
 
 vLLM relies on a model registry to determine how to run each model.
-A list of pre-registered architectures can be found on the [Supported Models](#supported-models) page.
+A list of pre-registered architectures can be found [here](#supported-models).
 
 If your model is not on this list, you must register it to vLLM.
 This page provides detailed instructions on how to do so.
@@ -16,7 +16,7 @@ This gives you the ability to modify the codebase and test your model.
 After you have implemented your model (see [tutorial](#new-model-basic)), put it into the <gh-dir:vllm/model_executor/models> directory.
 Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
 You should also include an example HuggingFace repository for this model in <gh-file:tests/models/registry.py> to run the unit tests.
-Finally, update the [Supported Models](#supported-models) documentation page to promote your model!
+Finally, update our [list of supported models](#supported-models) to promote your model!
 
 ```{important}
 The list of models in each section should be maintained in alphabetical order.
diff --git a/docs/source/serving/deploying_with_docker.md b/docs/source/deployment/docker.md
similarity index 98%
rename from docs/source/serving/deploying_with_docker.md
rename to docs/source/deployment/docker.md
index 844bd27800c7a..2df1aca27f1e6 100644
--- a/docs/source/serving/deploying_with_docker.md
+++ b/docs/source/deployment/docker.md
@@ -1,6 +1,6 @@
-(deploying-with-docker)=
+(deployment-docker)=
 
-# Deploying with Docker
+# Using Docker
 
 ## Use vLLM's Official Docker Image
 
diff --git a/docs/source/serving/deploying_with_bentoml.md b/docs/source/deployment/frameworks/bentoml.md
similarity index 89%
rename from docs/source/serving/deploying_with_bentoml.md
rename to docs/source/deployment/frameworks/bentoml.md
index dfa0de4f0f6d7..ea0b5d1d4c93b 100644
--- a/docs/source/serving/deploying_with_bentoml.md
+++ b/docs/source/deployment/frameworks/bentoml.md
@@ -1,6 +1,6 @@
-(deploying-with-bentoml)=
+(deployment-bentoml)=
 
-# Deploying with BentoML
+# BentoML
 
 [BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes.
 
diff --git a/docs/source/serving/deploying_with_cerebrium.md b/docs/source/deployment/frameworks/cerebrium.md
similarity index 98%
rename from docs/source/serving/deploying_with_cerebrium.md
rename to docs/source/deployment/frameworks/cerebrium.md
index 950064c8c1b10..be018dfb75d7a 100644
--- a/docs/source/serving/deploying_with_cerebrium.md
+++ b/docs/source/deployment/frameworks/cerebrium.md
@@ -1,6 +1,6 @@
-(deploying-with-cerebrium)=
+(deployment-cerebrium)=
 
-# Deploying with Cerebrium
+# Cerebrium
 
 ```{raw} html
 <p align="center">
diff --git a/docs/source/serving/deploying_with_dstack.md b/docs/source/deployment/frameworks/dstack.md
similarity index 98%
rename from docs/source/serving/deploying_with_dstack.md
rename to docs/source/deployment/frameworks/dstack.md
index 381f5f786ca2c..4142c1d9f1f60 100644
--- a/docs/source/serving/deploying_with_dstack.md
+++ b/docs/source/deployment/frameworks/dstack.md
@@ -1,6 +1,6 @@
-(deploying-with-dstack)=
+(deployment-dstack)=
 
-# Deploying with dstack
+# dstack
 
 ```{raw} html
 <p align="center">
diff --git a/docs/source/serving/deploying_with_helm.md b/docs/source/deployment/frameworks/helm.md
similarity index 98%
rename from docs/source/serving/deploying_with_helm.md
rename to docs/source/deployment/frameworks/helm.md
index 7286a0a88968f..18ed293191468 100644
--- a/docs/source/serving/deploying_with_helm.md
+++ b/docs/source/deployment/frameworks/helm.md
@@ -1,6 +1,6 @@
-(deploying-with-helm)=
+(deployment-helm)=
 
-# Deploying with Helm
+# Helm
 
 A Helm chart to deploy vLLM for Kubernetes
 
@@ -38,7 +38,7 @@ chart **including persistent volumes** and deletes the release.
 
 ## Architecture
 
-```{image} architecture_helm_deployment.png
+```{image} /assets/deployment/architecture_helm_deployment.png
 ```
 
 ## Values
diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md
new file mode 100644
index 0000000000000..6a59131d36618
--- /dev/null
+++ b/docs/source/deployment/frameworks/index.md
@@ -0,0 +1,13 @@
+# Using other frameworks
+
+```{toctree}
+:maxdepth: 1
+
+bentoml
+cerebrium
+dstack
+helm
+lws
+skypilot
+triton
+```
diff --git a/docs/source/serving/deploying_with_lws.md b/docs/source/deployment/frameworks/lws.md
similarity index 91%
rename from docs/source/serving/deploying_with_lws.md
rename to docs/source/deployment/frameworks/lws.md
index 22bab419eaca3..349fa83fbcb9d 100644
--- a/docs/source/serving/deploying_with_lws.md
+++ b/docs/source/deployment/frameworks/lws.md
@@ -1,6 +1,6 @@
-(deploying-with-lws)=
+(deployment-lws)=
 
-# Deploying with LWS
+# LWS
 
 LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
 A major use case is for multi-host/multi-node distributed inference.
diff --git a/docs/source/serving/run_on_sky.md b/docs/source/deployment/frameworks/skypilot.md
similarity index 98%
rename from docs/source/serving/run_on_sky.md
rename to docs/source/deployment/frameworks/skypilot.md
index 115873ae49292..f02a943026922 100644
--- a/docs/source/serving/run_on_sky.md
+++ b/docs/source/deployment/frameworks/skypilot.md
@@ -1,6 +1,6 @@
-(on-cloud)=
+(deployment-skypilot)=
 
-# Deploying and scaling up with SkyPilot
+# SkyPilot
 
 ```{raw} html
 <p align="center">
@@ -12,9 +12,9 @@ vLLM can be **run and scaled to multiple service replicas on clouds and Kubernet
 
 ## Prerequisites
 
-- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and request access to the model {code}`meta-llama/Meta-Llama-3-8B-Instruct`.
+- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and request access to the model `meta-llama/Meta-Llama-3-8B-Instruct`.
 - Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)).
-- Check that {code}`sky check` shows clouds or Kubernetes are enabled.
+- Check that `sky check` shows clouds or Kubernetes are enabled.
 
 ```console
 pip install skypilot-nightly
diff --git a/docs/source/serving/deploying_with_triton.md b/docs/source/deployment/frameworks/triton.md
similarity index 87%
rename from docs/source/serving/deploying_with_triton.md
rename to docs/source/deployment/frameworks/triton.md
index 9b0a6f1d54ae8..94d87120159c6 100644
--- a/docs/source/serving/deploying_with_triton.md
+++ b/docs/source/deployment/frameworks/triton.md
@@ -1,5 +1,5 @@
-(deploying-with-triton)=
+(deployment-triton)=
 
-# Deploying with NVIDIA Triton
+# NVIDIA Triton
 
 The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details.
diff --git a/docs/source/deployment/integrations/index.md b/docs/source/deployment/integrations/index.md
new file mode 100644
index 0000000000000..d47ede8967547
--- /dev/null
+++ b/docs/source/deployment/integrations/index.md
@@ -0,0 +1,9 @@
+# External Integrations
+
+```{toctree}
+:maxdepth: 1
+
+kserve
+kubeai
+llamastack
+```
diff --git a/docs/source/serving/deploying_with_kserve.md b/docs/source/deployment/integrations/kserve.md
similarity index 85%
rename from docs/source/serving/deploying_with_kserve.md
rename to docs/source/deployment/integrations/kserve.md
index feaeb5d0ec8a2..c780fd74e8f55 100644
--- a/docs/source/serving/deploying_with_kserve.md
+++ b/docs/source/deployment/integrations/kserve.md
@@ -1,6 +1,6 @@
-(deploying-with-kserve)=
+(deployment-kserve)=
 
-# Deploying with KServe
+# KServe
 
 vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
 
diff --git a/docs/source/serving/deploying_with_kubeai.md b/docs/source/deployment/integrations/kubeai.md
similarity index 93%
rename from docs/source/serving/deploying_with_kubeai.md
rename to docs/source/deployment/integrations/kubeai.md
index 3609d7e05acd3..2f5772e075d87 100644
--- a/docs/source/serving/deploying_with_kubeai.md
+++ b/docs/source/deployment/integrations/kubeai.md
@@ -1,6 +1,6 @@
-(deploying-with-kubeai)=
+(deployment-kubeai)=
 
-# Deploying with KubeAI
+# KubeAI
 
 [KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
 
diff --git a/docs/source/serving/serving_with_llamastack.md b/docs/source/deployment/integrations/llamastack.md
similarity index 95%
rename from docs/source/serving/serving_with_llamastack.md
rename to docs/source/deployment/integrations/llamastack.md
index 71dadca7ad47c..474d2bdfa9580 100644
--- a/docs/source/serving/serving_with_llamastack.md
+++ b/docs/source/deployment/integrations/llamastack.md
@@ -1,6 +1,6 @@
-(run-on-llamastack)=
+(deployment-llamastack)=
 
-# Serving with Llama Stack
+# Llama Stack
 
 vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) .
 
diff --git a/docs/source/serving/deploying_with_k8s.md b/docs/source/deployment/k8s.md
similarity index 99%
rename from docs/source/serving/deploying_with_k8s.md
rename to docs/source/deployment/k8s.md
index 5f9b0e4f55ecc..760214e112fba 100644
--- a/docs/source/serving/deploying_with_k8s.md
+++ b/docs/source/deployment/k8s.md
@@ -1,6 +1,6 @@
-(deploying-with-k8s)=
+(deployment-k8s)=
 
-# Deploying with Kubernetes
+# Using Kubernetes
 
 Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing.
 
diff --git a/docs/source/serving/deploying_with_nginx.md b/docs/source/deployment/nginx.md
similarity index 99%
rename from docs/source/serving/deploying_with_nginx.md
rename to docs/source/deployment/nginx.md
index a1f00d8536465..a58f791c2997b 100644
--- a/docs/source/serving/deploying_with_nginx.md
+++ b/docs/source/deployment/nginx.md
@@ -1,6 +1,6 @@
 (nginxloadbalancer)=
 
-# Deploying with Nginx Loadbalancer
+# Using Nginx
 
 This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
 
diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md
index 2f1280c047672..5e0dd021ad02e 100644
--- a/docs/source/design/arch_overview.md
+++ b/docs/source/design/arch_overview.md
@@ -57,7 +57,7 @@ More API details can be found in the {doc}`Offline Inference
 
 The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
 
-### OpenAI-compatible API server
+### OpenAI-Compatible API Server
 
 The second primary interface to vLLM is via its OpenAI-compatible API server.
 This server can be started using the `vllm serve` command.
diff --git a/docs/source/features/disagg_prefill.md b/docs/source/features/disagg_prefill.md
index 05226f2dec87c..645dc60807dd3 100644
--- a/docs/source/features/disagg_prefill.md
+++ b/docs/source/features/disagg_prefill.md
@@ -1,8 +1,12 @@
 (disagg-prefill)=
 
-# Disaggregated prefilling (experimental)
+# Disaggregated Prefilling (experimental)
 
-This page introduces you the disaggregated prefilling feature in vLLM. This feature is experimental and subject to change.
+This page introduces you the disaggregated prefilling feature in vLLM.
+
+```{note}
+This feature is experimental and subject to change.
+```
 
 ## Why disaggregated prefilling?
 
diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
index 8c52c97a41e48..bc8a0aa14dc5a 100644
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@@ -1,6 +1,6 @@
 (spec-decode)=
 
-# Speculative decoding
+# Speculative Decoding
 
 ```{warning}
 Please note that speculative decoding in vLLM is not yet optimized and does
diff --git a/docs/source/getting_started/installation/gpu-rocm.md b/docs/source/getting_started/installation/gpu-rocm.md
index 796911d7305a6..e36b92513e31d 100644
--- a/docs/source/getting_started/installation/gpu-rocm.md
+++ b/docs/source/getting_started/installation/gpu-rocm.md
@@ -148,7 +148,7 @@ $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
 $ python3 setup.py develop
 ```
 
-This may take 5-10 minutes. Currently, {code}`pip install .` does not work for ROCm installation.
+This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
 
 ```{tip}
 - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
diff --git a/docs/source/getting_started/installation/hpu-gaudi.md b/docs/source/getting_started/installation/hpu-gaudi.md
index 94de169f51a73..1d50cef3bdc83 100644
--- a/docs/source/getting_started/installation/hpu-gaudi.md
+++ b/docs/source/getting_started/installation/hpu-gaudi.md
@@ -82,7 +82,7 @@ $ python setup.py develop
 
 ## Supported Features
 
-- [Offline batched inference](#offline-batched-inference)
+- [Offline inference](#offline-inference)
 - Online inference via [OpenAI-Compatible Server](#openai-compatible-server)
 - HPU autodetection - no need to manually select device within vLLM
 - Paged KV cache with algorithms enabled for Intel Gaudi accelerators
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index ff216f8af30f9..3f9556165ece4 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -2,20 +2,20 @@
 
 # Quickstart
 
-This guide will help you quickly get started with vLLM to:
+This guide will help you quickly get started with vLLM to perform:
 
-- [Run offline batched inference](#offline-batched-inference)
-- [Run OpenAI-compatible inference](#openai-compatible-server)
+- [Offline batched inference](#quickstart-offline)
+- [Online inference using OpenAI-compatible server](#quickstart-online)
 
 ## Prerequisites
 
 - OS: Linux
 - Python: 3.9 -- 3.12
-- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
 
 ## Installation
 
-You can install vLLM using pip. It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
+If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/project/vllm/) directly.
+It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
 
 ```console
 $ conda create -n myenv python=3.10 -y
@@ -23,9 +23,11 @@ $ conda activate myenv
 $ pip install vllm
 ```
 
-Please refer to the [installation documentation](#installation-index) for more details on installing vLLM.
+```{note}
+For non-CUDA platforms, please refer [here](#installation-index) for specific instructions on how to install vLLM.
+```
 
-(offline-batched-inference)=
+(quickstart-offline)=
 
 ## Offline Batched Inference
 
@@ -73,7 +75,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-(openai-compatible-server)=
+(quickstart-online)=
 
 ## OpenAI-Compatible Server
 
diff --git a/docs/source/index.md b/docs/source/index.md
index 4bc40bf0f5e41..c335155bd6e14 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -65,32 +65,14 @@ getting_started/troubleshooting
 getting_started/faq
 ```
 
-```{toctree}
-:caption: Serving
-:maxdepth: 1
-
-serving/openai_compatible_server
-serving/deploying_with_docker
-serving/deploying_with_k8s
-serving/deploying_with_helm
-serving/deploying_with_nginx
-serving/distributed_serving
-serving/metrics
-serving/integrations
-serving/tensorizer
-serving/runai_model_streamer
-serving/engine_args
-serving/env_vars
-serving/usage_stats
-```
-
 ```{toctree}
 :caption: Models
 :maxdepth: 1
 
-models/supported_models
 models/generative_models
 models/pooling_models
+models/supported_models
+models/extensions/index
 ```
 
 ```{toctree}
@@ -99,7 +81,6 @@ models/pooling_models
 
 features/quantization/index
 features/lora
-features/multimodal_inputs
 features/tool_calling
 features/structured_outputs
 features/automatic_prefix_caching
@@ -108,6 +89,32 @@ features/spec_decode
 features/compatibility_matrix
 ```
 
+```{toctree}
+:caption: Inference and Serving
+:maxdepth: 1
+
+serving/offline_inference
+serving/openai_compatible_server
+serving/multimodal_inputs
+serving/distributed_serving
+serving/metrics
+serving/engine_args
+serving/env_vars
+serving/usage_stats
+serving/integrations/index
+```
+
+```{toctree}
+:caption: Deployment
+:maxdepth: 1
+
+deployment/docker
+deployment/k8s
+deployment/nginx
+deployment/frameworks/index
+deployment/integrations/index
+```
+
 ```{toctree}
 :caption: Performance
 :maxdepth: 1
diff --git a/docs/source/models/extensions/index.md b/docs/source/models/extensions/index.md
new file mode 100644
index 0000000000000..cff09d12eba47
--- /dev/null
+++ b/docs/source/models/extensions/index.md
@@ -0,0 +1,8 @@
+# Built-in Extensions
+
+```{toctree}
+:maxdepth: 1
+
+runai_model_streamer
+tensorizer
+```
diff --git a/docs/source/serving/runai_model_streamer.md b/docs/source/models/extensions/runai_model_streamer.md
similarity index 98%
rename from docs/source/serving/runai_model_streamer.md
rename to docs/source/models/extensions/runai_model_streamer.md
index d4269050ff574..fe2701194a604 100644
--- a/docs/source/serving/runai_model_streamer.md
+++ b/docs/source/models/extensions/runai_model_streamer.md
@@ -1,6 +1,6 @@
 (runai-model-streamer)=
 
-# Loading Models with Run:ai Model Streamer
+# Loading models with Run:ai Model Streamer
 
 Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory.
 Further reading can be found in [Run:ai Model Streamer Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md).
diff --git a/docs/source/serving/tensorizer.md b/docs/source/models/extensions/tensorizer.md
similarity index 95%
rename from docs/source/serving/tensorizer.md
rename to docs/source/models/extensions/tensorizer.md
index d3dd29d48f730..42ed5c795dd27 100644
--- a/docs/source/serving/tensorizer.md
+++ b/docs/source/models/extensions/tensorizer.md
@@ -1,6 +1,6 @@
 (tensorizer)=
 
-# Loading Models with CoreWeave's Tensorizer
+# Loading models with CoreWeave's Tensorizer
 
 vLLM supports loading models with [CoreWeave's Tensorizer](https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer).
 vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 94a8849f7edcd..590bea992d1fc 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -1,9 +1,9 @@
 (supported-models)=
 
-# Supported Models
+# List of Supported Models
 
 vLLM supports generative and pooling models across various tasks.
-If a model supports more than one task, you can set the task via the {code}`--task` argument.
+If a model supports more than one task, you can set the task via the `--task` argument.
 
 For each task, we list the model architectures that have been implemented in vLLM.
 Alongside each architecture, we include some popular models that use it.
@@ -14,8 +14,8 @@ Alongside each architecture, we include some popular models that use it.
 
 By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co/models).
 
-To determine whether a given model is supported, you can check the {code}`config.json` file inside the HF repository.
-If the {code}`"architectures"` field contains a model architecture listed below, then it should be supported in theory.
+To determine whether a given model is supported, you can check the `config.json` file inside the HF repository.
+If the `"architectures"` field contains a model architecture listed below, then it should be supported in theory.
 
 ````{tip}
 The easiest way to check if your model is really supported at runtime is to run the program below:
@@ -48,7 +48,7 @@ To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFac
 $ export VLLM_USE_MODELSCOPE=True
 ```
 
-And use with {code}`trust_remote_code=True`.
+And use with `trust_remote_code=True`.
 
 ```python
 from vllm import LLM
@@ -420,15 +420,15 @@ you should explicitly specify the task type to ensure that the model is used in
 ```
 
 ```{note}
-{code}`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
-You should manually set mean pooling by passing {code}`--override-pooler-config '{"pooling_type": "MEAN"}'`.
+`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
+You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
 ```
 
 ```{note}
-Unlike base Qwen2, {code}`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
-You can set {code}`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
+Unlike base Qwen2, `Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
+You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
 
-On the other hand, its 1.5B variant ({code}`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
+On the other hand, its 1.5B variant (`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
 despite being described otherwise on its model card.
 ```
 
@@ -468,8 +468,8 @@ If your model is not in the above list, we will try to automatically convert the
 {func}`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
 
 ```{important}
-For process-supervised reward models such as {code}`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
-e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
+For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
+e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
 ```
 
 #### Classification (`--task classify`)
@@ -537,13 +537,13 @@ The following modalities are supported depending on the model:
 - **V**ideo
 - **A**udio
 
-Any combination of modalities joined by {code}`+` are supported.
+Any combination of modalities joined by `+` are supported.
 
-- e.g.: {code}`T + I` means that the model supports text-only, image-only, and text-with-image inputs.
+- e.g.: `T + I` means that the model supports text-only, image-only, and text-with-image inputs.
 
-On the other hand, modalities separated by {code}`/` are mutually exclusive.
+On the other hand, modalities separated by `/` are mutually exclusive.
 
-- e.g.: {code}`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
+- e.g.: `T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
 
 See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model.
 
@@ -731,8 +731,8 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
 ````{important}
-To enable multiple multi-modal items per text prompt, you have to set {code}`limit_mm_per_prompt` (offline inference)
-or {code}`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
+To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
+or `--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
 
 ```python
 llm = LLM(
@@ -751,11 +751,11 @@ vLLM currently only supports adding LoRA to the language backbone of multimodal
 ```
 
 ```{note}
-To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
+To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
 ```
 
 ```{note}
-The official {code}`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork ({code}`HwwwH/MiniCPM-V-2`) for now.
+The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
 For more details, please see: <gh-pr:4087#issuecomment-2250397630>
 ```
 
@@ -770,7 +770,7 @@ you should explicitly specify the task type to ensure that the model is used in
 
 #### Text Embedding (`--task embed`)
 
-Any text generation model can be converted into an embedding model by passing {code}`--task embed`.
+Any text generation model can be converted into an embedding model by passing `--task embed`.
 
 ```{note}
 To get the best results, you should use pooling models that are specifically trained as such.
@@ -818,7 +818,7 @@ At vLLM, we are committed to facilitating the integration and support of third-p
 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
 
 ```{tip}
-When comparing the output of {code}`model.generate` from HuggingFace Transformers with the output of {code}`llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
+When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
 ```
 
 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index 6fbc1ea104678..b1703249d7224 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -18,13 +18,13 @@ After adding enough GPUs and nodes to hold the model, you can run vLLM first, wh
 There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs.
 ```
 
-## Details for Distributed Inference and Serving
+## Running vLLM on a single node
 
 vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
 
-Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured {code}`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the {code}`LLM` class {code}`distributed_executor_backend` argument or {code}`--distributed-executor-backend` API server argument. Set it to {code}`mp` for multiprocessing or {code}`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
+Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured `tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the `LLM` class `distributed_executor_backend` argument or `--distributed-executor-backend` API server argument. Set it to `mp` for multiprocessing or `ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
 
-To run multi-GPU inference with the {code}`LLM` class, set the {code}`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
+To run multi-GPU inference with the `LLM` class, set the `tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
 
 ```python
 from vllm import LLM
@@ -32,14 +32,14 @@ llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
 output = llm.generate("San Franciso is a")
 ```
 
-To run multi-GPU serving, pass in the {code}`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
+To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
 
 ```console
 $ vllm serve facebook/opt-13b \
 $     --tensor-parallel-size 4
 ```
 
-You can also additionally specify {code}`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
+You can also additionally specify `--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
 
 ```console
 $ vllm serve gpt2 \
@@ -47,7 +47,7 @@ $     --tensor-parallel-size 4 \
 $     --pipeline-parallel-size 2
 ```
 
-## Multi-Node Inference and Serving
+## Running vLLM on multiple nodes
 
 If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
 
diff --git a/docs/source/serving/integrations.md b/docs/source/serving/integrations.md
deleted file mode 100644
index d214c77254257..0000000000000
--- a/docs/source/serving/integrations.md
+++ /dev/null
@@ -1,17 +0,0 @@
-# Integrations
-
-```{toctree}
-:maxdepth: 1
-
-run_on_sky
-deploying_with_kserve
-deploying_with_kubeai
-deploying_with_triton
-deploying_with_bentoml
-deploying_with_cerebrium
-deploying_with_lws
-deploying_with_dstack
-serving_with_langchain
-serving_with_llamaindex
-serving_with_llamastack
-```
diff --git a/docs/source/serving/integrations/index.md b/docs/source/serving/integrations/index.md
new file mode 100644
index 0000000000000..371c284981ce9
--- /dev/null
+++ b/docs/source/serving/integrations/index.md
@@ -0,0 +1,8 @@
+# External Integrations
+
+```{toctree}
+:maxdepth: 1
+
+langchain
+llamaindex
+```
diff --git a/docs/source/serving/serving_with_langchain.md b/docs/source/serving/integrations/langchain.md
similarity index 82%
rename from docs/source/serving/serving_with_langchain.md
rename to docs/source/serving/integrations/langchain.md
index 96bd5943f3d64..49ff6e0c32a72 100644
--- a/docs/source/serving/serving_with_langchain.md
+++ b/docs/source/serving/integrations/langchain.md
@@ -1,10 +1,10 @@
-(run-on-langchain)=
+(serving-langchain)=
 
-# Serving with Langchain
+# LangChain
 
-vLLM is also available via [Langchain](https://github.com/langchain-ai/langchain) .
+vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain) .
 
-To install langchain, run
+To install LangChain, run
 
 ```console
 $ pip install langchain langchain_community -q
diff --git a/docs/source/serving/serving_with_llamaindex.md b/docs/source/serving/integrations/llamaindex.md
similarity index 74%
rename from docs/source/serving/serving_with_llamaindex.md
rename to docs/source/serving/integrations/llamaindex.md
index 98859d8e3f828..9961c181d7e1c 100644
--- a/docs/source/serving/serving_with_llamaindex.md
+++ b/docs/source/serving/integrations/llamaindex.md
@@ -1,10 +1,10 @@
-(run-on-llamaindex)=
+(serving-llamaindex)=
 
-# Serving with llama_index
+# LlamaIndex
 
-vLLM is also available via [llama_index](https://github.com/run-llama/llama_index) .
+vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index) .
 
-To install llamaindex, run
+To install LlamaIndex, run
 
 ```console
 $ pip install llama-index-llms-vllm -q
diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md
index 2dc78643f6d8f..e6ded2e6dd465 100644
--- a/docs/source/serving/metrics.md
+++ b/docs/source/serving/metrics.md
@@ -4,7 +4,7 @@ vLLM exposes a number of metrics that can be used to monitor the health of the
 system. These metrics are exposed via the `/metrics` endpoint on the vLLM
 OpenAI compatible API server.
 
-You can start the server using Python, or using [Docker](deploying_with_docker.md):
+You can start the server using Python, or using [Docker](#deployment-docker):
 
 ```console
 $ vllm serve unsloth/Llama-3.2-1B-Instruct
diff --git a/docs/source/features/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
similarity index 95%
rename from docs/source/features/multimodal_inputs.md
rename to docs/source/serving/multimodal_inputs.md
index 4f45a9f448cf0..0efa09f2869ca 100644
--- a/docs/source/features/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -18,7 +18,7 @@ To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`
 
 ### Image
 
-You can pass a single image to the {code}`'image'` field of the multi-modal dictionary, as shown in the following examples:
+You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
 
 ```python
 llm = LLM(model="llava-hf/llava-1.5-7b-hf")
@@ -122,21 +122,21 @@ for o in outputs:
 
 ### Video
 
-You can pass a list of NumPy arrays directly to the {code}`'video'` field of the multi-modal dictionary
+You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
 instead of using multi-image input.
 
 Full example: <gh-file:examples/offline_inference_vision_language.py>
 
 ### Audio
 
-You can pass a tuple {code}`(array, sampling_rate)` to the {code}`'audio'` field of the multi-modal dictionary.
+You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
 
 Full example: <gh-file:examples/offline_inference_audio_language.py>
 
 ### Embedding
 
 To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
-pass a tensor of shape {code}`(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
+pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
 
 ```python
 # Inference with image embeddings as input
@@ -294,7 +294,7 @@ $ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
 
 ### Video
 
-Instead of {code}`image_url`, you can pass a video file via {code}`video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf).
+Instead of `image_url`, you can pass a video file via `video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf).
 
 First, launch the OpenAI-compatible server:
 
@@ -418,7 +418,7 @@ result = chat_completion_from_base64.choices[0].message.content
 print("Chat completion output from input audio:", result)
 ```
 
-Alternatively, you can pass {code}`audio_url`, which is the audio counterpart of {code}`image_url` for image input:
+Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input:
 
 ```python
 chat_completion_from_url = client.chat.completions.create(
diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
new file mode 100644
index 0000000000000..83178f7811825
--- /dev/null
+++ b/docs/source/serving/offline_inference.md
@@ -0,0 +1,79 @@
+(offline-inference)=
+
+# Offline Inference
+
+You can run vLLM in your own code on a list of prompts.
+
+The offline API is based on the {class}`~vllm.LLM` class.
+To initialize the vLLM engine, create a new instance of `LLM` and specify the model to run.
+
+For example, the following code downloads the [`facebook/opt-125m`](https://huggingface.co/facebook/opt-125m) model from HuggingFace
+and runs it in vLLM using the default configuration.
+
+```python
+llm = LLM(model="facebook/opt-125m")
+```
+
+After initializing the `LLM` instance, you can perform model inference using various APIs.
+The available APIs depend on the type of model that is being run:
+
+- [Generative models](#generative-models) output logprobs which are sampled from to obtain the final output text.
+- [Pooling models](#pooling-models) output their hidden states directly.
+
+Please refer to the above pages for more details about each API.
+
+```{seealso}
+[API Reference](/dev/offline_inference/offline_index)
+```
+
+## Configuration Options
+
+This section lists the most common options for running the vLLM engine.
+For a full list, refer to the [Engine Arguments](#engine-args) page.
+
+### Reducing memory usage
+
+Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem.
+
+#### Tensor Parallelism (TP)
+
+Tensor parallelism (`tensor_parallel_size` option) can be used to split the model across multiple GPUs.
+
+The following code splits the model across 2 GPUs.
+
+```python
+llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
+          tensor_parallel_size=2)
+```
+
+```{important}
+To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. {func}`torch.cuda.set_device`)
+before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
+
+To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
+```
+
+#### Quantization
+
+Quantized models take less memory at the cost of lower precision.
+
+Statically quantized models can be downloaded from HF Hub (some popular ones are available at [Neural Magic](https://huggingface.co/neuralmagic))
+and used directly without extra configuration.
+
+Dynamic quantization is also supported via the `quantization` option -- see [here](#quantization-index) for more details.
+
+#### Context length and batch size
+
+You can further reduce memory usage by limit the context length of the model (`max_model_len` option)
+and the maximum batch size (`max_num_seqs` option).
+
+```python
+llm = LLM(model="adept/fuyu-8b",
+          max_model_len=2048,
+          max_num_seqs=2)
+```
+
+### Performance optimization and tuning
+
+You can potentially improve the performance of vLLM by finetuning various options.
+Please refer to [this guide](#optimization-and-tuning) for more details.
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 97e9879075570..1e5ea6357d202 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -1,8 +1,10 @@
-# OpenAI Compatible Server
+(openai-compatible-server)=
 
-vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API, and more!
+# OpenAI-Compatible Server
 
-You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](deploying_with_docker.md):
+vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more!
+
+You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](#deployment-docker):
 ```bash
 vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
 ```
diff --git a/docs/source/serving/usage_stats.md b/docs/source/serving/usage_stats.md
index 3d02fbab9216e..cfc3cb2576873 100644
--- a/docs/source/serving/usage_stats.md
+++ b/docs/source/serving/usage_stats.md
@@ -45,7 +45,7 @@ You can preview the collected data by running the following command:
 tail ~/.config/vllm/usage_stats.json
 ```
 
-## Opt-out of Usage Stats Collection
+## Opting out
 
 You can opt-out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file:
 

From b278557935d78b337fb5e82a32b02f75678b4101 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 7 Jan 2025 12:01:39 +0800
Subject: [PATCH 293/357] [Kernel][LoRA]Punica prefill  kernels fusion (#11234)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Abatom <abzhonghua@gmail.com>
Co-authored-by: Zhonghua Deng <abatom@163.com>
---
 .buildkite/test-pipeline.yaml          |   3 +-
 tests/lora/test_minicpmv.py            |  77 --------
 tests/lora/test_minicpmv_tp.py         |  63 +++++--
 tests/lora/test_punica_sizes.py        | 166 ++++++++---------
 tests/lora/test_punica_variation.py    | 168 +++++++++--------
 tests/lora/utils.py                    | 144 ++++++++++++---
 vllm/lora/ops/sgmv_expand.py           | 205 +++++++++++++--------
 vllm/lora/ops/sgmv_expand_slice.py     | 241 -------------------------
 vllm/lora/ops/sgmv_shrink.py           | 129 +++++++------
 vllm/lora/ops/utils.py                 | 121 ++++++++++++-
 vllm/lora/punica_wrapper/punica_gpu.py | 154 ++++++----------
 11 files changed, 707 insertions(+), 764 deletions(-)
 delete mode 100644 tests/lora/test_minicpmv.py
 delete mode 100644 vllm/lora/ops/sgmv_expand_slice.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 529daf54faecf..dcfe228ce8eae 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -242,7 +242,7 @@ steps:
   source_file_dependencies:
   - vllm/lora
   - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
   parallelism: 4
 
 - label: "PyTorch Fullgraph Smoke Test" # 9min
@@ -535,6 +535,7 @@ steps:
     # requires multi-GPU testing for validation.
     - pytest -v -s -x lora/test_chatglm3_tp.py
     - pytest -v -s -x lora/test_llama_tp.py
+    - pytest -v -s -x lora/test_minicpmv_tp.py
 
 
 - label: Weight Loading Multiple GPU Test  # 33min
diff --git a/tests/lora/test_minicpmv.py b/tests/lora/test_minicpmv.py
deleted file mode 100644
index 78bf5a1617233..0000000000000
--- a/tests/lora/test_minicpmv.py
+++ /dev/null
@@ -1,77 +0,0 @@
-from typing import List
-
-import pytest
-
-import vllm
-from vllm.assets.image import ImageAsset
-from vllm.lora.request import LoRARequest
-from vllm.platforms import current_platform
-
-MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
-
-PROMPT_TEMPLATE = (
-    "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
-    "(<image>./</image>)\nWhat is in the image?<|eot_id|>"
-    "<|start_header_id|>assistant<|end_header_id|>\n\n")
-
-IMAGE_ASSETS = [
-    ImageAsset("stop_sign"),
-    ImageAsset("cherry_blossom"),
-]
-
-# After fine-tuning with LoRA, all generated content should start begin `A`.
-EXPECTED_OUTPUT = [
-    "A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.",  # noqa: E501
-    "A pink cherry blossom tree with a blue sky in the background.",
-]
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
-    sampling_params = vllm.SamplingParams(
-        temperature=0,
-        max_tokens=5,
-        stop_token_ids=[128001, 128009],  # eos_id, eot_id
-    )
-
-    inputs = [{
-        "prompt": PROMPT_TEMPLATE,
-        "multi_modal_data": {
-            "image": asset.pil_image
-        },
-    } for asset in IMAGE_ASSETS]
-
-    outputs = llm.generate(
-        inputs,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None,
-    )
-    # Print the outputs.
-    generated_texts: List[str] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
-
-
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm")
-def test_minicpmv_lora(minicpmv_lora_files):
-    llm = vllm.LLM(
-        MODEL_PATH,
-        max_num_seqs=2,
-        enable_lora=True,
-        max_loras=4,
-        max_lora_rank=64,
-        trust_remote_code=True,
-        enable_chunked_prefill=True,
-    )
-    output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
-    for i in range(len(EXPECTED_OUTPUT)):
-        assert EXPECTED_OUTPUT[i].startswith(output1[i])
-    output2 = do_sample(llm, minicpmv_lora_files, lora_id=2)
-    for i in range(len(EXPECTED_OUTPUT)):
-        assert EXPECTED_OUTPUT[i].startswith(output2[i])
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index 930f177953a5f..3b0f18325a40b 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -3,10 +3,10 @@
 import pytest
 
 import vllm
+from tests.utils import fork_new_process_for_each_test
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
-
-from ..utils import multi_gpu_test
+from vllm.platforms import current_platform
 
 MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
 
@@ -17,13 +17,11 @@
 
 IMAGE_ASSETS = [
     ImageAsset("stop_sign"),
-    ImageAsset("cherry_blossom"),
 ]
 
 # After fine-tuning with LoRA, all generated content should start begin `A`.
 EXPECTED_OUTPUT = [
     "A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.",  # noqa: E501
-    "A pink cherry blossom tree with a blue sky in the background.",
 ]
 
 
@@ -50,48 +48,75 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     # Print the outputs.
     generated_texts: List[str] = []
     for output in outputs:
-        prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
         generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        print(f"Generated text: {generated_text!r}")
     return generated_texts
 
 
-@multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("fully_sharded", [True, False])
-def test_minicpmv_tp2(minicpmv_lora_files, fully_sharded):
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="MiniCPM-V dependency xformers incompatible with ROCm")
+@fork_new_process_for_each_test
+def test_minicpmv_lora(minicpmv_lora_files):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_num_seqs=2,
+        enable_lora=True,
+        max_loras=2,
+        max_lora_rank=8,
+        enforce_eager=True,
+        trust_remote_code=True,
+        enable_chunked_prefill=True,
+    )
+    output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output1[i])
+    output2 = do_sample(llm, minicpmv_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output2[i])
+
+
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="MiniCPM-V dependency xformers incompatible with ROCm")
+@fork_new_process_for_each_test
+def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
         enable_lora=True,
         max_num_seqs=2,
         max_loras=4,
         max_lora_rank=64,
-        tensor_parallel_size=2,
+        tensor_parallel_size=4,
         trust_remote_code=True,
-        fully_sharded_loras=fully_sharded,
+        enforce_eager=True,
         enable_chunked_prefill=True,
     )
-
     output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
-
     for i in range(len(EXPECTED_OUTPUT)):
         assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
 
 
-@multi_gpu_test(num_gpus=4)
-@pytest.mark.parametrize("fully_sharded", [True, False])
-def test_minicpmv_tp4(minicpmv_lora_files, fully_sharded):
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="MiniCPM-V dependency xformers incompatible with ROCm")
+@fork_new_process_for_each_test
+def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
         enable_lora=True,
         max_num_seqs=2,
-        max_loras=4,
-        max_lora_rank=64,
+        max_loras=2,
+        max_lora_rank=8,
         tensor_parallel_size=4,
         trust_remote_code=True,
-        fully_sharded_loras=fully_sharded,
+        fully_sharded_loras=True,
         enable_chunked_prefill=True,
     )
     output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
     for i in range(len(EXPECTED_OUTPUT)):
         assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
+    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
diff --git a/tests/lora/test_punica_sizes.py b/tests/lora/test_punica_sizes.py
index 66b5f82bbb97d..0351fedd1cfa5 100644
--- a/tests/lora/test_punica_sizes.py
+++ b/tests/lora/test_punica_sizes.py
@@ -4,6 +4,8 @@
 whether the corresponding Triton kernel can run normally when tensor parallelism
 is set to [1, 2, 4, 8, 16, 32, 64].
 """
+from threading import Lock
+
 import pytest
 import torch
 
@@ -11,12 +13,13 @@
 from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
 from vllm.lora.ops.bgmv_shrink import bgmv_shrink
 from vllm.lora.ops.sgmv_expand import sgmv_expand
-from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
 from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+from vllm.lora.ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
 from vllm.platforms import current_platform
 
-from .utils import (generate_data, generate_data_for_expand_nslices,
-                    ref_torch_groupgemm)
+from .utils import (assert_close, generate_data,
+                    generate_data_for_expand_nslices,
+                    generate_data_for_nslices, ref_torch_groupgemm)
 
 HIDDEN_SIZES = [
     128,
@@ -112,14 +115,7 @@
 SEED = [0]
 CUDA_DEVICES = [f"cuda:{0}"]
 
-
-def assert_close(a, b):
-    rtol, atol = {
-        torch.float16: (6e-2, 6e-2),
-        torch.bfloat16: (6e-2, 6e-2),
-        torch.float32: (1e-2, 1e-2),
-    }[a.dtype]
-    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
+_dict_lock = Lock()
 
 
 @pytest.mark.parametrize("batches", BATCHES)
@@ -127,6 +123,7 @@ def assert_close(a, b):
 @pytest.mark.parametrize("rank", MAX_RANKS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("nslices", [1, 2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
 @pytest.mark.parametrize("seed", SEED)
@@ -137,6 +134,7 @@ def test_punica_sgmv(
     rank: int,
     hidden_size: int,
     scaling: float,
+    nslices: int,
     dtype: torch.dtype,
     op_type: str,
     seed: int,
@@ -148,19 +146,20 @@ def test_punica_sgmv(
     seq_length = 128
     (
         inputs_tensor,
-        lora_weights,
+        lora_weights_lst,
         our_out_tensor,
         ref_out_tensor,
         b_seq_start_loc,
         lora_indices_tensor,
         seq_len_tensor,
         indices,
-    ) = generate_data(
+    ) = generate_data_for_nslices(
         batches,
         hidden_size,
         num_loras,
         rank,
         seq_length,
+        nslices,
         dtype,
         op_type,
         device,
@@ -172,43 +171,64 @@ def test_punica_sgmv(
     else:
         max_seq_length = max_seq_length.item()
     if op_type == "shrink":
-        sgmv_shrink(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            b_seq_start_loc,
-            seq_len_tensor,
-            lora_indices_tensor,
-            batches,
-            max_seq_length,
-            token_nums,
-            scaling,
-        )
+        # Preventing cache error pointer.
+        with _dict_lock:
+            _LORA_A_PTR_DICT.clear()
+            sgmv_shrink(
+                inputs_tensor,
+                lora_weights_lst,
+                our_out_tensor,
+                b_seq_start_loc,
+                seq_len_tensor,
+                lora_indices_tensor,
+                batches,
+                max_seq_length,
+                token_nums,
+                scaling,
+            )
+        for index in range(nslices):
+            ref_torch_groupgemm(
+                ref_out_tensor[index],
+                inputs_tensor,
+                lora_weights_lst[index],
+                lora_indices_tensor,
+                seq_len_tensor,
+                batches,
+                scaling,
+                op_type,
+            )
     else:
-        sgmv_expand(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            b_seq_start_loc,
-            seq_len_tensor,
-            lora_indices_tensor,
-            batches,
-            max_seq_length,
-            token_nums,
-            add_inputs=True,
-        )
-    ref_torch_groupgemm(
-        ref_out_tensor,
-        inputs_tensor,
-        lora_weights,
-        lora_indices_tensor,
-        seq_len_tensor,
-        batches,
-        scaling if op_type == "shrink" else 1.0,
-        op_type,
-    )
-    if op_type == "shrink":
-        ref_out_tensor = ref_out_tensor.to(torch.float32)
+        with _dict_lock:
+            _LORA_B_PTR_DICT.clear()
+            sgmv_expand(
+                inputs_tensor,
+                lora_weights_lst,
+                our_out_tensor,
+                b_seq_start_loc,
+                seq_len_tensor,
+                lora_indices_tensor,
+                batches,
+                max_seq_length,
+                token_nums,
+                offset_start=0,
+                add_inputs=True,
+            )
+
+        slice_offset = 0
+        for index in range(nslices):
+            lora_weights = lora_weights_lst[index]
+            ref_torch_groupgemm(
+                ref_out_tensor[:, slice_offset:slice_offset + hidden_size],
+                inputs_tensor[index],
+                lora_weights,
+                lora_indices_tensor,
+                seq_len_tensor,
+                batches,
+                1.0,
+                op_type,
+            )
+            slice_offset += hidden_size
+
     assert_close(our_out_tensor, ref_out_tensor)
 
 
@@ -292,25 +312,22 @@ def test_punica_bgmv(
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("nslices", [2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"])
 @pytest.mark.parametrize("seed", SEED)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_punica_expand_nslices(
+def test_punica_bgmv_expand_nslices(
     batches: int,
     num_loras: int,
     rank: int,
     hidden_size: int,
     nslices: int,
     dtype: torch.dtype,
-    op_type: str,
     seed: int,
     device: str,
 ):
-
     torch.set_default_device(device)
     current_platform.seed_everything(seed)
 
-    seq_length = 128 if op_type == "sgmv" else 1
+    seq_length = 1
     (
         inputs_tensor,
         lora_weights_lst,
@@ -330,41 +347,18 @@ def test_punica_expand_nslices(
         nslices,
         device,
     )
-    max_seq_length = seq_len_tensor.max()
-    token_nums = seq_len_tensor.sum().item()
-    if isinstance(max_seq_length, tuple):
-        max_seq_length = max_seq_length[0].item()
-    else:
-        max_seq_length = max_seq_length.item()
     slice_offset = 0
     for index in range(nslices):
         lora_weights = lora_weights_lst[index]
-        if op_type == "sgmv":
-            sgmv_expand_slice(
-                inputs_tensor,
-                lora_weights,
-                our_outputs,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                slice_offset,
-                hidden_size,
-                add_inputs=True,
-            )
-        else:
-
-            bgmv_expand_slice(
-                inputs_tensor,
-                lora_weights,
-                our_outputs,
-                indices,
-                slice_offset,
-                slice_size=hidden_size,
-                add_inputs=True,
-            )
+        bgmv_expand_slice(
+            inputs_tensor,
+            lora_weights,
+            our_outputs,
+            indices,
+            slice_offset,
+            slice_size=hidden_size,
+            add_inputs=True,
+        )
         ref_torch_groupgemm(
             ref_outputs[:, slice_offset:slice_offset + hidden_size],
             inputs_tensor,
diff --git a/tests/lora/test_punica_variation.py b/tests/lora/test_punica_variation.py
index 3b20033271d26..9ee10e7c23ee6 100644
--- a/tests/lora/test_punica_variation.py
+++ b/tests/lora/test_punica_variation.py
@@ -3,6 +3,8 @@
 under different conditions, including various batches, numbers of LoRA , and
 maximum ranks.
 """
+from threading import Lock
+
 import pytest
 import torch
 
@@ -11,12 +13,13 @@
 import vllm.lora.ops.bgmv_expand_slice
 import vllm.lora.ops.bgmv_shrink
 import vllm.lora.ops.sgmv_expand
-import vllm.lora.ops.sgmv_expand_slice
 import vllm.lora.ops.sgmv_shrink  # noqa: F401
+from vllm.lora.ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
 from vllm.platforms import current_platform
 
-from .utils import (generate_data, generate_data_for_expand_nslices,
-                    ref_torch_groupgemm)
+from .utils import (assert_close, generate_data,
+                    generate_data_for_expand_nslices,
+                    generate_data_for_nslices, ref_torch_groupgemm)
 
 HIDDEN_SIZES = [4097]
 
@@ -28,31 +31,23 @@
 SEED = [0]
 CUDA_DEVICES = [f"cuda:{0}"]
 
-
-def assert_close(a, b):
-    rtol, atol = {
-        torch.float16: (6e-2, 6e-2),
-        torch.bfloat16: (6e-2, 6e-2),
-        torch.float32: (1e-2, 1e-2),
-    }[a.dtype]
-    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
-
-
 # Unlike test_punica_sizes.py, we directly utilize custom op for
 # testing, which verifies the correct registration of these ops.
 bgmv_expand = torch.ops.vllm.bgmv_expand
 bgmv_expand_slice = torch.ops.vllm.bgmv_expand_slice
 bgmv_shrink = torch.ops.vllm.bgmv_shrink
 sgmv_expand = torch.ops.vllm.sgmv_expand
-sgmv_expand_slice = torch.ops.vllm.sgmv_expand_slice
 sgmv_shrink = torch.ops.vllm.sgmv_shrink
 
+_dict_lock = Lock()
+
 
 @pytest.mark.parametrize("batches", BATCHES)
 @pytest.mark.parametrize("num_loras", NUM_LORA)
 @pytest.mark.parametrize("rank", MAX_RANKS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("nslices", [1, 2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
 @pytest.mark.parametrize("seed", SEED)
@@ -63,6 +58,7 @@ def test_punica_sgmv(
     rank: int,
     hidden_size: int,
     scaling: float,
+    nslices: int,
     dtype: torch.dtype,
     op_type: str,
     seed: int,
@@ -74,19 +70,20 @@ def test_punica_sgmv(
     seq_length = 128
     (
         inputs_tensor,
-        lora_weights,
+        lora_weights_lst,
         our_out_tensor,
         ref_out_tensor,
         b_seq_start_loc,
         lora_indices_tensor,
         seq_len_tensor,
         indices,
-    ) = generate_data(
+    ) = generate_data_for_nslices(
         batches,
         hidden_size,
         num_loras,
         rank,
         seq_length,
+        nslices,
         dtype,
         op_type,
         device,
@@ -98,43 +95,64 @@ def test_punica_sgmv(
     else:
         max_seq_length = max_seq_length.item()
     if op_type == "shrink":
-        sgmv_shrink(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            b_seq_start_loc,
-            seq_len_tensor,
-            lora_indices_tensor,
-            batches,
-            max_seq_length,
-            token_nums,
-            scaling,
-        )
+        # Preventing cache error pointer.
+        with _dict_lock:
+            _LORA_A_PTR_DICT.clear()
+            sgmv_shrink(
+                inputs_tensor,
+                lora_weights_lst,
+                our_out_tensor,
+                b_seq_start_loc,
+                seq_len_tensor,
+                lora_indices_tensor,
+                batches,
+                max_seq_length,
+                token_nums,
+                scaling,
+            )
+        for index in range(nslices):
+            ref_torch_groupgemm(
+                ref_out_tensor[index],
+                inputs_tensor,
+                lora_weights_lst[index],
+                lora_indices_tensor,
+                seq_len_tensor,
+                batches,
+                scaling,
+                op_type,
+            )
     else:
-        sgmv_expand(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            b_seq_start_loc,
-            seq_len_tensor,
-            lora_indices_tensor,
-            batches,
-            max_seq_length,
-            token_nums,
-            add_inputs=True,
-        )
-    ref_torch_groupgemm(
-        ref_out_tensor,
-        inputs_tensor,
-        lora_weights,
-        lora_indices_tensor,
-        seq_len_tensor,
-        batches,
-        scaling if op_type == "shrink" else 1.0,
-        op_type,
-    )
-    if op_type == "shrink":
-        ref_out_tensor = ref_out_tensor.to(torch.float32)
+        with _dict_lock:
+            _LORA_B_PTR_DICT.clear()
+            sgmv_expand(
+                inputs_tensor,
+                lora_weights_lst,
+                our_out_tensor,
+                b_seq_start_loc,
+                seq_len_tensor,
+                lora_indices_tensor,
+                batches,
+                max_seq_length,
+                token_nums,
+                offset_start=0,
+                add_inputs=True,
+            )
+
+        slice_offset = 0
+        for index in range(nslices):
+            lora_weights = lora_weights_lst[index]
+            ref_torch_groupgemm(
+                ref_out_tensor[:, slice_offset:slice_offset + hidden_size],
+                inputs_tensor[index],
+                lora_weights,
+                lora_indices_tensor,
+                seq_len_tensor,
+                batches,
+                1.0,
+                op_type,
+            )
+            slice_offset += hidden_size
+
     assert_close(our_out_tensor, ref_out_tensor)
 
 
@@ -220,24 +238,22 @@ def test_punica_bgmv(
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("nslices", [2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"])
 @pytest.mark.parametrize("seed", SEED)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_punica_expand_nslices(
+def test_punica_bgmv_expand_nslices(
     batches: int,
     num_loras: int,
     rank: int,
     hidden_size: int,
     nslices: int,
     dtype: torch.dtype,
-    op_type: str,
     seed: int,
     device: str,
 ):
     torch.set_default_device(device)
     current_platform.seed_everything(seed)
 
-    seq_length = 128 if op_type == "sgmv" else 1
+    seq_length = 1
     (
         inputs_tensor,
         lora_weights_lst,
@@ -257,40 +273,18 @@ def test_punica_expand_nslices(
         nslices,
         device,
     )
-    max_seq_length = seq_len_tensor.max()
-    token_nums = seq_len_tensor.sum().item()
-    if isinstance(max_seq_length, tuple):
-        max_seq_length = max_seq_length[0].item()
-    else:
-        max_seq_length = max_seq_length.item()
     slice_offset = 0
     for index in range(nslices):
         lora_weights = lora_weights_lst[index]
-        if op_type == "sgmv":
-            sgmv_expand_slice(
-                inputs_tensor,
-                lora_weights,
-                our_outputs,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                slice_offset,
-                hidden_size,
-                add_inputs=True,
-            )
-        else:
-            bgmv_expand_slice(
-                inputs_tensor,
-                lora_weights,
-                our_outputs,
-                indices,
-                slice_offset,
-                slice_size=hidden_size,
-                add_inputs=True,
-            )
+        bgmv_expand_slice(
+            inputs_tensor,
+            lora_weights,
+            our_outputs,
+            indices,
+            slice_offset,
+            slice_size=hidden_size,
+            add_inputs=True,
+        )
         ref_torch_groupgemm(
             ref_outputs[:, slice_offset:slice_offset + hidden_size],
             inputs_tensor,
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index e394c33b3f9ea..b66d18074a7bf 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -18,11 +18,13 @@ def set_module_lora(self, module_name: str, lora: LoRALayerWeights):
     def get_module_lora(self, module_name: str) -> LoRALayerWeights:
         return self._loras[module_name]
 
-    def init_random_lora(self,
-                         module_name: str,
-                         weight: torch.Tensor,
-                         rank: int = 8,
-                         generate_embeddings_tensor: int = 0):
+    def init_random_lora(
+        self,
+        module_name: str,
+        weight: torch.Tensor,
+        rank: int = 8,
+        generate_embeddings_tensor: int = 0,
+    ):
         lora = LoRALayerWeights(
             module_name,
             rank=rank,
@@ -35,21 +37,25 @@ def init_random_lora(self,
                               device=self._device),
         )
         if generate_embeddings_tensor:
-            lora.embeddings_tensor = torch.rand(5,
-                                                generate_embeddings_tensor,
-                                                dtype=weight.dtype,
-                                                device=self._device)
+            lora.embeddings_tensor = torch.rand(
+                5,
+                generate_embeddings_tensor,
+                dtype=weight.dtype,
+                device=self._device,
+            )
         self.set_module_lora(module_name, lora)
 
         return lora
 
-    def init_lora(self,
-                  module_name: str,
-                  input_dim: int,
-                  output_dim: int,
-                  rank=8,
-                  noop=False,
-                  embeddings_tensor=None):
+    def init_lora(
+        self,
+        module_name: str,
+        input_dim: int,
+        output_dim: int,
+        rank=8,
+        noop=False,
+        embeddings_tensor=None,
+    ):
         lora = LoRALayerWeights(
             module_name,
             rank=rank,
@@ -125,8 +131,16 @@ def ref_torch_groupgemm(
     return
 
 
-def generate_data(batches, hidden_size, lora_nums, max_rank, seq_length, dtype,
-                  op_type, device):
+def generate_data(
+    batches,
+    hidden_size,
+    lora_nums,
+    max_rank,
+    seq_length,
+    dtype,
+    op_type,
+    device,
+):
     seq_len_tensor = torch.randint(seq_length, seq_length + 1,
                                    (batches, )).to(device)
     b_seq_start_loc = torch.cumsum(
@@ -187,8 +201,16 @@ def generate_data(batches, hidden_size, lora_nums, max_rank, seq_length, dtype,
     )
 
 
-def generate_data_for_expand_nslices(batches, hidden_size, lora_nums, max_rank,
-                                     seq_length, dtype, nslices, device):
+def generate_data_for_expand_nslices(
+    batches,
+    hidden_size,
+    lora_nums,
+    max_rank,
+    seq_length,
+    dtype,
+    nslices,
+    device,
+):
     seq_len_tensor = torch.randint(seq_length, seq_length + 1,
                                    (batches, )).to(device)
     b_seq_start_loc = torch.cumsum(
@@ -221,7 +243,87 @@ def generate_data_for_expand_nslices(batches, hidden_size, lora_nums, max_rank,
     for b_id in range(batches):
         lora_index = lora_indices_tensor[b_id]
         indices[current_offset:current_offset +
-                seq_len_tensor[b_id]] = lora_index.item()
+                seq_len_tensor[b_id]] = (lora_index.item())
+        current_offset += seq_len_tensor[b_id].item()
+
+    lora_indices_tensor = lora_indices_tensor.to(device)
+    return (
+        inputs_tensor,
+        lora_weights_lst,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    )
+
+
+def generate_data_for_nslices(
+    batches,
+    hidden_size,
+    lora_nums,
+    max_rank,
+    seq_length,
+    nslices,
+    dtype,
+    op_type,
+    device,
+):
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
+                                   (batches, )).to(device)
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+        dim=0,
+    ).to(device)
+    total_tokens = seq_len_tensor.sum()
+
+    lora_weights_lst = []
+    if op_type == "shrink":
+
+        inputs_tensor = torch.rand((total_tokens, hidden_size),
+                                   dtype=dtype).to(device)
+
+        for _ in range(nslices):
+            if op_type == "shrink":
+                lora_weights_lst.append(
+                    torch.rand(
+                        (lora_nums, max_rank, hidden_size),  # col-major
+                        dtype=dtype,
+                    ).to(device))
+        # NOTE  shrink kernel using torch.float32 as output type
+        # shrink op need atomic_add, so output is initinized by 0
+        our_out_tensor = torch.zeros(
+            (nslices, total_tokens, max_rank),
+            dtype=torch.float32,
+        ).to(device)
+    else:
+        inputs_tensor = torch.rand(
+            (nslices, total_tokens, max_rank),
+            dtype=dtype,
+        ).to(device)
+        for _ in range(nslices):
+            lora_weights_lst.append(
+                torch.rand(
+                    (lora_nums, hidden_size, max_rank),  # col-major
+                    dtype=dtype,
+                ).to(device))
+        # expand op needs to complete y+=a@lora_b, so output is
+        # initinized randomly
+        our_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
+                                    dtype=dtype).to(device)
+
+    # Ensure the same input.
+    ref_out_tensor = our_out_tensor.clone()
+    lora_indices_tensor = torch.randint(0,
+                                        lora_nums - 1 if lora_nums > 1 else 1,
+                                        (batches, ))
+    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
+    current_offset = 0
+    for b_id in range(batches):
+        lora_index = lora_indices_tensor[b_id]
+        indices[current_offset:current_offset +
+                seq_len_tensor[b_id]] = (lora_index.item())
         current_offset += seq_len_tensor[b_id].item()
 
     lora_indices_tensor = lora_indices_tensor.to(device)
diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py
index 77c5178493c44..8af44b703810b 100644
--- a/vllm/lora/ops/sgmv_expand.py
+++ b/vllm/lora/ops/sgmv_expand.py
@@ -1,66 +1,109 @@
 """
 Based on:
-Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
-Punica: Multi-Tenant LoRA Serving. 
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
+Punica: Multi-Tenant LoRA Serving.
 https://arxiv.org/abs/2310.18547
 """
 
+from typing import List
+
 import torch
 import triton
 import triton.language as tl
 
 from vllm.utils import direct_register_custom_op
 
+from .utils import _get_lora_b_ptr
+
 
 @triton.jit
 def _sgmv_expand_kernel(
-    input_ptr,
-    lora_ptr,
-    out_ptr,
-    N,
-    K,
-    b_seq_start_loc,
-    seq_lens,
-    lora_indices,
-    xm_stride,
-    xk_stride,  # 1
-    l0_stride,  # hidden_size*max_rank
-    lora_k_stride,
-    lora_n_stride,
-    cm_stride,
-    cn_stride,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_K: tl.constexpr,
-    EVEN_K: tl.constexpr,
-    ADD_INPUTS: tl.constexpr,
-    CAST_TYPE: tl.constexpr,
-):
+        input_ptr,
+        lora_ptr,
+        out_ptr,
+        N,
+        K,
+        b_seq_start_loc,
+        seq_lens,
+        lora_indices,
+        slice_start_loc,
+        input_d0_stride,
+        input_d1_stride,
+        input_d2_stride,  # 1
+        ls_d0_ptr,
+        ls_d1_ptr,
+        ls_d2_ptr,  # 1
+        output_d0_stride,
+        output_d1_stride,  # 1
+        output_hs_ptr,
+        BLOCK_M: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+        BLOCK_K: tl.constexpr,
+        EVEN_K: tl.constexpr,
+        ADD_INPUTS: tl.constexpr,
+        CAST_TYPE: tl.constexpr,
+        SLICE_NUM: tl.constexpr,
+        SAME_STRIDE: tl.constexpr):
     """
-    The sgmv's expand triton kernel is based on GroupGEMM.
+
+    Similar to the 'sgmv_expand' operator, but with an added parameter
+    'slice_offset'. The reason for not reusing the 'sgmv_expand' operator
+    might be that in the future, we could implement a fusion operator to
+    achieve the current functionality instead of having to call it multiple
+    times.
     """
     pid = tl.program_id(axis=0)
     cur_batch = tl.program_id(axis=1)
+    slice_id = tl.program_id(axis=2)
     cta_n_num = tl.cdiv(N, BLOCK_N)
+    # When the output dimensions of each slice are the same,cur_n=N, otherwise
+    # cur_n=tl.load(output_hs_ptr + slice_id), this situation exists in GQA's
+    # qkv linear.
+    curr_N = N if SAME_STRIDE else tl.load(output_hs_ptr + slice_id)
     pid_m = pid // cta_n_num
     pid_n = pid % cta_n_num
     M = tl.load(seq_lens + cur_batch)
     if pid_m * BLOCK_M > M:
         return
+    if pid_n * BLOCK_N > curr_N:
+        return
     lora_index = tl.load(lora_indices + cur_batch)
     if lora_index == -1:
         return
+
     cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
     offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
     offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
     offset_k = tl.arange(0, BLOCK_K)
     ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
-    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
+    rbn = tl.max_contiguous(tl.multiple_of(offset_n % curr_N, BLOCK_N),
+                            BLOCK_N)
+    # ls_d*_ptr can be either an integer or a pointer
+    if SAME_STRIDE:
+        # integer
+        cur_lora_d0_stride = ls_d0_ptr
+        cur_lora_d1_stride = ls_d1_ptr
+        cur_lora_d2_stride = ls_d2_ptr
+    else:
+        # pointer
+        cur_lora_d0_stride = tl.load(ls_d0_ptr + slice_id)
+        cur_lora_d1_stride = tl.load(ls_d1_ptr + slice_id)
+        cur_lora_d2_stride = tl.load(ls_d2_ptr + slice_id)
+    if SLICE_NUM == 1:
+        cur_input_ptr = input_ptr
+        cur_lora_ptr = lora_ptr
 
-    a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
-             offset_k[None, :] * xk_stride, )
-    b_ptr = (lora_ptr + l0_stride * lora_index +
-             offset_k[:, None] * lora_n_stride + rbn[None, :] * lora_k_stride)
+    else:
+        cur_input_ptr = input_ptr + slice_id * input_d0_stride
+        cur_lora_ptr = tl.load(lora_ptr + slice_id).to(
+            tl.pointer_type(out_ptr.dtype.element_ty))
+
+    a_ptr = (cur_input_ptr + cur_seq_start * input_d1_stride +
+             ram[:, None] * input_d1_stride +
+             offset_k[None, :] * input_d2_stride, )
+    b_ptr = (cur_lora_ptr + cur_lora_d0_stride * lora_index +
+             offset_k[:, None] * cur_lora_d2_stride +
+             rbn[None, :] * cur_lora_d1_stride)
     accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
     for k in range(tl.cdiv(K, BLOCK_K)):
         if EVEN_K:
@@ -74,26 +117,30 @@ def _sgmv_expand_kernel(
                               mask=offset_k[:, None] < K - k * BLOCK_K,
                               other=0)
         if CAST_TYPE:
-            tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
+            tiled_a = tiled_a.to(cur_lora_ptr.dtype.element_ty)
         accumulator += tl.dot(
             tiled_a,
             tiled_b,
         )
-        a_ptr += BLOCK_K * xk_stride
-        b_ptr += BLOCK_K * lora_n_stride
-    tiled_c = accumulator.to(lora_ptr.dtype.element_ty)
+        a_ptr += BLOCK_K * input_d2_stride
+        b_ptr += BLOCK_K * cur_lora_d2_stride
+
+    tiled_c = accumulator.to(cur_lora_ptr.dtype.element_ty)
+    if SLICE_NUM == 1:
+        cur_slice_start = slice_start_loc
+    else:
+        cur_slice_start = tl.load(slice_start_loc + slice_id)
+
     offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
-    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
-    c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
-             offset_cn[None, :] * cn_stride)
+    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + cur_slice_start
+    c_ptr = (out_ptr + offset_cm[:, None] * output_d0_stride +
+             offset_cn[None, :] * output_d1_stride)
     M = tl.load(seq_lens + cur_batch)
     c_mask = (offset_cm[:, None] <
-              (cur_seq_start + M)) & (offset_cn[None, :] < N)
+              (cur_seq_start + M)) & (offset_cn[None, :] <
+                                      (cur_slice_start + curr_N))
     if ADD_INPUTS:
-        # explicitly pass in other=None to tell triton that masked values
-        # can be uninitialized. This is OK because the later tl.store operation
-        # uses the same mask, eliminating the risk of garbage values propagating
-        tiled_out = tl.load(c_ptr, mask=c_mask, other=None)
+        tiled_out = tl.load(c_ptr, mask=c_mask)
         tiled_c += tiled_out
     tl.store(c_ptr, tiled_c, mask=c_mask)
 
@@ -101,7 +148,7 @@ def _sgmv_expand_kernel(
 @torch.inference_mode()
 def _sgmv_expand(
     inputs: torch.Tensor,
-    lora_b_weights: torch.Tensor,
+    lora_b_weights: List[torch.Tensor],
     output_tensor: torch.Tensor,
     b_seq_start_loc: torch.Tensor,
     seq_len_tensor: torch.Tensor,
@@ -109,17 +156,18 @@ def _sgmv_expand(
     batches: int,
     max_seq_length: int,
     token_nums: int,
+    offset_start: int = 0,
     add_inputs: bool = False,
 ) -> None:
     """
     Args:
         inputs (torch.Tensor): input tensor
-        lora_b_weights (torch.Tensor): lora'a weight
+        lora_b_weights (List[torch.Tensor]): lora'b weight
         output_tensor (torch.Tensor): output tensor
         b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
             sequence lengths of the sequences in the batch, used to index
             into sequence. E.g., if the sequence length is [4, 6], it is
-            [0, 4, 10].
+            [0, 4].
         seq_len_tensor (torch.Tensor): (batch_size,). Record the sequence
             length of the sequences in the batch.
         lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
@@ -130,77 +178,80 @@ def _sgmv_expand(
             batch.
         token_nums (int): The token numbers in the batch. Used to verify if the 
             token numbers in the inputs matches the one in the metadata.
-        add_inputs (bool, optional): Defaults to False, adds the final lora 
-            results to the output.
+        offset_start (int, optional): Offset start for output_tensor. 
+            Defaults to 0.
+        add_inputs (bool, optional): Whether to add the input tensor to the 
+            output tensor. Defaults to False.
     """
-
     assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
-    assert lora_b_weights.dtype in [
-        torch.float16,
-        torch.bfloat16,
-    ]
-    assert inputs.size(0) == token_nums
-    assert inputs.size(1) == lora_b_weights.size(-1)
+    for weight in lora_b_weights:
+        assert weight.dtype in [torch.float16, torch.bfloat16]
+
+    assert inputs.size(1) == token_nums
+    assert inputs.size(0) == len(lora_b_weights)
+
     assert b_seq_start_loc.size(0) == batches
     assert lora_indices_tensor.size(0) == batches
-    assert inputs.is_contiguous()
     assert output_tensor.is_contiguous()
-
-    if lora_b_weights.ndim == 4:  # shape:(lora_num,1,size,rank)
-        assert lora_b_weights.size(1) == 1
-        lora_b_weights = lora_b_weights.squeeze(dim=1)
-    else:
-        assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
-
-    assert lora_b_weights.is_contiguous()
+    (slice_start_tensor, lora_ptr_tensor, lora_strides_d0_tensor,
+     lora_strides_d1_tensor, lora_strides_d2_tensor, hidden_sizes_tensor,
+     same_stride, MAX_N) = _get_lora_b_ptr(lora_b_weights, offset_start,
+                                           b_seq_start_loc.device)
 
     # TODO tuning this config
+    K = lora_b_weights[0].shape[-1]  # K= rank
 
-    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
-    BLOCK_M = 32
-    BLOCK_N = 32
+    BLOCK_M = 64
+    BLOCK_N = 128
     BLOCK_K = 16
     EVEN_K = K % BLOCK_K == 0
     ADD_INPUTS = add_inputs
     CAST_TYPE = False
-    if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
+
+    if inputs.dtype == torch.float32 and lora_b_weights[0].dtype in [
             torch.float16,
             torch.bfloat16,
     ]:
         CAST_TYPE = True
     grid = (
-        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
+        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(MAX_N, BLOCK_N),
         batches,
+        len(lora_b_weights),
     )
     _sgmv_expand_kernel[grid](
         inputs,
-        lora_b_weights,
+        lora_ptr_tensor,
         output_tensor,
-        N,
+        MAX_N,
         K,
         b_seq_start_loc,
         seq_len_tensor,
         lora_indices_tensor,
+        slice_start_tensor,
         inputs.stride(0),
         inputs.stride(1),
-        lora_b_weights.stride(0),
-        lora_b_weights.stride(1),
-        lora_b_weights.stride(2),
+        inputs.stride(2),
+        lora_strides_d0_tensor,
+        lora_strides_d1_tensor,
+        lora_strides_d2_tensor,
         output_tensor.stride(0),
         output_tensor.stride(1),
+        hidden_sizes_tensor,
         BLOCK_M,
         BLOCK_N,
         BLOCK_K,
         EVEN_K,
         ADD_INPUTS,
         CAST_TYPE,
+        len(lora_b_weights),
+        same_stride,
     )
     return
 
 
-def sgmv_expand_fake(
+def _sgmv_expand_fake(
     inputs: torch.Tensor,
-    lora_b_weights: torch.Tensor,
+    lora_b_weights: List[torch.Tensor],
     output_tensor: torch.Tensor,
     b_seq_start_loc: torch.Tensor,
     seq_len_tensor: torch.Tensor,
@@ -208,18 +259,18 @@ def sgmv_expand_fake(
     batches: int,
     max_seq_length: int,
     token_nums: int,
+    offset_start: int = 0,
     add_inputs: bool = False,
 ) -> None:
     return
 
 
 try:
-
     direct_register_custom_op(
         op_name="sgmv_expand",
         op_func=_sgmv_expand,
         mutates_args=["output_tensor"],
-        fake_impl=sgmv_expand_fake,
+        fake_impl=_sgmv_expand_fake,
     )
     sgmv_expand = torch.ops.vllm.sgmv_expand
 
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
deleted file mode 100644
index 55c4fb68ed128..0000000000000
--- a/vllm/lora/ops/sgmv_expand_slice.py
+++ /dev/null
@@ -1,241 +0,0 @@
-"""
-Based on:
-Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
-Punica: Multi-Tenant LoRA Serving. 
-https://arxiv.org/abs/2310.18547
-"""
-
-import torch
-import triton
-import triton.language as tl
-
-from vllm.utils import direct_register_custom_op
-
-
-@triton.jit
-def _sgmv_expand_slice_kernel(
-    input_ptr,
-    lora_ptr,
-    out_ptr,
-    N,
-    K,
-    b_seq_start_loc,
-    seq_lens,
-    lora_indices,
-    xm_stride,
-    xk_stride,  # 1
-    l0_stride,  # hidden_size*max_rank
-    lora_k_stride,
-    lora_n_stride,
-    cm_stride,
-    cn_stride,
-    slice_offset,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_K: tl.constexpr,
-    EVEN_K: tl.constexpr,
-    ADD_INPUTS: tl.constexpr,
-    CAST_TYPE: tl.constexpr,
-):
-    """
-
-    Similar to the 'sgmv_expand' operator, but with an added parameter 
-    'slice_offset'. The reason for not reusing the 'sgmv_expand' operator 
-    might be that in the future, we could implement a fusion operator to 
-    achieve the current functionality instead of having to call it multiple 
-    times.
-    """
-    pid = tl.program_id(axis=0)
-    cur_batch = tl.program_id(axis=1)
-    cta_n_num = tl.cdiv(N, BLOCK_N)
-    pid_m = pid // cta_n_num
-    pid_n = pid % cta_n_num
-    M = tl.load(seq_lens + cur_batch)
-    if pid_m * BLOCK_M > M:
-        return
-    lora_index = tl.load(lora_indices + cur_batch)
-    if lora_index == -1:
-        return
-    cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
-    offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
-    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
-    offset_k = tl.arange(0, BLOCK_K)
-    ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
-    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
-
-    a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
-             offset_k[None, :] * xk_stride, )
-    b_ptr = (lora_ptr + l0_stride * lora_index +
-             offset_k[:, None] * lora_n_stride + rbn[None, :] * lora_k_stride)
-    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-    for k in range(tl.cdiv(K, BLOCK_K)):
-        if EVEN_K:
-            tiled_a = tl.load(a_ptr)
-            tiled_b = tl.load(b_ptr)
-        else:
-            tiled_a = tl.load(a_ptr,
-                              mask=offset_k[None, :] < K - k * BLOCK_K,
-                              other=0)
-            tiled_b = tl.load(b_ptr,
-                              mask=offset_k[:, None] < K - k * BLOCK_K,
-                              other=0)
-        if CAST_TYPE:
-            tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
-        accumulator += tl.dot(
-            tiled_a,
-            tiled_b,
-        )
-        a_ptr += BLOCK_K * xk_stride
-        b_ptr += BLOCK_K * lora_n_stride
-    tiled_c = accumulator.to(lora_ptr.dtype.element_ty)
-    offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
-    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + slice_offset
-    c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
-             offset_cn[None, :] * cn_stride)
-    M = tl.load(seq_lens + cur_batch)
-    c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :] <
-                                                           (slice_offset + N))
-    if ADD_INPUTS:
-        # explicitly pass in other=None to tell triton that masked values
-        # can be uninitialized. This is OK because the later tl.store operation
-        # uses the same mask, eliminating the risk of garbage values propagating
-        tiled_out = tl.load(c_ptr, mask=c_mask, other=None)
-        tiled_c += tiled_out
-    tl.store(c_ptr, tiled_c, mask=c_mask)
-
-
-@torch.inference_mode()
-def _sgmv_expand_slice(
-    inputs: torch.Tensor,
-    lora_b_weights: torch.Tensor,
-    output_tensor: torch.Tensor,
-    b_seq_start_loc: torch.Tensor,
-    seq_len_tensor: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    batches: int,
-    max_seq_length: int,
-    token_nums: int,
-    slice_offset: int,
-    slice_size: int,
-    add_inputs: bool = False,
-) -> None:
-    """_summary_
-
-    Args:
-        inputs (torch.Tensor): input tensor
-        lora_b_weights (torch.Tensor): lora'a weight
-        output_tensor (torch.Tensor): output tensor
-        b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
-            sequence lengths of the sequences in the batch, used to index
-            into sequence. E.g., if the sequence length is [4, 6], it is
-            [0, 4, 10].
-        seq_len_tensor (torch.Tensor): (batch_size,). Record the sequence
-            length of the sequences in the batch
-        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
-            corresponding to each batch. An index of -1 means no lora should be
-            applied.
-        batches (int): batch size
-        max_seq_length (int): The max sequence lengths of the sequences
-            in the batch
-        token_nums (int): The token numbers in the batch. Used to verify if the 
-            token numbers in the inputs matches the one in the metadata.
-        slice_offset (int): output_tensor's offset
-        slice_size (int): current output_tensor's size
-        add_inputs (bool, optional): Defaults to False, adds the final lora 
-            results to the output.
-    """
-
-    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
-    assert lora_b_weights.dtype in [
-        torch.float16,
-        torch.bfloat16,
-    ]
-    assert inputs.size(0) == token_nums
-    assert inputs.size(1) == lora_b_weights.size(-1)
-    assert b_seq_start_loc.size(0) == batches
-    assert lora_indices_tensor.size(0) == batches
-    assert slice_size == lora_b_weights.size(-2)
-    assert inputs.is_contiguous()
-    assert output_tensor.is_contiguous()
-
-    if lora_b_weights.ndim == 4:  # shape:(lora_num,1,size,rank)
-        assert lora_b_weights.size(1) == 1
-        lora_b_weights = lora_b_weights.squeeze(dim=1)
-    else:
-        assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
-
-    assert lora_b_weights.is_contiguous()
-
-    # TODO tuning this config
-    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
-
-    BLOCK_M = 32
-    BLOCK_N = 32
-    BLOCK_K = 16
-    EVEN_K = K % BLOCK_K == 0
-    ADD_INPUTS = add_inputs
-    CAST_TYPE = False
-    if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
-            torch.float16,
-            torch.bfloat16,
-    ]:
-        CAST_TYPE = True
-    grid = (
-        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
-        batches,
-    )
-    _sgmv_expand_slice_kernel[grid](
-        inputs,
-        lora_b_weights,
-        output_tensor,
-        N,
-        K,
-        b_seq_start_loc,
-        seq_len_tensor,
-        lora_indices_tensor,
-        inputs.stride(0),
-        inputs.stride(1),
-        lora_b_weights.stride(0),
-        lora_b_weights.stride(1),
-        lora_b_weights.stride(2),
-        output_tensor.stride(0),
-        output_tensor.stride(1),
-        slice_offset,
-        BLOCK_M,
-        BLOCK_N,
-        BLOCK_K,
-        EVEN_K,
-        ADD_INPUTS,
-        CAST_TYPE,
-    )
-    return
-
-
-def sgmv_expand_slice_fake(
-    inputs: torch.Tensor,
-    lora_b_weights: torch.Tensor,
-    output_tensor: torch.Tensor,
-    b_seq_start_loc: torch.Tensor,
-    seq_len_tensor: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    batches: int,
-    max_seq_length: int,
-    token_nums: int,
-    slice_offset: int,
-    slice_size: int,
-    add_inputs: bool = False,
-) -> None:
-    return
-
-
-try:
-    direct_register_custom_op(
-        op_name="sgmv_expand_slice",
-        op_func=_sgmv_expand_slice,
-        mutates_args=["output_tensor"],
-        fake_impl=sgmv_expand_slice_fake,
-    )
-    sgmv_expand_slice = torch.ops.vllm.sgmv_expand_slice
-
-except AttributeError:
-    sgmv_expand_slice = _sgmv_expand_slice
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index 37d1dc84eebca..3d2ebe8286f56 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -5,48 +5,60 @@
 https://arxiv.org/abs/2310.18547
 """
 
+from typing import List
+
 import torch
 import triton
 import triton.language as tl
 
 from vllm.utils import direct_register_custom_op
 
+from .utils import _get_lora_a_ptr
+
 
 @triton.jit
 def _sgmv_shrink_kernel(
-    input_ptr,
-    lora_ptr,
-    out_ptr,
-    N,
-    K,
-    b_seq_start_loc,
-    seq_lens,
-    lora_indices,
-    scaling,
-    xm_stride,  # hidden_size
-    xk_stride,  # 1
-    l0_stride,  # hidden_size*max_rank
-    lora_k_stride,
-    lora_n_stride,
-    cm_stride,
-    cn_stride,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_K: tl.constexpr,
-    EVEN_K: tl.constexpr,
-    SPLIT_K: tl.constexpr,
-):
+        input_ptr,
+        lora_ptr,  #1-3
+        out_ptr,
+        N,
+        K,
+        b_seq_start_loc,
+        seq_lens,
+        lora_indices,
+        scaling,
+        input_d0_stride,
+        input_d1_stride,  # 1
+        lora_d0_stride,
+        lora_d1_stride,
+        lora_d2_stride,  # 1
+        output_d0_stride,
+        output_d1_stride,
+        output_d2_stride,  # 1 
+        BLOCK_M: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+        BLOCK_K: tl.constexpr,
+        EVEN_K: tl.constexpr,
+        SPLIT_K: tl.constexpr,
+        SLICE_NUM: tl.constexpr):
     """
     The sgmv's shrink triton kernel is based on GroupGEMM+SPLIT-K.
     The GEMM of Multi-LoRA can be considered as GroupGEMM. Additionally,
     introducing SPLIT-K can improve performance
     """
     pid = tl.program_id(axis=0)
-    pid_sk = tl.program_id(axis=1)
+    pid_mix = tl.program_id(axis=1)
     cur_batch = tl.program_id(axis=2)
     cta_n_num = tl.cdiv(N, BLOCK_N)
     pid_m = pid // cta_n_num
     pid_n = pid % cta_n_num
+    if SLICE_NUM == 1:
+        slice_id: tl.constexpr = 0
+        pid_sk = tl.program_id(axis=1)
+    else:
+        pid_mix = tl.program_id(axis=1)
+        slice_id = pid_mix // SPLIT_K
+        pid_sk = pid_mix % SPLIT_K
 
     M = tl.load(seq_lens + cur_batch)
     if pid_m * BLOCK_M > M:
@@ -61,11 +73,22 @@ def _sgmv_shrink_kernel(
 
     ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
     rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
+    # input ptr
+    a_ptr = (input_ptr + cur_seq_start * input_d0_stride +
+             ram[:, None] * input_d0_stride +
+             offset_k[None, :] * input_d1_stride)
+
+    if SLICE_NUM == 1:
+        # current lora ptr
+        cur_lora_ptr = lora_ptr
+    else:
+        # current lora ptr
+        cur_lora_ptr = tl.load(lora_ptr + slice_id).to(
+            tl.pointer_type(input_ptr.dtype.element_ty))
 
-    a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
-             offset_k[None, :] * xk_stride)
-    b_ptr = (lora_ptr + l0_stride * lora_index + rbn[None, :] * lora_k_stride +
-             offset_k[:, None] * lora_n_stride)
+    b_ptr = (cur_lora_ptr + lora_d0_stride * lora_index +
+             rbn[None, :] * lora_d1_stride +
+             offset_k[:, None] * lora_d2_stride)
 
     accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
     for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)):
@@ -82,13 +105,15 @@ def _sgmv_shrink_kernel(
                               other=0.0)
         accumulator += tl.dot(tiled_a, tiled_b)
 
-        a_ptr += BLOCK_K * SPLIT_K * xk_stride
-        b_ptr += BLOCK_K * SPLIT_K * lora_n_stride
+        a_ptr += BLOCK_K * SPLIT_K * input_d1_stride
+        b_ptr += BLOCK_K * SPLIT_K * lora_d2_stride
     offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
 
     offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
-    c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
-             offset_cn[None, :] * cn_stride)
+    cur_out_ptr = (out_ptr if SLICE_NUM == 1 else out_ptr +
+                   slice_id * output_d0_stride)
+    c_ptr = cur_out_ptr + offset_cm[:, None] * output_d1_stride + offset_cn[
+        None, :] * output_d2_stride
     c_mask = (offset_cm[:, None] <
               (cur_seq_start + M)) & (offset_cn[None, :] < N)
     accumulator *= scaling
@@ -102,7 +127,7 @@ def _sgmv_shrink_kernel(
 @torch.inference_mode()
 def _sgmv_shrink(
     inputs: torch.Tensor,
-    lora_a_weights: torch.Tensor,
+    lora_a_weights: List[torch.Tensor],
     output_tensor: torch.Tensor,
     b_seq_start_loc: torch.Tensor,
     seq_len_tensor: torch.Tensor,
@@ -113,10 +138,9 @@ def _sgmv_shrink(
     scaling: float,
 ) -> None:
     """
-
     Args:
         inputs (torch.Tensor): input tensor
-        lora_a_weights (torch.Tensor): lora'a weight
+        lora_a_weights (List[torch.Tensor]): lora'a weight
         output_tensor (torch.Tensor): output tensor
         b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
             sequence lengths of the sequences in the batch, used to index
@@ -134,27 +158,21 @@ def _sgmv_shrink(
             token numbers in the inputs matches the one in the metadata.
         scaling (float): Scaling factor.
     """
-    assert inputs.dtype == lora_a_weights.dtype
+    assert inputs.dtype == lora_a_weights[0].dtype
     assert inputs.dtype in [torch.float16, torch.bfloat16]
-    assert lora_a_weights.dtype in [
-        torch.float16,
-        torch.bfloat16,
-    ]
+    for weight in lora_a_weights:
+        assert weight.dtype in [torch.float16, torch.bfloat16]
+
     assert inputs.size(0) == token_nums
-    assert inputs.size(1) == lora_a_weights.size(-1)
+    assert inputs.size(1) == lora_a_weights[0].size(-1)
     assert b_seq_start_loc.size(0) == batches
     assert lora_indices_tensor.size(0) == batches
     assert inputs.is_contiguous()
-
-    if lora_a_weights.ndim == 4:  # shape:(lora_num,1,rank, size)
-        assert lora_a_weights.size(1) == 1
-        lora_a_weights = lora_a_weights.squeeze(dim=1)
-    else:
-        assert lora_a_weights.ndim == 3  # shape:(lora_num,rank, size)
-    assert lora_a_weights.is_contiguous()
     assert output_tensor.is_contiguous()
+    (lora_ptr_tensor, lora_strides_d0, lora_strides_d1,
+     lora_strides_d2) = _get_lora_a_ptr(lora_a_weights, b_seq_start_loc.device)
     # TODO tuning this config
-    N, K = lora_a_weights.shape[-2:]  # K=hidden_size,N=rank
+    N, K = lora_a_weights[0].shape[-2:]  # K=hidden_size,N=rank
     BLOCK_M = 32
     BLOCK_N = 16
     BLOCK_K = 32
@@ -162,13 +180,12 @@ def _sgmv_shrink(
     EVEN_K = K % (BLOCK_K * SPLIT_K) == 0
     grid = (
         triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
-        SPLIT_K,
+        SPLIT_K * len(lora_a_weights),
         batches,
     )
-
     _sgmv_shrink_kernel[grid](
         inputs,
-        lora_a_weights,
+        lora_ptr_tensor,
         output_tensor,
         N,
         K,
@@ -178,23 +195,25 @@ def _sgmv_shrink(
         scaling,
         inputs.stride(0),
         inputs.stride(1),
-        lora_a_weights.stride(0),
-        lora_a_weights.stride(1),
-        lora_a_weights.stride(2),
+        lora_strides_d0,
+        lora_strides_d1,
+        lora_strides_d2,
         output_tensor.stride(0),
         output_tensor.stride(1),
+        output_tensor.stride(2),
         BLOCK_M,
         BLOCK_N,
         BLOCK_K,
         EVEN_K,
         SPLIT_K,
+        len(lora_a_weights),
     )
     return
 
 
 def sgmv_shrink_fake(
     inputs: torch.Tensor,
-    lora_a_weights: torch.Tensor,
+    lora_a_weights: List[torch.Tensor],
     output_tensor: torch.Tensor,
     b_seq_start_loc: torch.Tensor,
     seq_len_tensor: torch.Tensor,
diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py
index 7c3e27313ad97..7df5bc2c225e5 100644
--- a/vllm/lora/ops/utils.py
+++ b/vllm/lora/ops/utils.py
@@ -1,5 +1,7 @@
 import functools
-from typing import Dict
+from typing import Dict, List, Tuple
+
+import torch
 
 
 @functools.lru_cache
@@ -44,3 +46,120 @@ def get_lora_op_configs(op_type: str, batch: int,
     if not config:
         config = _get_default_config(op_type, batch, hidden_size)
     return config
+
+
+_LORA_A_PTR_DICT: Dict[Tuple[int, ...], Tuple[torch.tensor, ...]] = {}
+_LORA_B_PTR_DICT: Dict[Tuple[int, ...], Tuple[torch.tensor, ...]] = {}
+
+
+def _get_lora_a_ptr(lora_a_weights: List[torch.Tensor], device: str):
+    """
+    `_LORA_A_PTR_DICT` collects the required information during `profile_run`, 
+    After this, it remains constant and subsequent usage is through LUT.
+    Refer to: 
+    https://github.com/triton-lang/triton/blob/release/3.1.x/python/tutorials/08-grouped-gemm.py
+    """
+    key = tuple(lora_weight.data_ptr() for lora_weight in lora_a_weights)
+
+    if values := _LORA_A_PTR_DICT.get(key):
+        return values
+
+    lora_strides_d0 = []
+    lora_strides_d1 = []
+    lora_strides_d2 = []
+    tensor_ptrs = []
+    for lora_a_weight in lora_a_weights:
+        if lora_a_weight.ndim == 4:  # shape:(lora_num,1,size,rank)
+            assert lora_a_weight.size(1) == 1
+            lora_a_weight = lora_a_weight.squeeze(dim=1)
+        else:
+            assert lora_a_weight.ndim == 3  # shape:(lora_num,size,rank)
+        assert lora_a_weight.is_contiguous()
+        tensor_ptrs.append(lora_a_weight.data_ptr())
+        lora_strides_d0.append(lora_a_weight.stride(0))
+        lora_strides_d1.append(lora_a_weight.stride(1))
+        lora_strides_d2.append(lora_a_weight.stride(2))
+    if len(lora_a_weights) > 1:
+        lora_ptr_tensor = torch.tensor(tensor_ptrs, device=device)
+    else:
+        lora_ptr_tensor = lora_a_weights[0]
+
+    if (len(set(lora_strides_d0)) > 1 or len(set(lora_strides_d1)) > 1
+            or len(set(lora_strides_d2)) > 1):
+        raise ValueError("All LoRA weights must have the same stride.")
+
+    _LORA_A_PTR_DICT[key] = (
+        lora_ptr_tensor,
+        lora_strides_d0[0],
+        lora_strides_d1[0],
+        lora_strides_d2[0],
+    )
+    return _LORA_A_PTR_DICT.get(key)
+
+
+def _get_lora_b_ptr(lora_weights: List[torch.Tensor], offset_start: int,
+                    device: str):
+    """ 
+     `_LORA_B_PTR_DICT` collects the required information during `profile_run`, 
+    After this, it remains constant and subsequent usage is through LUT.
+    Refer to: 
+    https://github.com/triton-lang/triton/blob/release/3.1.x/python/tutorials/08-grouped-gemm.py
+
+    """
+
+    key = tuple(lora_weight.data_ptr() for lora_weight in lora_weights)
+    if values := _LORA_B_PTR_DICT.get(key):
+        return values
+    slice_offset_lst = []
+    tensor_ptrs = []
+    lora_strides_d0 = []
+    lora_strides_d1 = []
+    lora_strides_d2 = []
+    hidden_sizes = []
+    slice_offset = offset_start
+    for lora_b_weight in lora_weights:
+        if lora_b_weight.ndim == 4:  # shape:(lora_num,1,size,rank)
+            assert lora_b_weight.size(1) == 1
+            lora_b_weight = lora_b_weight.squeeze(dim=1)
+        else:
+            assert lora_b_weight.ndim == 3  # shape:(lora_num,size,rank)
+        assert lora_b_weight.is_contiguous()
+        tensor_ptrs.append(lora_b_weight.data_ptr())
+        lora_strides_d0.append(lora_b_weight.stride(0))
+        lora_strides_d1.append(lora_b_weight.stride(1))
+        lora_strides_d2.append(lora_b_weight.stride(2))
+        slice_offset_lst.append(slice_offset)
+        slice_offset += lora_b_weight.size(1)
+        hidden_sizes.append(lora_b_weight.size(1))
+
+    if len(lora_weights) > 1:
+        # note these are device tensors
+        lora_ptr_tensor = torch.tensor(tensor_ptrs, device=device)
+        slice_start_tensor = torch.tensor(slice_offset_lst, device=device)
+    else:
+        slice_start_tensor = slice_offset_lst[0]
+        lora_ptr_tensor = lora_b_weight[0]
+
+    # If each lora has the same stride, there's no need to use a
+    # tensor for storage.
+    if (len(set(lora_strides_d0)) == 1 and len(set(lora_strides_d1)) == 1 and
+            len(set(lora_strides_d2)) == 1) and len(set(hidden_sizes)) == 1:
+        lora_strides_d0_tensor = lora_strides_d0[0]
+        lora_strides_d1_tensor = lora_strides_d1[0]
+        lora_strides_d2_tensor = lora_strides_d2[0]
+        hidden_sizes_tensor = hidden_sizes[0]
+        same_stride = True
+
+    else:
+        lora_strides_d0_tensor = torch.tensor(lora_strides_d0, device=device)
+        lora_strides_d1_tensor = torch.tensor(lora_strides_d1, device=device)
+        lora_strides_d2_tensor = torch.tensor(lora_strides_d2, device=device)
+        hidden_sizes_tensor = torch.tensor(hidden_sizes, device=device)
+        same_stride = False
+    # MAX_N is the maximum hidden size among all the lora_b weights
+    MAX_N = max(hidden_sizes)
+    _LORA_B_PTR_DICT[key] = (slice_start_tensor, lora_ptr_tensor,
+                             lora_strides_d0_tensor, lora_strides_d1_tensor,
+                             lora_strides_d2_tensor, hidden_sizes_tensor,
+                             same_stride, MAX_N)
+    return _LORA_B_PTR_DICT.get(key)
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index de378df8b3cfa..278f7b5a8e9f4 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -5,7 +5,7 @@
 https://arxiv.org/abs/2310.18547
 """
 
-from typing import Callable, Optional, Tuple, Union, final
+from typing import Optional, Tuple, Union, final
 
 import torch
 
@@ -16,7 +16,6 @@
     from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
     from vllm.lora.ops.bgmv_shrink import bgmv_shrink
     from vllm.lora.ops.sgmv_expand import sgmv_expand
-    from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
     from vllm.lora.ops.sgmv_shrink import sgmv_shrink
 
 from .punica_base import PunicaWrapperBase
@@ -35,11 +34,11 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int,
         PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches,
                                    device)
 
-    def _shrink_prefill(
+    def _apply_shrink_prefill(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
-        w_t_all: torch.Tensor,
+        w_t_all: Tuple[torch.Tensor, ...],
         scale: float,
     ):
         #No LoRA request, so return directly
@@ -53,7 +52,7 @@ def _shrink_prefill(
             scale,
         )
 
-    def _shrink_decode(
+    def _apply_shrink_decode(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
@@ -62,56 +61,28 @@ def _shrink_decode(
     ):
         bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale)
 
-    def _expand_prefill(
+    def _apply_expand_prefill(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
         w_t_all: torch.Tensor,
+        offset_start: int,
         add_inputs: bool,
     ):
         #No LoRA request, so return directly
         if self.no_lora:
             return
-        sgmv_expand(
-            x,
-            w_t_all,
-            y,
-            *self.prefill_metadata,
-            add_inputs,
-        )
-
-    def _expand_decode(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        add_inputs: bool,
-    ):
-        bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_inputs)
 
-    def _expand_slice_prefill(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        y_offset: Optional[int],
-        y_slice_size: Optional[int],
-        add_inputs: bool,
-    ):
-        #No LoRA request, so return directly
-        if self.no_lora:
-            return
-        sgmv_expand_slice(
+        sgmv_expand(
             x,
             w_t_all,
             y,
             *self.prefill_metadata,
-            y_offset,
-            y_slice_size,
-            add_inputs,
+            offset_start=offset_start,
+            add_inputs=add_inputs,
         )
 
-    def _expand_slice_decode(
+    def _apply_expand_decode(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
@@ -123,43 +94,6 @@ def _expand_slice_decode(
         bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
                           y_slice_size, add_inputs)
 
-    def _apply_expand(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        y_offset: Optional[int],
-        y_slice_size: Optional[int],
-        add_inputs: bool = True,
-    ):
-        """
-        Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` 
-        computation, which is suitable for the
-        GEMM of lora'b.
-        """
-
-        expand_slice_fun: Callable = (self._expand_slice_prefill
-                                      if self.is_prefill else
-                                      self._expand_slice_decode)
-        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_inputs)
-
-    def _apply_shrink(self, y: torch.Tensor, x: torch.Tensor,
-                      w_t_all: torch.Tensor, scale: float):
-        """
-        Perform the ` y+=x@w_t_all` computation, which is suitable for the
-        GEMM of lora'a.
-        When `is_prefill is` true, it indicates that it is currently the
-        prefill stage, and the `_shrink_prefill` function should be called.
-        Otherwise, it is the decode stage, and the _shrink_decode function
-        should be called.
-        """
-        y_org = y
-        y = y.view(-1, y.shape[-1])
-        shrink_fun: Callable = (self._shrink_prefill
-                                if self.is_prefill else self._shrink_decode)
-        shrink_fun(y, x, w_t_all, scale)
-        y = y.view_as(y_org)
-
     def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
                    x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...],
                    scale: float, **kwargs):
@@ -182,10 +116,15 @@ def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
         """
 
         x = x.view(-1, x.shape[-1])
-        # TODO fuse these kernels
-        for slice_idx in range(len(lora_a_stacked)):
-            self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx],
-                               scale)
+
+        if self.is_prefill:
+            # NOTE fused kernel
+            self._apply_shrink_prefill(y, x, lora_a_stacked, scale)
+        else:
+            # TODO fuse these kernels
+            for slice_idx in range(len(lora_a_stacked)):
+                self._apply_shrink_decode(y[slice_idx], x,
+                                          lora_a_stacked[slice_idx], scale)
 
     def add_expand(self,
                    y: torch.Tensor,
@@ -217,20 +156,28 @@ def add_expand(self,
         """
         y_org = y
         y = y.view(-1, y.shape[-1])
-        offset_left = offset_start
         if lora_bias_stacked is not None:
             self._apply_bias(self.token_lora_indices, y, output_slices,
                              lora_bias_stacked)
-        for slice_idx in range(len(lora_b_stacked)):
-            self._apply_expand(
-                y,
-                x[slice_idx],
-                lora_b_stacked[slice_idx],
-                offset_left,
-                output_slices[slice_idx],
-                add_inputs=add_inputs,
-            )
-            offset_left += output_slices[slice_idx]
+        if self.is_prefill:
+            # NOTE fused kernel
+            self._apply_expand_prefill(y,
+                                       x,
+                                       lora_b_stacked,
+                                       offset_start,
+                                       add_inputs=True)
+        else:
+            # TODO fuse these kernels
+            for slice_idx in range(len(lora_b_stacked)):
+                self._apply_expand_decode(
+                    y,
+                    x[slice_idx],
+                    lora_b_stacked[slice_idx],
+                    offset_start,
+                    output_slices[slice_idx],
+                    add_inputs=add_inputs,
+                )
+                offset_start += output_slices[slice_idx]
         y = y.view_as(y_org)
 
     def add_lora_embedding(self,
@@ -252,10 +199,18 @@ def add_lora_embedding(self,
             add_inputs (bool): Default to True.
         """
 
-        # Embedding layer only need expand op
-        expand_fun: Callable = (self._expand_prefill
-                                if self.is_prefill else self._expand_decode)
-        expand_fun(y, x, lora_b_stacked, add_inputs)
+        if self.is_prefill:
+            sgmv_expand(
+                x.unsqueeze(dim=0),
+                [lora_b_stacked],
+                y,
+                *self.prefill_metadata,
+                offset_start=0,
+                add_inputs=add_inputs,
+            )
+        else:
+            bgmv_expand(x, lora_b_stacked, y, self.token_lora_indices,
+                        add_inputs)
 
     def add_lora_linear(self,
                         y: torch.Tensor,
@@ -301,10 +256,11 @@ def add_lora_linear(self,
             r = lora_b_stacked[0].size(-1)
             # We set the buffer to be float32 by default ,refer to:
             # https://github.com/triton-lang/triton/issues/1387
-            buffer = tuple(
-                torch.zeros(
-                    (x.size(0), r), dtype=torch.float32, device=x.device)
-                for _ in range(len(output_slices)))
+            buffer = torch.zeros(
+                (len(output_slices), x.size(0), r),
+                dtype=torch.float32,
+                device=x.device,
+            )
         self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs)
         self.add_expand(y,
                         buffer,

From 0f3f3c86ec44467fa80b60cb9f971f9ede028f76 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 6 Jan 2025 20:36:24 -0800
Subject: [PATCH 294/357] [Bugfix] Update attention interface in `Whisper`
 (#11784)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/whisper.py | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index cb54b4c3ba663..c1f3bb0ca33c2 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -106,6 +106,7 @@ def __init__(
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.attn",
+            attn_type=self.attn_type,
         )
 
     def _init_qkv(
@@ -134,12 +135,7 @@ def forward(
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
 
-        attn_output = self.attn(q,
-                                k,
-                                v,
-                                kv_cache,
-                                attn_metadata,
-                                attn_type=self.attn_type)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
 
         output, _ = self.out_proj(attn_output)
 
@@ -164,6 +160,7 @@ def __init__(
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=prefix,
+            attn_type=AttentionType.ENCODER_DECODER,
         )
 
     def _init_qkv(
@@ -207,12 +204,13 @@ def forward(
         else:
             k = v = None
 
-        attn_output = self.attn(q,
-                                k,
-                                v,
-                                kv_cache,
-                                attn_metadata,
-                                attn_type=AttentionType.ENCODER_DECODER)
+        attn_output = self.attn(
+            q,
+            k,
+            v,
+            kv_cache,
+            attn_metadata,
+        )
 
         output, _ = self.out_proj(attn_output)
 
@@ -734,4 +732,4 @@ def load_weights(self, weights: Iterable[Tuple[str,
         loaded_weights = [(name, loaded_weight)
                           for name, loaded_weight in weights]
         mapper = WeightsMapper({".fc1.": ".mlp.fc1.", ".fc2.": ".mlp.fc2."})
-        return loader.load_weights(loaded_weights, mapper=mapper)
\ No newline at end of file
+        return loader.load_weights(loaded_weights, mapper=mapper)

From 898cdf033e31dc28042f7181b1565c78d905196e Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfc@amazon.com>
Date: Mon, 6 Jan 2025 21:36:10 -0800
Subject: [PATCH 295/357] [CI] Fix neuron CI and run offline tests (#11779)

Signed-off-by: Liangfu Chen <liangfc@amazon.com>
---
 .buildkite/run-neuron-test.sh        | 53 ++++++++++++++--------------
 Dockerfile.neuron                    |  8 +++--
 examples/offline_inference_neuron.py | 11 ++----
 3 files changed, 35 insertions(+), 37 deletions(-)

diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index 9259391aaed49..aa29c434e7cfb 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -3,6 +3,18 @@
 # This script build the Neuron docker image and run the API server inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -e
+set -v
+
+image_name="neuron/vllm-ci"
+container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+
+HF_CACHE="$(realpath ~)/huggingface"
+mkdir -p "${HF_CACHE}"
+HF_MOUNT="/root/.cache/huggingface"
+
+NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
+mkdir -p "${NEURON_COMPILE_CACHE_URL}"
+NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
 
 # Try building the docker image
 aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
@@ -13,41 +25,30 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
     last_build=$(cat /tmp/neuron-docker-build-timestamp)
     current_time=$(date +%s)
     if [ $((current_time - last_build)) -gt 86400 ]; then
+        docker image prune -f
         docker system prune -f
+        rm -rf "${HF_MOUNT:?}/*"
+        rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
         echo "$current_time" > /tmp/neuron-docker-build-timestamp
     fi
 else
     date "+%s" > /tmp/neuron-docker-build-timestamp
 fi
 
-docker build -t neuron -f Dockerfile.neuron .
+docker build -t "${image_name}" -f Dockerfile.neuron .
 
 # Setup cleanup
-remove_docker_container() { docker rm -f neuron || true; }
+remove_docker_container() {
+    docker image rm -f "${image_name}" || true;
+}
 trap remove_docker_container EXIT
-remove_docker_container
 
 # Run the image
-docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
-       --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
-
-# Wait for the server to start
-wait_for_server_to_start() {
-    timeout=300
-    counter=0
-
-    while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
-        sleep 1
-        counter=$((counter + 1))
-        if [ $counter -ge $timeout ]; then
-            echo "Timeout after $timeout seconds"
-            break
-        fi
-    done
-}
-wait_for_server_to_start
-
-# Test a simple prompt
-curl -X POST -H "Content-Type: application/json" \
-    localhost:8000/generate \
-    -d '{"prompt": "San Francisco is a"}'
+docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
+       -v "${HF_CACHE}:${HF_MOUNT}" \
+       -e "HF_HOME=${HF_MOUNT}" \
+       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
+       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
+       --name "${container_name}" \
+       ${image_name} \
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py"
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 269139fe90f0b..e9cb82889decd 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -15,8 +15,8 @@ RUN apt-get update && \
         ffmpeg libsm6 libxext6 libgl1
 
 ### Mount Point ###
-# When launching the container, mount the code directory to /app
-ARG APP_MOUNT=/app
+# When launching the container, mount the code directory to /workspace
+ARG APP_MOUNT=/workspace
 VOLUME [ ${APP_MOUNT} ]
 WORKDIR ${APP_MOUNT}/vllm
 
@@ -25,6 +25,7 @@ RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
 RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
 RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install pytest
 
 COPY . .
 ARG GIT_REPO_CHECK=0
@@ -42,4 +43,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 
+# overwrite entrypoint to run bash script
+RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
+
 CMD ["/bin/bash"]
diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference_neuron.py
index 2856be7c864ea..f098c8e5fed1e 100644
--- a/examples/offline_inference_neuron.py
+++ b/examples/offline_inference_neuron.py
@@ -1,12 +1,5 @@
-import os
-
 from vllm import LLM, SamplingParams
 
-# creates XLA hlo graphs for all the context length buckets.
-os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048"
-# creates XLA hlo graphs for all the token gen buckets.
-os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048"
-
 # Sample prompts.
 prompts = [
     "Hello, my name is",
@@ -26,8 +19,8 @@
     # Currently, this is a known limitation in continuous batching support
     # in transformers-neuronx.
     # TODO(liangfu): Support paged-attention in transformers-neuronx.
-    max_model_len=2048,
-    block_size=2048,
+    max_model_len=1024,
+    block_size=1024,
     # The device can be automatically detected when AWS Neuron SDK is installed.
     # The device argument can be either unspecified for automated detection,
     # or explicitly assigned.

From e512f76a898d61b8857b36b138bb9cf93ea04d03 Mon Sep 17 00:00:00 2001
From: XiaobingZhang <xiaobingzhangupc@gmail.com>
Date: Tue, 7 Jan 2025 14:12:48 +0800
Subject: [PATCH 296/357] fix init error for MessageQueue when n_local_reader
 is zero (#11768)

---
 vllm/distributed/device_communicators/shm_broadcast.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 9f97b0f01ad8a..4ced991f62f66 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -247,7 +247,8 @@ def __init__(
         self.handle = Handle(
             connect_ip=connect_ip,
             local_reader_ranks=local_reader_ranks,
-            buffer_handle=self.buffer.handle(),
+            buffer_handle=self.buffer.handle()
+            if self.buffer is not None else None,
             local_subscribe_port=local_subscribe_port,
             remote_subscribe_port=remote_subscribe_port,
         )

From ce1917fcf211458dfbe6bb86d6a9d2d9bd346e63 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 7 Jan 2025 01:57:32 -0500
Subject: [PATCH 297/357] [Doc] Create a vulnerability management team (#9925)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 SECURITY.md                                   |  2 +-
 .../contributing/vulnerability_management.md  | 43 +++++++++++++++++++
 docs/source/index.md                          |  1 +
 3 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/contributing/vulnerability_management.md

diff --git a/SECURITY.md b/SECURITY.md
index ad3f1f16ab560..de0032d26c87b 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -4,7 +4,7 @@
 
 If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
 
-Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
+Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/contributing/vulnerability_management/).
 
 ---
 
diff --git a/docs/source/contributing/vulnerability_management.md b/docs/source/contributing/vulnerability_management.md
new file mode 100644
index 0000000000000..422dc13e6a644
--- /dev/null
+++ b/docs/source/contributing/vulnerability_management.md
@@ -0,0 +1,43 @@
+# Vulnerability Management
+
+## Reporting Vulnerabilities
+
+As mentioned in the [security
+policy](https://github.com/vllm-project/vllm/tree/main/SECURITY.md), security
+vulnerabilities may be reported privately to the project via
+[GitHub](https://github.com/vllm-project/vllm/security/advisories/new).
+
+## Vulnerability Management Team
+
+Once a vulnerability has been reported to the project, the Vulnerability
+Management Team (VMT) is responsible for managing the vulnerability. The VMT is
+responsible for:
+
+- Triaging the vulnerability.
+- Coordinating with reporters and project maintainers on vulnerability analysis
+  and resolution.
+- Drafting of security advisories for confirmed vulnerabilities, as appropriate.
+- Coordination with project maintainers on a coordinated release of the fix and
+  security advisory.
+
+### Security Advisories
+
+Advisories are published via GitHub through the same system used to report
+vulnerabilities. More information on the process can be found in the [GitHub
+documentation](https://docs.github.com/en/code-security/security-advisories/working-with-repository-security-advisories/about-repository-security-advisories).
+
+### Team Members
+
+We prefer to keep all vulnerability-related communication on the security report
+on GitHub. However, if you need to contact the VMT directly for an urgent issue,
+you may contact the following individuals:
+
+- Simon Mo - simon.mo@hey.com
+- Russell Bryant - rbryant@redhat.com
+
+## Slack Discussion
+
+You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai)
+to discuss security-related topics. However, please do not disclose any
+vulnerabilities in this channel. If you need to report a vulnerability, please
+use the GitHub security advisory system or contact a VMT member privately.
diff --git a/docs/source/index.md b/docs/source/index.md
index c335155bd6e14..11d3e24a9b60a 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -169,6 +169,7 @@ contributing/overview
 contributing/profiling/profiling_index
 contributing/dockerfile/dockerfile
 contributing/model/index
+contributing/vulnerability_management
 ```
 
 # Indices and tables

From 1e4ce295ae70771f8e0eaa50962b3dda29c3f0d6 Mon Sep 17 00:00:00 2001
From: Yuan <yuan.zhou@intel.com>
Date: Tue, 7 Jan 2025 15:28:01 +0800
Subject: [PATCH 298/357] [CI][CPU] adding build number to docker image name
 (#11788)

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .buildkite/run-cpu-test.sh | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 4f1729d46dae2..a4eca078568fd 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -9,31 +9,31 @@ CORE_RANGE=${CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 
 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
 
 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; }
+remove_docker_container() { docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
- --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
 
 function cpu_tests() {
   set -e
   export NUMA_NODE=$2
 
   # offline inference
-  docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
     set -e
     python3 examples/offline_inference.py"
 
   # Run basic model test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     pip install pytest pytest-asyncio \
       decord einops librosa peft Pillow sentence-transformers soundfile \
@@ -46,26 +46,26 @@ function cpu_tests() {
     pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
 
   # Run compressed-tensor test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
 
   # Run AWQ test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
     tests/quantization/test_ipex_quant.py"
 
   # Run chunked-prefill and prefix-cache test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v -k cpu_model \
     tests/basic_correctness/test_chunked_prefill.py"  
 
   # online inference
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     export VLLM_CPU_KVCACHE_SPACE=10 
     export VLLM_CPU_OMP_THREADS_BIND=$1

From 8082ad7950ad96fdc15e6b5a42e8098dd7087f6f Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 7 Jan 2025 01:55:39 -0800
Subject: [PATCH 299/357] [V1][Doc] Update V1 support for `LLaVa-NeXT-Video`
 (#11798)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 590bea992d1fc..7777545b8b3c1 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -640,7 +640,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `LlavaOnevisionForConditionalGeneration`
   - LLaVA-Onevision
   - T + I<sup>+</sup> + V<sup>+</sup>

From 8f37be38ebfe0295a4925837c501c87149997a4d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 7 Jan 2025 18:25:02 +0800
Subject: [PATCH 300/357] [Bugfix] Comprehensively test and fix LLaVA-NeXT
 feature size calculation (#11800)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 requirements-test.in                          |   1 +
 requirements-test.txt                         |   4 +
 .../processing/test_llava_next.py             | 129 +++++++++++++++---
 .../processing/test_llava_onevision.py        | 129 +++++++++++++++---
 vllm/model_executor/models/llava_next.py      |  37 ++---
 vllm/model_executor/models/llava_onevision.py |  42 +++---
 6 files changed, 253 insertions(+), 89 deletions(-)

diff --git a/requirements-test.in b/requirements-test.in
index fb4179c3d8423..4b4dc376d1fa5 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -13,6 +13,7 @@ einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio tests
 peft
+pqdm
 ray[adag]==2.40.0
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
diff --git a/requirements-test.txt b/requirements-test.txt
index 3771577fe8ed0..f576e42afcbbf 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -48,6 +48,8 @@ botocore==1.35.57
     #   awscli
     #   boto3
     #   s3transfer
+bounded-pool-executor==0.0.3
+    # via pqdm
 buildkite-test-collector==0.1.9
     # via -r requirements-test.in
 certifi==2024.8.30
@@ -342,6 +344,8 @@ pooch==1.8.2
     # via librosa
 portalocker==2.10.1
     # via sacrebleu
+pqdm==0.2.0
+    # via -r requirements-test.in
 propcache==0.2.0
     # via yarl
 protobuf==5.28.3
diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
index 37a6d334ee60c..9fa6a8a10a0f9 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_next.py
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
@@ -1,8 +1,13 @@
+import itertools
+from functools import partial
+
 import pytest
 from PIL import Image
+from pqdm.threads import pqdm
 from transformers import AutoTokenizer
 
 from vllm.inputs import InputProcessingContext
+from vllm.multimodal.parse import ImageSize
 
 from ....utils import build_model_context
 
@@ -15,20 +20,69 @@ def processor_for_llava_next():
     return LlavaNextMultiModalProcessor
 
 
+def _validate_image_prompt_replacements_one(
+    processor,
+    num_imgs: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    prompt = "<image>" * num_imgs
+    image = Image.new("RGB", size=image_size)
+    mm_data = {"image": [image] * num_imgs}
+
+    try:
+        # The processor will throw an error if there is a mismatch
+        # in the prompt replacements
+        processed_inputs = processor.apply(prompt, mm_data, {})
+
+        image_placeholders = processed_inputs["mm_placeholders"]["image"]
+        assert len(image_placeholders) == num_imgs
+
+        first_placeholder = image_placeholders[0]
+
+        # NOTE: There is a BOS token
+        assert first_placeholder["offset"] == 1
+        assert first_placeholder["length"] == (
+            len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
+
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+
+
+def _test_image_prompt_replacements(
+    processor,
+    *,
+    num_imgs: int,
+    image_sizes: list[ImageSize],
+) -> None:
+    """
+    Ensure LlavaNextMultiModalProcessor
+    handles prompt replacement properly for input images.
+    """
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    validate_one = partial(
+        _validate_image_prompt_replacements_one,
+        processor,
+        num_imgs,
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+
+
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
-@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488),
-                                        (488, 183), (198, 176), (176, 198),
-                                        (161, 184), (184, 161)])
 @pytest.mark.parametrize("num_imgs", [1, 2])
-def test_processor_prompt_replacements(
+def test_processor_prompt_replacements_regression(
     processor_for_llava_next,
     model_id: str,
-    image_size: tuple[int, int],
     num_imgs: int,
 ):
-    """
-    Ensure LlavaNextMultiModalProcessor handles prompt replacement properly.
-    """
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
@@ -37,22 +91,55 @@ def test_processor_prompt_replacements(
     )
     tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
     ctx = InputProcessingContext(ctx.model_config, tokenizer)
+    processor = processor_for_llava_next(ctx)
+
+    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
+                    (488, 183), (2560, 1669)]
+    image_sizes = [
+        size for w, h in image_ratios
+        for size in [ImageSize(w, h), ImageSize(h, w)]
+    ]
+
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
 
-    # Build the image str / prompt based on the number of images we pass
-    prompt = "<image>" * num_imgs
-    mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs}
 
-    # The processor will throw an error if there is a mismatch
-    # in the prompt replacements
+@pytest.mark.skip("This test takes around 2 hours to run. "
+                  "Comment this out to run it manually.")
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize("num_imgs", [1])
+def test_processor_prompt_replacements_all(
+    processor_for_llava_next,
+    model_id: str,
+    num_imgs: int,
+):
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    ctx = InputProcessingContext(ctx.model_config, tokenizer)
     processor = processor_for_llava_next(ctx)
-    processed_inputs = processor.apply(prompt, mm_data, {})
 
-    image_placeholders = processed_inputs["mm_placeholders"]["image"]
-    assert len(image_placeholders) == num_imgs
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
 
-    first_placeholder = image_placeholders[0]
+    # The aspect ratio of the grid layout is between 1 and 2
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(64, 1024), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
 
-    # NOTE: There is a BOS token
-    assert first_placeholder["offset"] == 1
-    assert first_placeholder["length"] == (
-        len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
index ed3e2db799be7..d4cdffa210b6d 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
@@ -1,8 +1,13 @@
+import itertools
+from functools import partial
+
 import pytest
 from PIL import Image
+from pqdm.threads import pqdm
 from transformers import AutoTokenizer
 
 from vllm.inputs import InputProcessingContext
+from vllm.multimodal.parse import ImageSize
 
 from ....utils import build_model_context
 
@@ -15,22 +20,68 @@ def processor_for_llava_onevision():
     return LlavaOnevisionMultiModalProcessor
 
 
+def _validate_image_prompt_replacements_one(
+    processor,
+    num_imgs: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    prompt = "<image>" * num_imgs
+    image = Image.new("RGB", size=image_size)
+    mm_data = {"image": [image] * num_imgs}
+
+    try:
+        # The processor will throw an error if there is a mismatch
+        # in the prompt replacements
+        processed_inputs = processor.apply(prompt, mm_data, {})
+
+        image_placeholders = processed_inputs["mm_placeholders"]["image"]
+        assert len(image_placeholders) == num_imgs
+
+        first_placeholder = image_placeholders[0]
+
+        assert first_placeholder["offset"] == 0
+        assert first_placeholder["length"] == len(
+            processed_inputs["prompt_token_ids"]) // num_imgs
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+
+
+def _test_image_prompt_replacements(
+    processor,
+    *,
+    num_imgs: int,
+    image_sizes: list[ImageSize],
+) -> None:
+    """
+    Ensure LlavaOnevisionMultiModalProcessor
+    handles prompt replacement properly for input images.
+    """
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    validate_one = partial(
+        _validate_image_prompt_replacements_one,
+        processor,
+        num_imgs,
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+
+
 @pytest.mark.parametrize("model_id",
                          ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
-@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488),
-                                        (488, 183), (198, 176), (176, 198),
-                                        (161, 184), (184, 161)])
 @pytest.mark.parametrize("num_imgs", [1, 2])
-def test_processor_prompt_replacements(
+def test_processor_prompt_replacements_regression(
     processor_for_llava_onevision,
     model_id: str,
-    image_size: tuple[int, int],
     num_imgs: int,
 ):
-    """
-    Ensure LlavaOnevisionMultiModalProcessor handles prompt replacement
-    properly.
-    """
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
@@ -39,22 +90,56 @@ def test_processor_prompt_replacements(
     )
     tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
     ctx = InputProcessingContext(ctx.model_config, tokenizer)
+    processor = processor_for_llava_onevision(ctx)
 
-    # Build the image str / prompt based on the number of images we pass
-    prompt = "<image>" * num_imgs
-    mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs}
+    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
+                    (488, 183), (2560, 1669)]
+    image_sizes = [
+        size for w, h in image_ratios
+        for size in [ImageSize(w, h), ImageSize(h, w)]
+    ]
+
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
 
-    # The processor will throw an error if there is a mismatch
-    # in the prompt replacements
+
+@pytest.mark.skip("This test takes around 2 hours to run. "
+                  "Comment this out to run it manually.")
+@pytest.mark.parametrize("model_id",
+                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.parametrize("num_imgs", [1])
+def test_processor_prompt_replacements_all(
+    processor_for_llava_onevision,
+    model_id: str,
+    num_imgs: int,
+):
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    ctx = InputProcessingContext(ctx.model_config, tokenizer)
     processor = processor_for_llava_onevision(ctx)
-    processed_inputs = processor.apply(prompt, mm_data, {})
 
-    image_placeholders = processed_inputs["mm_placeholders"]["image"]
-    assert len(image_placeholders) == num_imgs
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
 
-    first_placeholder = image_placeholders[0]
+    # The aspect ratio of the grid layout is between 1 and 6
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(64, 1024), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
 
-    # NOTE: There is a BOS token
-    assert first_placeholder["offset"] == 0
-    assert first_placeholder["length"] == len(
-        processed_inputs["prompt_token_ids"]) // num_imgs
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 258352416d4a7..815456dac2a2f 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -2,7 +2,6 @@
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
                     Protocol, Set, Tuple, TypedDict, Union)
 
-import numpy as np
 import torch
 import torch.nn as nn
 from transformers import BatchFeature, LlavaNextConfig, LlavaNextProcessor
@@ -74,7 +73,7 @@ def _get_hf_config(self) -> LlavaNextLikeConfig:
     def _get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaNextProcessor)
 
-    # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106
+    # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L113
     def _get_num_image_tokens(
         self,
         *,
@@ -111,7 +110,7 @@ def _get_num_image_tokens(
 
         return unpadded_feature_size + newline_feature_size + base_feature_size
 
-    # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L79
+    # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
     def _get_num_unpadded_features(
         self,
         *,
@@ -121,29 +120,23 @@ def _get_num_unpadded_features(
         num_patch_height: int,
         num_patch_width: int,
     ) -> tuple[int, int]:
-        # NOTE: Use float32 to remain consistent with HF output
-        current_height_f = np.float32(npatches * num_patch_height)
-        current_width_f = np.float32(npatches * num_patch_width)
+        current_height = npatches * num_patch_height
+        current_width = npatches * num_patch_width
 
-        original_width_f = np.float32(original_width)
-        original_height_f = np.float32(original_height)
+        aspect_ratio = original_width / original_height
+        current_aspect_ratio = current_width / current_height
 
-        original_aspect_ratio = original_width_f / original_height_f
-        current_aspect_ratio = current_width_f / current_height_f
-
-        if original_aspect_ratio > current_aspect_ratio:
-            scale_factor = current_width_f / original_width_f
-            new_height = int(original_height_f * scale_factor)
-            padding = (current_height_f - new_height) // 2
-            current_height_f -= 2 * padding
+        if aspect_ratio > current_aspect_ratio:
+            new_height = (original_height * current_width) // original_width
+            padding = (current_height - new_height) // 2
+            current_height = current_height - (2 * padding)
         else:
-            scale_factor = current_height_f / original_height_f
-            new_width = int(original_width_f * scale_factor)
-            padding = (current_width_f - new_width) // 2
-            current_width_f -= 2 * padding
+            new_width = (original_width * current_height) // original_height
+            padding = (current_width - new_width) // 2
+            current_width = current_width - (2 * padding)
 
-        unpadded_features = int(current_height_f * current_width_f)
-        newline_features = int(current_height_f)
+        unpadded_features = current_height * current_width
+        newline_features = current_height
 
         return (unpadded_features, newline_features)
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 62dae74e377be..b5e3edba1f01c 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -3,7 +3,6 @@
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
                     Protocol, Set, Tuple, TypedDict, Union)
 
-import numpy as np
 import torch
 import torch.nn as nn
 from transformers import (BatchFeature, LlavaOnevisionConfig,
@@ -98,6 +97,8 @@ def _get_hf_config(self) -> LlavaOnevisionLikeConfig:
     def _get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaOnevisionProcessor)
 
+    # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
+    # with additional logic afterwards taken from LlavaOnevisionProcessor
     def _get_num_unpadded_features(
         self,
         *,
@@ -107,35 +108,28 @@ def _get_num_unpadded_features(
         num_patch_height: int,
         num_patch_width: int,
     ) -> tuple[int, int]:
-        # NOTE: Use float32 to remain consistent with HF output
-        current_height_f = np.float32(npatches * num_patch_height)
-        current_width_f = np.float32(npatches * num_patch_width)
+        current_height = npatches * num_patch_height
+        current_width = npatches * num_patch_width
 
-        original_width_f = np.float32(original_width)
-        original_height_f = np.float32(original_height)
+        aspect_ratio = original_width / original_height
+        current_aspect_ratio = current_width / current_height
 
-        original_aspect_ratio = original_width_f / original_height_f
-        current_aspect_ratio = current_width_f / current_height_f
-
-        if original_aspect_ratio > current_aspect_ratio:
-            scale_factor = current_width_f / original_width_f
-            new_height = int(original_height_f * scale_factor)
-            padding = (current_height_f - new_height) // 2
-            current_height_f -= 2 * padding
+        if aspect_ratio > current_aspect_ratio:
+            new_height = (original_height * current_width) // original_width
+            padding = (current_height - new_height) // 2
+            current_height = current_height - (2 * padding)
         else:
-            scale_factor = current_height_f / original_height_f
-            new_width = int(original_width_f * scale_factor)
-            padding = (current_width_f - new_width) // 2
-            current_width_f -= 2 * padding
+            new_width = (original_width * current_height) // original_height
+            padding = (current_width - new_width) // 2
+            current_width = current_width - (2 * padding)
 
-        unpadded_features = int(current_height_f * current_width_f)
-        newline_features = int(current_height_f)
+        unpadded_features = current_height * current_width
+        newline_features = current_height
 
-        ratio = math.sqrt(current_height_f * current_width_f /
-                          (9 * npatches**2))
+        ratio = math.sqrt(current_height * current_width / (9 * npatches**2))
         if ratio > 1.1:
-            height_factor = int(current_height_f // ratio)
-            width_factor = int(current_width_f // ratio)
+            height_factor = int(current_height // ratio)
+            width_factor = int(current_width // ratio)
             unpadded_features = height_factor * width_factor
             newline_features = height_factor
 

From 869e829b853cc35747c5e4bc9d773a4cff704d12 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 7 Jan 2025 18:41:17 +0800
Subject: [PATCH 301/357] [doc] add doc to explain how to use uv (#11773)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 .../getting_started/installation/gpu-cuda.md  | 67 ++++++++++++++-----
 1 file changed, 52 insertions(+), 15 deletions(-)

diff --git a/docs/source/getting_started/installation/gpu-cuda.md b/docs/source/getting_started/installation/gpu-cuda.md
index 7ea10bb8b59ff..295555b6c41f0 100644
--- a/docs/source/getting_started/installation/gpu-cuda.md
+++ b/docs/source/getting_started/installation/gpu-cuda.md
@@ -12,24 +12,43 @@ vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) bin
 
 ## Install released versions
 
-You can install vLLM using pip:
+### Create a new Python environment
+
+You can create a new Python environment using `conda`:
 
 ```console
 $ # (Recommended) Create a new conda environment.
 $ conda create -n myenv python=3.12 -y
 $ conda activate myenv
-
-$ # Install vLLM with CUDA 12.1.
-$ pip install vllm
 ```
 
 ```{note}
-Although we recommend using `conda` to create and manage Python environments, it is highly recommended to use `pip` to install vLLM. This is because `pip` can install `torch` with separate library packages like `NCCL`, while `conda` installs `torch` with statically linked `NCCL`. This can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
+[PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages. In particular, the PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
+```
+
+Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command:
+
+```console
+$ # (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment.
+$ uv venv myenv --python 3.12 --seed
+$ source myenv/bin/activate
+```
+
+In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
+
+Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-from-source) for more details.
+
+### Install vLLM
+
+You can install vLLM using either `pip` or `uv pip`:
+
+```console
+$ # Install vLLM with CUDA 12.1.
+$ pip install vllm # If you are using pip.
+$ uv pip install vllm # If you are using uv.
 ```
 
-````{note}
-As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default.
-We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
+As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
 
 ```console
 $ # Install vLLM with CUDA 11.8.
@@ -38,22 +57,19 @@ $ export PYTHON_VERSION=310
 $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 ```
 
-In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
-
-Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions.
-````
-
 (install-the-latest-code)=
 
 ## Install the latest code
 
-LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`. You can download and install it with the following command:
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`.
+
+### Install the latest code using `pip`
 
 ```console
 $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 ```
 
-If you want to access the wheels for previous commits, you can specify the commit hash in the URL:
+If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
 
 ```console
 $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
@@ -62,6 +78,27 @@ $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm
 
 Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
 
+Due to the limitation of `pip`, you have to specify the full URL of the wheel file.
+
+### Install the latest code using `uv`
+
+Another way to install the latest code is to use `uv`:
+
+```console
+$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly
+```
+
+If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
+
+```console
+$ export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
+$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
+```
+
+The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
+
+### Install the latest code using `docker`
+
 Another way to access the latest code is to use the docker images:
 
 ```console

From 2de197bdd4b82a004ff99806d054dce1d93b3ced Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 7 Jan 2025 03:47:36 -0800
Subject: [PATCH 302/357] [V1] Support audio language models on V1 (#11733)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.md    |  4 ++--
 vllm/model_executor/models/qwen2_audio.py |  9 +++++---
 vllm/model_executor/models/ultravox.py    | 28 +++++++++++++++++------
 3 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 7777545b8b3c1..8c5f6836d6aa8 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -710,7 +710,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `Qwen/Qwen2-Audio-7B-Instruct`
   -
   - ✅︎
-  -
+  - ✅︎
 * - `Qwen2VLForConditionalGeneration`
   - Qwen2-VL
   - T + I<sup>E+</sup> + V<sup>E+</sup>
@@ -724,7 +724,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `fixie-ai/ultravox-v0_3`
   -
   - ✅︎
-  -
+  - ✅︎
 ```
 
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 576b01776e5de..7012ddc66cd9c 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -335,13 +335,16 @@ def _process_audio_input(self,
         selected_audio_feature = audio_outputs.last_hidden_state
         audio_features = self.multi_modal_projector(selected_audio_feature)
         num_audios, max_audio_tokens, embed_dim = audio_features.shape
+        audio_output_lengths = audio_output_lengths.unsqueeze(1)
         audio_features_mask = torch.arange(max_audio_tokens).expand(
-            num_audios, max_audio_tokens
-        ).to(audio_output_lengths.device) < audio_output_lengths.unsqueeze(1)
+            num_audios, max_audio_tokens).to(
+                audio_output_lengths.device) < audio_output_lengths
         masked_audio_features = audio_features[audio_features_mask].view(
             -1, embed_dim)
 
-        return masked_audio_features
+        # Split to tuple of embeddings for individual audio input.
+        return torch.split(masked_audio_features,
+                           audio_output_lengths.flatten().tolist())
 
     def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index ba823acecbb56..ecafd157b1d61 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -1,6 +1,5 @@
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
 """PyTorch Ultravox model."""
-
 import math
 from functools import cached_property
 from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set,
@@ -14,6 +13,7 @@
 from transformers.models.whisper import WhisperFeatureExtractor
 from transformers.models.whisper.modeling_whisper import WhisperEncoder
 
+from vllm import envs
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
@@ -35,8 +35,11 @@
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings,
                     merge_multimodal_embeddings_from_map)
 
+_AUDIO_PLACEHOLDER_OVERRIDE = "<|reserved_special_token_0|>"
+_AUDIO_PLACEHOLDER_TOKEN = 128002
 _AUDIO_TOKENS_PER_SECOND = 6.25
 
 
@@ -64,7 +67,14 @@ def _get_hf_processor(
         # Ignored in initialization
         sampling_rate: Optional[int] = None,
     ) -> ProcessorMixin:
-        return self.ctx.get_hf_processor()
+        hf_processor = self.ctx.get_hf_processor()
+
+        # NOTE: Ultravox processing definition uses '<|eot_id|>' as the
+        # placeholder that will cause confusion with the actual end of turn
+        # token, thus we override placeholder with a reserved special
+        # token.
+        hf_processor.audio_token_replacement = _AUDIO_PLACEHOLDER_OVERRIDE
+        return hf_processor
 
     def _get_feature_extractor(
         self,
@@ -465,11 +475,15 @@ def get_input_embeddings(
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
 
-            # TODO(ywang96): use merge_multimodal_embeddings after
-            # v0 is deprecated
-            merge_multimodal_embeddings_from_map(
-                inputs_embeds, multimodal_embeddings,
-                attn_metadata.multi_modal_placeholder_index_maps["audio"])
+            # TODO(ywang96): remove this block after v0 is deprecated.
+            if not envs.VLLM_USE_V1:
+                merge_multimodal_embeddings_from_map(
+                    inputs_embeds, multimodal_embeddings,
+                    attn_metadata.multi_modal_placeholder_index_maps["audio"])
+            else:
+                inputs_embeds = merge_multimodal_embeddings(
+                    input_ids, inputs_embeds, multimodal_embeddings,
+                    _AUDIO_PLACEHOLDER_TOKEN)
         return inputs_embeds
 
     def forward(self,

From d9fa1c05ad7149a43051a283d0cbeeb65bf6b4a3 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 7 Jan 2025 21:42:58 +0800
Subject: [PATCH 303/357] [doc] update how pip can install nightly wheels
 (#11806)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/getting_started/installation/gpu-cuda.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/getting_started/installation/gpu-cuda.md b/docs/source/getting_started/installation/gpu-cuda.md
index 295555b6c41f0..1cd513177bf0d 100644
--- a/docs/source/getting_started/installation/gpu-cuda.md
+++ b/docs/source/getting_started/installation/gpu-cuda.md
@@ -66,19 +66,19 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe
 ### Install the latest code using `pip`
 
 ```console
-$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+$ pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
 ```
 
-If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
+`--pre` is required for `pip` to consider pre-released versions.
+
+If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL:
 
 ```console
 $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
 $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 ```
 
-Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
-
-Due to the limitation of `pip`, you have to specify the full URL of the wheel file.
+Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
 
 ### Install the latest code using `uv`
 
@@ -126,7 +126,7 @@ $ cd vllm
 $ VLLM_USE_PRECOMPILED=1 pip install --editable .
 ```
 
-This will download the latest nightly wheel and use the compiled libraries from there in the install.
+This will download the latest nightly wheel from https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl and use the compiled libraries from there in the installation.
 
 The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files):
 

From c0efe92d8b9ef968a5b796fd7d6ebc426d78e726 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 7 Jan 2025 21:50:58 +0800
Subject: [PATCH 304/357] [Doc] Add note to `gte-Qwen2` models (#11808)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 8c5f6836d6aa8..3ba34c77205e5 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -430,6 +430,9 @@ You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask
 
 On the other hand, its 1.5B variant (`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
 despite being described otherwise on its model card.
+
+Regardless of the variant, you need to enable `--trust-remote-code` for the correct tokenizer to be
+loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882).
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using

From 869579a702cb086cca6bd6ec4500f954a9adec1c Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 8 Jan 2025 01:04:28 +0800
Subject: [PATCH 305/357] [optimization] remove python function call for custom
 op (#11750)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/_custom_ops.py                             |  4 ----
 vllm/model_executor/layers/activation.py        | 17 +++++++++++------
 .../layers/fused_moe/fused_marlin_moe.py        |  4 ++--
 .../layers/fused_moe/fused_moe.py               |  3 ++-
 4 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index eb2f69df42624..afb350591e562 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -35,10 +35,6 @@ def register_fake(fn):
 
 
 # activation ops
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.silu_and_mul(out, x)
-
-
 def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     torch.ops._C.gelu_and_mul(out, x)
 
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 34d65ed51ef3f..46d4670bfe4f9 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -10,6 +10,7 @@
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 from vllm.utils import LazyDict
 
 
@@ -58,27 +59,31 @@ class SiluAndMul(CustomOp):
         return: (num_tokens, d) or (batch_size, seq_len, d)
     """
 
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda_alike():
+            self.op = torch.ops._C.silu_and_mul
+        elif current_platform.is_xpu():
+            import intel_extension_for_pytorch as ipex
+            self.op = ipex.llm.functional.silu_and_mul
+
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         d = x.shape[-1] // 2
         return F.silu(x[..., :d]) * x[..., d:]
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         d = x.shape[-1] // 2
         output_shape = (x.shape[:-1] + (d, ))
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.silu_and_mul(out, x)
+        self.op(out, x)
         return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm._ipex_ops import ipex_ops as ops
-
         d = x.shape[-1] // 2
         output_shape = (x.shape[:-1] + (d, ))
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.silu_and_mul(out, x)
+        self.op(out, x)
         return out
 
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 4741d69de11ac..87993267c05b5 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -4,7 +4,6 @@
 
 import torch
 
-from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_topk, moe_align_block_size, try_get_optimal_moe_config)
 from vllm.scalar_type import scalar_types
@@ -301,7 +300,8 @@ def fused_marlin_moe(
         False,
     )
 
-    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
+    torch.ops._C.silu_and_mul(intermediate_cache2,
+                              intermediate_cache1.view(-1, 2 * N))
 
     intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe(
         intermediate_cache2,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 4101facbe7874..1bb6bc753d37c 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -753,7 +753,8 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 use_int8_w8a16=use_int8_w8a16,
                                 block_shape=block_shape)
 
-        ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
+        torch.ops._C.silu_and_mul(intermediate_cache2,
+                                  intermediate_cache1.view(-1, N))
 
         invoke_fused_moe_kernel(intermediate_cache2,
                                 w2,

From c994223d569221652643e897d8402b835ead411d Mon Sep 17 00:00:00 2001
From: jiangjiadi <34134495+jiangjiadi@users.noreply.github.com>
Date: Wed, 8 Jan 2025 02:36:34 +0800
Subject: [PATCH 306/357] [Bugfix] update the prefix for qwen2 (#11795)

Co-authored-by: jiadi.jjd <jiadi.jjd@antgroup.com>
---
 vllm/model_executor/models/qwen2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 01745b5fd53e1..d20fb150f7e39 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -298,7 +298,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             lambda prefix: Qwen2DecoderLayer(config=config,
                                              cache_config=cache_config,
                                              quant_config=quant_config,
-                                             prefix=f"{prefix}.layers"),
+                                             prefix=prefix),
             prefix=f"{prefix}.layers",
         )
 

From 973f5dc581c35a9c5b9176116e2f42f3f97d0d01 Mon Sep 17 00:00:00 2001
From: sroy745 <142070531+sroy745@users.noreply.github.com>
Date: Tue, 7 Jan 2025 11:19:12 -0800
Subject: [PATCH 307/357] [Doc]Add documentation for using EAGLE in vLLM
 (#11417)

Signed-off-by: Sourashis Roy <sroy@roblox.com>
---
 docs/source/features/spec_decode.md | 66 +++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
index bc8a0aa14dc5a..903acadb71426 100644
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@@ -159,6 +159,72 @@ A variety of speculative models of this type are available on HF hub:
 - [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator)
 - [granite-20b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator)
 
+## Speculating using EAGLE based draft models
+
+The following code configures vLLM to use speculative decoding where proposals are generated by
+an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model.
+
+```python
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+llm = LLM(
+    model="meta-llama/Meta-Llama-3-8B-Instruct",
+    tensor_parallel_size=4,
+    speculative_model="path/to/modified/eagle/model",
+    speculative_draft_tensor_parallel_size=1,
+)
+
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+```
+
+A few important things to consider when using the EAGLE based draft models:
+
+1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) cannot be
+   used directly with vLLM due to differences in the expected layer names and model definition.
+   To use these models with vLLM, use the [following script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) 
+   to convert them. Note that this script does not modify the model's weights.
+
+   In the above example, use the script to first convert
+   the [yuhuili/EAGLE-LLaMA3-Instruct-8B](https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B) model 
+   and then use the converted checkpoint as the draft model in vLLM.
+
+2. The EAGLE based draft models need to be run without tensor parallelism
+   (i.e. speculative_draft_tensor_parallel_size is set to 1), although
+   it is possible to run the main model using tensor parallelism (see example above).
+
+3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
+   reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
+   investigation and tracked here: [https://github.com/vllm-project/vllm/issues/9565](https://github.com/vllm-project/vllm/issues/9565).
+
+
+A variety of EAGLE draft models are available on the Hugging Face hub:
+
+| Base Model                                                           | EAGLE on Hugging Face                     | # EAGLE Parameters |
+|---------------------------------------------------------------------|-------------------------------------------|--------------------|
+| Vicuna-7B-v1.3                                                       | yuhuili/EAGLE-Vicuna-7B-v1.3             | 0.24B              |
+| Vicuna-13B-v1.3                                                      | yuhuili/EAGLE-Vicuna-13B-v1.3            | 0.37B              |
+| Vicuna-33B-v1.3                                                      | yuhuili/EAGLE-Vicuna-33B-v1.3            | 0.56B              |
+| LLaMA2-Chat 7B                                                       | yuhuili/EAGLE-llama2-chat-7B             | 0.24B              |
+| LLaMA2-Chat 13B                                                      | yuhuili/EAGLE-llama2-chat-13B            | 0.37B              |
+| LLaMA2-Chat 70B                                                      | yuhuili/EAGLE-llama2-chat-70B            | 0.99B              |
+| Mixtral-8x7B-Instruct-v0.1                                           | yuhuili/EAGLE-mixtral-instruct-8x7B      | 0.28B              |
+| LLaMA3-Instruct 8B                                                   | yuhuili/EAGLE-LLaMA3-Instruct-8B         | 0.25B              |
+| LLaMA3-Instruct 70B                                                  | yuhuili/EAGLE-LLaMA3-Instruct-70B        | 0.99B              |
+| Qwen2-7B-Instruct                                                    | yuhuili/EAGLE-Qwen2-7B-Instruct          | 0.26B              |
+| Qwen2-72B-Instruct                                                   | yuhuili/EAGLE-Qwen2-72B-Instruct         | 1.05B              |
+
+
 ## Lossless guarantees of Speculative Decoding
 
 In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of

From a4e2b268568b335d8fe37f8eaaa894cec3ba9397 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= <jiefu@tencent.com>
Date: Wed, 8 Jan 2025 08:15:50 +0800
Subject: [PATCH 308/357] [Bugfix] Significant performance drop on CPUs with
 --num-scheduler-steps > 1 (#11794)

---
 vllm/engine/arg_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index e94664308cf8d..0850bab6bb7e1 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1157,6 +1157,12 @@ def create_engine_config(self,
             if self.enable_chunked_prefill and self.pipeline_parallel_size > 1:
                 raise ValueError("Multi-Step Chunked-Prefill is not supported "
                                  "for pipeline-parallel-size > 1")
+            from vllm.platforms import current_platform
+            if current_platform.is_cpu():
+                logger.warning("Multi-Step (--num-scheduler-steps > 1) is "
+                               "currently not supported for CPUs and has been "
+                               "disabled.")
+                self.num_scheduler_steps = 1
 
         # make sure num_lookahead_slots is set the higher value depending on
         # if we are using speculative decoding or multi-step

From 5950f555a1d2ce19c30efb24abe03737320d05c1 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 8 Jan 2025 01:20:12 +0000
Subject: [PATCH 309/357] [Doc] Group examples into categories (#11782)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .gitignore                                    |   5 +-
 docs/Makefile                                 |   4 +
 docs/requirements-docs.txt                    |   1 +
 docs/source/conf.py                           |   4 +
 docs/source/generate_examples.py              | 264 +++++++++++++++---
 .../examples/examples_index.template.md       |   8 -
 examples/fp8/README.md                        |   6 +-
 .../Otel.md                                   |   0
 .../dummy_client.py                           |   0
 .../README.md                                 |  10 +-
 .../docker-compose.yaml                       |   0
 .../grafana.json                              |   0
 .../prometheus.yaml                           |   0
 13 files changed, 240 insertions(+), 62 deletions(-)
 delete mode 100644 docs/source/getting_started/examples/examples_index.template.md
 rename examples/{production_monitoring => opentelemetry}/Otel.md (100%)
 rename examples/{production_monitoring => opentelemetry}/dummy_client.py (100%)
 rename examples/{production_monitoring => prometheus_grafana}/README.md (95%)
 rename examples/{production_monitoring => prometheus_grafana}/docker-compose.yaml (100%)
 rename examples/{production_monitoring => prometheus_grafana}/grafana.json (100%)
 rename examples/{production_monitoring => prometheus_grafana}/prometheus.yaml (100%)

diff --git a/.gitignore b/.gitignore
index bb7e4d5b244a8..89dab8f13bab1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -79,10 +79,7 @@ instance/
 
 # Sphinx documentation
 docs/_build/
-docs/source/getting_started/examples/*.rst
-!**/*.template.rst
-docs/source/getting_started/examples/*.md
-!**/*.template.md
+docs/source/getting_started/examples/
 
 # PyBuilder
 .pybuilder/
diff --git a/docs/Makefile b/docs/Makefile
index d0c3cbf1020d5..5b801f79d1f26 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -18,3 +18,7 @@ help:
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+clean:
+	@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+	rm -rf "$(SOURCEDIR)/getting_started/examples"
diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index 25a700033cc9e..64cf6ef8fc19d 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -3,6 +3,7 @@ sphinx-book-theme==1.0.1
 sphinx-copybutton==0.5.2
 myst-parser==3.0.1
 sphinx-argparse==0.4.0
+sphinx-togglebutton==0.3.2
 msgspec
 cloudpickle
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 71394c5302a39..1ce11fe057071 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -43,6 +43,10 @@
     "sphinx.ext.autosummary",
     "myst_parser",
     "sphinxarg.ext",
+    "sphinx_togglebutton",
+]
+myst_enable_extensions = [
+    "colon_fence",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index aef32f7559f74..32bb86c469c78 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -1,54 +1,234 @@
+import itertools
 import re
+from dataclasses import dataclass, field
 from pathlib import Path
 
+ROOT_DIR = Path(__file__).parent.parent.parent.resolve()
+ROOT_DIR_RELATIVE = '../../../..'
+EXAMPLE_DIR = ROOT_DIR / "examples"
+EXAMPLE_DOC_DIR = ROOT_DIR / "docs/source/getting_started/examples"
+
 
 def fix_case(text: str) -> str:
-    subs = [
-        ("api", "API"),
-        ("llm", "LLM"),
-        ("vllm", "vLLM"),
-        ("openai", "OpenAI"),
-        ("multilora", "MultiLoRA"),
-    ]
-    for sub in subs:
-        text = re.sub(*sub, text, flags=re.IGNORECASE)
+    subs = {
+        "api": "API",
+        "cpu": "CPU",
+        "llm": "LLM",
+        "tpu": "TPU",
+        "aqlm": "AQLM",
+        "gguf": "GGUF",
+        "lora": "LoRA",
+        "vllm": "vLLM",
+        "openai": "OpenAI",
+        "multilora": "MultiLoRA",
+        "mlpspeculator": "MLPSpeculator",
+        r"fp\d+": lambda x: x.group(0).upper(),  # e.g. fp16, fp32
+        r"int\d+": lambda x: x.group(0).upper(),  # e.g. int8, int16
+    }
+    for pattern, repl in subs.items():
+        text = re.sub(rf'\b{pattern}\b', repl, text, flags=re.IGNORECASE)
     return text
 
 
-def generate_title(filename: str) -> str:
-    # Turn filename into a title
-    title = filename.replace("_", " ").title()
-    # Handle acronyms and names
-    title = fix_case(title)
-    return f"# {title}"
+@dataclass
+class Index:
+    """
+    Index class to generate a structured document index.
+
+    Attributes:
+        path (Path): The path save the index file to.
+        title (str): The title of the index.
+        description (str): A brief description of the index.
+        caption (str): An optional caption for the table of contents.
+        maxdepth (int): The maximum depth of the table of contents. Defaults to 1.
+        documents (list[str]): A list of document paths to include in the index. Defaults to an empty list.
+
+    Methods:
+        generate() -> str:
+            Generates the index content as a string in the specified format.
+    """ # noqa: E501
+    path: Path
+    title: str
+    description: str
+    caption: str
+    maxdepth: int = 1
+    documents: list[str] = field(default_factory=list)
+
+    def generate(self) -> str:
+        content = f"# {self.title}\n\n{self.description}\n\n"
+        content += "```{toctree}\n"
+        content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n"
+        content += "\n".join(sorted(self.documents)) + "\n```\n"
+        return content
+
+
+@dataclass
+class Example:
+    """
+    Example class for generating documentation content from a given path.
+
+    Attributes:
+        path (Path): The path to the main directory or file.
+        category (str): The category of the document.
+        main_file (Path): The main file in the directory.
+        other_files (list[Path]): List of other files in the directory.
+        title (str): The title of the document.
+
+    Methods:
+        __post_init__(): Initializes the main_file, other_files, and title attributes.
+        determine_main_file() -> Path: Determines the main file in the given path.
+        determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file.
+        determine_title() -> str: Determines the title of the document.
+        generate() -> str: Generates the documentation content.
+    """ # noqa: E501
+    path: Path
+    category: str = None
+    main_file: Path = field(init=False)
+    other_files: list[Path] = field(init=False)
+    title: str = field(init=False)
+
+    def __post_init__(self):
+        self.main_file = self.determine_main_file()
+        self.other_files = self.determine_other_files()
+        self.title = self.determine_title()
+
+    def determine_main_file(self) -> Path:
+        """
+        Determines the main file in the given path.
+        If the path is a file, it returns the path itself. Otherwise, it searches
+        for Markdown files (*.md) in the directory and returns the first one found.
+        Returns:
+            Path: The main file path, either the original path if it's a file or the first
+            Markdown file found in the directory.
+        Raises:
+            IndexError: If no Markdown files are found in the directory.
+        """ # noqa: E501
+        return self.path if self.path.is_file() else list(
+            self.path.glob("*.md")).pop()
+
+    def determine_other_files(self) -> list[Path]:
+        """
+        Determine other files in the directory excluding the main file.
+
+        This method checks if the given path is a file. If it is, it returns an empty list.
+        Otherwise, it recursively searches through the directory and returns a list of all
+        files that are not the main file.
+
+        Returns:
+            list[Path]: A list of Path objects representing the other files in the directory.
+        """ # noqa: E501
+        if self.path.is_file():
+            return []
+        is_other_file = lambda file: file.is_file() and file != self.main_file
+        return [file for file in self.path.rglob("*") if is_other_file(file)]
+
+    def determine_title(self) -> str:
+        return fix_case(self.path.stem.replace("_", " ").title())
+
+    def generate(self) -> str:
+        # Convert the path to a relative path from __file__
+        make_relative = lambda path: ROOT_DIR_RELATIVE / path.relative_to(
+            ROOT_DIR)
+
+        content = f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
+        if self.main_file.suffix == ".py":
+            content += f"# {self.title}\n\n"
+        include = "include" if self.main_file.suffix == ".md" else \
+            "literalinclude"
+        content += f":::{{{include}}} {make_relative(self.main_file)}\n:::\n\n"
+
+        if not self.other_files:
+            return content
+
+        content += "## Example materials\n\n"
+        for file in self.other_files:
+            include = "include" if file.suffix == ".md" else "literalinclude"
+            content += f":::{{admonition}} {file.relative_to(self.path)}\n"
+            content += ":class: dropdown\n\n"
+            content += f":::{{{include}}} {make_relative(file)}\n:::\n"
+            content += ":::\n\n"
+
+        return content
 
 
 def generate_examples():
-    root_dir = Path(__file__).parent.parent.parent.resolve()
-
-    # Source paths
-    script_dir = root_dir / "examples"
-    script_paths = sorted(script_dir.glob("*.py"))
-
-    # Destination paths
-    doc_dir = root_dir / "docs/source/getting_started/examples"
-    doc_paths = [doc_dir / f"{path.stem}.md" for path in script_paths]
-
-    # Generate the example docs for each example script
-    for script_path, doc_path in zip(script_paths, doc_paths):
-        # Make script_path relative to doc_path and call it include_path
-        include_path = '../../../..' / script_path.relative_to(root_dir)
-        content = (f"{generate_title(doc_path.stem)}\n\n"
-                   f"Source: <gh-file:examples/{script_path.name}>.\n\n"
-                   f"```{{literalinclude}} {include_path}\n"
-                   ":language: python\n"
-                   ":linenos:\n```")
+    # Create the EXAMPLE_DOC_DIR if it doesn't exist
+    if not EXAMPLE_DOC_DIR.exists():
+        EXAMPLE_DOC_DIR.mkdir(parents=True)
+
+    # Create empty indices
+    examples_index = Index(
+        path=EXAMPLE_DOC_DIR / "examples_index.md",
+        title="Examples",
+        description=
+        "A collection of examples demonstrating usage of vLLM.\nAll documented examples are autogenerated using <gh-file:docs/source/generate_examples.py> from examples found in <gh-file:examples>.",  # noqa: E501
+        caption="Examples",
+        maxdepth=1)  # TODO change to 2 when examples start being categorised
+    category_indices = {
+        "offline_inference":
+        Index(
+            path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
+            title="Offline Inference",
+            description=
+            "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.",  # noqa: E501
+            caption="Examples",
+        ),
+        "online_serving":
+        Index(
+            path=EXAMPLE_DOC_DIR / "examples_online_serving_index.md",
+            title="Online Serving",
+            description=
+            "Online serving examples demonstrate how to use vLLM in an online setting, where the model is queried for predictions in real-time.",  # noqa: E501
+            caption="Examples",
+        ),
+        "other":
+        Index(
+            path=EXAMPLE_DOC_DIR / "examples_other_index.md",
+            title="Other",
+            description=
+            "Other examples that don't strongly fit into the online or offline serving categories.",  # noqa: E501
+            caption="Examples",
+        ),
+    }
+
+    examples = []
+    # Find categorised examples
+    for category in category_indices:
+        category_dir = EXAMPLE_DIR / category
+        py = category_dir.glob("*.py")
+        md = category_dir.glob("*.md")
+        for path in itertools.chain(py, md):
+            examples.append(Example(path, category))
+        # Find examples in subdirectories
+        for path in category_dir.glob("*/*.md"):
+            examples.append(Example(path.parent, category))
+    # Find uncategorised examples
+    py = EXAMPLE_DIR.glob("*.py")
+    md = EXAMPLE_DIR.glob("*.md")
+    for path in itertools.chain(py, md):
+        examples.append(Example(path))
+    # Find examples in subdirectories
+    for path in EXAMPLE_DIR.glob("*/*.md"):
+        # Skip categorised examples
+        if path.parent.name in category_indices:
+            continue
+        examples.append(Example(path.parent))
+
+    # Generate the example documentation
+    for example in examples:
+        doc_path = EXAMPLE_DOC_DIR / f"{example.path.stem}.md"
         with open(doc_path, "w+") as f:
-            f.write(content)
-
-    # Generate the toctree for the example scripts
-    with open(doc_dir / "examples_index.template.md") as f:
-        examples_index = f.read()
-    with open(doc_dir / "examples_index.md", "w+") as f:
-        example_docs = "\n".join(path.stem + ".md" for path in script_paths)
-        f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs))
+            f.write(example.generate())
+        # Add the example to the appropriate index
+        index = category_indices.get(example.category, examples_index)
+        index.documents.append(example.path.stem)
+
+    # Generate the index files
+    for category_index in category_indices.values():
+        if category_index.documents:
+            examples_index.documents.insert(0, category_index.path.name)
+            with open(category_index.path, "w+") as f:
+                f.write(category_index.generate())
+
+    with open(examples_index.path, "w+") as f:
+        f.write(examples_index.generate())
diff --git a/docs/source/getting_started/examples/examples_index.template.md b/docs/source/getting_started/examples/examples_index.template.md
deleted file mode 100644
index de7a91c0ffa48..0000000000000
--- a/docs/source/getting_started/examples/examples_index.template.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# Examples
-
-```{toctree}
-:maxdepth: 1
-:caption: Scripts
-
-%EXAMPLE_DOCS%
-```
\ No newline at end of file
diff --git a/examples/fp8/README.md b/examples/fp8/README.md
index 181c36558fcff..5492872cae93a 100644
--- a/examples/fp8/README.md
+++ b/examples/fp8/README.md
@@ -56,7 +56,7 @@ python3 examples/fp8/extract_scales.py --quantized_model <QUANTIZED_MODEL_DIR> -
 ```
 ### 4. Load KV Cache Scaling Factors into VLLM.
 This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for KV cache scaling factors to be utilized for FP8.
-```python
+```
 # prerequisites:
 # -  LLaMa 2 kv_cache_scales.json file
 
@@ -90,7 +90,7 @@ optional arguments:
   --kv-cache-dtype {auto,fp8} Data type for kv cache storage. If "auto", will use model data type. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported ```for common inference criteria.
   --quantization-param-path QUANT_PARAM_JSON Path to the JSON file containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache scaling factors default to 1.0, which may cause accuracy issues. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.
 ```
-```
 Example:
+```console
 python3 benchmarks/benchmark_throughput.py --input-len <INPUT_LEN> --output-len <OUTPUT_LEN> -tp <TENSOR_PARALLEL_SIZE> --kv-cache-dtype fp8 --quantization-param-path <path/to/kv_cache_scales.json> --model <path-to-llama2>
-```python
+```
diff --git a/examples/production_monitoring/Otel.md b/examples/opentelemetry/Otel.md
similarity index 100%
rename from examples/production_monitoring/Otel.md
rename to examples/opentelemetry/Otel.md
diff --git a/examples/production_monitoring/dummy_client.py b/examples/opentelemetry/dummy_client.py
similarity index 100%
rename from examples/production_monitoring/dummy_client.py
rename to examples/opentelemetry/dummy_client.py
diff --git a/examples/production_monitoring/README.md b/examples/prometheus_grafana/README.md
similarity index 95%
rename from examples/production_monitoring/README.md
rename to examples/prometheus_grafana/README.md
index 807c0470e7b30..c49e5306a1cb4 100644
--- a/examples/production_monitoring/README.md
+++ b/examples/prometheus_grafana/README.md
@@ -1,4 +1,4 @@
-# vLLM + Prometheus/Grafana 
+# Prometheus and Grafana 
 
 This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites. 
 
@@ -6,7 +6,7 @@ Install:
 - [`docker`](https://docs.docker.com/engine/install/)
 - [`docker compose`](https://docs.docker.com/compose/install/linux/#install-using-the-repository)
 
-### Launch
+## Launch
 
 Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
 ```bash
@@ -35,11 +35,11 @@ python3 ../../benchmarks/benchmark_serving.py \
 
 Navigating to [`http://localhost:8000/metrics`](http://localhost:8000/metrics) will show the raw Prometheus metrics being exposed by vLLM.
 
-### Grafana Dashboard
+## Grafana Dashboard
 
 Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the default username (`admin`) and password (`admin`).
 
-#### Add Prometheus Data Source
+### Add Prometheus Data Source
 
 Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus. 
 
@@ -47,7 +47,7 @@ On Prometheus configuration page, we need to add the `Prometheus Server URL` in
 
 Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.".
 
-#### Import Dashboard 
+### Import Dashboard 
 
 Navigate to [`http://localhost:3000/dashboard/import`](http://localhost:3000/dashboard/import), upload `grafana.json`, and select the `prometheus` datasource. You should see a screen that looks like the following:
 
diff --git a/examples/production_monitoring/docker-compose.yaml b/examples/prometheus_grafana/docker-compose.yaml
similarity index 100%
rename from examples/production_monitoring/docker-compose.yaml
rename to examples/prometheus_grafana/docker-compose.yaml
diff --git a/examples/production_monitoring/grafana.json b/examples/prometheus_grafana/grafana.json
similarity index 100%
rename from examples/production_monitoring/grafana.json
rename to examples/prometheus_grafana/grafana.json
diff --git a/examples/production_monitoring/prometheus.yaml b/examples/prometheus_grafana/prometheus.yaml
similarity index 100%
rename from examples/production_monitoring/prometheus.yaml
rename to examples/prometheus_grafana/prometheus.yaml

From 91445c7bc8000a6f6f1efed0882076d7001be968 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 8 Jan 2025 10:17:16 +0800
Subject: [PATCH 310/357] [Bugfix] Fix image input for Pixtral-HF (#11741)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 ...e_inference_vision_language_multi_image.py | 41 ++++++++++++++++---
 vllm/model_executor/models/llava.py           |  6 +++
 vllm/model_executor/models/pixtral.py         |  2 +-
 vllm/model_executor/models/utils.py           |  9 ++++
 4 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
index 6af8d7768e75d..cf2e90a325c6a 100644
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -23,7 +23,7 @@
 class ModelRequestData(NamedTuple):
     llm: LLM
     prompt: str
-    stop_token_ids: Optional[List[str]]
+    stop_token_ids: Optional[List[int]]
     image_data: List[Image]
     chat_template: Optional[str]
 
@@ -44,12 +44,14 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData:
     prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
               "<|im_start|>assistant\n")
     stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
+
     return ModelRequestData(
         llm=llm,
         prompt=prompt,
         stop_token_ids=stop_token_ids,
         image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None)
+        chat_template=None,
+    )
 
 
 def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
@@ -166,7 +168,8 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
-    prompt = f"<|image|><|image|><|begin_of_text|>{question}"
+    placeholders = "<|image|>" * len(image_urls)
+    prompt = f"{placeholders}<|begin_of_text|>{question}"
     return ModelRequestData(
         llm=llm,
         prompt=prompt,
@@ -209,6 +212,31 @@ def load_nvlm_d(question: str, image_urls: List[str]):
     )
 
 
+def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData:
+    model_name = "mistral-community/pixtral-12b"
+
+    # Adjust this as necessary to fit in GPU
+    llm = LLM(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        tensor_parallel_size=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = "[IMG]" * len(image_urls)
+    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
+    stop_token_ids = None
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
 def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
     # num_crops is an override kwarg to the multimodal image processor;
     # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
@@ -244,7 +272,8 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
     )
 
 
-def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
+def load_qwen_vl_chat(question: str,
+                      image_urls: List[str]) -> ModelRequestData:
     model_name = "Qwen/Qwen-VL-Chat"
     llm = LLM(
         model=model_name,
@@ -274,6 +303,7 @@ def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
 
     stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+
     return ModelRequestData(
         llm=llm,
         prompt=prompt,
@@ -348,7 +378,8 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
     "mllama": load_mllama,
     "NVLM_D": load_nvlm_d,
     "phi3_v": load_phi3v,
-    "qwen_vl_chat": load_qwenvl_chat,
+    "pixtral_hf": load_pixtral_hf,
+    "qwen_vl_chat": load_qwen_vl_chat,
     "qwen2_vl": load_qwen2_vl,
 }
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 4299af8cd03a2..305f1364dba23 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -546,6 +546,12 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
+            if self.config.vision_config.model_type == "pixtral":
+                return LlavaImagePixelInputs(
+                    type="pixel_values",
+                    data=flatten_bn(pixel_values),
+                )
+
             return LlavaImagePixelInputs(
                 type="pixel_values",
                 data=self._validate_pixel_values(
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 9e1d38512c0b4..b74bb3c8a3f88 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -774,7 +774,7 @@ def get_num_image_tokens(
     ) -> int:
         return get_pixtral_hf_image_feature_size(
             image_size=self.vision_config.image_size,
-            patch_size=self.get_image_size(),
+            patch_size=self.vision_config.patch_size,
         )
 
     def get_max_image_tokens(self) -> int:
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 31017f16d3c97..4ed3b237ae0e2 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -281,6 +281,15 @@ def flatten_bn(
     ...
 
 
+@overload
+def flatten_bn(
+    x: Union[List[torch.Tensor], torch.Tensor],
+    *,
+    concat: bool = False,
+) -> Union[List[torch.Tensor], torch.Tensor]:
+    ...
+
+
 def flatten_bn(
     x: Union[List[torch.Tensor], torch.Tensor],
     *,

From 4d29e91be84d27ca313d657eee92c067439a4c23 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Tue, 7 Jan 2025 20:57:04 -0600
Subject: [PATCH 311/357] [Misc] sort torch profiler table by kernel timing
 (#11813)

---
 benchmarks/benchmark_latency.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 0a14aedd5feba..e669ce4db299d 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -52,7 +52,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
                 llm.generate(dummy_prompts,
                              sampling_params=sampling_params,
                              use_tqdm=False)
-            print(p.key_averages())
+            print(p.key_averages().table(sort_by="self_cuda_time_total"))
         else:
             start_time = time.perf_counter()
             llm.generate(dummy_prompts,

From dc71af0a71f347badcd917810440fad136e73ba6 Mon Sep 17 00:00:00 2001
From: WangErXiao <863579016@qq.com>
Date: Wed, 8 Jan 2025 12:09:25 +0800
Subject: [PATCH 312/357] =?UTF-8?q?Remove=20the=20duplicate=20imports=20of?=
 =?UTF-8?q?=20MultiModalKwargs=20and=20PlaceholderRange=E2=80=A6=20(#11824?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 vllm/v1/core/scheduler.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index baaf3329dc79f..b26716f5c02e6 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -5,8 +5,6 @@
 
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.logger import init_logger
-from vllm.multimodal import MultiModalKwargs
-from vllm.multimodal.base import PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
 from vllm.v1.core.kv_cache_manager import KVCacheManager

From b640b19cc0babe256c5455befe95340f951763d9 Mon Sep 17 00:00:00 2001
From: Nishidha <nishidha.panpaliya@partner.ibm.com>
Date: Wed, 8 Jan 2025 10:35:37 +0530
Subject: [PATCH 313/357] Fixed docker build for ppc64le (#11518)

Signed-off-by: Nishidha Panpaliya <nishidha.panpaliya@partner.ibm.com>
---
 Dockerfile.ppc64le | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index 971248577983f..d3cd1c7b313bc 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -4,7 +4,7 @@ USER root
 
 ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
 
-RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 
+RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
 
 # Some packages in requirements-cpu are installed here
 # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
@@ -18,9 +18,8 @@ ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
 
-# These packages will be in rocketce eventually
 RUN --mount=type=cache,target=/root/.cache/pip  \
-    pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
+    RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
         'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         torch==2.3.1 \
         -r requirements-cpu.txt \

From f4923cb8bce7d9c3038ad6c597ae1ff3ed90fe93 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Wed, 8 Jan 2025 09:08:30 +0400
Subject: [PATCH 314/357] [OpenVINO] Fixed Docker.openvino build (#11732)

Signed-off-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 Dockerfile.openvino | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile.openvino b/Dockerfile.openvino
index 8bd188ffde408..32bcbfa9cc168 100644
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -14,6 +14,7 @@ ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 
+RUN python3 -m pip install -U pip
 # install build requirements
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
 # build vLLM with OpenVINO backend

From f645eb69545672d394e9e9e0ce46c725504fd2a0 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 8 Jan 2025 13:08:48 +0800
Subject: [PATCH 315/357] [Bugfix] Add checks for LoRA and CPU offload (#11810)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/config.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/config.py b/vllm/config.py
index 8b824a1fca511..a9b6d6b19127f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2051,6 +2051,11 @@ def __post_init__(self):
                 f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
                 f"max_loras ({self.max_loras})")
 
+    def verify_with_cache_config(self, cache_config: CacheConfig):
+        # TODO LoRA supports CPU offload.
+        if cache_config.cpu_offload_gb > 0:
+            raise ValueError("CPU offload is not supported with LoRA yet.")
+
     def verify_with_model_config(self, model_config: ModelConfig):
         if self.lora_dtype in (None, "auto"):
             self.lora_dtype = model_config.dtype
@@ -3138,6 +3143,7 @@ def __post_init__(self):
             self.cache_config.verify_with_parallel_config(self.parallel_config)
 
         if self.lora_config:
+            self.lora_config.verify_with_cache_config(self.cache_config)
             self.lora_config.verify_with_model_config(self.model_config)
             self.lora_config.verify_with_scheduler_config(
                 self.scheduler_config)

From 259abd8953a8fea9abf3c4e66aa7c51391fa5b64 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Tue, 7 Jan 2025 21:16:08 -0800
Subject: [PATCH 316/357] [Docs] reorganize sponsorship page (#11639)

Signed-off-by: simon-mo <simon.mo@hey.com>
---
 README.md                         | 15 ++++++++++-----
 docs/source/community/sponsors.md | 14 ++++++++++----
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 652268ec29cac..8e85b460363fc 100644
--- a/README.md
+++ b/README.md
@@ -90,28 +90,33 @@ vLLM is a community project. Our compute resources for development and testing a
 
 <!-- Note: Please sort them in alphabetical order. -->
 <!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
-
+Cash Donations:
 - a16z
+- Dropbox
+- Sequoia Capital
+- Skywork AI
+- ZhenFund
+
+Compute Resources:
 - AMD
 - Anyscale
 - AWS
 - Crusoe Cloud
 - Databricks
 - DeepInfra
-- Dropbox
 - Google Cloud
 - Lambda Lab
 - Nebius
+- Novita
 - NVIDIA
 - Replicate
 - Roblox
 - RunPod
-- Sequoia Capital
-- Skywork AI
 - Trainy
 - UC Berkeley
 - UC San Diego
-- ZhenFund
+
+Slack Sponsor: Anyscale
 
 We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
 
diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md
index c6f83b3a92ca0..3d5a57baefbde 100644
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -5,26 +5,32 @@ vLLM is a community project. Our compute resources for development and testing a
 <!-- Note: Please sort them in alphabetical order. -->
 <!-- Note: Please keep these consistent with README.md. -->
 
+Cash Donations:
 - a16z
+- Dropbox
+- Sequoia Capital
+- Skywork AI
+- ZhenFund
+
+Compute Resources:
 - AMD
 - Anyscale
 - AWS
 - Crusoe Cloud
 - Databricks
 - DeepInfra
-- Dropbox
 - Google Cloud
 - Lambda Lab
 - Nebius
+- Novita
 - NVIDIA
 - Replicate
 - Roblox
 - RunPod
-- Sequoia Capital
-- Skywork AI
 - Trainy
 - UC Berkeley
 - UC San Diego
-- ZhenFund
+
+Slack Sponsor: Anyscale
 
 We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.

From ef68eb28d8d45be6e0defe82245e16be9362e375 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 8 Jan 2025 13:40:09 +0800
Subject: [PATCH 317/357] [Bug] Fix pickling of `ModelConfig` when RunAI Model
 Streamer is used (#11825)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index a9b6d6b19127f..44426489f686a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -381,16 +381,16 @@ def maybe_pull_model_tokenizer_for_s3(self, model: str,
         """
         if is_s3(model) or is_s3(tokenizer):
             if is_s3(model):
-                self.s3_model = S3Model()
-                self.s3_model.pull_files(model, allow_pattern=["*config.json"])
+                s3_model = S3Model()
+                s3_model.pull_files(model, allow_pattern=["*config.json"])
                 self.model_weights = self.model
-                self.model = self.s3_model.dir
+                self.model = s3_model.dir
 
             if is_s3(tokenizer):
-                self.s3_tokenizer = S3Model()
-                self.s3_tokenizer.pull_files(
+                s3_tokenizer = S3Model()
+                s3_tokenizer.pull_files(
                     model, ignore_pattern=["*.pt", "*.safetensors", "*.bin"])
-                self.tokenizer = self.s3_tokenizer.dir
+                self.tokenizer = s3_tokenizer.dir
 
     def _init_multimodal_config(
         self, limit_mm_per_prompt: Optional[Mapping[str, int]]

From 889e662eae19fe8f30469883c6854ee4df4315a9 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 8 Jan 2025 14:36:03 +0800
Subject: [PATCH 318/357] [misc] improve memory profiling (#11809)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 tests/test_utils.py                           | 19 +++++-
 .../vllm_test_utils/__init__.py               |  3 +-
 .../vllm_test_utils/monitor.py                | 68 +++++++++++++++++++
 vllm/utils.py                                 | 12 ++--
 4 files changed, 94 insertions(+), 8 deletions(-)
 create mode 100644 tests/vllm_test_utils/vllm_test_utils/monitor.py

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 32a6b0aed66aa..0285b00d73be1 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -5,6 +5,7 @@
 
 import pytest
 import torch
+from vllm_test_utils import monitor
 
 from vllm.utils import (FlexibleArgumentParser, StoreBoolean, deprecate_kwargs,
                         get_open_port, memory_profiling, merge_async_iterators,
@@ -289,8 +290,16 @@ def test_memory_profiling():
 
     weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB
 
+    def measure_current_non_torch():
+        free, total = torch.cuda.mem_get_info()
+        current_used = total - free
+        current_torch = torch.cuda.memory_reserved()
+        current_non_torch = current_used - current_torch
+        return current_non_torch
+
     with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes,
-    weights_memory_in_bytes=weights_memory_in_bytes) as result:
+    weights_memory_in_bytes=weights_memory_in_bytes) as result, \
+        monitor(measure_current_non_torch) as monitored_values:
         # make a memory spike, 1 GiB
         spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
         del spike
@@ -298,7 +307,15 @@ def test_memory_profiling():
         # Add some extra non-torch memory 256 MiB (simulate NCCL)
         handle2 = lib.cudaMalloc(256 * 1024 * 1024)
 
+    # this is an analytic value, it is exact,
+    # we only have 256 MiB non-torch memory increase
+    measured_diff = monitored_values.values[-1] - monitored_values.values[0]
+    assert measured_diff == 256 * 1024 * 1024
+
     # Check that the memory usage is within 5% of the expected values
+    # 5% tolerance is caused by PyTorch caching allocator,
+    # we cannot control PyTorch's behavior of its internal buffers,
+    # which causes a small error (<10 MiB in practice)
     non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa
     torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa
     assert abs(non_torch_ratio - 1) <= 0.05
diff --git a/tests/vllm_test_utils/vllm_test_utils/__init__.py b/tests/vllm_test_utils/vllm_test_utils/__init__.py
index bf0b62a5b75e3..6505c81546bb0 100644
--- a/tests/vllm_test_utils/vllm_test_utils/__init__.py
+++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py
@@ -4,5 +4,6 @@
 """
 
 from .blame import BlameResult, blame
+from .monitor import MonitoredValues, monitor
 
-__all__ = ["blame", "BlameResult"]
+__all__ = ["blame", "BlameResult", "monitor", "MonitoredValues"]
diff --git a/tests/vllm_test_utils/vllm_test_utils/monitor.py b/tests/vllm_test_utils/vllm_test_utils/monitor.py
new file mode 100644
index 0000000000000..a237f53a75d18
--- /dev/null
+++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py
@@ -0,0 +1,68 @@
+import contextlib
+import dataclasses
+import sys
+import traceback
+from typing import Callable, Generator, Generic, TypeVar
+
+_T = TypeVar("_T")
+
+
+@dataclasses.dataclass
+class MonitoredValues(Generic[_T]):
+    values: list[_T] = dataclasses.field(default_factory=list)
+    trace_stacks: list[str] = dataclasses.field(default_factory=list)
+
+
+@contextlib.contextmanager
+def monitor(
+    measure_func: Callable[[],
+                           _T]) -> Generator[MonitoredValues[_T], None, None]:
+    """
+    Trace the function calls to continuously monitor the change of
+    a value.
+
+    Usage:
+
+    ```python
+
+    def measure_func():
+        ... # measure the current value
+        return current_value
+
+    with monitor(measure_func) as monitored_values:
+        # do something
+    
+        monitored_values.values # all changes of the values
+        monitored_values.trace_stacks # trace stacks of every change
+    ```
+    """
+    monitored_values = MonitoredValues[_T]()
+
+    def _trace_calls(frame, event, arg=None):
+        nonlocal monitored_values
+        if event in ['line']:
+            # triggered by every line of Python code.
+            # only Python functions will trigger it,
+            # c/cpp functions will not trigger it.
+            try:
+                # Temporarily disable the trace function
+                sys.settrace(None)
+                # do a measurement
+                current_value = measure_func()
+                if len(monitored_values.values
+                       ) == 0 or current_value != monitored_values.values[-1]:
+                    monitored_values.values.append(current_value)
+                    monitored_values.trace_stacks.append("".join(
+                        traceback.format_stack()))
+                # Re-enable the trace function
+                sys.settrace(_trace_calls)
+            except NameError:
+                # modules are deleted during shutdown
+                pass
+        return _trace_calls
+
+    try:
+        sys.settrace(_trace_calls)
+        yield monitored_values
+    finally:
+        sys.settrace(None)
diff --git a/vllm/utils.py b/vllm/utils.py
index 63057153f851d..2660b53d7bfb0 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1742,10 +1742,10 @@ class MemorySnapshot:
     timestamp: float = 0.0
 
     def measure(self):
-        self.torch_peak_in_bytes = torch.cuda.memory_stats(
-        )["allocated_bytes.all.peak"]
-        self.torch_memory_in_bytes = torch.cuda.memory_stats(
-        )["allocated_bytes.all.current"]
+        self.torch_peak_in_bytes = torch.cuda.max_memory_reserved()
+        # torch.cuda.memory_reserved() is how many bytes
+        # PyTorch gets from cuda (by calling cudaMalloc, etc.)
+        self.torch_memory_in_bytes = torch.cuda.memory_reserved()
         self.timestamp = time.time()
 
     def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
@@ -1822,10 +1822,10 @@ def memory_profiling(
 
     The memory used for loading weights (a.) is directly given from the argument `weights_memory_in_bytes`.
 
-    The increase of ``torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.).
+    The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.).
 
     (c.) is tricky. We measure the total memory used in this GPU (`torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]`),
-    subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_stats()["allocated_bytes.all.current"]`.
+    subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_reserved()`.
     """ # noqa
     torch.cuda.reset_peak_memory_stats()
 

From ad9f1aa6796297a00456e715043f3eaad55bed53 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 8 Jan 2025 14:36:49 +0800
Subject: [PATCH 319/357] [doc] update wheels url (#11830)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/getting_started/installation/gpu-cuda.md | 4 ++--
 python_only_dev.py                                   | 2 +-
 setup.py                                             | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/getting_started/installation/gpu-cuda.md b/docs/source/getting_started/installation/gpu-cuda.md
index 1cd513177bf0d..419b8163fc034 100644
--- a/docs/source/getting_started/installation/gpu-cuda.md
+++ b/docs/source/getting_started/installation/gpu-cuda.md
@@ -75,7 +75,7 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi
 
 ```console
 $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+$ pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 ```
 
 Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
@@ -126,7 +126,7 @@ $ cd vllm
 $ VLLM_USE_PRECOMPILED=1 pip install --editable .
 ```
 
-This will download the latest nightly wheel from https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl and use the compiled libraries from there in the installation.
+This will download the latest nightly wheel from https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl and use the compiled libraries from there in the installation.
 
 The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files):
 
diff --git a/python_only_dev.py b/python_only_dev.py
index f70b4984025b3..7d95ac96e6e4b 100644
--- a/python_only_dev.py
+++ b/python_only_dev.py
@@ -7,7 +7,7 @@
 or
 
 export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-export VLLM_PRECOMPILED_WHEEL_LOCATION=https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 pip install -e .
 """ # noqa
 
diff --git a/setup.py b/setup.py
index ba6953dbdc174..ef9f4e579e84d 100644
--- a/setup.py
+++ b/setup.py
@@ -252,7 +252,7 @@ def run(self):
 
 class repackage_wheel(build_ext):
     """Extracts libraries and other files from an existing wheel."""
-    default_wheel = "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+    default_wheel = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
 
     def run(self) -> None:
         wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION",

From a1b2b8606e75ab8fbc066e7f0fae20c1e60244ca Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Tue, 7 Jan 2025 23:05:46 -0800
Subject: [PATCH 320/357] [Docs] Update sponsor name: 'Novita' to 'Novita AI'
 (#11833)

---
 README.md                         | 2 +-
 docs/source/community/sponsors.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8e85b460363fc..1f82229f39537 100644
--- a/README.md
+++ b/README.md
@@ -107,7 +107,7 @@ Compute Resources:
 - Google Cloud
 - Lambda Lab
 - Nebius
-- Novita
+- Novita AI
 - NVIDIA
 - Replicate
 - Roblox
diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md
index 3d5a57baefbde..9d2af4c13b088 100644
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -22,7 +22,7 @@ Compute Resources:
 - Google Cloud
 - Lambda Lab
 - Nebius
-- Novita
+- Novita AI
 - NVIDIA
 - Replicate
 - Roblox

From cfd3219f5881e2abea1f7c9d2866ed1838c5057b Mon Sep 17 00:00:00 2001
From: Wallas Henrique <wallashss@users.noreply.github.com>
Date: Wed, 8 Jan 2025 05:35:49 -0300
Subject: [PATCH 321/357] [Hardware][Apple] Native support for macOS Apple
 Silicon (#11696)

Signed-off-by: Wallas Santos <wallashss@ibm.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 cmake/cpu_extension.cmake                     | 61 ++++++++++++++-----
 csrc/cpu/cpu_types_arm.hpp                    | 61 ++++++++++++++++++-
 csrc/cpu/utils.cpp                            | 23 +++++--
 .../getting_started/installation/cpu-apple.md | 51 ++++++++++++++++
 .../getting_started/installation/cpu-arm.md   |  4 +-
 .../getting_started/installation/index.md     |  1 +
 requirements-cpu.txt                          |  6 +-
 setup.py                                      |  9 ++-
 vllm/config.py                                | 12 ++++
 vllm/entrypoints/openai/api_server.py         |  3 +
 vllm/utils.py                                 |  7 +++
 11 files changed, 209 insertions(+), 29 deletions(-)
 create mode 100644 docs/source/getting_started/installation/cpu-apple.md

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 68f7ca1af05ad..714abca2a5ff7 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -4,6 +4,11 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    set(MACOSX_FOUND TRUE)
+endif()
+
+
 #
 # Define environment variables for special configurations
 #
@@ -13,6 +18,9 @@ endif()
 
 include_directories("${CMAKE_SOURCE_DIR}/csrc")
 
+
+set (ENABLE_NUMA TRUE)
+
 #
 # Check the compile flags
 #
@@ -22,18 +30,28 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
         "-mf16c"
     )
 endif()
-list(APPEND CXX_COMPILE_FLAGS
-    "-fopenmp"
-    "-DVLLM_CPU_EXTENSION")
 
-execute_process(COMMAND cat /proc/cpuinfo
-                RESULT_VARIABLE CPUINFO_RET
-                OUTPUT_VARIABLE CPUINFO)
+if(MACOSX_FOUND)
+    list(APPEND CXX_COMPILE_FLAGS
+        "-Xpreprocessor"
+        "-fopenmp"
+        "-DVLLM_CPU_EXTENSION")
+else()
+    list(APPEND CXX_COMPILE_FLAGS
+        "-fopenmp"
+        "-DVLLM_CPU_EXTENSION")
+endif()
 
-if (NOT CPUINFO_RET EQUAL 0)
-    message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
+if (NOT MACOSX_FOUND)
+    execute_process(COMMAND cat /proc/cpuinfo
+                    RESULT_VARIABLE CPUINFO_RET
+                    OUTPUT_VARIABLE CPUINFO)
+    if (NOT CPUINFO_RET EQUAL 0)
+        message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
+    endif()
 endif()
 
+
 function (find_isa CPUINFO TARGET OUT)
     string(FIND ${CPUINFO} ${TARGET} ISA_FOUND)
     if(NOT ISA_FOUND EQUAL -1)
@@ -54,12 +72,17 @@ endfunction()
 
 is_avx512_disabled(AVX512_DISABLED)
 
-find_isa(${CPUINFO} "avx2" AVX2_FOUND)
-find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
-find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
-find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
-find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
-find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
+if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+    set(APPLE_SILICON_FOUND TRUE)
+else()
+    find_isa(${CPUINFO} "avx2" AVX2_FOUND)
+    find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
+    find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
+    find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
+    find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
+    find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
+endif()
+
 
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     list(APPEND CXX_COMPILE_FLAGS
@@ -103,6 +126,9 @@ elseif (ASIMD_FOUND)
         set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")  
     endif()
     list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})     
+elseif(APPLE_SILICON_FOUND)
+    message(STATUS "Apple Silicon Detected")
+    set(ENABLE_NUMA OFF)
 else()
     message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
 endif()
@@ -139,7 +165,12 @@ endif()
 
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
 
-list(APPEND LIBS numa)
+if(ENABLE_NUMA)
+    list(APPEND LIBS numa)
+else()
+    message(STATUS "NUMA is disabled")
+    add_compile_definitions(-DVLLM_NUMA_DISABLED)
+endif()
 
 #
 # _C extension
diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp
index 73e0f8cb2e0fb..ae062a5b86892 100644
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -91,11 +91,68 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
                 vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
             }
         }
+
+        // Note: below is the unrolled version of the following code:
+        // 
+        // for (int i = 0; i < remainder; ++i) {
+        //     reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = 
+        //          vgetq_lane_f16(temp, i);
+        // }
+        // 
+        // For macOS build (Clang), the arm/neon intrinsics function 
+        // `vgetq_lane_f16` needs the parameter `i` to be constant at compile 
+        // time. 
         
         if (remainder > 0) {
             float16x8_t temp = reg.val[full_blocks];
-            for (int i = 0; i < remainder; ++i) {
-                reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = vgetq_lane_f16(temp, i);
+            __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr);
+            switch (remainder)
+            {
+            case 1:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              break;
+            case 2:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              break;
+            case 3:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              break;
+            case 4:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+              break;
+            case 5:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+              break;
+            case 6:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+              fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
+              break;
+            case 7:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+              fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
+              fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6);
+              break;
+            
+            default:
+              break;
             }
         }
     }
diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index 1138a55df2f05..42a1c1d924bac 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -1,10 +1,22 @@
-#include <numa.h>
-#include <unistd.h>
-#include <string>
-#include <sched.h>
+#ifndef VLLM_NUMA_DISABLED
+  #include <numa.h>
+  #include <unistd.h>
+  #include <string>
+  #include <sched.h>
+#endif
 
 #include "cpu_types.hpp"
 
+#ifdef VLLM_NUMA_DISABLED
+std::string init_cpu_threads_env(const std::string& cpu_ids) {
+  return std::string(
+      "Warning: NUMA is not enabled in this build. `init_cpu_threads_env` has "
+      "no effect to setup thread affinity.");
+}
+
+#endif
+
+#ifndef VLLM_NUMA_DISABLED
 std::string init_cpu_threads_env(const std::string& cpu_ids) {
   bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
   TORCH_CHECK(omp_cpu_mask->size > 0);
@@ -57,7 +69,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
   omp_lock_t writelock;
   omp_init_lock(&writelock);
 
-#pragma omp parallel for schedule(static, 1)
+  #pragma omp parallel for schedule(static, 1)
   for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
     cpu_set_t mask;
     CPU_ZERO(&mask);
@@ -88,3 +100,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
 
   return ss.str();
 }
+#endif
\ No newline at end of file
diff --git a/docs/source/getting_started/installation/cpu-apple.md b/docs/source/getting_started/installation/cpu-apple.md
new file mode 100644
index 0000000000000..b55e4384d064d
--- /dev/null
+++ b/docs/source/getting_started/installation/cpu-apple.md
@@ -0,0 +1,51 @@
+(installation-apple)=
+
+# Installation for macOS
+
+vLLM has experimental support for macOS with Apple Silicon. For now, users shall build from the source vLLM to natively run on macOS. For more details, like running on vLLM in a docker container, see [ARM CPU Documentation](installation-arm)
+
+Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
+
+## Requirements
+
+- **Operating System**: `macOS Sonoma` or later
+- **SDK** `XCode 15.4` or later with Command Line Tools
+- **Compilers**: `Apple Clang >= 15.0.0`
+
+<!-- (arm-backend-quick-start-dockerfile)= -->
+
+## Build and installation
+
+After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source.
+
+```
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ pip install -r requirements-cpu.txt
+$ pip install -e . 
+```
+
+```{note}
+On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
+```
+
+
+
+## Troubleshooting
+
+If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your 
+[Command Line Tools for Xcode](https://developer.apple.com/download/all/).
+
+```
+[...] fatal error: 'map' file not found
+          1 | #include <map>
+            |          ^~~~~
+      1 error generated.
+      [2/8] Building CXX object CMakeFiles/_C.dir/csrc/cpu/pos_encoding.cpp.o
+
+[...] fatal error: 'cstddef' file not found
+         10 | #include <cstddef>
+            |          ^~~~~~~~~
+      1 error generated.
+```
+
diff --git a/docs/source/getting_started/installation/cpu-arm.md b/docs/source/getting_started/installation/cpu-arm.md
index a46e2c010600d..e199073ed721f 100644
--- a/docs/source/getting_started/installation/cpu-arm.md
+++ b/docs/source/getting_started/installation/cpu-arm.md
@@ -2,7 +2,7 @@
 
 # Installation for ARM CPUs
 
-vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering:
+vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM (which also apply to Apple Silicon, see [Installation for macOS](#installation-apple) for more). For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering:
 
 - CPU backend inference capabilities
 - Relevant runtime environment variables
@@ -20,7 +20,7 @@ Contents:
 ## Requirements
 
 - **Operating System**: Linux or macOS
-- **Compiler**: `gcc/g++ >= 12.3.0` (optional, but recommended)
+- **Compilers**: `gcc/g++ >= 12.3.0` (optional, but recommended) or `Apple Clang >= 15.0.0` for macOS
 - **Instruction Set Architecture (ISA)**: NEON support is required
 
 (arm-backend-quick-start-dockerfile)=
diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md
index 83de1aff409b2..0ebadca2ccec9 100644
--- a/docs/source/getting_started/installation/index.md
+++ b/docs/source/getting_started/installation/index.md
@@ -11,6 +11,7 @@ gpu-cuda
 gpu-rocm
 cpu-x86
 cpu-arm
+cpu-apple
 hpu-gaudi
 tpu
 xpu
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index e62f313297762..056fbf5a7adec 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -2,7 +2,7 @@
 -r requirements-common.txt
 
 # Dependencies for CPUs
-torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" 
-torch==2.5.1; platform_machine == "aarch64"
+torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin"
+torch==2.5.1; platform_machine == "aarch64" or platform_system == "Darwin" 
 torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
-datasets # for benchmark scripts
\ No newline at end of file
+datasets # for benchmark scripts
diff --git a/setup.py b/setup.py
index ef9f4e579e84d..b6c1f5bc8ac3f 100644
--- a/setup.py
+++ b/setup.py
@@ -34,9 +34,14 @@ def load_module_from_path(module_name, path):
 
 VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
 
-if not sys.platform.startswith("linux"):
+if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
     logger.warning(
-        "vLLM only supports Linux platform (including WSL). "
+        "VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
+    VLLM_TARGET_DEVICE = "cpu"
+elif not (sys.platform.startswith("linux")
+          or sys.platform.startswith("darwin")):
+    logger.warning(
+        "vLLM only supports Linux platform (including WSL) and MacOS."
         "Building on %s, "
         "so vLLM may not be able to run correctly", sys.platform)
     VLLM_TARGET_DEVICE = "empty"
diff --git a/vllm/config.py b/vllm/config.py
index 44426489f686a..535cbe97a311a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4,6 +4,7 @@
 import hashlib
 import json
 import os
+import sys
 import warnings
 from contextlib import contextmanager
 from dataclasses import dataclass, field, replace
@@ -2259,6 +2260,17 @@ def _get_and_verify_dtype(
                     "supported for POWERPC.")
                 torch_dtype = torch.bfloat16
 
+            # TODO: change this condition to check if the platform support bf16
+            # instead of checking the OS. For instance M2 shall supports bf16
+            # already. But we need to modify `cpu_extension.cmake` to activate
+            # the feature in the build.
+            if (current_platform.is_cpu() and sys.platform.startswith("darwin")
+                    and current_platform.get_cpu_architecture()
+                    == CpuArchEnum.ARM and config_dtype == torch.bfloat16):
+                logger.info("For macOS with Apple Silicon, currently bfloat16 "
+                            "is not supported. Setting dtype to float16.")
+                torch_dtype = torch.float16
+
             if current_platform.is_hpu() and config_dtype == torch.float16:
                 logger.info(
                     "For HPU, we cast models to bfloat16 instead of"
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 047f699e4f277..bc1471e1f534d 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -7,6 +7,7 @@
 import re
 import signal
 import socket
+import sys
 import tempfile
 import uuid
 from argparse import Namespace
@@ -805,6 +806,8 @@ def signal_handler(*_) -> None:
             ssl_certfile=args.ssl_certfile,
             ssl_ca_certs=args.ssl_ca_certs,
             ssl_cert_reqs=args.ssl_cert_reqs,
+            # Workaround to work on macOS
+            fd=sock.fileno() if sys.platform.startswith("darwin") else None,
             **uvicorn_kwargs,
         )
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 2660b53d7bfb0..c09cae70e9af8 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -524,6 +524,13 @@ def get_open_port() -> int:
 
 
 def find_process_using_port(port: int) -> Optional[psutil.Process]:
+    # TODO: We can not check for running processes with network
+    # port on macOS. Therefore, we can not have a full graceful shutdown
+    # of vLLM. For now, let's not look for processes in this case.
+    # Ref: https://www.florianreinhard.de/accessdenied-in-psutil/
+    if sys.platform.startswith("darwin"):
+        return None
+
     for conn in psutil.net_connections():
         if conn.laddr.port == port:
             try:

From f12141170a95ad866b3c55762623bc718994e1d7 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 8 Jan 2025 18:46:43 +0800
Subject: [PATCH 322/357] [torch.compile] consider relevant code in compilation
 cache (#11614)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py   | 70 ++++++++++++++++++++++++++++++----
 vllm/compilation/decorators.py | 28 +++++++++++++-
 vllm/config.py                 | 29 ++------------
 vllm/sequence.py               |  7 ++++
 4 files changed, 99 insertions(+), 35 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index a8dd628b9cd6f..87655530cead4 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -145,6 +145,7 @@ def wrap_inductor(graph: fx.GraphModule,
                   example_inputs,
                   additional_inductor_config,
                   compilation_config: CompilationConfig,
+                  vllm_backend: "VllmBackend",
                   graph_index: int = 0,
                   num_graphs: int = 1,
                   runtime_shape: Optional[int] = None,
@@ -176,7 +177,7 @@ def wrap_inductor(graph: fx.GraphModule,
     # see https://github.com/pytorch/pytorch/issues/138980
     graph = copy.deepcopy(graph)
 
-    cache_data = compilation_config.inductor_hash_cache
+    cache_data = vllm_backend.inductor_hash_cache
     if (runtime_shape, graph_index) in cache_data:
         # we compiled this graph before
         # so we can directly lookup the compiled graph via hash
@@ -196,7 +197,7 @@ def wrap_inductor(graph: fx.GraphModule,
                 hash_str, example_inputs, True, False)
             assert inductor_compiled_graph is not None, (
                 "Inductor cache lookup failed. Please remove"
-                f"the cache file {compilation_config.inductor_hash_cache.cache_file_path} and try again."  # noqa
+                f"the cache file {cache_data.cache_file_path} and try again."  # noqa
             )
 
         # Inductor calling convention (function signature):
@@ -354,7 +355,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
 
     def __init__(self, module: torch.fx.GraphModule,
                  compile_submod_names: List[str], vllm_config: VllmConfig,
-                 graph_pool):
+                 graph_pool, vllm_backend: "VllmBackend"):
         super().__init__(module)
         from torch._guards import detect_fake_mode
         self.fake_mode = detect_fake_mode()
@@ -362,6 +363,7 @@ def __init__(self, module: torch.fx.GraphModule,
         self.compilation_config = vllm_config.compilation_config
         self.graph_pool = graph_pool
         self.vllm_config = vllm_config
+        self.vllm_backend = vllm_backend
 
     def run(self, *args):
         fake_args = [
@@ -389,6 +391,7 @@ def call_module(self, target: torch.fx.node.Target,
                 args,
                 self.compilation_config.inductor_compile_config,
                 self.compilation_config,
+                self.vllm_backend,
                 graph_index=index,
                 num_graphs=len(self.compile_submod_names),
                 runtime_shape=None,
@@ -397,7 +400,7 @@ def call_module(self, target: torch.fx.node.Target,
             self.module.__dict__[target] = PiecewiseBackend(
                 submod, self.vllm_config, self.graph_pool, index,
                 len(self.compile_submod_names), sym_shape_indices,
-                compiled_graph_for_general_shape)
+                compiled_graph_for_general_shape, self.vllm_backend)
 
             compilation_counter.num_piecewise_capturable_graphs_seen += 1
 
@@ -430,6 +433,7 @@ class VllmBackend:
     post_grad_passes: Sequence[Callable]
     sym_tensor_indices: List[int]
     input_buffers: List[torch.Tensor]
+    inductor_hash_cache: InductorHashCache
 
     def __init__(
         self,
@@ -472,6 +476,53 @@ def configure_post_pass(self):
 
     def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
 
+        if not self.compilation_config.cache_dir:
+            # no provided cache dir, generate one based on the known factors
+            # that affects the compilation. if none of the factors change,
+            # the cache dir will be the same so that we can reuse the compiled
+            # graph.
+
+            # 1. factors come from the vllm_config (it mainly summarizes how the
+            #    model is created)
+            vllm_config = self.vllm_config
+            config_hash = vllm_config.compute_hash()
+
+            # 2. factors come from the code files that are traced by Dynamo (
+            #    it mainly summarizes how the model is used in forward pass)
+            forward_code_files = list(
+                sorted(self.compilation_config.traced_files))
+            self.compilation_config.traced_files.clear()
+            logger.debug(
+                "Traced files (to be considered for compilation cache):\n%s",
+                "\n".join(forward_code_files))
+            hash_content = []
+            for filepath in forward_code_files:
+                hash_content.append(filepath)
+                with open(filepath) as f:
+                    hash_content.append(f.read())
+            import hashlib
+            code_hash = hashlib.md5(
+                "\n".join(hash_content).encode()).hexdigest()
+
+            # combine the two hashes to generate the cache dir
+            hash_key = hashlib.md5(
+                f"{config_hash}_{code_hash}".encode()).hexdigest()[:10]
+            cache_dir = os.path.join(
+                envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key,
+                f"rank_{vllm_config.parallel_config.rank}")
+        else:
+            cache_dir = self.compilation_config.cache_dir
+        os.makedirs(cache_dir, exist_ok=True)
+
+        disabled = envs.VLLM_DISABLE_COMPILE_CACHE
+        self.inductor_hash_cache: InductorHashCache = InductorHashCache(
+            cache_dir, disabled=disabled)
+        if disabled:
+            logger.info("vLLM's torch.compile cache is disabled.")
+        else:
+            logger.info("Using cache directory: %s for vLLM's torch.compile",
+                        cache_dir)
+
         # when dynamo calls the backend, it means the bytecode
         # transform and analysis are done
         compilation_counter.num_graphs_seen += 1
@@ -507,8 +558,8 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         # propagate the split graph to the piecewise backend,
         # compile submodules with symbolic shapes
         PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile,
-                                    self.vllm_config,
-                                    self.graph_pool).run(*example_inputs)
+                                    self.vllm_config, self.graph_pool,
+                                    self).run(*example_inputs)
 
         self._called = True
 
@@ -577,7 +628,8 @@ class PiecewiseBackend:
     def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
                  graph_pool: Any, piecewise_compile_index: int,
                  total_piecewise_compiles: int, sym_shape_indices: List[int],
-                 compiled_graph_for_general_shape: Callable):
+                 compiled_graph_for_general_shape: Callable,
+                 vllm_backend: VllmBackend):
         """
         The backend for piecewise compilation.
         It mainly handles the compilation and cudagraph capturing.
@@ -597,6 +649,7 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
         self.graph_pool = graph_pool
         self.piecewise_compile_index = piecewise_compile_index
         self.total_piecewise_compiles = total_piecewise_compiles
+        self.vllm_backend = vllm_backend
 
         self.is_first_graph = piecewise_compile_index == 0
         self.is_last_graph = (
@@ -634,7 +687,7 @@ def check_for_ending_compilation(self):
         if self.is_last_graph and not self.to_be_compiled_sizes:
             # no specific sizes to compile
             # save the hash of the inductor graph for the next run
-            self.compilation_config.inductor_hash_cache.save_to_file()
+            self.vllm_backend.inductor_hash_cache.save_to_file()
             end_monitoring_torch_compile(self.vllm_config)
 
     def __call__(self, *args) -> Any:
@@ -662,6 +715,7 @@ def __call__(self, *args) -> Any:
                 args,
                 self.compilation_config.inductor_compile_config,
                 self.compilation_config,
+                self.vllm_backend,
                 graph_index=self.piecewise_compile_index,
                 num_graphs=self.total_piecewise_compiles,
                 runtime_shape=runtime_shape,
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 805a217ee6ca1..10513111ea7f1 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -1,8 +1,10 @@
 import inspect
 from typing import Callable, Dict, List, Optional, TypeVar, Union, overload
+from unittest.mock import patch
 
 import torch
 import torch.nn as nn
+from torch._dynamo.symbolic_convert import InliningInstructionTranslator
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
@@ -196,7 +198,31 @@ def __call__(self, *args, **kwargs):
             # we need to control all the compilation of the model.
             torch._dynamo.eval_frame.remove_from_cache(
                 self.original_code_object)
-            return self.compiled_callable(*args, **kwargs)
+
+            # collect all relevant files traced by Dynamo,
+            # so that the compilation cache can trigger re-compilation
+            # properly when any of these files change.
+
+            # 1. the file containing the top-level forward function
+            self.vllm_config.compilation_config.traced_files.add(
+                self.original_code_object.co_filename)
+
+            # 2. every time Dynamo sees a function call, it will inline
+            # the function by calling InliningInstructionTranslator.inline_call
+            # we hijack this function to know all the functions called
+            # during Dynamo tracing, and their corresponding files
+            inline_call = InliningInstructionTranslator.inline_call
+
+            def patched_inline_call(parent, func, args, kwargs):
+                code = func.get_code()
+                self.vllm_config.compilation_config.traced_files.add(
+                    code.co_filename)
+                return inline_call(parent, func, args, kwargs)
+
+            with patch.object(InliningInstructionTranslator, 'inline_call',
+                              patched_inline_call):
+                output = self.compiled_callable(*args, **kwargs)
+            return output
 
         # usually, capturing the model once is enough, and then we can
         # dispatch to the compiled code directly, without going through
diff --git a/vllm/config.py b/vllm/config.py
index 535cbe97a311a..6dabeb3861af2 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3,7 +3,6 @@
 import enum
 import hashlib
 import json
-import os
 import sys
 import warnings
 from contextlib import contextmanager
@@ -2778,9 +2777,8 @@ def model_post_init(self, __context: Any) -> None:
     # keep track of enabled and disabled custom ops
     enabled_custom_ops: Counter[str] = PrivateAttr
     disabled_custom_ops: Counter[str] = PrivateAttr
+    traced_files: Set[str] = PrivateAttr
     compilation_time: float = PrivateAttr
-    # should be InductorHashCache, but Pydantic does not support it
-    inductor_hash_cache: Any = PrivateAttr
 
     # Per-model forward context
     # Mainly used to store attention cls
@@ -2818,6 +2816,7 @@ def __repr__(self) -> str:
             "compilation_time",
             "bs_to_padded_graph_size",
             "pass_config",
+            "traced_files",
         }
         return self.model_dump_json(exclude=exclude, exclude_unset=True)
 
@@ -2877,6 +2876,7 @@ def model_post_init(self, __context: Any) -> None:
 
         self.enabled_custom_ops = Counter()
         self.disabled_custom_ops = Counter()
+        self.traced_files = set()
         self.static_forward_context = {}
         self.compilation_time = 0.0
 
@@ -2899,29 +2899,6 @@ def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
         # merge with the config use_inductor
         assert self.level == CompilationLevel.PIECEWISE
 
-        if not self.cache_dir:
-            # no provided cache dir, generate one based on the known factors
-            # that affects the compilation. if none of the factors change,
-            # the cache dir will be the same so that we can reuse the compiled
-            # graph.
-            hash_key = vllm_config.compute_hash()
-            cache_dir = os.path.join(
-                envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key,
-                f"rank_{vllm_config.parallel_config.rank}")
-            os.makedirs(cache_dir, exist_ok=True)
-            self.cache_dir = cache_dir
-
-            disabled = envs.VLLM_DISABLE_COMPILE_CACHE
-            from vllm.compilation.backends import InductorHashCache
-            self.inductor_hash_cache: InductorHashCache = InductorHashCache(
-                self.cache_dir, disabled=disabled)
-            if disabled:
-                logger.info("vLLM's torch.compile cache is disabled.")
-            else:
-                logger.info(
-                    "Using cache directory: %s for vLLM's torch.compile",
-                    self.cache_dir)
-
         from vllm.compilation.backends import VllmBackend
         return VllmBackend(vllm_config)
 
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 0157abbd2eed5..5857f656dfc10 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1108,6 +1108,13 @@ class IntermediateTensors:
 
     tensors: Dict[str, torch.Tensor]
 
+    def __init__(self, tensors):
+        # manually define this function, so that
+        # Dynamo knows `IntermediateTensors()` comes from this file.
+        # Otherwise, dataclass will generate this function by evaluating
+        # a string, and we will lose the information about the source file.
+        self.tensors = tensors
+
     def __getitem__(self, key: Union[str, slice]):
         if isinstance(key, str):
             return self.tensors[key]

From 2a0596bc480bb835dc05a30f5e708ecbfffbcd69 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 8 Jan 2025 18:59:58 +0800
Subject: [PATCH 323/357] [VLM] Reorganize profiling/processing-related code
 (#11812)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../processing/test_llava_next.py             |  41 ++--
 .../processing/test_llava_onevision.py        |  41 ++--
 .../vision_language/processing/test_phi3v.py  |  24 +-
 .../processing/test_qwen2_vl.py               |  22 +-
 tests/multimodal/test_processing.py           |  52 ++---
 .../vllm_add_dummy_model/my_llava.py          |  10 +-
 vllm/inputs/preprocess.py                     |   2 +-
 vllm/inputs/registry.py                       |   4 +-
 vllm/model_executor/models/aria.py            |  47 ++--
 vllm/model_executor/models/blip2.py           |  39 ++--
 vllm/model_executor/models/chameleon.py       |  47 ++--
 vllm/model_executor/models/fuyu.py            |  80 +++----
 vllm/model_executor/models/llava.py           | 175 +++++++-------
 vllm/model_executor/models/llava_next.py      |  55 +++--
 .../model_executor/models/llava_next_video.py | 104 +++++----
 vllm/model_executor/models/llava_onevision.py | 115 +++++----
 vllm/model_executor/models/phi3v.py           |  83 +++----
 vllm/model_executor/models/qwen2_audio.py     |  49 ++--
 vllm/model_executor/models/qwen2_vl.py        | 113 +++++----
 vllm/model_executor/models/ultravox.py        |  46 ++--
 vllm/multimodal/processing.py                 | 219 +++++++-----------
 vllm/multimodal/profiling.py                  | 152 +++++++++---
 vllm/multimodal/registry.py                   |  73 +++++-
 23 files changed, 833 insertions(+), 760 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
index 9fa6a8a10a0f9..689d17be81889 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_next.py
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
@@ -4,24 +4,17 @@
 import pytest
 from PIL import Image
 from pqdm.threads import pqdm
-from transformers import AutoTokenizer
 
-from vllm.inputs import InputProcessingContext
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.parse import ImageSize
+from vllm.multimodal.processing import BaseMultiModalProcessor
+from vllm.multimodal.utils import cached_get_tokenizer
 
 from ....utils import build_model_context
 
 
-# Fixtures lazy import to avoid initializing CUDA during test collection
-@pytest.fixture()
-def processor_for_llava_next():
-    from vllm.model_executor.models.llava_next import (
-        LlavaNextMultiModalProcessor)
-    return LlavaNextMultiModalProcessor
-
-
 def _validate_image_prompt_replacements_one(
-    processor,
+    processor: BaseMultiModalProcessor,
     num_imgs: int,
     failed_size_excs: list[tuple[ImageSize, Exception]],
     image_size: ImageSize,
@@ -78,20 +71,17 @@ def _test_image_prompt_replacements(
 
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize("num_imgs", [1, 2])
-def test_processor_prompt_replacements_regression(
-    processor_for_llava_next,
-    model_id: str,
-    num_imgs: int,
-):
+def test_processor_prompt_replacements_regression(model_id, num_imgs):
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
-    processor = processor_for_llava_next(ctx)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
 
     image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
                     (488, 183), (2560, 1669)]
@@ -111,20 +101,17 @@ def test_processor_prompt_replacements_regression(
                   "Comment this out to run it manually.")
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize("num_imgs", [1])
-def test_processor_prompt_replacements_all(
-    processor_for_llava_next,
-    model_id: str,
-    num_imgs: int,
-):
+def test_processor_prompt_replacements_all(model_id, num_imgs):
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
-    processor = processor_for_llava_next(ctx)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
 
     seen_aspect_ratios = set[float]()
     image_sizes = list[ImageSize]()
diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
index d4cdffa210b6d..a033354f0e9b8 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
@@ -4,24 +4,17 @@
 import pytest
 from PIL import Image
 from pqdm.threads import pqdm
-from transformers import AutoTokenizer
 
-from vllm.inputs import InputProcessingContext
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.parse import ImageSize
+from vllm.multimodal.processing import BaseMultiModalProcessor
+from vllm.multimodal.utils import cached_get_tokenizer
 
 from ....utils import build_model_context
 
 
-# Fixtures lazy import to avoid initializing CUDA during test collection
-@pytest.fixture()
-def processor_for_llava_onevision():
-    from vllm.model_executor.models.llava_onevision import (
-        LlavaOnevisionMultiModalProcessor)
-    return LlavaOnevisionMultiModalProcessor
-
-
 def _validate_image_prompt_replacements_one(
-    processor,
+    processor: BaseMultiModalProcessor,
     num_imgs: int,
     failed_size_excs: list[tuple[ImageSize, Exception]],
     image_size: ImageSize,
@@ -77,20 +70,17 @@ def _test_image_prompt_replacements(
 @pytest.mark.parametrize("model_id",
                          ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
 @pytest.mark.parametrize("num_imgs", [1, 2])
-def test_processor_prompt_replacements_regression(
-    processor_for_llava_onevision,
-    model_id: str,
-    num_imgs: int,
-):
+def test_processor_prompt_replacements_regression(model_id, num_imgs):
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
-    processor = processor_for_llava_onevision(ctx)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
 
     image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
                     (488, 183), (2560, 1669)]
@@ -111,20 +101,17 @@ def test_processor_prompt_replacements_regression(
 @pytest.mark.parametrize("model_id",
                          ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
 @pytest.mark.parametrize("num_imgs", [1])
-def test_processor_prompt_replacements_all(
-    processor_for_llava_onevision,
-    model_id: str,
-    num_imgs: int,
-):
+def test_processor_prompt_replacements_all(model_id, num_imgs):
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
-    processor = processor_for_llava_onevision(ctx)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
 
     seen_aspect_ratios = set[float]()
     image_sizes = list[ImageSize]()
diff --git a/tests/models/decoder_only/vision_language/processing/test_phi3v.py b/tests/models/decoder_only/vision_language/processing/test_phi3v.py
index 249045b3c04ce..c5b77260c6544 100644
--- a/tests/models/decoder_only/vision_language/processing/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/processing/test_phi3v.py
@@ -1,21 +1,13 @@
 """Tests for phi3v's multimodal preprocessing kwargs."""
 import pytest
-from transformers import AutoTokenizer
 
-from vllm.inputs import InputProcessingContext
-from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.utils import cached_get_tokenizer
 
 from .....conftest import _ImageAssets
 from ....utils import build_model_context
 
 
-# Wrap lazy imports to avoid initializing CUDA during test collection
-@pytest.fixture()
-def processor_for_phi3v():
-    from vllm.model_executor.models.phi3v import Phi3VMultiModalProcessor
-    return Phi3VMultiModalProcessor
-
-
 @pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
 # yapf: disable
 @pytest.mark.parametrize(
@@ -29,7 +21,6 @@ def processor_for_phi3v():
 # yapf: enable
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_override(
-    processor_for_phi3v,
     image_assets: _ImageAssets,
     model_id: str,
     mm_processor_kwargs: dict[str, int],
@@ -37,21 +28,26 @@ def test_processor_override(
     num_imgs: int,
 ):
     """Ensure input_processor_for_phi3v handles num_crops properly."""
+    # Avoid initializing CUDA early
+    from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
+
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
         trust_remote_code=True,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
+    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=tokenizer,
+    )
 
     # Build the image str / prompt based on the number of images we pass
     img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
     prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
     mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
 
-    processor = processor_for_phi3v(ctx)
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
 
     # Ensure we have the right number of placeholders per num_crops size
diff --git a/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py
index b9ac887edf90f..0d54802f2b733 100644
--- a/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py
@@ -1,19 +1,12 @@
 import pytest
-from transformers import AutoTokenizer
 
-from vllm.inputs import InputProcessingContext
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.utils import cached_get_tokenizer
 
 from .....conftest import _ImageAssets
 from ....utils import build_model_context
 
 
-# Fixtures lazy import to avoid initializing CUDA during test collection
-@pytest.fixture()
-def processor_for_qwen2_vl():
-    from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalProcessor
-    return Qwen2VLMultiModalProcessor
-
-
 @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
 # yapf: disable
 @pytest.mark.parametrize(
@@ -24,7 +17,6 @@ def processor_for_qwen2_vl():
 # yapf: enable
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_override(
-    processor_for_qwen2_vl,
     image_assets: _ImageAssets,
     model_id: str,
     mm_processor_kwargs: dict[str, object],
@@ -39,18 +31,20 @@ def test_processor_override(
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
+    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=tokenizer,
+    )
 
     # Build the image str / prompt based on the number of images we pass
     prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
     mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
 
-    processor = processor_for_qwen2_vl(ctx)
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
 
     # Ensure we have the right number of placeholders per num_crops size
-    hf_processor = processor._get_hf_processor(**mm_processor_kwargs)
+    hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs)
     image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
     img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
     pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 75d878217b657..d98bd9736b65f 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -10,12 +10,17 @@
 from vllm.config import ModelConfig
 from vllm.inputs import InputProcessingContext
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.processing import (ProcessingCache, PromptReplacement,
-                                        _PlaceholderInfo, find_mm_placeholders,
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.multimodal.processing import (PlaceholderInfo, ProcessingCache,
+                                        PromptReplacement,
+                                        find_mm_placeholders,
                                         find_text_matches, find_token_matches,
                                         iter_token_matches,
                                         replace_text_matches,
                                         replace_token_matches)
+# yapf: enable
+from vllm.multimodal.profiling import MultiModalProfiler
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import full_groupby
@@ -431,7 +436,7 @@ def test_find_replace_tokens(
             [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
             {
                 "pattern_1": [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_1",
                         item_idx=0,
                         start_idx=6,
@@ -445,13 +450,13 @@ def test_find_replace_tokens(
             [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],
             {
                 "pattern_1": [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_1",
                         item_idx=0,
                         start_idx=1,
                         replacement=[32000, 32000],
                     ),
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=5,
@@ -459,7 +464,7 @@ def test_find_replace_tokens(
                     ),
                 ],
                 "pattern_3": [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_3",
                         item_idx=0,
                         start_idx=7,
@@ -472,13 +477,13 @@ def test_find_replace_tokens(
             [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
             {
                 "pattern_1": [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_1",
                         item_idx=0,
                         start_idx=1,
                         replacement=[32000, 32000],
                     ),
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=3,
@@ -486,7 +491,7 @@ def test_find_replace_tokens(
                     ),
                 ],
                 "pattern_3": [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_3",
                         item_idx=0,
                         start_idx=6,
@@ -577,19 +582,15 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         revision=None,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
-    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
 
-    processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
-    ctx = InputProcessingContext(
+    processor = MULTIMODAL_REGISTRY.create_processor(
         model_config,
         tokenizer=cached_get_tokenizer(model_config.tokenizer),
     )
-
-    processor = processor_factory(ctx, cache=None)
-    profiler = processor.profiling_info
+    profiler = MultiModalProfiler(processor)
 
     mock_supported_mm_limits = MagicMock(return_value={"image": num_supported})
-    profiler.get_supported_mm_limits = mock_supported_mm_limits
+    processor.info.get_supported_mm_limits = mock_supported_mm_limits
 
     if is_valid:
         exc_ctx = nullcontext()
@@ -597,7 +598,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         exc_ctx = pytest.raises(ValueError, match="this model only supports")
 
     with exc_ctx:
-        profiler.get_mm_limits()
+        profiler.get_dummy_data(model_config.max_model_len)
 
 
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@@ -620,16 +621,12 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
         revision=None,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
-    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
 
-    processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
-    ctx = InputProcessingContext(
+    processor = MULTIMODAL_REGISTRY.create_processor(
         model_config,
         tokenizer=cached_get_tokenizer(model_config.tokenizer),
     )
 
-    processor = processor_factory(ctx, cache=None)
-
     rng = np.random.RandomState(0)
     image = _rand_img(rng, min_wh=128, max_wh=256)
     if num_images == 0:
@@ -681,9 +678,9 @@ def _test_processing_cache_correctness(
         hf_overrides=hf_overrides,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
-    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
 
-    processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
+    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
+    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
     ctx = InputProcessingContext(
         model_config,
         tokenizer=cached_get_tokenizer(model_config.tokenizer),
@@ -691,8 +688,9 @@ def _test_processing_cache_correctness(
     # Ensure that it can fit all of the data
     cache = ProcessingCache(capacity=1 << 30)
 
-    baseline_processor = processor_factory(ctx, cache=None)
-    cached_processor = processor_factory(ctx, cache=cache)
+    baseline_processor = factories.build_processor(ctx, cache=None)
+    cached_processor = factories.build_processor(ctx, cache=cache)
+    dummy_inputs = baseline_processor.dummy_inputs
 
     rng = np.random.RandomState(0)
 
@@ -724,7 +722,7 @@ def _test_processing_cache_correctness(
         }
 
         mm_counts = {k: len(vs) for k, vs in mm_data.items()}
-        prompt = baseline_processor.profiling_info.get_dummy_processor_inputs(
+        prompt = dummy_inputs.get_dummy_processor_inputs(
             model_config.max_model_len,
             mm_counts,
         ).prompt_text
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
index 06dfebbb95527..ac64edfd4ec9d 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
@@ -2,13 +2,17 @@
 
 import torch
 
-from vllm.model_executor.models.llava import (LlavaForConditionalGeneration,
-                                              LlavaMultiModalProcessor)
+from vllm.model_executor.models.llava import (LlavaDummyInputsBuilder,
+                                              LlavaForConditionalGeneration,
+                                              LlavaMultiModalProcessor,
+                                              LlavaProcessingInfo)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
 
-@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor,
+                                        info=LlavaProcessingInfo,
+                                        dummy_inputs=LlavaDummyInputsBuilder)
 class MyLlava(LlavaForConditionalGeneration):
 
     def compute_logits(
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index b362ee0cac328..6ddc1eb76f10d 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -7,7 +7,7 @@
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
-from vllm.multimodal.processing import MultiModalDataDict, MultiModalInputsV2
+from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputsV2
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.utils import print_info_once, print_warning_once
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 2d9d024e03e80..b22b3f1594f24 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -323,6 +323,7 @@ def dummy_data_for_profiling(
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
         from vllm.multimodal import MultiModalKwargs
+        from vllm.multimodal.profiling import MultiModalProfiler
         from vllm.multimodal.utils import cached_get_tokenizer
 
         if mm_registry.has_processor(model_config):
@@ -331,7 +332,8 @@ def dummy_data_for_profiling(
                 trust_remote_code=model_config.trust_remote_code,
             )
             processor = mm_registry.create_processor(model_config, tokenizer)
-            dummy_data = processor.get_dummy_data(seq_len)
+            profiler = MultiModalProfiler(processor)
+            dummy_data = profiler.get_dummy_data(seq_len)
         else:
             model_cls, _ = get_model_architecture(model_config)
             if is_encoder_data:
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 2e649f10c0765..089062ab53fc3 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -23,10 +23,10 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
+from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.aria import (AriaMoELMConfig,
                                                   AriaVisionConfig)
@@ -445,33 +445,33 @@ def build_mm_projector(config: PretrainedConfig):
     )
 
 
-class AriaProcessingMixin(ProcessingMixin):
+class AriaProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config()
 
-    def _get_vision_config(self) -> AriaVisionConfig:
-        return self._get_hf_config().vision_config
-
-    def _get_num_image_tokens(self) -> int:
-        hf_config = self._get_hf_config()
-        return max(hf_config.projector_patch_to_query_dict.values())
-
-
-class AriaProfilingInfo(AriaProcessingMixin, BaseProfilingInfo):
+    def get_vision_config(self) -> AriaVisionConfig:
+        return self.get_hf_config().vision_config
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {"image": self._get_num_image_tokens()}
+        return {"image": self.get_num_image_tokens()}
+
+    def get_num_image_tokens(self) -> int:
+        hf_config = self.get_hf_config()
+        return max(hf_config.projector_patch_to_query_dict.values())
+
+
+class AriaDummyInputsBuilder(BaseDummyInputsBuilder[AriaProcessingInfo]):
 
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        vision_config = self._get_vision_config()
+        vision_config = self.info.get_vision_config()
 
         max_image_size = vision_config.image_size
         num_images = mm_counts.get("image", 0)
@@ -483,7 +483,7 @@ def get_dummy_processor_inputs(
                                    num_images=num_images)
         }
 
-        hf_processor = self._get_hf_processor()
+        hf_processor = self.info.get_hf_processor()
         image_token: str = hf_processor.image_token  # type: ignore
 
         return ProcessorInputs(
@@ -492,10 +492,7 @@ def get_dummy_processor_inputs(
         )
 
 
-class AriaMultiModalProcessor(AriaProcessingMixin, BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return AriaProfilingInfo(self.ctx)
+class AriaMultiModalProcessor(BaseMultiModalProcessor[AriaProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -513,10 +510,10 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
 
-        num_image_tokens = self._get_num_image_tokens()
+        num_image_tokens = self.info.get_num_image_tokens()
 
         return [
             PromptReplacement(
@@ -527,7 +524,9 @@ def _get_prompt_replacements(
         ]
 
 
-@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor,
+                                        info=AriaProcessingInfo,
+                                        dummy_inputs=AriaDummyInputsBuilder)
 class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
     """
     Aria model for conditional generation tasks.
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index fd45783f167b4..7dfc0b687c6e3 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -17,10 +17,10 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
+from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .blip import BlipVisionModel
@@ -397,30 +397,30 @@ def forward(
         return sequence_output
 
 
-class Blip2ProcessingMixin(ProcessingMixin):
+class Blip2ProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(Blip2Config)
 
-    def _get_num_image_tokens(self) -> int:
-        hf_config = self._get_hf_config()
-        return hf_config.num_query_tokens
-
-
-class Blip2ProfilingInfo(Blip2ProcessingMixin, BaseProfilingInfo):
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {"image": self._get_num_image_tokens()}
+        return {"image": self.get_num_image_tokens()}
+
+    def get_num_image_tokens(self) -> int:
+        hf_config = self.get_hf_config()
+        return hf_config.num_query_tokens
+
+
+class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]):
 
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         vision_config = hf_config.vision_config
 
         max_image_size = vision_config.image_size
@@ -439,10 +439,7 @@ def get_dummy_processor_inputs(
         )
 
 
-class Blip2MultiModalProcessor(Blip2ProcessingMixin, BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return Blip2ProfilingInfo(self.ctx)
+class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -460,7 +457,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        num_image_tokens = self._get_num_image_tokens()
+        num_image_tokens = self.info.get_num_image_tokens()
 
         return [
             PromptReplacement(
@@ -491,7 +488,9 @@ def apply(
         return result
 
 
-@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor,
+                                        info=Blip2ProcessingInfo,
+                                        dummy_inputs=Blip2DummyInputsBuilder)
 class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 73ed73b61ebf9..acff926891bbe 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -30,10 +30,10 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
+from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import print_warning_once
 
@@ -49,33 +49,34 @@ class ChameleonImagePixelInputs(TypedDict):
     """Shape: `(batch_size * num_images, num_channels, height, width)`"""
 
 
-class ChameleonProcessingMixin(ProcessingMixin):
+class ChameleonProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(ChameleonConfig)
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(ChameleonProcessor)
 
-    def _get_num_image_tokens(self) -> int:
-        processor = self._get_hf_processor()
-        return processor.image_seq_length
-
-
-class ChameleonProfilingInfo(ChameleonProcessingMixin, BaseProfilingInfo):
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {"image": self._get_num_image_tokens()}
+        return {"image": self.get_num_image_tokens()}
+
+    def get_num_image_tokens(self) -> int:
+        processor = self.get_hf_processor()
+        return processor.image_seq_length
+
+
+class ChameleonDummyInputsBuilder(
+        BaseDummyInputsBuilder[ChameleonProcessingInfo]):
 
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        config = self._get_hf_config()
+        config = self.info.get_hf_config()
 
         width = height = config.vq_config.resolution
         num_images = mm_counts.get("image", 0)
@@ -93,11 +94,8 @@ def get_dummy_processor_inputs(
         )
 
 
-class ChameleonMultiModalProcessor(ChameleonProcessingMixin,
-                                   BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return ChameleonProfilingInfo(self.ctx)
+class ChameleonMultiModalProcessor(
+        BaseMultiModalProcessor[ChameleonProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -112,7 +110,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        processor = self._get_hf_processor(**hf_processor_mm_kwargs)
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
         return [
             PromptReplacement(
@@ -120,7 +118,7 @@ def _get_prompt_replacements(
                 target="<image>",
                 replacement="".join([
                     processor.image_start_token,
-                    processor.image_token * self._get_num_image_tokens(),
+                    processor.image_token * self.info.get_num_image_tokens(),
                     processor.image_end_token,
                 ]),
             )
@@ -916,7 +914,10 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_processor(ChameleonMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(
+    ChameleonMultiModalProcessor,
+    info=ChameleonProcessingInfo,
+    dummy_inputs=ChameleonDummyInputsBuilder)
 class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
 
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index c937fcb0978b9..59af5f0b3ae98 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -33,11 +33,11 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
-from vllm.multimodal.parse import ImageProcessorItems, ImageSize
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -64,24 +64,38 @@ class FuyuImagePatchInputs(TypedDict):
     """
 
 
-class FuyuProcessingMixin(ProcessingMixin):
+class FuyuProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(FuyuConfig)
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(FuyuProcessor)
 
-    def _get_image_processor(self) -> FuyuImageProcessor:
-        return self._get_hf_processor().image_processor
+    def get_image_processor(self) -> FuyuImageProcessor:
+        return self.get_hf_processor().image_processor
 
-    def _get_image_feature_grid_size(
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        max_ncols, max_nrows = self.get_image_feature_grid_size(
+            image_width=target_width,
+            image_height=target_height,
+        )
+        max_image_tokens = (max_ncols + 1) * max_nrows
+
+        return {"image": max_image_tokens}
+
+    def get_image_feature_grid_size(
         self,
         *,
         image_width: int,
         image_height: int,
     ) -> tuple[int, int]:
-        image_processor = self._get_image_processor()
+        image_processor = self.get_image_processor()
         target_width = image_processor.size["width"]
         target_height = image_processor.size["height"]
 
@@ -97,34 +111,21 @@ def _get_image_feature_grid_size(
         nrows = math.ceil(image_height / 30)
         return ncols, nrows
 
-
-class FuyuProfilingInfo(FuyuProcessingMixin, BaseProfilingInfo):
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": 1}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        target_width, target_height = self._get_image_size_with_most_features()
-
-        max_ncols, max_nrows = self._get_image_feature_grid_size(
-            image_width=target_width,
-            image_height=target_height,
-        )
-        max_image_tokens = (max_ncols + 1) * max_nrows
-
-        return {"image": max_image_tokens}
-
-    def _get_image_size_with_most_features(self) -> ImageSize:
-        image_processor = self._get_image_processor()
+    def get_image_size_with_most_features(self) -> ImageSize:
+        image_processor = self.get_image_processor()
         return ImageSize(width=image_processor.size["width"],
                          height=image_processor.size["height"])
 
+
+class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
 
         mm_data = {
@@ -140,10 +141,7 @@ def get_dummy_processor_inputs(
         )
 
 
-class FuyuMultiModalProcessor(FuyuProcessingMixin, BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return FuyuProfilingInfo(self.ctx)
+class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
 
     def _call_hf_processor(
         self,
@@ -156,7 +154,7 @@ def _call_hf_processor(
             # Avoid warning from HF logger for text-only input
             # Input_ids format: bos_token_id + prompt_token_ids + boa_token_id
             # Tokenizer won't add boa_token_id by default, we add it manually.
-            tokenizer = self._get_tokenizer()
+            tokenizer = self.info.get_tokenizer()
             boa_token_id: int = tokenizer.vocab["<0x04>"]  # type: ignore
             prompt_ids = tokenizer.encode(prompt) + [boa_token_id]
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
@@ -196,10 +194,10 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         bos_token_id = hf_config.bos_token_id
 
-        tokenizer = self._get_tokenizer()
+        tokenizer = self.info.get_tokenizer()
         eot_token_id = tokenizer.bos_token_id
         assert isinstance(eot_token_id, int)
 
@@ -207,7 +205,7 @@ def get_replacement_fuyu(item_idx: int):
             images = mm_items.get_items("image", ImageProcessorItems)
             image_size = images.get_image_size(item_idx)
 
-            ncols, nrows = self._get_image_feature_grid_size(
+            ncols, nrows = self.info.get_image_feature_grid_size(
                 image_width=image_size.width,
                 image_height=image_size.height,
             )
@@ -244,7 +242,9 @@ def apply(
         return result
 
 
-@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor,
+                                        info=FuyuProcessingInfo,
+                                        dummy_inputs=FuyuDummyInputsBuilder)
 class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 305f1364dba23..8d94acf3b21d5 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,7 +1,7 @@
-from abc import ABC, abstractmethod
+from abc import abstractmethod
 from functools import cached_property
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
-                    Protocol, Set, Tuple, TypedDict, Union)
+                    Protocol, Set, Tuple, TypedDict, TypeVar, Union)
 
 import torch
 import torch.nn as nn
@@ -25,11 +25,11 @@
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
-                                   ImageSize)
+                                   ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingCache,
-                                        ProcessingMixin, PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, ProcessingCache,
+                                        PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .clip import CLIPVisionModel
@@ -105,34 +105,23 @@ class LlavaLikeProcessor(Protocol):
     image_token: Final[str]
 
 
-class BaseLlavaProcessingMixin(ProcessingMixin, ABC):
+class BaseLlavaProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self) -> LlavaLikeConfig:
+    def get_hf_config(self) -> LlavaLikeConfig:
         return self.ctx.get_hf_config(LlavaConfig)
 
-    def _get_vision_encoder_info(self):
-        return get_vision_encoder_info(self._get_hf_config())
+    def get_vision_encoder_info(self):
+        return get_vision_encoder_info(self.get_hf_config())
 
     @abstractmethod
-    def _get_hf_processor(self) -> LlavaLikeProcessor:
+    def get_hf_processor(self) -> LlavaLikeProcessor:
         raise NotImplementedError
 
-    def _get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        hf_config = self._get_hf_config()
-        vision_encoder_info = self._get_vision_encoder_info()
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
 
-        return self._apply_feature_select_strategy(
-            hf_config.vision_feature_select_strategy,
-            vision_encoder_info.get_num_image_tokens(
-                image_width=image_width,
-                image_height=image_height,
-            ),
-        )
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {"image": self.get_max_image_tokens()}
 
     def _apply_feature_select_strategy(
         self,
@@ -147,28 +136,42 @@ def _apply_feature_select_strategy(
         msg = f"Unexpected feature select strategy: {strategy!r}"
         raise NotImplementedError(msg)
 
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self.get_hf_config()
+        vision_encoder_info = self.get_vision_encoder_info()
 
-class BaseLlavaProfilingInfo(BaseLlavaProcessingMixin, BaseProfilingInfo):
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {"image": self._get_max_image_tokens()}
+        return self._apply_feature_select_strategy(
+            hf_config.vision_feature_select_strategy,
+            vision_encoder_info.get_num_image_tokens(
+                image_width=image_width,
+                image_height=image_height,
+            ),
+        )
 
-    def _get_image_size_with_most_features(self) -> ImageSize:
-        vision_encoder_info = self._get_vision_encoder_info()
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self.get_vision_encoder_info()
         width = height = vision_encoder_info.get_image_size()
         return ImageSize(width=width, height=height)
 
-    def _get_max_image_tokens(self) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
 
-        return self._get_num_image_tokens(
+        return self.get_num_image_tokens(
             image_width=target_width,
             image_height=target_height,
         )
 
+
+_I = TypeVar("_I", bound=BaseLlavaProcessingInfo)
+
+
+class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
@@ -176,9 +179,10 @@ def get_dummy_processor_inputs(
     ) -> ProcessorInputs:
         num_images = mm_counts.get("image", 0)
 
-        processor = self._get_hf_processor()
+        processor = self.info.get_hf_processor()
         image_token = processor.image_token
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
 
         mm_data = {
             "image":
@@ -193,23 +197,13 @@ def get_dummy_processor_inputs(
         )
 
 
-class LlavaProcessingMixin(BaseLlavaProcessingMixin):
+class LlavaProcessingInfo(BaseLlavaProcessingInfo):
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaProcessor)
 
 
-class LlavaProfilingInfo(LlavaProcessingMixin, BaseLlavaProfilingInfo):
-    pass
-
-
-class BaseLlavaMultiModalProcessor(LlavaProcessingMixin,
-                                   BaseMultiModalProcessor):
-
-    # Copied from BaseMultiModalProcessor
-    @abstractmethod
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        raise NotImplementedError
+class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
     # Copied from BaseMultiModalProcessor
     @abstractmethod
@@ -226,7 +220,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
 
         def get_replacement(item_idx: int):
@@ -237,7 +231,7 @@ def get_replacement(item_idx: int):
                 num_image_tokens = images.get_feature_size(item_idx)
             else:
                 image_size = images.get_image_size(item_idx)
-                num_image_tokens = self._get_num_image_tokens(
+                num_image_tokens = self.info.get_num_image_tokens(
                     image_width=image_size.width,
                     image_height=image_size.height,
                 )
@@ -253,10 +247,8 @@ def get_replacement(item_idx: int):
         ]
 
 
-class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return LlavaProfilingInfo(self.ctx)
+class LlavaMultiModalProcessor(
+        BaseLlavaMultiModalProcessor[LlavaProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -269,21 +261,14 @@ def _get_mm_fields_config(
         )
 
 
-class PixtralHFProcessingMixin(BaseLlavaProcessingMixin):
+class PixtralHFProcessingInfo(BaseLlavaProcessingInfo):
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(PixtralProcessor)
 
 
-class PixtralHFProfilingInfo(PixtralHFProcessingMixin, BaseLlavaProfilingInfo):
-    pass
-
-
-class PixtralHFMultiModalProcessor(PixtralHFProcessingMixin,
-                                   BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return PixtralHFProfilingInfo(self.ctx)
+class PixtralHFMultiModalProcessor(
+        BaseMultiModalProcessor[PixtralHFProcessingInfo]):
 
     def _call_hf_processor(
         self,
@@ -328,10 +313,10 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
 
-        processor = self._get_hf_processor()
+        processor = self.info.get_hf_processor()
         image_token = processor.image_token
         image_break_token = processor.image_break_token
         image_end_token = processor.image_end_token
@@ -363,26 +348,40 @@ def get_replacement(item_idx: int):
         ]
 
 
+def _build_llava_or_pixtral_hf_info(
+    ctx: InputProcessingContext, ) -> BaseLlavaProcessingInfo:
+    hf_config = ctx.get_hf_config(LlavaConfig)
+
+    if isinstance(hf_config.vision_config, PixtralVisionConfig):
+        return PixtralHFProcessingInfo(ctx)
+
+    return LlavaProcessingInfo(ctx)
+
+
 def _build_llava_or_pixtral_hf_processor(
-    ctx: InputProcessingContext,
+    info: _I,
+    dummy_inputs: BaseDummyInputsBuilder[_I],
     *,
     cache: Optional[ProcessingCache] = None,
     enable_sanity_checks: bool = True,
 ) -> BaseMultiModalProcessor:
-    hf_config = ctx.get_hf_config(LlavaConfig)
-
-    if isinstance(hf_config.vision_config, PixtralVisionConfig):
+    if isinstance(info, PixtralHFProcessingInfo):
         return PixtralHFMultiModalProcessor(
-            ctx,
+            info,
+            dummy_inputs,  # type: ignore
+            cache=cache,
+            enable_sanity_checks=enable_sanity_checks,
+        )
+
+    if isinstance(info, LlavaProcessingInfo):
+        return LlavaMultiModalProcessor(
+            info,
+            dummy_inputs,  # type: ignore
             cache=cache,
             enable_sanity_checks=enable_sanity_checks,
         )
 
-    return LlavaMultiModalProcessor(
-        ctx,
-        cache=cache,
-        enable_sanity_checks=enable_sanity_checks,
-    )
+    raise NotImplementedError(type(info))
 
 
 def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
@@ -460,7 +459,9 @@ def init_vision_tower_for_llava(
     raise NotImplementedError(msg)
 
 
-@MULTIMODAL_REGISTRY.register_processor(_build_llava_or_pixtral_hf_processor)
+@MULTIMODAL_REGISTRY.register_processor(_build_llava_or_pixtral_hf_processor,
+                                        info=_build_llava_or_pixtral_hf_info,
+                                        dummy_inputs=LlavaDummyInputsBuilder)
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     # BitandBytes specific attributes
     bitsandbytes_stacked_params_mapping = {
@@ -727,11 +728,11 @@ def apply(
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
 
         # Assume that it doesn't depend on the image size
-        num_image_tokens = self._get_num_image_tokens(
+        num_image_tokens = self.info.get_num_image_tokens(
             image_width=-1,
             image_height=-1,
         )
@@ -796,6 +797,8 @@ def get_replacement_mantis(item_idx: int):
 
 # To use this model, please use
 # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'`
-@MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor,
+                                        info=LlavaProcessingInfo,
+                                        dummy_inputs=LlavaDummyInputsBuilder)
 class MantisForConditionalGeneration(LlavaForConditionalGeneration):
     pass
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 815456dac2a2f..fda4f22d366b1 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,6 +1,7 @@
+from abc import abstractmethod
 from functools import cached_property
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
-                    Protocol, Set, Tuple, TypedDict, Union)
+                    Protocol, Set, Tuple, TypedDict, TypeVar, Union)
 
 import torch
 import torch.nn as nn
@@ -16,13 +17,12 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
 from vllm.multimodal.parse import ImageSize
-from vllm.multimodal.profiling import BaseProfilingInfo
 from vllm.sequence import IntermediateTensors
 
 from .clip import CLIPVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
-from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingMixin,
-                    BaseLlavaProfilingInfo, LlavaLikeConfig,
+from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingInfo,
+                    LlavaDummyInputsBuilder, LlavaLikeConfig,
                     LlavaMultiModalProjector, init_vision_tower_for_llava)
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn,
@@ -65,23 +65,23 @@ class LlavaNextLikeConfig(LlavaLikeConfig, Protocol):
     image_grid_pinpoints: Final[list[list[int]]]
 
 
-class LlavaNextProcessingMixin(BaseLlavaProcessingMixin):
+class LlavaNextProcessingInfo(BaseLlavaProcessingInfo):
 
-    def _get_hf_config(self) -> LlavaNextLikeConfig:
+    def get_hf_config(self) -> LlavaNextLikeConfig:
         return self.ctx.get_hf_config(LlavaNextConfig)
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaNextProcessor)
 
     # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L113
-    def _get_num_image_tokens(
+    def get_num_image_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
     ) -> int:
-        hf_config = self._get_hf_config()
-        vision_encoder_info = self._get_vision_encoder_info()
+        hf_config = self.get_hf_config()
+        vision_encoder_info = self.get_vision_encoder_info()
 
         base_feature_size = self._apply_feature_select_strategy(
             hf_config.vision_feature_select_strategy,
@@ -140,16 +140,13 @@ def _get_num_unpadded_features(
 
         return (unpadded_features, newline_features)
 
-
-class LlavaNextProfilingInfo(LlavaNextProcessingMixin, BaseLlavaProfilingInfo):
-
-    def _get_image_size_with_most_features(self) -> ImageSize:
-        hf_config = self._get_hf_config()
+    def get_image_size_with_most_features(self) -> ImageSize:
+        hf_config = self.get_hf_config()
 
         largest_feature_size, largest_feature_pinpoint = 0, None
         for (height, width) in hf_config.image_grid_pinpoints:
-            feat_size = self._get_num_image_tokens(image_width=width,
-                                                   image_height=height)
+            feat_size = self.get_num_image_tokens(image_width=width,
+                                                  image_height=height)
             if feat_size > largest_feature_size:
                 largest_feature_size = feat_size
                 largest_feature_pinpoint = ImageSize(width=width,
@@ -161,11 +158,23 @@ def _get_image_size_with_most_features(self) -> ImageSize:
         return largest_feature_pinpoint
 
 
-class LlavaNextMultiModalProcessor(LlavaNextProcessingMixin,
-                                   BaseLlavaMultiModalProcessor):
+_I = TypeVar("_I", bound=LlavaNextProcessingInfo)
+
+
+class BaseLlavaNextMultiModalProcessor(BaseLlavaMultiModalProcessor[_I]):
+
+    # Copied from BaseMultiModalProcessor
+    @abstractmethod
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        raise NotImplementedError
+
 
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return LlavaNextProfilingInfo(self.ctx)
+class LlavaNextMultiModalProcessor(
+        BaseLlavaNextMultiModalProcessor[LlavaNextProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -179,7 +188,9 @@ def _get_mm_fields_config(
         )
 
 
-@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor,
+                                        info=LlavaNextProcessingInfo,
+                                        dummy_inputs=LlavaDummyInputsBuilder)
 class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
 
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 6e82cee1c95a4..5be85d7c0f033 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -17,12 +17,11 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
-from vllm.multimodal.parse import (ImageSize, VideoEmbeddingItems,
-                                   VideoProcessorItems)
+from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
+                                   VideoEmbeddingItems, VideoProcessorItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
@@ -47,33 +46,52 @@ class LlavaNextVideoPixelInputs(TypedDict):
     """
 
 
-class LlavaNextVideoProcessingMixin(ProcessingMixin):
+class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(LlavaNextVideoConfig)
 
-    def _get_vision_encoder_info(self):
-        return get_vision_encoder_info(self._get_hf_config())
+    def get_vision_encoder_info(self):
+        return get_vision_encoder_info(self.get_hf_config())
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaNextVideoProcessor)
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"video": 1}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        max_video_tokens = self.get_num_video_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=self.get_num_frames_with_most_features(seq_len),
+        )
+
+        return {"video": max_video_tokens}
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self.get_vision_encoder_info()
+        width = height = vision_encoder_info.get_image_size()
+        return ImageSize(width=width, height=height)
+
     def _get_num_frame_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
     ) -> int:
-        hf_config = self._get_hf_config()
+        hf_config = self.get_hf_config()
         spatial_pool_stride = hf_config.spatial_pool_stride
 
-        vision_encoder_info = self._get_vision_encoder_info()
+        vision_encoder_info = self.get_vision_encoder_info()
         patch_grid_length = vision_encoder_info.get_patch_grid_length()
         pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride)
 
         return pooled_grid_length * pooled_grid_length
 
-    def _get_num_video_tokens(
+    def get_num_video_tokens(
         self,
         *,
         image_width: int,
@@ -87,37 +105,14 @@ def _get_num_video_tokens(
 
         return num_frame_tokens * num_frames
 
-
-class LlavaNextVideoProfilingInfo(LlavaNextVideoProcessingMixin,
-                                  BaseProfilingInfo):
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"video": 1}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        target_width, target_height = self._get_image_size_with_most_features()
-
-        max_video_tokens = self._get_num_video_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            num_frames=self._get_dummy_num_frames(seq_len),
-        )
-
-        return {"video": max_video_tokens}
-
-    def _get_image_size_with_most_features(self) -> ImageSize:
-        vision_encoder_info = self._get_vision_encoder_info()
-        width = height = vision_encoder_info.get_image_size()
-        return ImageSize(width=width, height=height)
-
     def _get_max_video_frames(self, max_tokens: int) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = self.get_image_size_with_most_features()
 
         num_frames = 0
 
         while True:
             next_num_frames = num_frames + 1
-            next_max_tokens = self._get_num_video_tokens(
+            next_max_tokens = self.get_num_video_tokens(
                 image_width=target_width,
                 image_height=target_height,
                 num_frames=next_num_frames,
@@ -130,7 +125,7 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
         return num_frames
 
-    def _get_dummy_num_frames(self, seq_len: int) -> int:
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
         max_videos = mm_config.limit_per_prompt.get("video", 1)
 
@@ -138,6 +133,10 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
 
         return max(max_total_frames // max(max_videos, 1), 1)
 
+
+class LlavaNextVideoDummyInputsBuilder(
+        BaseDummyInputsBuilder[LlavaNextVideoProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
@@ -145,16 +144,20 @@ def get_dummy_processor_inputs(
     ) -> ProcessorInputs:
         num_videos = mm_counts.get("video", 0)
 
-        processor = self._get_hf_processor()
+        processor = self.info.get_hf_processor()
         video_token = processor.video_token
-        target_width, target_height = self._get_image_size_with_most_features()
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len)
 
         mm_data = {
             "video":
             self._get_dummy_videos(
                 width=target_width,
                 height=target_height,
-                num_frames=self._get_dummy_num_frames(seq_len),
+                num_frames=target_num_frames,
                 num_videos=num_videos,
             )
         }
@@ -165,11 +168,8 @@ def get_dummy_processor_inputs(
         )
 
 
-class LlavaNextVideoMultiModalProcessor(LlavaNextVideoProcessingMixin,
-                                        BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return LlavaNextVideoProfilingInfo(self.ctx)
+class LlavaNextVideoMultiModalProcessor(
+        BaseMultiModalProcessor[LlavaNextVideoProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -184,7 +184,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         video_token_id = hf_config.video_token_index
 
         def get_replacement(item_idx: int):
@@ -195,7 +195,7 @@ def get_replacement(item_idx: int):
                 num_video_tokens = videos.get_feature_size(item_idx)
             else:
                 image_size = videos.get_frame_size(item_idx)
-                num_video_tokens = self._get_num_video_tokens(
+                num_video_tokens = self.info.get_num_video_tokens(
                     image_width=image_size.width,
                     image_height=image_size.height,
                     num_frames=videos.get_num_frames(item_idx),
@@ -269,7 +269,11 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_processor(LlavaNextVideoMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(
+    LlavaNextVideoMultiModalProcessor,
+    info=LlavaNextVideoProcessingInfo,
+    dummy_inputs=LlavaNextVideoDummyInputsBuilder,
+)
 class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
                                              SupportsPP):
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index b5e3edba1f01c..78a47e64d9afc 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -17,19 +17,20 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
-from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
-                                   VideoEmbeddingItems, VideoProcessorItems)
-from vllm.multimodal.processing import MultiModalFieldConfig, PromptReplacement
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems,
+                                   VideoProcessorItems)
+from vllm.multimodal.processing import PromptReplacement
+from vllm.multimodal.profiling import ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
 from .clip import CLIPVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
-from .llava import BaseLlavaProfilingInfo, init_vision_tower_for_llava
-from .llava_next import (LlavaNextLikeConfig, LlavaNextMultiModalProcessor,
-                         LlavaNextProcessingMixin)
+from .llava import LlavaDummyInputsBuilder, init_vision_tower_for_llava
+from .llava_next import (BaseLlavaNextMultiModalProcessor, LlavaNextLikeConfig,
+                         LlavaNextProcessingInfo)
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -89,14 +90,23 @@ class LlavaOnevisionLikeConfig(LlavaNextLikeConfig, Protocol):
     video_token_index: Final[int]
 
 
-class LlavaOnevisionProcessingMixin(LlavaNextProcessingMixin):
+class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
 
-    def _get_hf_config(self) -> LlavaOnevisionLikeConfig:
+    def get_hf_config(self) -> LlavaOnevisionLikeConfig:
         return self.ctx.get_hf_config(LlavaOnevisionConfig)
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaOnevisionProcessor)
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {
+            "image": self.get_max_image_tokens(),
+            "video": self.get_max_video_tokens(seq_len),
+        }
+
     # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
     # with additional logic afterwards taken from LlavaOnevisionProcessor
     def _get_num_unpadded_features(
@@ -141,16 +151,16 @@ def _get_num_frame_tokens(
         image_width: int,
         image_height: int,
     ) -> int:
-        hf_config = self._get_hf_config()
+        hf_config = self.get_hf_config()
         spatial_pool_stride = getattr(hf_config, "spatial_pool_stride", 2)
 
-        vision_encoder_info = self._get_vision_encoder_info()
+        vision_encoder_info = self.get_vision_encoder_info()
         patch_grid_length = vision_encoder_info.get_patch_grid_length()
         pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride)
 
         return pooled_grid_length * pooled_grid_length
 
-    def _get_num_video_tokens(
+    def get_num_video_tokens(
         self,
         *,
         image_width: int,
@@ -164,43 +174,14 @@ def _get_num_video_tokens(
 
         return num_frame_tokens * num_frames + 1  # Newline token
 
-
-class LlavaOnevisionProfilingInfo(LlavaOnevisionProcessingMixin,
-                                  BaseLlavaProfilingInfo):
-
-    def _get_image_size_with_most_features(self) -> ImageSize:
-        hf_config = self._get_hf_config()
-        largest_feature_size, largest_feature_pinpoint = 0, None
-        for (height, width) in hf_config.image_grid_pinpoints:
-            feat_size = self._get_num_image_tokens(image_width=width,
-                                                   image_height=height)
-            if feat_size > largest_feature_size:
-                largest_feature_size = feat_size
-                largest_feature_pinpoint = ImageSize(width=width,
-                                                     height=height)
-
-        if largest_feature_size == 0 or largest_feature_pinpoint is None:
-            raise ValueError("Cannot have a largest feature size of 0!")
-
-        return largest_feature_pinpoint
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None, "video": None}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {
-            "image": self._get_max_image_tokens(),
-            "video": self._get_max_video_tokens(seq_len),
-        }
-
     def _get_max_video_frames(self, max_tokens: int) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = self.get_image_size_with_most_features()
 
         num_frames = 0
 
         while True:
             next_num_frames = num_frames + 1
-            next_max_tokens = self._get_num_video_tokens(
+            next_max_tokens = self.get_num_video_tokens(
                 image_width=target_width,
                 image_height=target_height,
                 num_frames=next_num_frames,
@@ -213,12 +194,12 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
         return num_frames
 
-    def _get_dummy_num_frames(self, seq_len: int) -> int:
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
         max_images = mm_config.limit_per_prompt.get("image", 1)
         max_videos = mm_config.limit_per_prompt.get("video", 1)
 
-        max_image_tokens = self._get_max_image_tokens() * max_images
+        max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
                                                       max_image_tokens)
         max_frames_per_video = min(max_total_frames // max(max_videos, 1),
@@ -226,15 +207,19 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
 
         return max(max_frames_per_video, 1)
 
-    def _get_max_video_tokens(self, seq_len: int) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+    def get_max_video_tokens(self, seq_len: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
 
-        return self._get_num_video_tokens(
+        return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self._get_dummy_num_frames(seq_len),
+            num_frames=self.get_num_frames_with_most_features(seq_len),
         )
 
+
+class LlavaOnevisionDummyInputsBuilder(
+        LlavaDummyInputsBuilder[LlavaOnevisionProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
@@ -243,10 +228,14 @@ def get_dummy_processor_inputs(
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
-        processor = self._get_hf_processor()
+        processor = self.info.get_hf_processor()
         image_token = processor.image_token
         video_token = processor.video_token
-        target_width, target_height = self._get_image_size_with_most_features()
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len)
 
         mm_data = {
             "image":
@@ -257,7 +246,7 @@ def get_dummy_processor_inputs(
             self._get_dummy_videos(
                 width=target_width,
                 height=target_height,
-                num_frames=self._get_dummy_num_frames(seq_len),
+                num_frames=target_num_frames,
                 num_videos=num_videos,
             )
         }
@@ -268,11 +257,8 @@ def get_dummy_processor_inputs(
         )
 
 
-class LlavaOnevisionMultiModalProcessor(LlavaOnevisionProcessingMixin,
-                                        LlavaNextMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return LlavaOnevisionProfilingInfo(self.ctx)
+class LlavaOnevisionMultiModalProcessor(
+        BaseLlavaNextMultiModalProcessor[LlavaOnevisionProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -303,7 +289,7 @@ def _call_hf_processor(
                 mm_kwargs=mm_kwargs,
             )
 
-        processor = self._get_hf_processor()
+        processor = self.info.get_hf_processor()
         video_token = processor.video_token
 
         # LLaVA-OneVision processor doesn't support multiple videos
@@ -345,7 +331,7 @@ def _get_prompt_replacements(
             out_mm_kwargs=out_mm_kwargs,
         )
 
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         video_token_id = hf_config.video_token_index
 
         def get_video_replacement(item_idx: int):
@@ -356,7 +342,7 @@ def get_video_replacement(item_idx: int):
                 num_video_tokens = videos.get_feature_size(item_idx)
             else:
                 image_size = videos.get_frame_size(item_idx)
-                num_video_tokens = self._get_num_video_tokens(
+                num_video_tokens = self.info.get_num_video_tokens(
                     image_width=image_size.width,
                     image_height=image_size.height,
                     num_frames=videos.get_num_frames(item_idx),
@@ -393,7 +379,10 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_processor(LlavaOnevisionMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(
+    LlavaOnevisionMultiModalProcessor,
+    info=LlavaOnevisionProcessingInfo,
+    dummy_inputs=LlavaOnevisionDummyInputsBuilder)
 class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
                                              SupportsPP):
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index c8418c14e5fdf..a1b1af35604db 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -34,13 +34,12 @@
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
-                                   ImageSize)
+                                   ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement,
-                                        _BoundPromptReplacement,
-                                        _PlaceholderInfo)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo,
+                                        BoundPromptReplacement,
+                                        PlaceholderInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
@@ -302,9 +301,9 @@ def add_image_newline(self, image_features_hd):
         return image_features_hd_newline
 
 
-class Phi3VProcessingMixin(ProcessingMixin):
+class Phi3VProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_processor(
+    def get_hf_processor(
         self,
         *,
         num_crops: Optional[int] = None,
@@ -314,39 +313,42 @@ def _get_hf_processor(
 
         return self.ctx.get_hf_processor()
 
-    def _get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        processor = self._get_hf_processor()
-
-        return processor.calc_num_image_tokens_from_image_size(  # type: ignore
-            width=image_width,
-            height=image_height,
-        )
-
-
-class Phi3VProfilingInfo(Phi3VProcessingMixin, BaseProfilingInfo):
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = self.get_image_size_with_most_features()
 
-        max_image_tokens = self._get_num_image_tokens(
+        max_image_tokens = self.get_num_image_tokens(
             image_width=target_width,
             image_height=target_height,
+            processor=None,
         )
 
         return {"image": max_image_tokens}
 
-    def _get_image_size_with_most_features(self) -> ImageSize:
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[ProcessorMixin],
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        return processor.calc_num_image_tokens_from_image_size(  # type: ignore
+            width=image_width,
+            height=image_height,
+        )
+
+    def get_image_size_with_most_features(self) -> ImageSize:
         # Result in the max possible feature size (h:w = 16:1)
         return ImageSize(height=8000, width=50)
 
+
+class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
@@ -354,7 +356,8 @@ def get_dummy_processor_inputs(
     ) -> ProcessorInputs:
         num_images = mm_counts.get("image", 0)
 
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
 
         mm_data = {
             "image":
@@ -363,7 +366,7 @@ def get_dummy_processor_inputs(
                                    num_images=num_images)
         }
 
-        hf_processor = self._get_hf_processor()
+        hf_processor = self.info.get_hf_processor()
         image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
 
         return ProcessorInputs(
@@ -372,10 +375,7 @@ def get_dummy_processor_inputs(
         )
 
 
-class Phi3VMultiModalProcessor(Phi3VProcessingMixin, BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return Phi3VProfilingInfo(self.ctx)
+class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
 
     def _call_hf_processor(
         self,
@@ -416,10 +416,10 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs)
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
 
-        tokenizer = self._get_tokenizer()
+        tokenizer = self.info.get_tokenizer()
         bos_token_id = tokenizer.bos_token_id
         assert isinstance(bos_token_id, int)
 
@@ -431,9 +431,10 @@ def get_replacement_phi3v(item_idx: int):
                 num_image_tokens = images.get_feature_size(item_idx)
             else:
                 image_size = images.get_image_size(item_idx)
-                num_image_tokens = self._get_num_image_tokens(
+                num_image_tokens = self.info.get_num_image_tokens(
                     image_width=image_size.width,
                     image_height=image_size.height,
+                    processor=hf_processor,
                 )
 
             return [_IMAGE_TOKEN_ID] * num_image_tokens + [bos_token_id]
@@ -451,9 +452,9 @@ def get_replacement_phi3v(item_idx: int):
     def _apply_prompt_replacements(
         self,
         token_ids: list[int],
-        mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
+        mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
         mm_item_counts: Mapping[str, int],
-    ) -> tuple[list[int], str, Mapping[str, list[_PlaceholderInfo]]]:
+    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderInfo]]]:
         token_ids, text, placeholders = super()._apply_prompt_replacements(
             token_ids=token_ids,
             mm_prompt_repls=mm_prompt_repls,
@@ -466,7 +467,7 @@ def _apply_prompt_replacements(
             token_ids = [token_ids[0], *token_ids[2:]]
             placeholders = {
                 modality: [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality=p.modality,
                         item_idx=p.item_idx,
                         start_idx=p.start_idx - 1,
@@ -499,7 +500,9 @@ def apply(
         return result
 
 
-@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor,
+                                        info=Phi3VProcessingInfo,
+                                        dummy_inputs=Phi3VDummyInputsBuilder)
 class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 7012ddc66cd9c..0dff9595c6c08 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -38,11 +38,11 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
-from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataParser
+from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
+                                   MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -80,12 +80,12 @@ def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
     return feat_lengths, output_lengths
 
 
-class Qwen2AudioProcessingMixin(ProcessingMixin):
+class Qwen2AudioProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen2AudioConfig)
 
-    def _get_hf_processor(
+    def get_hf_processor(
         self,
         *,
         # Ignored in initialization
@@ -93,36 +93,37 @@ def _get_hf_processor(
     ) -> Qwen2AudioProcessor:
         return self.ctx.get_hf_processor(Qwen2AudioProcessor)
 
-    def _get_feature_extractor(
+    def get_feature_extractor(
         self,
         *,
         # Ignored in initialization
         sampling_rate: Optional[int] = None,
     ) -> WhisperFeatureExtractor:
-        hf_processor = self._get_hf_processor(sampling_rate=sampling_rate)
+        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
         feature_extractor = hf_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
-
-class Qwen2AudioProfilingInfo(Qwen2AudioProcessingMixin, BaseProfilingInfo):
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        hf_config = self._get_hf_config()
+        hf_config = self.get_hf_config()
         max_source_positions = hf_config.audio_config.max_source_positions
         max_output_lengths = (max_source_positions - 2) // 2 + 1
 
         return {"audio": max_output_lengths}
 
+
+class Qwen2AudioDummyInputsBuilder(
+        BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor()
 
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
@@ -139,14 +140,11 @@ def get_dummy_processor_inputs(
         )
 
 
-class Qwen2AudioMultiModalProcessor(Qwen2AudioProcessingMixin,
-                                    BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return Qwen2AudioProfilingInfo(self.ctx)
+class Qwen2AudioMultiModalProcessor(
+        BaseMultiModalProcessor[Qwen2AudioProcessingInfo]):
 
     def _get_data_parser(self) -> MultiModalDataParser:
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor()
         return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
 
     def _call_hf_processor(
@@ -161,7 +159,7 @@ def _call_hf_processor(
         if audios:
             mm_data["audios"] = audios
 
-            feature_extractor = self._get_feature_extractor(**mm_kwargs)
+            feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
             mm_kwargs = dict(
                 **mm_kwargs,
                 sampling_rate=feature_extractor.sampling_rate,
@@ -194,7 +192,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         placeholder = hf_config.audio_token_index
 
         feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
@@ -234,10 +232,13 @@ def _always_apply_prompt_replacements(self) -> bool:
         # has already performed processing for multi-audio input when the input
         # audios are short (the corresponding placeholders may take up fewer
         # tokens than the number of audio items)
-        return not hasattr(self._get_hf_processor(), "audio_token")
+        return not hasattr(self.info.get_hf_processor(), "audio_token")
 
 
-@MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(
+    Qwen2AudioMultiModalProcessor,
+    info=Qwen2AudioProcessingInfo,
+    dummy_inputs=Qwen2AudioDummyInputsBuilder)
 class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
                                          SupportsPP):
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index a5c2fb9e84df3..8537fec854b6d 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -57,11 +57,10 @@
                                     MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors, VideoItem)
 from vllm.multimodal.parse import (ImageSize, ModalityDataItems,
-                                   MultiModalDataParser)
+                                   MultiModalDataItems, MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
@@ -709,12 +708,12 @@ def _parse_video_data(
         return super()._parse_video_data(data)
 
 
-class Qwen2VLProcessingMixin(ProcessingMixin):
+class Qwen2VLProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen2VLConfig)
 
-    def _get_hf_processor(
+    def get_hf_processor(
         self,
         *,
         min_pixels: Optional[int] = None,
@@ -736,18 +735,27 @@ def _get_hf_processor(
 
         return hf_processor
 
-    def _get_image_processor(
+    def get_image_processor(
         self,
         *,
         min_pixels: Optional[int] = None,
         max_pixels: Optional[int] = None,
     ):
-        hf_processor = self._get_hf_processor(min_pixels=min_pixels,
-                                              max_pixels=max_pixels)
+        hf_processor = self.get_hf_processor(min_pixels=min_pixels,
+                                             max_pixels=max_pixels)
         image_processor = hf_processor.image_processor  # type: ignore
         assert isinstance(image_processor, Qwen2VLImageProcessor)
         return image_processor
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {
+            "image": self.get_max_image_tokens(),
+            "video": self.get_max_video_tokens(seq_len),
+        }
+
     def _get_vision_info(
         self,
         *,
@@ -755,15 +763,17 @@ def _get_vision_info(
         image_height: int,
         num_frames: int = 1,
         do_resize: bool = True,
+        image_processor: Optional[Qwen2VLImageProcessor],
     ) -> tuple[ImageSize, int]:
-        hf_config = self._get_hf_config()
+        if image_processor is None:
+            image_processor = self.get_image_processor()
+
+        hf_config = self.get_hf_config()
         vision_config = hf_config.vision_config
         patch_size = vision_config.patch_size
         merge_size = vision_config.spatial_merge_size
         temporal_patch_size = vision_config.temporal_patch_size
 
-        image_processor = self._get_image_processor()
-
         if do_resize:
             resized_height, resized_width = smart_resize(
                 height=image_height,
@@ -787,70 +797,65 @@ def _get_vision_info(
 
         return preprocessed_size, num_vision_tokens
 
-    def _get_num_image_tokens(
+    def get_num_image_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
+        image_processor: Optional[Qwen2VLImageProcessor],
     ) -> int:
         _, num_image_tokens = self._get_vision_info(
             image_width=image_width,
             image_height=image_height,
+            image_processor=image_processor,
         )
         return num_image_tokens
 
-    def _get_num_video_tokens(
+    def get_num_video_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
         num_frames: int,
+        image_processor: Optional[Qwen2VLImageProcessor],
     ) -> int:
         _, num_video_tokens = self._get_vision_info(
             image_width=image_width,
             image_height=image_height,
             num_frames=num_frames,
+            image_processor=image_processor,
         )
         return num_video_tokens
 
-
-class Qwen2VLProfilingInfo(Qwen2VLProcessingMixin, BaseProfilingInfo):
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None, "video": None}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {
-            "image": self._get_max_image_tokens(),
-            "video": self._get_max_video_tokens(seq_len),
-        }
-
-    def _get_image_size_with_most_features(self) -> ImageSize:
+    def get_image_size_with_most_features(self) -> ImageSize:
         max_image_size, _ = self._get_vision_info(
             image_width=9999999,
             image_height=9999999,
+            image_processor=None,
         )
         return max_image_size
 
-    def _get_max_image_tokens(self) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
 
-        return self._get_num_image_tokens(
+        return self.get_num_image_tokens(
             image_width=target_width,
             image_height=target_height,
+            image_processor=None,
         )
 
     def _get_max_video_frames(self, max_tokens: int) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = self.get_image_size_with_most_features()
 
         num_frames = 0
 
         while True:
             next_num_frames = num_frames + 1
-            next_max_tokens = self._get_num_video_tokens(
+            next_max_tokens = self.get_num_video_tokens(
                 image_width=target_width,
                 image_height=target_height,
                 num_frames=next_num_frames,
+                image_processor=None,
             )
 
             if next_max_tokens > max_tokens:
@@ -860,12 +865,12 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
         return num_frames
 
-    def _get_dummy_num_frames(self, seq_len: int) -> int:
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
         max_images = mm_config.limit_per_prompt.get("image", 1)
         max_videos = mm_config.limit_per_prompt.get("video", 1)
 
-        max_image_tokens = self._get_max_image_tokens() * max_images
+        max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
                                                       max_image_tokens)
 
@@ -877,15 +882,19 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
 
         return num_frames
 
-    def _get_max_video_tokens(self, seq_len: int) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+    def get_max_video_tokens(self, seq_len: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
 
-        return self._get_num_video_tokens(
+        return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self._get_dummy_num_frames(seq_len),
+            num_frames=self.get_num_frames_with_most_features(seq_len),
+            image_processor=None,
         )
 
+
+class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
@@ -894,10 +903,14 @@ def get_dummy_processor_inputs(
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
-        hf_processor = self._get_hf_processor()
+        hf_processor = self.info.get_hf_processor()
         image_token: str = hf_processor.image_token
         video_token: str = hf_processor.video_token
-        target_width, target_height = self._get_image_size_with_most_features()
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len)
 
         mm_data = {
             "image":
@@ -908,7 +921,7 @@ def get_dummy_processor_inputs(
             self._get_dummy_videos(
                 width=target_width,
                 height=target_height,
-                num_frames=self._get_dummy_num_frames(seq_len),
+                num_frames=target_num_frames,
                 num_videos=num_videos,
             )
         }
@@ -919,11 +932,8 @@ def get_dummy_processor_inputs(
         )
 
 
-class Qwen2VLMultiModalProcessor(Qwen2VLProcessingMixin,
-                                 BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return Qwen2VLProfilingInfo(self.ctx)
+class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
+                                 ):
 
     def _get_data_parser(self) -> MultiModalDataParser:
         return Qwen2MultiModalDataParser()
@@ -934,8 +944,9 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs)
-        image_processor = self._get_image_processor(**hf_processor_mm_kwargs)
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(
+            **hf_processor_mm_kwargs)
 
         # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
         # image_token and video_token registered
@@ -991,7 +1002,9 @@ def _get_mm_fields_config(
         )
 
 
-@MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor,
+                                        info=Qwen2VLProcessingInfo,
+                                        dummy_inputs=Qwen2VLDummyInputsBuilder)
 class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                                       SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index ecafd157b1d61..fada22d685dd6 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -24,11 +24,10 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
-from vllm.multimodal.parse import MultiModalDataParser
+from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
@@ -59,9 +58,9 @@ class UltravoxAudioEmbeddingInputs(TypedDict):
                             UltravoxAudioEmbeddingInputs]
 
 
-class UltravoxProcessingMixin(ProcessingMixin):
+class UltravoxProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_processor(
+    def get_hf_processor(
         self,
         *,
         # Ignored in initialization
@@ -76,37 +75,38 @@ def _get_hf_processor(
         hf_processor.audio_token_replacement = _AUDIO_PLACEHOLDER_OVERRIDE
         return hf_processor
 
-    def _get_feature_extractor(
+    def get_feature_extractor(
         self,
         *,
         # Ignored in initialization
         sampling_rate: Optional[int] = None,
     ) -> WhisperFeatureExtractor:
-        hf_processor = self._get_hf_processor(sampling_rate=sampling_rate)
+        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
         audio_processor = hf_processor.audio_processor  # type: ignore
         feature_extractor = audio_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
-
-class UltravoxProfilingInfo(UltravoxProcessingMixin, BaseProfilingInfo):
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.get_feature_extractor()
         max_audio_tokens = math.ceil(feature_extractor.chunk_length *
                                      _AUDIO_TOKENS_PER_SECOND)
 
         return {"audio": max_audio_tokens}
 
+
+class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]
+                                 ):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor()
 
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
@@ -123,14 +123,11 @@ def get_dummy_processor_inputs(
         )
 
 
-class UltravoxMultiModalProcessor(UltravoxProcessingMixin,
-                                  BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return UltravoxProfilingInfo(self.ctx)
+class UltravoxMultiModalProcessor(
+        BaseMultiModalProcessor[UltravoxProcessingInfo]):
 
     def _get_data_parser(self) -> MultiModalDataParser:
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor()
         return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
 
     def _call_hf_processor(
@@ -141,7 +138,7 @@ def _call_hf_processor(
     ) -> BatchFeature:
         # Text-only input not supported in composite processor
         if not mm_data:
-            tokenizer = self._get_tokenizer()
+            tokenizer = self.info.get_tokenizer()
 
             prompt_ids = tokenizer.encode(
                 prompt,
@@ -160,7 +157,7 @@ def _call_hf_processor(
                 mm_kwargs=mm_kwargs,
             )
 
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor()
         mm_kwargs = dict(
             **mm_kwargs,
             sampling_rate=feature_extractor.sampling_rate,
@@ -208,7 +205,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs)
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         placeholder = hf_processor.audio_token_replacement  # type: ignore
 
         def get_replacement_ultravox(item_idx: int):
@@ -342,7 +339,10 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor,
+                                        info=UltravoxProcessingInfo,
+                                        dummy_inputs=UltravoxDummyInputsBuilder
+                                        )
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
 
     hf_to_vllm_mapper = WeightsMapper(
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 41113cd85bd16..c6a30cacebdd1 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -4,12 +4,13 @@
 from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence
 from dataclasses import dataclass, field
 from functools import lru_cache
-from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union
+from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol,
+                    TypeVar, Union)
 
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 
-from vllm import envs
-from vllm.inputs import DummyData, InputProcessingContext
+import vllm.envs as envs
+from vllm.inputs import InputProcessingContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
                                                encode_tokens)
@@ -20,7 +21,9 @@
                      MultiModalInputsV2, MultiModalKwargs,
                      MultiModalKwargsItem, PlaceholderRange)
 from .parse import MultiModalDataItems, MultiModalDataParser
-from .profiling import BaseProfilingInfo
+
+if TYPE_CHECKING:
+    from .profiling import BaseDummyInputsBuilder
 
 logger = init_logger(__name__)
 
@@ -46,8 +49,8 @@ class PromptReplacement:
     if it does not depend on the input.
     """
 
-    def bind(self, tokenizer: AnyTokenizer) -> "_BoundPromptReplacement":
-        return _BoundPromptReplacement(
+    def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptReplacement":
+        return BoundPromptReplacement(
             tokenizer=tokenizer,
             modality=self.modality,
             _target=self.target,
@@ -128,7 +131,7 @@ def token_ids(self) -> list[int]:
 
 
 @dataclass
-class _BoundPromptReplacement:
+class BoundPromptReplacement:
     tokenizer: AnyTokenizer = field(repr=False)
     modality: str
 
@@ -207,7 +210,7 @@ def iter_token_matches(
 
 @dataclass(repr=False)
 class _PromptReplacementMatch(ABC):
-    prompt_repl: _BoundPromptReplacement
+    prompt_repl: BoundPromptReplacement
 
     @property
     def modality(self) -> str:
@@ -255,7 +258,7 @@ def end_idx(self) -> int:
 
 
 @dataclass
-class _PlaceholderInfo:
+class PlaceholderInfo:
     modality: str
     item_idx: int
     start_idx: int
@@ -274,7 +277,7 @@ def to_range(self) -> PlaceholderRange:
 
 def find_token_matches(
     prompt: list[int],
-    prompt_repls: Sequence[_BoundPromptReplacement],
+    prompt_repls: Sequence[BoundPromptReplacement],
 ) -> list[_PromptReplacementTokenMatch]:
     """Return each target of :code:`prompt_repls` found in :code:`prompt`."""
     return [
@@ -286,7 +289,7 @@ def find_token_matches(
 
 def find_text_matches(
     prompt: str,
-    prompt_repls: Sequence[_BoundPromptReplacement],
+    prompt_repls: Sequence[BoundPromptReplacement],
 ) -> list[_PromptReplacementTextMatch]:
     """Return each target of :code:`prompt_repls` found in :code:`prompt`."""
     return [
@@ -390,9 +393,9 @@ def replace_text_matches(
 def _iter_modality_placeholders(
     prompt: list[int],
     modality: str,
-    modality_repls: Sequence[_BoundPromptReplacement],
+    modality_repls: Sequence[BoundPromptReplacement],
     modal_item_count: int,
-) -> Iterable[_PlaceholderInfo]:
+) -> Iterable[PlaceholderInfo]:
     if modal_item_count == 0:
         return
 
@@ -413,7 +416,7 @@ def _iter_modality_placeholders(
                 continue
 
             if prompt[start_idx:end_idx] == repl_tokens:
-                yield _PlaceholderInfo(
+                yield PlaceholderInfo(
                     modality=modality,
                     item_idx=item_idx,
                     start_idx=start_idx,
@@ -434,10 +437,10 @@ def _iter_modality_placeholders(
 
 
 def _iter_placeholders(
-    mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
+    mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
     prompt: list[int],
     mm_item_counts: Mapping[str, int],
-) -> Iterable[_PlaceholderInfo]:
+) -> Iterable[PlaceholderInfo]:
     """
     For each modality, yield each set of placeholder tokens found in
     :code:`prompt`.
@@ -455,10 +458,10 @@ def _iter_placeholders(
 
 
 def find_mm_placeholders(
-    mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
+    mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
     prompt: list[int],
     mm_item_counts: Mapping[str, int],
-) -> Mapping[str, list[_PlaceholderInfo]]:
+) -> Mapping[str, list[PlaceholderInfo]]:
     it = _iter_placeholders(mm_prompt_repls, prompt, mm_item_counts)
     return dict(full_groupby_modality(it))
 
@@ -524,29 +527,59 @@ def put(
         self._cache.put(cache_key, output_kwargs)
 
 
-class ProcessingMixin:
-    """
-    Contains helper functions to perform processing.
+class BaseProcessingInfo:
+    """Base class containing information to perform processing."""
 
-    Not to be confused with :class:`transformers.ProcessorMixin`.
-    """
-    ctx: InputProcessingContext
+    def __init__(self, ctx: InputProcessingContext) -> None:
+        super().__init__()
 
-    def _get_tokenizer(self) -> AnyTokenizer:
+        self.ctx = ctx
+
+    @property
+    def model_id(self) -> str:
+        return self.ctx.model_config.model
+
+    def get_tokenizer(self) -> AnyTokenizer:
         return self.ctx.tokenizer
 
-    def _get_hf_config(self) -> PretrainedConfig:
+    def get_hf_config(self) -> PretrainedConfig:
         return self.ctx.get_hf_config()
 
-    def _get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
+    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
         """
         Subclasses can override this method to handle
         specific kwargs from model config or user inputs.
         """
         return self.ctx.get_hf_processor(**kwargs)
 
+    @abstractmethod
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        """
+        Return the maximum supported number of items for each modality.
+
+        A value of `None` means unlimited number of items.
+
+        Omitting a modality from the returned dictionary means that
+        it is not supported at all.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        """
+        Get the maximum possible number of tokens per data item
+        for each modality.
+
+        The dictionary returned by this method should have the same
+        keys as that returned by :meth:`get_supported_mm_limits`.
+        """
+        raise NotImplementedError
+
+
+_I = TypeVar("_I", bound=BaseProcessingInfo)
 
-class BaseMultiModalProcessor(ProcessingMixin, ABC):
+
+class BaseMultiModalProcessor(ABC, Generic[_I]):
     """
     Abstract base class to process multi-modal inputs to be used in vLLM.
 
@@ -554,18 +587,19 @@ class BaseMultiModalProcessor(ProcessingMixin, ABC):
     """
 
     def __init__(self,
-                 ctx: InputProcessingContext,
+                 info: _I,
+                 dummy_inputs: "BaseDummyInputsBuilder[_I]",
                  *,
                  cache: Optional[ProcessingCache] = None,
                  enable_sanity_checks: bool = True) -> None:
         super().__init__()
 
-        self.ctx = ctx
+        self.info = info
+        self.dummy_inputs = dummy_inputs
         self.cache = cache
         self.enable_sanity_checks = enable_sanity_checks
 
         self.data_parser = self._get_data_parser()
-        self.profiling_info = self._get_profiling_info()
 
     def __call__(
         self,
@@ -585,13 +619,6 @@ def _get_data_parser(self) -> MultiModalDataParser:
         """
         return MultiModalDataParser()
 
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        """
-        Get the profiling information to find the worst-case memory usage of
-        the model.
-        """
-        raise NotImplementedError
-
     def _to_mm_items(
         self,
         mm_data: MultiModalDataDict,
@@ -602,7 +629,7 @@ def _to_mm_items(
         """
         mm_items = self.data_parser.parse_mm_data(mm_data)
 
-        mm_limits = self.ctx.get_mm_config().limit_per_prompt
+        mm_limits = self.info.ctx.get_mm_config().limit_per_prompt
         for modality, items in mm_items.items():
             limit = mm_limits.get(modality, 1)
             if len(items) > limit:
@@ -646,19 +673,19 @@ def _get_prompt_replacements(
 
     def _find_mm_placeholders(
         self,
-        mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
+        mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
         new_token_ids: list[int],
         mm_item_counts: Mapping[str, int],
-    ) -> Mapping[str, list[_PlaceholderInfo]]:
+    ) -> Mapping[str, list[PlaceholderInfo]]:
         return find_mm_placeholders(mm_prompt_repls, new_token_ids,
                                     mm_item_counts)
 
     def _get_hf_mm_data(
         self,
         mm_items: MultiModalDataItems,
-    ) -> tuple[dict[str, Any], dict[str, Any]]:
-        processor_data = dict[str, Any]()
-        passthrough_data = dict[str, Any]()
+    ) -> tuple[Mapping[str, object], Mapping[str, object]]:
+        processor_data = dict[str, object]()
+        passthrough_data = dict[str, object]()
 
         for items in mm_items.values():
             processor_data.update(items.get_processor_data())
@@ -678,8 +705,8 @@ def _call_hf_processor(
         Call the HF processor on the prompt text and
         associated multi-modal data.
         """
-        return self.ctx.call_hf_processor(
-            self._get_hf_processor(**mm_kwargs),
+        return self.info.ctx.call_hf_processor(
+            self.info.get_hf_processor(**mm_kwargs),
             dict(text=prompt, **mm_data),
             mm_kwargs,
         )
@@ -738,8 +765,8 @@ def _apply_hf_processor_missing(
 
         # Some HF processors (e.g. Qwen2-VL) expect corresponding
         # multi-modal tokens to be in the prompt text
-        dummy_inputs = self.profiling_info.get_dummy_processor_inputs(
-            self.ctx.model_config.max_model_len,
+        dummy_inputs = self.dummy_inputs.get_dummy_processor_inputs(
+            self.info.ctx.model_config.max_model_len,
             mm_missing_counts,
         )
 
@@ -762,7 +789,7 @@ def _cached_apply_hf_processor(
         caching the results and reusing cached results.
         """
         cache = self.cache
-        model_id = self.ctx.model_config.model
+        model_id = self.info.model_id
 
         _, passthrough_data = self._get_hf_mm_data(mm_data_items)
         if cache is None or passthrough_data:
@@ -838,8 +865,8 @@ def _cached_apply_hf_processor(
     def _bind_and_group_repls(
         self,
         prompt_repls: list[PromptReplacement],
-    ) -> dict[str, list[_BoundPromptReplacement]]:
-        tokenizer = self._get_tokenizer()
+    ) -> dict[str, list[BoundPromptReplacement]]:
+        tokenizer = self.info.get_tokenizer()
 
         it = (prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls)
         return dict(full_groupby_modality(it))
@@ -859,10 +886,10 @@ def _always_apply_prompt_replacements(self) -> bool:
     def _apply_prompt_replacements(
         self,
         token_ids: list[int],
-        mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
+        mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
         mm_item_counts: Mapping[str, int],
-    ) -> tuple[list[int], str, Mapping[str, list[_PlaceholderInfo]]]:
-        tokenizer = self._get_tokenizer()
+    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderInfo]]]:
+        tokenizer = self.info.get_tokenizer()
 
         mm_token_matches = {
             modality: find_token_matches(token_ids, prompt_repls)
@@ -950,7 +977,7 @@ def _validate_mm_kwargs(
 
     def _validate_mm_placeholders(
         self,
-        mm_placeholders: Mapping[str, list[_PlaceholderInfo]],
+        mm_placeholders: Mapping[str, list[PlaceholderInfo]],
         mm_item_counts: Mapping[str, int],
         *,
         allow_missing: bool = False,
@@ -1001,7 +1028,7 @@ def apply(
         # instead of rehashing.
 
         if envs.VLLM_USE_V1:
-            model_id = self.ctx.model_config.model
+            model_id = self.info.model_id
             mm_hashes = {
                 modality: [
                     MultiModalHasher.hash_kwargs(model_id=model_id,
@@ -1046,7 +1073,7 @@ def apply(
                 allow_missing=True,
             )
 
-            mm_missing_repls = dict[str, list[_BoundPromptReplacement]]()
+            mm_missing_repls = dict[str, list[BoundPromptReplacement]]()
             for modality, missing_repl_count in mm_missing_repl_counts.items():
                 if missing_repl_count == 0:
                     mm_missing_repls[modality] = []
@@ -1059,7 +1086,7 @@ def apply(
         # If HF processor already inserts placeholder tokens,
         # there is no need for us to insert them
         if all(len(repls) == 0 for repls in mm_missing_repls.items()):
-            tokenizer = self._get_tokenizer()
+            tokenizer = self.info.get_tokenizer()
             prompt_text = decode_tokens(tokenizer, prompt_ids)
             mm_placeholders = hf_mm_placeholders
         else:
@@ -1090,79 +1117,3 @@ def apply(
             mm_hashes=mm_hashes,
             mm_placeholders=mm_placeholder_ranges,
         )
-
-    def _get_dummy_mm_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> MultiModalInputsV2:
-        profiling = self.profiling_info
-        processor_inputs = profiling.get_dummy_processor_inputs(
-            seq_len, mm_counts)
-
-        return self.apply(
-            prompt_text=processor_inputs.prompt_text,
-            mm_data=processor_inputs.mm_data,
-            hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
-        )
-
-    def get_dummy_data(self, seq_len: int) -> DummyData:
-        # Avoid circular import
-        from vllm.sequence import SequenceData
-
-        profiling = self.profiling_info
-        mm_counts = profiling.get_mm_limits()
-        mm_max_tokens_per_item = profiling.get_mm_max_tokens_per_item(seq_len)
-        if mm_counts.keys() != mm_max_tokens_per_item.keys():
-            raise AssertionError(
-                "The keys returned by `get_supported_mm_limits`"
-                f"({set(mm_counts.keys())}) should be the same as those "
-                "returned by `get_mm_max_tokens_per_item` "
-                f"({set(mm_max_tokens_per_item.keys())})")
-
-        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
-        prompt_token_ids = mm_inputs["prompt_token_ids"]
-        placeholders_by_modality = mm_inputs["mm_placeholders"]
-
-        total_placeholders_by_modality = {
-            modality: sum(item["length"] for item in placeholders)
-            for modality, placeholders in placeholders_by_modality.items()
-        }
-        expected_placeholders_by_modality = {
-            modality: mm_max_tokens_per_item[modality] * mm_counts[modality]
-            for modality in placeholders_by_modality
-        }
-        if total_placeholders_by_modality != expected_placeholders_by_modality:
-            raise AssertionError(
-                f"The processed dummy data has a total of "
-                f"{total_placeholders_by_modality} placeholder tokens, which "
-                f"is not the expected {expected_placeholders_by_modality} "
-                "tokens.")
-
-        total_len = len(prompt_token_ids)
-
-        # V0 does not support chunked prefill.
-        if total_len > seq_len and not envs.VLLM_USE_V1:
-            logger.warning(
-                "The context length (%d) of the model is too short "
-                "to hold the multi-modal embeddings in the worst case "
-                "(%d tokens in total, out of which %s are reserved for "
-                "multi-modal embeddings). This may cause certain multi-modal "
-                "inputs to fail during inference, even when the input text is "
-                "short. To avoid this, you should increase `max_model_len`, "
-                "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len,
-                total_len, total_placeholders_by_modality)
-
-            return DummyData(
-                seq_data=SequenceData.from_prompt_token_counts((0, seq_len)),
-                multi_modal_data=None,
-                multi_modal_placeholders=None,
-            )
-
-        prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
-
-        return DummyData(
-            seq_data=SequenceData.from_seqs(prompt_token_ids),
-            multi_modal_data=mm_inputs["mm_kwargs"],
-            multi_modal_placeholders=placeholders_by_modality,
-        )
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 2ecf0db1a485d..2ac3a6bcf3ddd 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -1,16 +1,18 @@
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass, field
-from typing import Optional
+from typing import Generic, TypeVar
 
 import numpy as np
 import numpy.typing as npt
 from PIL import Image
 
-from vllm.inputs import InputProcessingContext
+import vllm.envs as envs
+from vllm.inputs import DummyData
 from vllm.logger import init_logger
 
-from .inputs import MultiModalDataDict
+from .inputs import MultiModalDataDict, MultiModalInputsV2
+from .processing import BaseMultiModalProcessor, BaseProcessingInfo
 
 logger = init_logger(__name__)
 
@@ -23,39 +25,19 @@ class ProcessorInputs:
     hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
 
 
-class BaseProfilingInfo(ABC):
+_I = TypeVar("_I", bound=BaseProcessingInfo)
+
+
+class BaseDummyInputsBuilder(ABC, Generic[_I]):
     """
-    Abstract base class that provides the information necessary to profile
+    Abstract base class that constructs the dummy data to profile
     multi-modal models.
     """
 
-    def __init__(self, ctx: InputProcessingContext) -> None:
+    def __init__(self, info: _I) -> None:
         super().__init__()
 
-        self.ctx = ctx
-
-    @abstractmethod
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        """
-        Return the maximum supported number of items for each modality.
-
-        A value of `None` means unlimited number of items.
-
-        Omitting a modality from the returned dictionary means that
-        it is not supported at all.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        """
-        Get the maximum possible number of tokens per data item
-        for each modality.
-
-        The dictionary returned by this method should have the same
-        keys as that returned by :meth:`get_supported_mm_limits`.
-        """
-        raise NotImplementedError
+        self.info = info
 
     @abstractmethod
     def get_dummy_processor_inputs(
@@ -64,8 +46,8 @@ def get_dummy_processor_inputs(
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         """
-        Build the multi-modal portion of the input which, after processing,
-        results in `mm_max_tokens` in :meth:`get_mm_max_tokens_per_item`.
+        Build the input which, after processing, results in
+        `self.info.get_mm_max_tokens_per_item()` placeholder tokens.
         """
         raise NotImplementedError
 
@@ -99,11 +81,33 @@ def _get_dummy_videos(
         video = np.zeros((num_frames, width, height, 3))
         return [video] * num_videos
 
-    def get_mm_limits(self) -> Mapping[str, int]:
-        mm_config = self.ctx.get_mm_config()
+
+class MultiModalProfiler(Generic[_I]):
+    """
+    Contains code for running memory profiling for multi-modal models.
+    """
+
+    def __init__(
+        self,
+        processor: BaseMultiModalProcessor[_I],
+    ) -> None:
+        super().__init__()
+
+        self.processor = processor
+
+    @property
+    def processing_info(self) -> BaseProcessingInfo:
+        return self.processor.info
+
+    @property
+    def dummy_inputs(self) -> BaseDummyInputsBuilder[_I]:
+        return self.processor.dummy_inputs
+
+    def _get_mm_limits(self) -> Mapping[str, int]:
+        mm_config = self.processing_info.ctx.get_mm_config()
         mm_limit_per_prompt = mm_config.limit_per_prompt
 
-        supported_mm_limits = self.get_supported_mm_limits()
+        supported_mm_limits = self.processing_info.get_supported_mm_limits()
 
         mm_limits = {
             modality: mm_limit_per_prompt.get(modality, 1)
@@ -119,3 +123,81 @@ def get_mm_limits(self) -> Mapping[str, int]:
                     f"at most {supported_limit} {modality} items.")
 
         return mm_limits
+
+    def _get_dummy_mm_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalInputsV2:
+        factory = self.dummy_inputs
+        processor_inputs = factory.get_dummy_processor_inputs(
+            seq_len, mm_counts)
+
+        return self.processor.apply(
+            prompt_text=processor_inputs.prompt_text,
+            mm_data=processor_inputs.mm_data,
+            hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
+        )
+
+    def get_dummy_data(self, seq_len: int) -> DummyData:
+        # Avoid circular import
+        from vllm.sequence import SequenceData
+
+        mm_counts = self._get_mm_limits()
+
+        info = self.processing_info
+        mm_max_tokens_per_item = info.get_mm_max_tokens_per_item(seq_len)
+
+        if mm_counts.keys() != mm_max_tokens_per_item.keys():
+            raise AssertionError(
+                "The keys returned by `get_supported_mm_limits`"
+                f"({set(mm_counts.keys())}) should be the same as those "
+                "returned by `get_mm_max_tokens_per_item` "
+                f"({set(mm_max_tokens_per_item.keys())})")
+
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
+        prompt_token_ids = mm_inputs["prompt_token_ids"]
+        placeholders_by_modality = mm_inputs["mm_placeholders"]
+
+        total_placeholders_by_modality = {
+            modality: sum(item["length"] for item in placeholders)
+            for modality, placeholders in placeholders_by_modality.items()
+        }
+        expected_placeholders_by_modality = {
+            modality: mm_max_tokens_per_item[modality] * mm_counts[modality]
+            for modality in placeholders_by_modality
+        }
+        if total_placeholders_by_modality != expected_placeholders_by_modality:
+            raise AssertionError(
+                f"The processed dummy data has a total of "
+                f"{total_placeholders_by_modality} placeholder tokens, which "
+                f"is not the expected {expected_placeholders_by_modality} "
+                "tokens.")
+
+        total_len = len(prompt_token_ids)
+
+        # V0 does not support chunked prefill.
+        if total_len > seq_len and not envs.VLLM_USE_V1:
+            logger.warning(
+                "The context length (%d) of the model is too short "
+                "to hold the multi-modal embeddings in the worst case "
+                "(%d tokens in total, out of which %s are reserved for "
+                "multi-modal embeddings). This may cause certain multi-modal "
+                "inputs to fail during inference, even when the input text is "
+                "short. To avoid this, you should increase `max_model_len`, "
+                "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len,
+                total_len, total_placeholders_by_modality)
+
+            return DummyData(
+                seq_data=SequenceData.from_prompt_token_counts((0, seq_len)),
+                multi_modal_data=None,
+                multi_modal_placeholders=None,
+            )
+
+        prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
+
+        return DummyData(
+            seq_data=SequenceData.from_seqs(prompt_token_ids),
+            multi_modal_data=mm_inputs["mm_kwargs"],
+            multi_modal_placeholders=placeholders_by_modality,
+        )
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index f75a594a4c4e0..5f01eac4edade 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,7 +1,8 @@
 import functools
 from collections import UserDict
-from typing import (TYPE_CHECKING, Any, Dict, Mapping, Optional, Protocol,
-                    Sequence, Type, TypeVar)
+from dataclasses import dataclass
+from typing import (TYPE_CHECKING, Any, Dict, Generic, Mapping, Optional,
+                    Protocol, Sequence, Type, TypeVar)
 
 import torch.nn as nn
 
@@ -14,7 +15,9 @@
 from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc
 from .image import ImagePlugin
 from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
-from .processing import BaseMultiModalProcessor, ProcessingCache
+from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
+                         ProcessingCache)
+from .profiling import BaseDummyInputsBuilder
 from .utils import cached_get_tokenizer
 from .video import VideoPlugin
 
@@ -27,20 +30,59 @@
 MM_CACHE_SIZE = 256
 
 N = TypeVar("N", bound=Type[nn.Module])
+_I = TypeVar("_I", bound=BaseProcessingInfo)
+_I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
 
 
-class MultiModalProcessorFactory(Protocol):
+class ProcessingInfoFactory(Protocol[_I_co]):
     """Constructs a :class:`MultiModalProcessor` instance from the context."""
 
     def __call__(
         self,
         ctx: InputProcessingContext,
+    ) -> _I_co:
+        ...
+
+
+class DummyInputsBuilderFactory(Protocol[_I]):
+    """
+    Constructs a :class:`BaseDummyInputsBuilder` instance from the context.
+    """
+
+    def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]:
+        ...
+
+
+class MultiModalProcessorFactory(Protocol[_I]):
+    """Constructs a :class:`MultiModalProcessor` instance from the context."""
+
+    def __call__(
+        self,
+        info: _I,
+        dummy_inputs: BaseDummyInputsBuilder[_I],
         *,
         cache: Optional[ProcessingCache] = None,
-    ) -> BaseMultiModalProcessor:
+    ) -> BaseMultiModalProcessor[_I]:
         ...
 
 
+@dataclass(frozen=True)
+class _ProcessorFactories(Generic[_I]):
+    info: ProcessingInfoFactory[_I]
+    processor: MultiModalProcessorFactory[_I]
+    dummy_inputs: DummyInputsBuilderFactory[_I]
+
+    def build_processor(
+        self,
+        ctx: InputProcessingContext,
+        *,
+        cache: Optional[ProcessingCache] = None,
+    ):
+        info = self.info(ctx)
+        dummy_inputs_builder = self.dummy_inputs(info)
+        return self.processor(info, dummy_inputs_builder, cache=cache)
+
+
 class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]):
     """
     Wraps `_limits_by_model` for a more informative error message
@@ -71,7 +113,7 @@ def __init__(
         self._plugins = {p.get_data_key(): p for p in plugins}
 
         self._processor_factories = ClassRegistry[nn.Module,
-                                                  MultiModalProcessorFactory]()
+                                                  _ProcessorFactories]()
 
         # This is used for non-multimodal models
         self._disabled_limits_per_plugin = {k: 0 for k in self._plugins}
@@ -224,7 +266,7 @@ def get_max_tokens_per_item_by_modality(
             tokenizer = cached_get_tokenizer(model_config.tokenizer)
             processor = self.create_processor(model_config, tokenizer)
             seq_len = model_config.max_model_len
-            return processor.profiling_info.get_mm_max_tokens_per_item(seq_len)
+            return processor.info.get_mm_max_tokens_per_item(seq_len)
 
         return {
             key: plugin.get_max_multimodal_tokens(model_config)
@@ -315,7 +357,10 @@ def get_mm_limits_per_prompt(
 
     def register_processor(
         self,
-        factory: MultiModalProcessorFactory,
+        processor: MultiModalProcessorFactory[_I],
+        *,
+        info: ProcessingInfoFactory[_I],
+        dummy_inputs: DummyInputsBuilderFactory[_I],
     ):
         """
         Register a multi-modal processor to a model class. The processor
@@ -336,7 +381,11 @@ def wrapper(model_cls: N) -> N:
                     "registered to %s. It is overwritten by the new one.",
                     model_cls, self)
 
-            self._processor_factories[model_cls] = factory
+            self._processor_factories[model_cls] = _ProcessorFactories(
+                info=info,
+                dummy_inputs=dummy_inputs,
+                processor=processor,
+            )
 
             return model_cls
 
@@ -359,15 +408,15 @@ def create_processor(
         self,
         model_config: "ModelConfig",
         tokenizer: AnyTokenizer,
-    ) -> BaseMultiModalProcessor:
+    ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
         """
         Create a multi-modal processor for a specific model and tokenizer.
         """
         model_cls = self._get_model_cls(model_config)
-        processor_factory = self._processor_factories[model_cls]
+        factories = self._processor_factories[model_cls]
 
         ctx = InputProcessingContext(model_config, tokenizer)
         cache = (None if model_config.disable_mm_preprocessor_cache else
                  self._processing_cache)
 
-        return processor_factory(ctx, cache=cache)
+        return factories.build_processor(ctx, cache=cache)

From aba8d6ee006b78149ac4514f460e4038b2d4f607 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 8 Jan 2025 13:09:53 +0000
Subject: [PATCH 324/357] [Doc] Move examples into categories (#11840)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/run-cpu-test.sh                    |  2 +-
 .buildkite/run-gh200-test.sh                  |  2 +-
 .buildkite/run-hpu-test.sh                    |  2 +-
 .buildkite/run-neuron-test.sh                 |  2 +-
 .buildkite/run-openvino-test.sh               |  2 +-
 .buildkite/run-tpu-test.sh                    |  2 +-
 .buildkite/run-xpu-test.sh                    |  4 +-
 .buildkite/test-pipeline.yaml                 | 26 +++++------
 .github/workflows/lint-and-deploy.yaml        |  4 +-
 Dockerfile                                    |  2 +-
 .../contributing/profiling/profiling_index.md |  2 +-
 docs/source/deployment/frameworks/skypilot.md |  4 +-
 docs/source/features/disagg_prefill.md        |  2 +-
 docs/source/features/lora.md                  |  2 +-
 docs/source/features/quantization/auto_awq.md |  2 +-
 .../features/quantization/fp8_e4m3_kvcache.md |  2 +-
 docs/source/features/structured_outputs.md    |  4 +-
 docs/source/generate_examples.py              | 45 ++++++++++---------
 .../getting_started/installation/cpu-x86.md   |  4 +-
 .../getting_started/installation/xpu.md       |  2 +-
 docs/source/getting_started/quickstart.md     |  4 +-
 .../source/getting_started/troubleshooting.md |  2 +-
 docs/source/models/extensions/tensorizer.md   |  2 +-
 docs/source/models/generative_models.md       |  4 +-
 docs/source/models/pooling_models.md          |  6 +--
 docs/source/serving/distributed_serving.md    |  2 +-
 docs/source/serving/multimodal_inputs.md      | 16 +++----
 .../serving/openai_compatible_server.md       | 10 ++---
 .../{ => offline_inference}/aqlm_example.py   |  0
 .../{ => offline_inference}/cpu_offload.py    |  0
 .../florence2_inference.py                    |  3 +-
 .../{ => offline_inference}/gguf_inference.py |  0
 .../llm_engine_example.py                     |  0
 .../lora_with_quantization_inference.py       |  0
 .../multilora_inference.py                    |  0
 .../offline_chat_with_tools.py                |  0
 .../offline_inference.py                      |  0
 .../offline_inference_arctic.py               |  0
 .../offline_inference_audio_language.py       |  0
 .../offline_inference_chat.py                 |  0
 .../offline_inference_classification.py       |  0
 .../offline_inference_cli.py                  |  0
 .../offline_inference_distributed.py          |  0
 .../offline_inference_embedding.py            |  0
 .../offline_inference_encoder_decoder.py      |  0
 .../offline_inference_mlpspeculator.py        |  0
 .../offline_inference_neuron.py               |  0
 ...line_inference_neuron_int8_quantization.py |  0
 .../offline_inference_openai.md               | 18 ++++----
 .../openai_example_batch.jsonl                |  0
 .../offline_inference_pixtral.py              |  0
 .../offline_inference_scoring.py              |  0
 .../offline_inference_structured_outputs.py   |  0
 .../offline_inference_tpu.py                  |  0
 .../offline_inference_vision_language.py      |  0
 ...ine_inference_vision_language_embedding.py |  0
 ...e_inference_vision_language_multi_image.py |  0
 .../offline_inference_whisper.py              |  0
 ...nference_with_default_generation_config.py |  0
 .../offline_inference_with_prefix.py          |  0
 .../offline_inference_with_profiler.py        |  0
 .../offline_profile.py                        |  2 +-
 .../save_sharded_state.py                     |  0
 examples/{ => online_serving}/api_client.py   |  0
 .../chart-helm/.helmignore                    |  0
 .../chart-helm/Chart.yaml                     |  0
 examples/online_serving/chart-helm/README.md  | 21 +++++++++
 .../{ => online_serving}/chart-helm/ct.yaml   |  0
 .../chart-helm/lintconf.yaml                  |  0
 .../chart-helm/templates/_helpers.tpl         |  0
 .../chart-helm/templates/configmap.yaml       |  0
 .../chart-helm/templates/custom-objects.yaml  |  0
 .../chart-helm/templates/deployment.yaml      |  0
 .../chart-helm/templates/hpa.yaml             |  0
 .../chart-helm/templates/job.yaml             |  0
 .../templates/poddisruptionbudget.yaml        |  0
 .../chart-helm/templates/pvc.yaml             |  0
 .../chart-helm/templates/secrets.yaml         |  0
 .../chart-helm/templates/service.yaml         |  0
 .../chart-helm/values.schema.json             |  0
 .../chart-helm/values.yaml                    |  0
 .../disaggregated_prefill.sh                  |  0
 .../gradio_openai_chatbot_webserver.py        |  0
 .../{ => online_serving}/gradio_webserver.py  |  0
 .../openai_chat_completion_client.py          |  0
 ...i_chat_completion_client_for_multimodal.py |  0
 ...penai_chat_completion_client_with_tools.py |  0
 ...enai_chat_completion_structured_outputs.py |  0
 ...ai_chat_embedding_client_for_multimodal.py |  0
 .../openai_completion_client.py               |  0
 .../openai_cross_encoder_score.py             |  0
 .../openai_embedding_client.py                |  0
 .../openai_pooling_client.py                  |  0
 .../opentelemetry/Otel.md                     |  0
 .../opentelemetry/dummy_client.py             |  0
 .../prometheus_grafana/README.md              |  0
 .../prometheus_grafana/docker-compose.yaml    |  0
 .../prometheus_grafana/grafana.json           |  0
 .../prometheus_grafana/prometheus.yaml        |  0
 examples/{ => online_serving}/run_cluster.sh  |  0
 .../sagemaker-entrypoint.sh                   |  0
 examples/{ => other}/fp8/README.md            | 10 ++---
 examples/{ => other}/fp8/extract_scales.py    |  0
 examples/{ => other}/fp8/quantizer/README.md  |  0
 .../{ => other}/fp8/quantizer/quantize.py     |  0
 examples/{ => other}/logging_configuration.md |  0
 examples/{ => other}/tensorize_vllm_model.py  | 10 ++---
 pyproject.toml                                |  2 +-
 tests/plugins_tests/test_platform_plugins.py  |  2 +-
 tests/tensorizer_loader/test_tensorizer.py    |  4 +-
 tools/profiler/print_layerwise_table.py       |  2 +-
 tools/profiler/visualize_layerwise_profile.py | 10 ++---
 vllm/distributed/kv_transfer/README.md        |  2 +-
 vllm/model_executor/model_loader/loader.py    | 11 ++---
 .../model_executor/model_loader/tensorizer.py | 14 +++---
 .../model_loader/weight_utils.py              |  3 +-
 116 files changed, 153 insertions(+), 124 deletions(-)
 rename examples/{ => offline_inference}/aqlm_example.py (100%)
 rename examples/{ => offline_inference}/cpu_offload.py (100%)
 rename examples/{ => offline_inference}/florence2_inference.py (92%)
 rename examples/{ => offline_inference}/gguf_inference.py (100%)
 rename examples/{ => offline_inference}/llm_engine_example.py (100%)
 rename examples/{ => offline_inference}/lora_with_quantization_inference.py (100%)
 rename examples/{ => offline_inference}/multilora_inference.py (100%)
 rename examples/{ => offline_inference}/offline_chat_with_tools.py (100%)
 rename examples/{ => offline_inference}/offline_inference.py (100%)
 rename examples/{ => offline_inference}/offline_inference_arctic.py (100%)
 rename examples/{ => offline_inference}/offline_inference_audio_language.py (100%)
 rename examples/{ => offline_inference}/offline_inference_chat.py (100%)
 rename examples/{ => offline_inference}/offline_inference_classification.py (100%)
 rename examples/{ => offline_inference}/offline_inference_cli.py (100%)
 rename examples/{ => offline_inference}/offline_inference_distributed.py (100%)
 rename examples/{ => offline_inference}/offline_inference_embedding.py (100%)
 rename examples/{ => offline_inference}/offline_inference_encoder_decoder.py (100%)
 rename examples/{ => offline_inference}/offline_inference_mlpspeculator.py (100%)
 rename examples/{ => offline_inference}/offline_inference_neuron.py (100%)
 rename examples/{ => offline_inference}/offline_inference_neuron_int8_quantization.py (100%)
 rename examples/{ => offline_inference/offline_inference_openai}/offline_inference_openai.md (90%)
 rename examples/{ => offline_inference/offline_inference_openai}/openai_example_batch.jsonl (100%)
 rename examples/{ => offline_inference}/offline_inference_pixtral.py (100%)
 rename examples/{ => offline_inference}/offline_inference_scoring.py (100%)
 rename examples/{ => offline_inference}/offline_inference_structured_outputs.py (100%)
 rename examples/{ => offline_inference}/offline_inference_tpu.py (100%)
 rename examples/{ => offline_inference}/offline_inference_vision_language.py (100%)
 rename examples/{ => offline_inference}/offline_inference_vision_language_embedding.py (100%)
 rename examples/{ => offline_inference}/offline_inference_vision_language_multi_image.py (100%)
 rename examples/{ => offline_inference}/offline_inference_whisper.py (100%)
 rename examples/{ => offline_inference}/offline_inference_with_default_generation_config.py (100%)
 rename examples/{ => offline_inference}/offline_inference_with_prefix.py (100%)
 rename examples/{ => offline_inference}/offline_inference_with_profiler.py (100%)
 rename examples/{ => offline_inference}/offline_profile.py (99%)
 rename examples/{ => offline_inference}/save_sharded_state.py (100%)
 rename examples/{ => online_serving}/api_client.py (100%)
 rename examples/{ => online_serving}/chart-helm/.helmignore (100%)
 rename examples/{ => online_serving}/chart-helm/Chart.yaml (100%)
 create mode 100644 examples/online_serving/chart-helm/README.md
 rename examples/{ => online_serving}/chart-helm/ct.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/lintconf.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/_helpers.tpl (100%)
 rename examples/{ => online_serving}/chart-helm/templates/configmap.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/custom-objects.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/deployment.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/hpa.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/job.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/poddisruptionbudget.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/pvc.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/secrets.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/service.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/values.schema.json (100%)
 rename examples/{ => online_serving}/chart-helm/values.yaml (100%)
 rename examples/{ => online_serving}/disaggregated_prefill.sh (100%)
 rename examples/{ => online_serving}/gradio_openai_chatbot_webserver.py (100%)
 rename examples/{ => online_serving}/gradio_webserver.py (100%)
 rename examples/{ => online_serving}/openai_chat_completion_client.py (100%)
 rename examples/{ => online_serving}/openai_chat_completion_client_for_multimodal.py (100%)
 rename examples/{ => online_serving}/openai_chat_completion_client_with_tools.py (100%)
 rename examples/{ => online_serving}/openai_chat_completion_structured_outputs.py (100%)
 rename examples/{ => online_serving}/openai_chat_embedding_client_for_multimodal.py (100%)
 rename examples/{ => online_serving}/openai_completion_client.py (100%)
 rename examples/{ => online_serving}/openai_cross_encoder_score.py (100%)
 rename examples/{ => online_serving}/openai_embedding_client.py (100%)
 rename examples/{ => online_serving}/openai_pooling_client.py (100%)
 rename examples/{ => online_serving}/opentelemetry/Otel.md (100%)
 rename examples/{ => online_serving}/opentelemetry/dummy_client.py (100%)
 rename examples/{ => online_serving}/prometheus_grafana/README.md (100%)
 rename examples/{ => online_serving}/prometheus_grafana/docker-compose.yaml (100%)
 rename examples/{ => online_serving}/prometheus_grafana/grafana.json (100%)
 rename examples/{ => online_serving}/prometheus_grafana/prometheus.yaml (100%)
 rename examples/{ => online_serving}/run_cluster.sh (100%)
 rename examples/{ => online_serving}/sagemaker-entrypoint.sh (100%)
 rename examples/{ => other}/fp8/README.md (88%)
 rename examples/{ => other}/fp8/extract_scales.py (100%)
 rename examples/{ => other}/fp8/quantizer/README.md (100%)
 rename examples/{ => other}/fp8/quantizer/quantize.py (100%)
 rename examples/{ => other}/logging_configuration.md (100%)
 rename examples/{ => other}/tensorize_vllm_model.py (96%)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index a4eca078568fd..87d08c8c7fdcb 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -30,7 +30,7 @@ function cpu_tests() {
   # offline inference
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
     set -e
-    python3 examples/offline_inference.py"
+    python3 examples/offline_inference/offline_inference.py"
 
   # Run basic model test
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
index 4fc6d089cc666..1e5ff77895a38 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -24,5 +24,5 @@ remove_docker_container
 
 # Run the image and test offline inference
 docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference.py
+    python3 examples/offline_inference/offline_inference.py
 '
diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh
index fa4f74fca7a11..a50570ab53438 100644
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
\ No newline at end of file
+docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/offline_inference.py
\ No newline at end of file
diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index aa29c434e7cfb..52d485939b1d0 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
        -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
        --name "${container_name}" \
        ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py"
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/offline_inference_neuron.py"
diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
index 6b12f424fd828..380f7a44a429a 100755
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/offline_inference.py
diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
index 770dad6ffa3a1..13605a3e97142 100644
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@@ -14,4 +14,4 @@ remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
+docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
index e0a12afbe7320..160e10aa3bb9b 100644
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@@ -14,6 +14,6 @@ remove_docker_container
 
 # Run the image and test offline inference/tensor parallel
 docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
-    python3 examples/offline_inference.py
-    python3 examples/offline_inference_cli.py -tp 2
+    python3 examples/offline_inference/offline_inference.py
+    python3 examples/offline_inference/offline_inference_cli.py -tp 2
 '
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index dcfe228ce8eae..b7178b94f481a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -187,19 +187,19 @@ steps:
   - examples/
   commands:
     - pip install tensorizer # for tensorizer test
-    - python3 offline_inference.py
-    - python3 cpu_offload.py
-    - python3 offline_inference_chat.py
-    - python3 offline_inference_with_prefix.py
-    - python3 llm_engine_example.py
-    - python3 offline_inference_vision_language.py
-    - python3 offline_inference_vision_language_multi_image.py
-    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference_encoder_decoder.py
-    - python3 offline_inference_classification.py
-    - python3 offline_inference_embedding.py
-    - python3 offline_inference_scoring.py
-    - python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
+    - python3 offline_inference/offline_inference.py
+    - python3 offline_inference/cpu_offload.py
+    - python3 offline_inference/offline_inference_chat.py
+    - python3 offline_inference/offline_inference_with_prefix.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 offline_inference/offline_inference_vision_language.py
+    - python3 offline_inference/offline_inference_vision_language_multi_image.py
+    - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 offline_inference/offline_inference_encoder_decoder.py
+    - python3 offline_inference/offline_inference_classification.py
+    - python3 offline_inference/offline_inference_embedding.py
+    - python3 offline_inference/offline_inference_scoring.py
+    - python3 offline_inference/offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
 
 - label: Prefix Caching Test # 9min
   mirror_hardwares: [amd]
diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index ab6f6e5d2060d..ee768db63c96c 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -27,7 +27,7 @@ jobs:
           version: v3.10.1
 
       - name: Run chart-testing (lint)
-        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/chart-helm --charts examples/chart-helm
+        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
 
       - name: Setup minio
         run: |
@@ -64,7 +64,7 @@ jobs:
         run: |
           export AWS_ACCESS_KEY_ID=minioadmin
           export AWS_SECRET_ACCESS_KEY=minioadmin
-          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
+          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
     
       - name: curl test
         run: |
diff --git a/Dockerfile b/Dockerfile
index 088314eb38dbe..808cf675acf4d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -250,7 +250,7 @@ ENV VLLM_USAGE_SOURCE production-docker-image
 # define sagemaker first, so it is not default from `docker build`
 FROM vllm-openai-base AS vllm-sagemaker
 
-COPY examples/sagemaker-entrypoint.sh .
+COPY examples/online_serving/sagemaker-entrypoint.sh .
 RUN chmod +x sagemaker-entrypoint.sh
 ENTRYPOINT ["./sagemaker-entrypoint.sh"]
 
diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md
index 46210957c19ec..97de40ff469f1 100644
--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve
 
 ### Offline Inference
 
-Refer to <gh-file:examples/offline_inference_with_profiler.py> for an example.
+Refer to <gh-file:examples/offline_inference/offline_inference_with_profiler.py> for an example.
 
 ### OpenAI Server
 
diff --git a/docs/source/deployment/frameworks/skypilot.md b/docs/source/deployment/frameworks/skypilot.md
index f02a943026922..657e7f2bc72cc 100644
--- a/docs/source/deployment/frameworks/skypilot.md
+++ b/docs/source/deployment/frameworks/skypilot.md
@@ -61,7 +61,7 @@ run: |
 
   echo 'Starting gradio server...'
   git clone https://github.com/vllm-project/vllm.git || true
-  python vllm/examples/gradio_openai_chatbot_webserver.py \
+  python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
     -m $MODEL_NAME \
     --port 8811 \
     --model-url http://localhost:8081/v1 \
@@ -321,7 +321,7 @@ run: |
 
   echo 'Starting gradio server...'
   git clone https://github.com/vllm-project/vllm.git || true
-  python vllm/examples/gradio_openai_chatbot_webserver.py \
+  python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
     -m $MODEL_NAME \
     --port 8811 \
     --model-url http://$ENDPOINT/v1 \
diff --git a/docs/source/features/disagg_prefill.md b/docs/source/features/disagg_prefill.md
index 645dc60807dd3..efa2efc66192e 100644
--- a/docs/source/features/disagg_prefill.md
+++ b/docs/source/features/disagg_prefill.md
@@ -21,7 +21,7 @@ Disaggregated prefill DOES NOT improve throughput.
 
 ## Usage example
 
-Please refer to `examples/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
+Please refer to `examples/online_serving/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
 
 ## Benchmarks
 
diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md
index cf06916d70f44..b00d05147bb32 100644
--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
@@ -47,7 +47,7 @@ outputs = llm.generate(
 )
 ```
 
-Check out <gh-file:examples/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
+Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
 
 ## Serving LoRA Adapters
 
diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md
index c02fbf0605a8c..3679595e3d4d0 100644
--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/source/features/quantization/auto_awq.md
@@ -47,7 +47,7 @@ print(f'Model is quantized and saved at "{quant_path}"')
 To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
 
 ```console
-$ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
+$ python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
 ```
 
 AWQ models are also supported directly through the LLM entrypoint:
diff --git a/docs/source/features/quantization/fp8_e4m3_kvcache.md b/docs/source/features/quantization/fp8_e4m3_kvcache.md
index f200c722d1d42..50edaf81fddd3 100644
--- a/docs/source/features/quantization/fp8_e4m3_kvcache.md
+++ b/docs/source/features/quantization/fp8_e4m3_kvcache.md
@@ -28,7 +28,7 @@ Here is an example of how to enable this feature:
 
 ```python
 # two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to
-# https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own.
+# https://github.com/vllm-project/vllm/blob/main/examples/other/fp8/README.md to generate kv_cache_scales.json of your own.
 
 from vllm import LLM, SamplingParams
 sampling_params = SamplingParams(temperature=1.3, top_p=0.8)
diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md
index 26c09bb0d8a0c..ccd9a6a1b1a14 100644
--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@@ -131,7 +131,7 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message.content)
 ```
 
-Full example: <gh-file:examples/openai_chat_completion_structured_outputs.py>
+Full example: <gh-file:examples/online_serving/openai_chat_completion_structured_outputs.py>
 
 ## Experimental Automatic Parsing (OpenAI API)
 
@@ -257,4 +257,4 @@ outputs = llm.generate(
 print(outputs[0].outputs[0].text)
 ```
 
-Full example: <gh-file:examples/offline_inference_structured_outputs.py>
+Full example: <gh-file:examples/offline_inference/offline_inference_structured_outputs.py>
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index 32bb86c469c78..aaa13d0fb6d3f 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -12,6 +12,7 @@
 def fix_case(text: str) -> str:
     subs = {
         "api": "API",
+        "Cli": "CLI",
         "cpu": "CPU",
         "llm": "LLM",
         "tpu": "TPU",
@@ -58,7 +59,7 @@ def generate(self) -> str:
         content = f"# {self.title}\n\n{self.description}\n\n"
         content += "```{toctree}\n"
         content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n"
-        content += "\n".join(sorted(self.documents)) + "\n```\n"
+        content += "\n".join(self.documents) + "\n```\n"
         return content
 
 
@@ -131,11 +132,14 @@ def generate(self) -> str:
             ROOT_DIR)
 
         content = f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
-        if self.main_file.suffix == ".py":
-            content += f"# {self.title}\n\n"
         include = "include" if self.main_file.suffix == ".md" else \
             "literalinclude"
-        content += f":::{{{include}}} {make_relative(self.main_file)}\n:::\n\n"
+        if include == "literalinclude":
+            content += f"# {self.title}\n\n"
+        content += f":::{{{include}}} {make_relative(self.main_file)}\n"
+        if include == "literalinclude":
+            content += f":language: {self.main_file.suffix[1:]}\n"
+        content += ":::\n\n"
 
         if not self.other_files:
             return content
@@ -163,14 +167,16 @@ def generate_examples():
         description=
         "A collection of examples demonstrating usage of vLLM.\nAll documented examples are autogenerated using <gh-file:docs/source/generate_examples.py> from examples found in <gh-file:examples>.",  # noqa: E501
         caption="Examples",
-        maxdepth=1)  # TODO change to 2 when examples start being categorised
+        maxdepth=2)
+    # Category indices stored in reverse order because they are inserted into
+    # examples_index.documents at index 0 in order
     category_indices = {
-        "offline_inference":
+        "other":
         Index(
-            path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
-            title="Offline Inference",
+            path=EXAMPLE_DOC_DIR / "examples_other_index.md",
+            title="Other",
             description=
-            "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.",  # noqa: E501
+            "Other examples that don't strongly fit into the online or offline serving categories.",  # noqa: E501
             caption="Examples",
         ),
         "online_serving":
@@ -181,31 +187,30 @@ def generate_examples():
             "Online serving examples demonstrate how to use vLLM in an online setting, where the model is queried for predictions in real-time.",  # noqa: E501
             caption="Examples",
         ),
-        "other":
+        "offline_inference":
         Index(
-            path=EXAMPLE_DOC_DIR / "examples_other_index.md",
-            title="Other",
+            path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
+            title="Offline Inference",
             description=
-            "Other examples that don't strongly fit into the online or offline serving categories.",  # noqa: E501
+            "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.",  # noqa: E501
             caption="Examples",
         ),
     }
 
     examples = []
+    glob_patterns = ["*.py", "*.md", "*.sh"]
     # Find categorised examples
     for category in category_indices:
         category_dir = EXAMPLE_DIR / category
-        py = category_dir.glob("*.py")
-        md = category_dir.glob("*.md")
-        for path in itertools.chain(py, md):
+        globs = [category_dir.glob(pattern) for pattern in glob_patterns]
+        for path in itertools.chain(*globs):
             examples.append(Example(path, category))
         # Find examples in subdirectories
         for path in category_dir.glob("*/*.md"):
             examples.append(Example(path.parent, category))
     # Find uncategorised examples
-    py = EXAMPLE_DIR.glob("*.py")
-    md = EXAMPLE_DIR.glob("*.md")
-    for path in itertools.chain(py, md):
+    globs = [EXAMPLE_DIR.glob(pattern) for pattern in glob_patterns]
+    for path in itertools.chain(*globs):
         examples.append(Example(path))
     # Find examples in subdirectories
     for path in EXAMPLE_DIR.glob("*/*.md"):
@@ -215,7 +220,7 @@ def generate_examples():
         examples.append(Example(path.parent))
 
     # Generate the example documentation
-    for example in examples:
+    for example in sorted(examples, key=lambda e: e.path.stem):
         doc_path = EXAMPLE_DOC_DIR / f"{example.path.stem}.md"
         with open(doc_path, "w+") as f:
             f.write(example.generate())
diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu-x86.md
index bbb2d1872ef39..bb046dd0fd9dc 100644
--- a/docs/source/getting_started/installation/cpu-x86.md
+++ b/docs/source/getting_started/installation/cpu-x86.md
@@ -95,7 +95,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install
 $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
 $ find / -name *libtcmalloc* # find the dynamic link library path
 $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
-$ python examples/offline_inference.py # run vLLM
+$ python examples/offline_inference/offline_inference.py # run vLLM
 ```
 
 - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
@@ -132,7 +132,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
 
 # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
 $ export VLLM_CPU_OMP_THREADS_BIND=0-7
-$ python examples/offline_inference.py
+$ python examples/offline_inference/offline_inference.py
 ```
 
 - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
diff --git a/docs/source/getting_started/installation/xpu.md b/docs/source/getting_started/installation/xpu.md
index be4e3b9bd1bc5..c1ab5478eb652 100644
--- a/docs/source/getting_started/installation/xpu.md
+++ b/docs/source/getting_started/installation/xpu.md
@@ -71,4 +71,4 @@ $      --pipeline-parallel-size=2 \
 $      -tp=8
 ```
 
-By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/run_cluster.sh> helper script.
+By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 3f9556165ece4..6b56918ce5638 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -31,7 +31,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in
 
 ## Offline Batched Inference
 
-With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference.py>
+With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/offline_inference.py>
 
 The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:
 
@@ -133,7 +133,7 @@ completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
 print("Completion result:", completion)
 ```
 
-A more detailed client example can be found here: <gh-file:examples/openai_completion_client.py>
+A more detailed client example can be found here: <gh-file:examples/online_serving/openai_completion_client.py>
 
 ### OpenAI Chat Completions API with vLLM
 
diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md
index 5a0310da0f2cb..f5efe0bef7506 100644
--- a/docs/source/getting_started/troubleshooting.md
+++ b/docs/source/getting_started/troubleshooting.md
@@ -24,7 +24,7 @@ To isolate the model downloading and loading issue, you can use the `--load-form
 
 ## Model is too large
 
-If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
 
 ## Enable more logging
 
diff --git a/docs/source/models/extensions/tensorizer.md b/docs/source/models/extensions/tensorizer.md
index 42ed5c795dd27..ae17e3437bca6 100644
--- a/docs/source/models/extensions/tensorizer.md
+++ b/docs/source/models/extensions/tensorizer.md
@@ -9,7 +9,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor
 
 For more information on CoreWeave's Tensorizer, please refer to
 [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
-the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/tensorize_vllm_model.html).
+the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference/tensorize_vllm_model.html).
 
 ```{note}
 Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index 383299d61b5dd..6228c7c2ac957 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -46,7 +46,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference.py>
+A code example can be found here: <gh-file:examples/offline_inference/offline_inference.py>
 
 ### `LLM.beam_search`
 
@@ -103,7 +103,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference_chat.py>
+A code example can be found here: <gh-file:examples/offline_inference/offline_inference_chat.py>
 
 If the model doesn't have a chat template or you want to specify another one,
 you can explicitly pass a chat template:
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 12ded68eb30b5..3e4407cfdc233 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -65,7 +65,7 @@ embeds = output.outputs.embedding
 print(f"Embeddings: {embeds!r} (size={len(embeds)})")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference_embedding.py>
+A code example can be found here: <gh-file:examples/offline_inference/offline_inference_embedding.py>
 
 ### `LLM.classify`
 
@@ -80,7 +80,7 @@ probs = output.outputs.probs
 print(f"Class Probabilities: {probs!r} (size={len(probs)})")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference_classification.py>
+A code example can be found here: <gh-file:examples/offline_inference/offline_inference_classification.py>
 
 ### `LLM.score`
 
@@ -102,7 +102,7 @@ score = output.outputs.score
 print(f"Score: {score}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference_scoring.py>
+A code example can be found here: <gh-file:examples/offline_inference/offline_inference_scoring.py>
 
 ## Online Inference
 
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index b1703249d7224..4e0a9ef6ecf7d 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -51,7 +51,7 @@ $     --pipeline-parallel-size 2
 
 If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
 
-The first step, is to start containers and organize them into a cluster. We have provided the helper script <gh-file:examples/run_cluster.sh> to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command.
+The first step, is to start containers and organize them into a cluster. We have provided the helper script <gh-file:examples/online_serving/run_cluster.sh> to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command.
 
 Pick a node as the head node, and run the following command:
 
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index 0efa09f2869ca..9f5e1b908d786 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -60,7 +60,7 @@ for o in outputs:
     print(generated_text)
 ```
 
-Full example: <gh-file:examples/offline_inference_vision_language.py>
+Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
 
 To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
 
@@ -91,7 +91,7 @@ for o in outputs:
     print(generated_text)
 ```
 
-Full example: <gh-file:examples/offline_inference_vision_language_multi_image.py>
+Full example: <gh-file:examples/offline_inference/offline_inference_vision_language_multi_image.py>
 
 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
 
@@ -125,13 +125,13 @@ for o in outputs:
 You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
 instead of using multi-image input.
 
-Full example: <gh-file:examples/offline_inference_vision_language.py>
+Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
 
 ### Audio
 
 You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
 
-Full example: <gh-file:examples/offline_inference_audio_language.py>
+Full example: <gh-file:examples/offline_inference/offline_inference_audio_language.py>
 
 ### Embedding
 
@@ -271,7 +271,7 @@ chat_response = client.chat.completions.create(
 print("Chat completion output:", chat_response.choices[0].message.content)
 ```
 
-Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
+Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
 ```{tip}
 Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
@@ -342,7 +342,7 @@ result = chat_completion_from_url.choices[0].message.content
 print("Chat completion output from image url:", result)
 ```
 
-Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
+Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
 ````{note}
 By default, the timeout for fetching videos through HTTP URL is `30` seconds.
@@ -445,7 +445,7 @@ result = chat_completion_from_url.choices[0].message.content
 print("Chat completion output from audio url:", result)
 ```
 
-Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
+Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
 ````{note}
 By default, the timeout for fetching audios through HTTP URL is `10` seconds.
@@ -529,4 +529,4 @@ Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of th
 example below for details.
 ```
 
-Full example: <gh-file:examples/openai_chat_embedding_client_for_multimodal.py>
+Full example: <gh-file:examples/online_serving/openai_chat_embedding_client_for_multimodal.py>
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 1e5ea6357d202..022dd3ae8a237 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -191,7 +191,7 @@ The order of priorities is `command line > config file values > defaults`.
 Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
 you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
 
-Code example: <gh-file:examples/openai_completion_client.py>
+Code example: <gh-file:examples/online_serving/openai_completion_client.py>
 
 #### Extra parameters
 
@@ -222,7 +222,7 @@ We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
 see our [Multimodal Inputs](#multimodal-inputs) guide for more information.
 - *Note: `image_url.detail` parameter is not supported.*
 
-Code example: <gh-file:examples/openai_chat_completion_client.py>
+Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
 
 #### Extra parameters
 
@@ -255,7 +255,7 @@ which will be treated as a single prompt to the model.
 This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details.
 ```
 
-Code example: <gh-file:examples/openai_embedding_client.py>
+Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
 
 #### Extra parameters
 
@@ -299,7 +299,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_
 
 The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
 
-Code example: <gh-file:examples/openai_pooling_client.py>
+Code example: <gh-file:examples/online_serving/openai_pooling_client.py>
 
 (score-api)=
 ### Score API
@@ -309,7 +309,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent
 
 You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
 
-Code example: <gh-file:examples/openai_cross_encoder_score.py>
+Code example: <gh-file:examples/online_serving/openai_cross_encoder_score.py>
 
 #### Single inference
 
diff --git a/examples/aqlm_example.py b/examples/offline_inference/aqlm_example.py
similarity index 100%
rename from examples/aqlm_example.py
rename to examples/offline_inference/aqlm_example.py
diff --git a/examples/cpu_offload.py b/examples/offline_inference/cpu_offload.py
similarity index 100%
rename from examples/cpu_offload.py
rename to examples/offline_inference/cpu_offload.py
diff --git a/examples/florence2_inference.py b/examples/offline_inference/florence2_inference.py
similarity index 92%
rename from examples/florence2_inference.py
rename to examples/offline_inference/florence2_inference.py
index b58ac2e1f7ed4..49dd2c331db5a 100644
--- a/examples/florence2_inference.py
+++ b/examples/offline_inference/florence2_inference.py
@@ -3,7 +3,8 @@
 encoder/decoder models, specifically Florence-2
 '''
 # TODO(Isotr0py):
-# Move to offline_inference_vision_language.py after porting vision backbone
+# Move to offline_inference/offline_inference_vision_language.py
+# after porting vision backbone
 from vllm import LLM, SamplingParams
 
 dtype = "float"
diff --git a/examples/gguf_inference.py b/examples/offline_inference/gguf_inference.py
similarity index 100%
rename from examples/gguf_inference.py
rename to examples/offline_inference/gguf_inference.py
diff --git a/examples/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py
similarity index 100%
rename from examples/llm_engine_example.py
rename to examples/offline_inference/llm_engine_example.py
diff --git a/examples/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py
similarity index 100%
rename from examples/lora_with_quantization_inference.py
rename to examples/offline_inference/lora_with_quantization_inference.py
diff --git a/examples/multilora_inference.py b/examples/offline_inference/multilora_inference.py
similarity index 100%
rename from examples/multilora_inference.py
rename to examples/offline_inference/multilora_inference.py
diff --git a/examples/offline_chat_with_tools.py b/examples/offline_inference/offline_chat_with_tools.py
similarity index 100%
rename from examples/offline_chat_with_tools.py
rename to examples/offline_inference/offline_chat_with_tools.py
diff --git a/examples/offline_inference.py b/examples/offline_inference/offline_inference.py
similarity index 100%
rename from examples/offline_inference.py
rename to examples/offline_inference/offline_inference.py
diff --git a/examples/offline_inference_arctic.py b/examples/offline_inference/offline_inference_arctic.py
similarity index 100%
rename from examples/offline_inference_arctic.py
rename to examples/offline_inference/offline_inference_arctic.py
diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference/offline_inference_audio_language.py
similarity index 100%
rename from examples/offline_inference_audio_language.py
rename to examples/offline_inference/offline_inference_audio_language.py
diff --git a/examples/offline_inference_chat.py b/examples/offline_inference/offline_inference_chat.py
similarity index 100%
rename from examples/offline_inference_chat.py
rename to examples/offline_inference/offline_inference_chat.py
diff --git a/examples/offline_inference_classification.py b/examples/offline_inference/offline_inference_classification.py
similarity index 100%
rename from examples/offline_inference_classification.py
rename to examples/offline_inference/offline_inference_classification.py
diff --git a/examples/offline_inference_cli.py b/examples/offline_inference/offline_inference_cli.py
similarity index 100%
rename from examples/offline_inference_cli.py
rename to examples/offline_inference/offline_inference_cli.py
diff --git a/examples/offline_inference_distributed.py b/examples/offline_inference/offline_inference_distributed.py
similarity index 100%
rename from examples/offline_inference_distributed.py
rename to examples/offline_inference/offline_inference_distributed.py
diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference/offline_inference_embedding.py
similarity index 100%
rename from examples/offline_inference_embedding.py
rename to examples/offline_inference/offline_inference_embedding.py
diff --git a/examples/offline_inference_encoder_decoder.py b/examples/offline_inference/offline_inference_encoder_decoder.py
similarity index 100%
rename from examples/offline_inference_encoder_decoder.py
rename to examples/offline_inference/offline_inference_encoder_decoder.py
diff --git a/examples/offline_inference_mlpspeculator.py b/examples/offline_inference/offline_inference_mlpspeculator.py
similarity index 100%
rename from examples/offline_inference_mlpspeculator.py
rename to examples/offline_inference/offline_inference_mlpspeculator.py
diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference/offline_inference_neuron.py
similarity index 100%
rename from examples/offline_inference_neuron.py
rename to examples/offline_inference/offline_inference_neuron.py
diff --git a/examples/offline_inference_neuron_int8_quantization.py b/examples/offline_inference/offline_inference_neuron_int8_quantization.py
similarity index 100%
rename from examples/offline_inference_neuron_int8_quantization.py
rename to examples/offline_inference/offline_inference_neuron_int8_quantization.py
diff --git a/examples/offline_inference_openai.md b/examples/offline_inference/offline_inference_openai/offline_inference_openai.md
similarity index 90%
rename from examples/offline_inference_openai.md
rename to examples/offline_inference/offline_inference_openai/offline_inference_openai.md
index 2436417cb543a..6278a1943fe4a 100644
--- a/examples/offline_inference_openai.md
+++ b/examples/offline_inference/offline_inference_openai/offline_inference_openai.md
@@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format
  
 The OpenAI batch file format consists of a series of json objects on new lines.
  
-[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl)
+[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl)
  
 Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
  
@@ -31,13 +31,13 @@ We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
 
 ```
-wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
 ```
 
 Once you've created your batch file it should look like this
 
 ```
-$ cat openai_example_batch.jsonl
+$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
@@ -49,7 +49,7 @@ The batch running tool is designed to be used from the command line.
 You can run the batch with the following command, which will write its results to a file called `results.jsonl`
 
 ```
-python -m vllm.entrypoints.openai.run_batch -i openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.run_batch -i offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
 ### Step 3: Check your results
@@ -66,10 +66,10 @@ $ cat results.jsonl
 
 The batch runner supports remote input and output urls that are accessible via http/https.
 
-For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl`, you can run
+For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl`, you can run
 
 ```
-python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
 ## Example 3: Integrating with AWS S3
@@ -90,13 +90,13 @@ To integrate with cloud blob storage, we recommend using presigned urls.
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
 
 ```
-wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
 ```
 
 Once you've created your batch file it should look like this
 
 ```
-$ cat openai_example_batch.jsonl
+$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
@@ -104,7 +104,7 @@ $ cat openai_example_batch.jsonl
 Now upload your batch file to your S3 bucket.
 
 ```
-aws s3 cp openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
+aws s3 cp offline_inference/offline_inference_openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
 ```
 
 ### Step 2: Generate your presigned urls
diff --git a/examples/openai_example_batch.jsonl b/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
similarity index 100%
rename from examples/openai_example_batch.jsonl
rename to examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
diff --git a/examples/offline_inference_pixtral.py b/examples/offline_inference/offline_inference_pixtral.py
similarity index 100%
rename from examples/offline_inference_pixtral.py
rename to examples/offline_inference/offline_inference_pixtral.py
diff --git a/examples/offline_inference_scoring.py b/examples/offline_inference/offline_inference_scoring.py
similarity index 100%
rename from examples/offline_inference_scoring.py
rename to examples/offline_inference/offline_inference_scoring.py
diff --git a/examples/offline_inference_structured_outputs.py b/examples/offline_inference/offline_inference_structured_outputs.py
similarity index 100%
rename from examples/offline_inference_structured_outputs.py
rename to examples/offline_inference/offline_inference_structured_outputs.py
diff --git a/examples/offline_inference_tpu.py b/examples/offline_inference/offline_inference_tpu.py
similarity index 100%
rename from examples/offline_inference_tpu.py
rename to examples/offline_inference/offline_inference_tpu.py
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference/offline_inference_vision_language.py
similarity index 100%
rename from examples/offline_inference_vision_language.py
rename to examples/offline_inference/offline_inference_vision_language.py
diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference/offline_inference_vision_language_embedding.py
similarity index 100%
rename from examples/offline_inference_vision_language_embedding.py
rename to examples/offline_inference/offline_inference_vision_language_embedding.py
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference/offline_inference_vision_language_multi_image.py
similarity index 100%
rename from examples/offline_inference_vision_language_multi_image.py
rename to examples/offline_inference/offline_inference_vision_language_multi_image.py
diff --git a/examples/offline_inference_whisper.py b/examples/offline_inference/offline_inference_whisper.py
similarity index 100%
rename from examples/offline_inference_whisper.py
rename to examples/offline_inference/offline_inference_whisper.py
diff --git a/examples/offline_inference_with_default_generation_config.py b/examples/offline_inference/offline_inference_with_default_generation_config.py
similarity index 100%
rename from examples/offline_inference_with_default_generation_config.py
rename to examples/offline_inference/offline_inference_with_default_generation_config.py
diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference/offline_inference_with_prefix.py
similarity index 100%
rename from examples/offline_inference_with_prefix.py
rename to examples/offline_inference/offline_inference_with_prefix.py
diff --git a/examples/offline_inference_with_profiler.py b/examples/offline_inference/offline_inference_with_profiler.py
similarity index 100%
rename from examples/offline_inference_with_profiler.py
rename to examples/offline_inference/offline_inference_with_profiler.py
diff --git a/examples/offline_profile.py b/examples/offline_inference/offline_profile.py
similarity index 99%
rename from examples/offline_profile.py
rename to examples/offline_inference/offline_profile.py
index 46afe8aa2604b..187a05e4d70a2 100644
--- a/examples/offline_profile.py
+++ b/examples/offline_inference/offline_profile.py
@@ -363,7 +363,7 @@ def abort_requests():
 
     example:
     ```
-    python examples/offline_profile.py \\
+    python examples/offline_inference/offline_profile.py \\
         --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
         --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
         --enforce-eager run_num_steps -n 2
diff --git a/examples/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
similarity index 100%
rename from examples/save_sharded_state.py
rename to examples/offline_inference/save_sharded_state.py
diff --git a/examples/api_client.py b/examples/online_serving/api_client.py
similarity index 100%
rename from examples/api_client.py
rename to examples/online_serving/api_client.py
diff --git a/examples/chart-helm/.helmignore b/examples/online_serving/chart-helm/.helmignore
similarity index 100%
rename from examples/chart-helm/.helmignore
rename to examples/online_serving/chart-helm/.helmignore
diff --git a/examples/chart-helm/Chart.yaml b/examples/online_serving/chart-helm/Chart.yaml
similarity index 100%
rename from examples/chart-helm/Chart.yaml
rename to examples/online_serving/chart-helm/Chart.yaml
diff --git a/examples/online_serving/chart-helm/README.md b/examples/online_serving/chart-helm/README.md
new file mode 100644
index 0000000000000..6aa126d4fd22c
--- /dev/null
+++ b/examples/online_serving/chart-helm/README.md
@@ -0,0 +1,21 @@
+# Helm Charts
+
+This directory contains a Helm chart for deploying the vllm application. The chart includes configurations for deployment, autoscaling, resource management, and more.
+
+## Files
+
+- Chart.yaml: Defines the chart metadata including name, version, and maintainers.
+- ct.yaml: Configuration for chart testing.
+- lintconf.yaml: Linting rules for YAML files.
+- values.schema.json: JSON schema for validating values.yaml.
+- values.yaml: Default values for the Helm chart.
+- templates/_helpers.tpl: Helper templates for defining common configurations.
+- templates/configmap.yaml: Template for creating ConfigMaps.
+- templates/custom-objects.yaml: Template for custom Kubernetes objects.
+- templates/deployment.yaml: Template for creating Deployments.
+- templates/hpa.yaml: Template for Horizontal Pod Autoscaler.
+- templates/job.yaml: Template for Kubernetes Jobs.
+- templates/poddisruptionbudget.yaml: Template for Pod Disruption Budget.
+- templates/pvc.yaml: Template for Persistent Volume Claims.
+- templates/secrets.yaml: Template for Kubernetes Secrets.
+- templates/service.yaml: Template for creating Services.
\ No newline at end of file
diff --git a/examples/chart-helm/ct.yaml b/examples/online_serving/chart-helm/ct.yaml
similarity index 100%
rename from examples/chart-helm/ct.yaml
rename to examples/online_serving/chart-helm/ct.yaml
diff --git a/examples/chart-helm/lintconf.yaml b/examples/online_serving/chart-helm/lintconf.yaml
similarity index 100%
rename from examples/chart-helm/lintconf.yaml
rename to examples/online_serving/chart-helm/lintconf.yaml
diff --git a/examples/chart-helm/templates/_helpers.tpl b/examples/online_serving/chart-helm/templates/_helpers.tpl
similarity index 100%
rename from examples/chart-helm/templates/_helpers.tpl
rename to examples/online_serving/chart-helm/templates/_helpers.tpl
diff --git a/examples/chart-helm/templates/configmap.yaml b/examples/online_serving/chart-helm/templates/configmap.yaml
similarity index 100%
rename from examples/chart-helm/templates/configmap.yaml
rename to examples/online_serving/chart-helm/templates/configmap.yaml
diff --git a/examples/chart-helm/templates/custom-objects.yaml b/examples/online_serving/chart-helm/templates/custom-objects.yaml
similarity index 100%
rename from examples/chart-helm/templates/custom-objects.yaml
rename to examples/online_serving/chart-helm/templates/custom-objects.yaml
diff --git a/examples/chart-helm/templates/deployment.yaml b/examples/online_serving/chart-helm/templates/deployment.yaml
similarity index 100%
rename from examples/chart-helm/templates/deployment.yaml
rename to examples/online_serving/chart-helm/templates/deployment.yaml
diff --git a/examples/chart-helm/templates/hpa.yaml b/examples/online_serving/chart-helm/templates/hpa.yaml
similarity index 100%
rename from examples/chart-helm/templates/hpa.yaml
rename to examples/online_serving/chart-helm/templates/hpa.yaml
diff --git a/examples/chart-helm/templates/job.yaml b/examples/online_serving/chart-helm/templates/job.yaml
similarity index 100%
rename from examples/chart-helm/templates/job.yaml
rename to examples/online_serving/chart-helm/templates/job.yaml
diff --git a/examples/chart-helm/templates/poddisruptionbudget.yaml b/examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml
similarity index 100%
rename from examples/chart-helm/templates/poddisruptionbudget.yaml
rename to examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml
diff --git a/examples/chart-helm/templates/pvc.yaml b/examples/online_serving/chart-helm/templates/pvc.yaml
similarity index 100%
rename from examples/chart-helm/templates/pvc.yaml
rename to examples/online_serving/chart-helm/templates/pvc.yaml
diff --git a/examples/chart-helm/templates/secrets.yaml b/examples/online_serving/chart-helm/templates/secrets.yaml
similarity index 100%
rename from examples/chart-helm/templates/secrets.yaml
rename to examples/online_serving/chart-helm/templates/secrets.yaml
diff --git a/examples/chart-helm/templates/service.yaml b/examples/online_serving/chart-helm/templates/service.yaml
similarity index 100%
rename from examples/chart-helm/templates/service.yaml
rename to examples/online_serving/chart-helm/templates/service.yaml
diff --git a/examples/chart-helm/values.schema.json b/examples/online_serving/chart-helm/values.schema.json
similarity index 100%
rename from examples/chart-helm/values.schema.json
rename to examples/online_serving/chart-helm/values.schema.json
diff --git a/examples/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml
similarity index 100%
rename from examples/chart-helm/values.yaml
rename to examples/online_serving/chart-helm/values.yaml
diff --git a/examples/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh
similarity index 100%
rename from examples/disaggregated_prefill.sh
rename to examples/online_serving/disaggregated_prefill.sh
diff --git a/examples/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py
similarity index 100%
rename from examples/gradio_openai_chatbot_webserver.py
rename to examples/online_serving/gradio_openai_chatbot_webserver.py
diff --git a/examples/gradio_webserver.py b/examples/online_serving/gradio_webserver.py
similarity index 100%
rename from examples/gradio_webserver.py
rename to examples/online_serving/gradio_webserver.py
diff --git a/examples/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py
similarity index 100%
rename from examples/openai_chat_completion_client.py
rename to examples/online_serving/openai_chat_completion_client.py
diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
similarity index 100%
rename from examples/openai_chat_completion_client_for_multimodal.py
rename to examples/online_serving/openai_chat_completion_client_for_multimodal.py
diff --git a/examples/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py
similarity index 100%
rename from examples/openai_chat_completion_client_with_tools.py
rename to examples/online_serving/openai_chat_completion_client_with_tools.py
diff --git a/examples/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py
similarity index 100%
rename from examples/openai_chat_completion_structured_outputs.py
rename to examples/online_serving/openai_chat_completion_structured_outputs.py
diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
similarity index 100%
rename from examples/openai_chat_embedding_client_for_multimodal.py
rename to examples/online_serving/openai_chat_embedding_client_for_multimodal.py
diff --git a/examples/openai_completion_client.py b/examples/online_serving/openai_completion_client.py
similarity index 100%
rename from examples/openai_completion_client.py
rename to examples/online_serving/openai_completion_client.py
diff --git a/examples/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py
similarity index 100%
rename from examples/openai_cross_encoder_score.py
rename to examples/online_serving/openai_cross_encoder_score.py
diff --git a/examples/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py
similarity index 100%
rename from examples/openai_embedding_client.py
rename to examples/online_serving/openai_embedding_client.py
diff --git a/examples/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py
similarity index 100%
rename from examples/openai_pooling_client.py
rename to examples/online_serving/openai_pooling_client.py
diff --git a/examples/opentelemetry/Otel.md b/examples/online_serving/opentelemetry/Otel.md
similarity index 100%
rename from examples/opentelemetry/Otel.md
rename to examples/online_serving/opentelemetry/Otel.md
diff --git a/examples/opentelemetry/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py
similarity index 100%
rename from examples/opentelemetry/dummy_client.py
rename to examples/online_serving/opentelemetry/dummy_client.py
diff --git a/examples/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md
similarity index 100%
rename from examples/prometheus_grafana/README.md
rename to examples/online_serving/prometheus_grafana/README.md
diff --git a/examples/prometheus_grafana/docker-compose.yaml b/examples/online_serving/prometheus_grafana/docker-compose.yaml
similarity index 100%
rename from examples/prometheus_grafana/docker-compose.yaml
rename to examples/online_serving/prometheus_grafana/docker-compose.yaml
diff --git a/examples/prometheus_grafana/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json
similarity index 100%
rename from examples/prometheus_grafana/grafana.json
rename to examples/online_serving/prometheus_grafana/grafana.json
diff --git a/examples/prometheus_grafana/prometheus.yaml b/examples/online_serving/prometheus_grafana/prometheus.yaml
similarity index 100%
rename from examples/prometheus_grafana/prometheus.yaml
rename to examples/online_serving/prometheus_grafana/prometheus.yaml
diff --git a/examples/run_cluster.sh b/examples/online_serving/run_cluster.sh
similarity index 100%
rename from examples/run_cluster.sh
rename to examples/online_serving/run_cluster.sh
diff --git a/examples/sagemaker-entrypoint.sh b/examples/online_serving/sagemaker-entrypoint.sh
similarity index 100%
rename from examples/sagemaker-entrypoint.sh
rename to examples/online_serving/sagemaker-entrypoint.sh
diff --git a/examples/fp8/README.md b/examples/other/fp8/README.md
similarity index 88%
rename from examples/fp8/README.md
rename to examples/other/fp8/README.md
index 5492872cae93a..4e8031d954113 100644
--- a/examples/fp8/README.md
+++ b/examples/other/fp8/README.md
@@ -20,12 +20,12 @@ Before incorporating the FP8 datatype for inference workloads, you must adhere t
 ### 2. Convert HF model into a quantized HF model.
 Note: The following steps are adapted from the [TensorRT-LLM repository](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/README.md).
 
-`quantize.py` (examples/fp8/quantizer/quantize.py) uses the quantization toolkit  (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format).
+`quantize.py` (examples/other/fp8/quantizer/quantize.py) uses the quantization toolkit  (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format).
 
-The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found at `examples/fp8/quantizer/README.md`.
+The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found at `examples/other/fp8/quantizer/README.md`.
 
 ### 3. Extract KV Cache Scaling Factors from quantized HF model.
-`extract_scales.py` (examples/fp8/extract_scales.py) can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports Llama 2 models. It is also important to note the following:
+`extract_scales.py` (examples/other/fp8/extract_scales.py) can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports Llama 2 models. It is also important to note the following:
 1. **File Structure**: The utility operates under the assumption that all parameters, including KV cache scaling factors, corresponding to a particular Tensor Parallelism (TP) rank are stored in a single file. These files must adhere to a specific naming convention where the TP rank is immediately identified after a specific keyword (e.g., "rank") in the filename.
 
 2. **TP Decomposition**: The utility assumes consistency between the TP decomposition employed by the quantizer tool and that used by vLLM.
@@ -35,7 +35,7 @@ The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found a
 ```python
 # prerequisites:
 # - Quantized HF LLaMa 2 model 
-python3 examples/fp8/extract_scales.py --help
+python3 examples/other/fp8/extract_scales.py --help
 Usage: extract_scales.py [-h] --quantized_model QUANTIZED_MODEL [--load_format {auto,safetensors,npz,pt}] [--output_dir OUTPUT_DIR] [--output_name OUTPUT_NAME] [--tp_size TP_SIZE]
 
 KV Scale Extraction Example
@@ -52,7 +52,7 @@ Optional arguments:
 ```
 ```python
 Example:
-python3 examples/fp8/extract_scales.py --quantized_model <QUANTIZED_MODEL_DIR> --tp_size <TENSOR_PARALLEL_SIZE> --output_dir <PATH_TO_OUTPUT_DIR>
+python3 examples/other/fp8/extract_scales.py --quantized_model <QUANTIZED_MODEL_DIR> --tp_size <TENSOR_PARALLEL_SIZE> --output_dir <PATH_TO_OUTPUT_DIR>
 ```
 ### 4. Load KV Cache Scaling Factors into VLLM.
 This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for KV cache scaling factors to be utilized for FP8.
diff --git a/examples/fp8/extract_scales.py b/examples/other/fp8/extract_scales.py
similarity index 100%
rename from examples/fp8/extract_scales.py
rename to examples/other/fp8/extract_scales.py
diff --git a/examples/fp8/quantizer/README.md b/examples/other/fp8/quantizer/README.md
similarity index 100%
rename from examples/fp8/quantizer/README.md
rename to examples/other/fp8/quantizer/README.md
diff --git a/examples/fp8/quantizer/quantize.py b/examples/other/fp8/quantizer/quantize.py
similarity index 100%
rename from examples/fp8/quantizer/quantize.py
rename to examples/other/fp8/quantizer/quantize.py
diff --git a/examples/logging_configuration.md b/examples/other/logging_configuration.md
similarity index 100%
rename from examples/logging_configuration.md
rename to examples/other/logging_configuration.md
diff --git a/examples/tensorize_vllm_model.py b/examples/other/tensorize_vllm_model.py
similarity index 96%
rename from examples/tensorize_vllm_model.py
rename to examples/other/tensorize_vllm_model.py
index dd77a4ad0c6b7..5fff1fdf502c9 100644
--- a/examples/tensorize_vllm_model.py
+++ b/examples/other/tensorize_vllm_model.py
@@ -25,7 +25,7 @@
 To serialize a model, install vLLM from source, then run something 
 like this from the root level of this repository:
 
-python -m examples.tensorize_vllm_model \
+python -m examples.offline_inference.tensorize_vllm_model \
    --model facebook/opt-125m \
    serialize \
    --serialized-directory s3://my-bucket \
@@ -45,7 +45,7 @@
 To deserialize a model, you can run something like this from the root 
 level of this repository:
 
-python -m examples.tensorize_vllm_model \
+python -m examples.offline_inference.tensorize_vllm_model \
    --model EleutherAI/gpt-j-6B \
    --dtype float16 \
    deserialize \
@@ -63,11 +63,11 @@
 model-rank-%03d.tensors
 
 For more information on the available arguments for serializing, run 
-`python -m examples.tensorize_vllm_model serialize --help`.
+`python -m examples.offline_inference.tensorize_vllm_model serialize --help`.
 
 Or for deserializing:
 
-`python -m examples.tensorize_vllm_model deserialize --help`.
+`python -m examples.offline_inference.tensorize_vllm_model deserialize --help`.
 
 Once a model is serialized, tensorizer can be invoked with the `LLM` class 
 directly to load models:
@@ -88,7 +88,7 @@
 In order to see all of the available arguments usable to configure 
 loading with tensorizer that are given to `TensorizerConfig`, run:
 
-`python -m examples.tensorize_vllm_model deserialize --help`
+`python -m examples.offline_inference.tensorize_vllm_model deserialize --help`
 
 under the `tensorizer options` section. These can also be used for
 deserialization in this example script, although `--tensorizer-uri` and
diff --git a/pyproject.toml b/pyproject.toml
index 45fa4bff4e680..0ac3f39ef7a5f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ build-backend = "setuptools.build_meta"
 line-length = 80
 exclude = [
     # External file, leaving license intact
-    "examples/fp8/quantizer/quantize.py"
+    "examples/other/fp8/quantizer/quantize.py"
 ]
 
 [tool.ruff.lint.per-file-ignores]
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index 0d27cf9f152e0..57518bd3e8299 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -5,7 +5,7 @@ def test_platform_plugins():
     import os
     example_file = os.path.join(
         os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
-        "examples", "offline_inference.py")
+        "examples", "offline_inference/offline_inference.py")
     runpy.run_path(example_file)
 
     # check if the plugin is loaded correctly
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index 0b0792b6b845f..bf409d2d97aa1 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -163,8 +163,8 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
 
 def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
     multilora_inference = import_from_path(
-        "examples.multilora_inference",
-        EXAMPLES_PATH / "multilora_inference.py",
+        "examples.offline_inference.multilora_inference",
+        EXAMPLES_PATH / "offline_inference/multilora_inference.py",
     )
 
     model_ref = "meta-llama/Llama-2-7b-hf"
diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py
index 394ca8663e189..49366abc7fb56 100644
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@@ -31,7 +31,7 @@ def get_entries(node, curr_depth=0):
                         type=str,
                         required=True,
                         help="json trace file output by "
-                        "examples/offline_profile.py")
+                        "examples/offline_inference/offline_profile.py")
     parser.add_argument("--phase",
                         type=str,
                         required=True,
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index da7a28da15c19..fa88ed4204d8f 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -534,11 +534,11 @@ def make_plot_title_suffix(profile_json: dict) -> str:
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
-    parser.add_argument(
-        "--json-trace",
-        type=str,
-        required=True,
-        help="json trace file output by examples/offline_profile.py")
+    parser.add_argument("--json-trace",
+                        type=str,
+                        required=True,
+                        help="json trace file output by \
+                              examples/offline_inference/offline_profile.py")
     parser.add_argument("--output-directory",
                         type=str,
                         required=False,
diff --git a/vllm/distributed/kv_transfer/README.md b/vllm/distributed/kv_transfer/README.md
index dab2d10c4c9d0..e20c992a381a3 100644
--- a/vllm/distributed/kv_transfer/README.md
+++ b/vllm/distributed/kv_transfer/README.md
@@ -22,7 +22,7 @@ NOTE: If you want to not only transfer KV caches, but adjust the model execution
 
 ## Disaggregated prefilling
 
-The example usage is in [this file](../../../examples/disaggregated_prefill.sh).
+The example usage is in [this file](../../../examples/online_serving/disaggregated_prefill.sh).
 
 Here is the diagram of how we run disaggretgated prefilling.
 
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index a9c1fa7221217..0033fbff0e9ac 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -452,9 +452,9 @@ def _load_model_serialized_cpu(
         """Load a serialized model with tensorizer to the CPU.
 
         This is only necessary when the model isn't vLLM-tensorized (see
-        examples/tensorize_vllm_model.py) This should still be faster than
-        default HuggingFace loading, but will be slower than loading a
-        vLLM-tensorized model.
+        examples/other/tensorize_vllm_model.py) This should still
+        be faster than default HuggingFace loading, but will be slower than
+        loading a vLLM-tensorized model.
         """
         device_config = vllm_config.device_config
         model_config = vllm_config.model_config
@@ -472,7 +472,7 @@ def _load_model_serialized(
         """Load a serialized model with tensorizer.
 
         Expects a vLLM-tensorized model. See the
-        examples/tensorize_vllm_model.py example script
+        examples/other/tensorize_vllm_model.py example script
         for serializing vLLM models."""
 
         device_config = vllm_config.device_config
@@ -529,7 +529,8 @@ class ShardedStateLoader(BaseModelLoader):
     Model loader that directly loads each worker's model state dict, which
     enables a fast load path for large tensor-parallel models where each worker
     only needs to read its own shard rather than the entire checkpoint. See
-    `examples/save_sharded_state.py` for creating a sharded checkpoint.
+    `examples/offline_inference/save_sharded_state.py` for creating a sharded
+    checkpoint.
     """
 
     DEFAULT_PATTERN = "model-rank-{rank}-part-{part}.safetensors"
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 8b929f299c8d8..fbd4937112e11 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -155,7 +155,7 @@ class TensorizerArgs:
       encryption_keyfile: File path to a binary file containing a  
           binary key to use for decryption. `None` (the default) means 
           no decryption. See the example script in 
-          examples/tensorize_vllm_model.py. 
+          examples/other/tensorize_vllm_model.py. 
       s3_access_key_id: The access key for the S3 bucket. Can also be set via
           the S3_ACCESS_KEY_ID environment variable.
       s3_secret_access_key: The secret access key for the S3 bucket. Can also
@@ -363,12 +363,12 @@ def deserialize(self):
 def tensorizer_weights_iterator(
     tensorizer_args: "TensorizerArgs"
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
-    logger.warning(
-        "Deserializing HuggingFace models is not optimized for "
-        "loading on vLLM, as tensorizer is forced to load to CPU. "
-        "Consider deserializing a vLLM model instead for faster "
-        "load times. See the examples/tensorize_vllm_model.py example "
-        "script for serializing vLLM models.")
+    logger.warning("Deserializing HuggingFace models is not optimized for "
+                   "loading on vLLM, as tensorizer is forced to load to CPU. "
+                   "Consider deserializing a vLLM model instead for faster "
+                   "load times. See the "
+                   "examples/other/tensorize_vllm_model.py example script "
+                   "for serializing vLLM models.")
 
     deserializer_args = tensorizer_args.deserializer_params
     stream_params = tensorizer_args.stream_params
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 8aa0c98df70d2..a2c991cfdb74e 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -503,7 +503,8 @@ def kv_cache_scales_loader(
     KV cache scaling factors. The serialization should represent a dictionary
     whose keys are the TP ranks and values are another dictionary mapping layers
     to their KV cache scaling factors.
-    Keep this function in sync with the output of examples/fp8/extract_scales.py
+    Keep this function in sync with the output of
+    examples/other/fp8/extract_scales.py
     """
     try:
         with open(filename) as f:

From 6cd40a5bfed24ef0ceca83b0450be6920d8ca6d4 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 8 Jan 2025 21:34:44 +0800
Subject: [PATCH 325/357] [Doc][4/N] Reorganize API Reference (#11843)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml                 |   2 +-
 Dockerfile                                    |   4 ++--
 .../{dev => api}/engine/async_llm_engine.md   |   0
 .../engine_index.md => api/engine/index.md}   |   0
 docs/source/{dev => api}/engine/llm_engine.md |   0
 .../multimodal/index.md}                      |  10 --------
 .../offline_inference/index.md}               |   0
 .../{dev => api}/offline_inference/llm.md     |   0
 .../offline_inference/llm_inputs.md           |   0
 docs/source/api/params.md                     |  22 ++++++++++++++++++
 .../dockerfile-stages-dependency.png          | Bin
 .../contributing/dockerfile/dockerfile.md     |   2 +-
 docs/source/design/arch_overview.md           |   2 +-
 .../multimodal/adding_multimodal_plugin.md    |  16 -------------
 docs/source/dev/pooling_params.md             |   6 -----
 docs/source/dev/sampling_params.md            |   6 -----
 docs/source/getting_started/quickstart.md     |   2 +-
 docs/source/index.md                          |   9 ++++---
 docs/source/serving/offline_inference.md      |   2 +-
 .../serving/openai_compatible_server.md       |   8 +++----
 vllm/multimodal/base.py                       |   3 ---
 vllm/multimodal/inputs.py                     |   6 -----
 vllm/multimodal/registry.py                   |   3 ---
 vllm/pooling_params.py                        |   2 +-
 24 files changed, 38 insertions(+), 67 deletions(-)
 rename docs/source/{dev => api}/engine/async_llm_engine.md (100%)
 rename docs/source/{dev/engine/engine_index.md => api/engine/index.md} (100%)
 rename docs/source/{dev => api}/engine/llm_engine.md (100%)
 rename docs/source/{design/multimodal/multimodal_index.md => api/multimodal/index.md} (84%)
 rename docs/source/{dev/offline_inference/offline_index.md => api/offline_inference/index.md} (100%)
 rename docs/source/{dev => api}/offline_inference/llm.md (100%)
 rename docs/source/{dev => api}/offline_inference/llm_inputs.md (100%)
 create mode 100644 docs/source/api/params.md
 rename docs/source/assets/{dev => contributing}/dockerfile-stages-dependency.png (100%)
 delete mode 100644 docs/source/design/multimodal/adding_multimodal_plugin.md
 delete mode 100644 docs/source/dev/pooling_params.md
 delete mode 100644 docs/source/dev/sampling_params.md

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index b7178b94f481a..f883595f6d9ad 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -38,7 +38,7 @@ steps:
   - pip install -r requirements-docs.txt
   - SPHINXOPTS=\"-W\" make html
   # Check API reference (if it fails, you may have missing mock imports)
-  - grep \"sig sig-object py\" build/html/dev/sampling_params.html
+  - grep \"sig sig-object py\" build/html/api/params.html
 
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
   fast_check: true
diff --git a/Dockerfile b/Dockerfile
index 808cf675acf4d..4542bc9cf0bd2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,8 +2,8 @@
 # to run the OpenAI compatible server.
 
 # Please update any changes made here to
-# docs/source/dev/dockerfile/dockerfile.md and
-# docs/source/assets/dev/dockerfile-stages-dependency.png
+# docs/source/contributing/dockerfile/dockerfile.md and
+# docs/source/assets/contributing/dockerfile-stages-dependency.png
 
 ARG CUDA_VERSION=12.4.1
 #################### BASE BUILD IMAGE ####################
diff --git a/docs/source/dev/engine/async_llm_engine.md b/docs/source/api/engine/async_llm_engine.md
similarity index 100%
rename from docs/source/dev/engine/async_llm_engine.md
rename to docs/source/api/engine/async_llm_engine.md
diff --git a/docs/source/dev/engine/engine_index.md b/docs/source/api/engine/index.md
similarity index 100%
rename from docs/source/dev/engine/engine_index.md
rename to docs/source/api/engine/index.md
diff --git a/docs/source/dev/engine/llm_engine.md b/docs/source/api/engine/llm_engine.md
similarity index 100%
rename from docs/source/dev/engine/llm_engine.md
rename to docs/source/api/engine/llm_engine.md
diff --git a/docs/source/design/multimodal/multimodal_index.md b/docs/source/api/multimodal/index.md
similarity index 84%
rename from docs/source/design/multimodal/multimodal_index.md
rename to docs/source/api/multimodal/index.md
index e4f2171e84ff7..0046b73ea825e 100644
--- a/docs/source/design/multimodal/multimodal_index.md
+++ b/docs/source/api/multimodal/index.md
@@ -11,18 +11,8 @@ vLLM provides experimental support for multi-modal models through the {mod}`vllm
 Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
 via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
 
-Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
-by following [this guide](#adding-multimodal-plugin).
-
 Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs).
 
-## Guides
-
-```{toctree}
-:maxdepth: 1
-
-adding_multimodal_plugin
-```
 
 ## Module Contents
 
diff --git a/docs/source/dev/offline_inference/offline_index.md b/docs/source/api/offline_inference/index.md
similarity index 100%
rename from docs/source/dev/offline_inference/offline_index.md
rename to docs/source/api/offline_inference/index.md
diff --git a/docs/source/dev/offline_inference/llm.md b/docs/source/api/offline_inference/llm.md
similarity index 100%
rename from docs/source/dev/offline_inference/llm.md
rename to docs/source/api/offline_inference/llm.md
diff --git a/docs/source/dev/offline_inference/llm_inputs.md b/docs/source/api/offline_inference/llm_inputs.md
similarity index 100%
rename from docs/source/dev/offline_inference/llm_inputs.md
rename to docs/source/api/offline_inference/llm_inputs.md
diff --git a/docs/source/api/params.md b/docs/source/api/params.md
new file mode 100644
index 0000000000000..a3b4d9cbb44ec
--- /dev/null
+++ b/docs/source/api/params.md
@@ -0,0 +1,22 @@
+# Optional Parameters
+
+Optional parameters for vLLM APIs.
+
+(sampling-params)=
+
+## Sampling Parameters
+
+```{eval-rst}
+.. autoclass:: vllm.SamplingParams
+    :members:
+```
+
+(pooling-params)=
+
+## Pooling Parameters
+
+```{eval-rst}
+.. autoclass:: vllm.PoolingParams
+    :members:
+```
+
diff --git a/docs/source/assets/dev/dockerfile-stages-dependency.png b/docs/source/assets/contributing/dockerfile-stages-dependency.png
similarity index 100%
rename from docs/source/assets/dev/dockerfile-stages-dependency.png
rename to docs/source/assets/contributing/dockerfile-stages-dependency.png
diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md
index 38ea956ba8dfb..cb142318b8724 100644
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@@ -17,7 +17,7 @@ The edges of the build graph represent:
 
 - `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head)
 
-  > ```{figure} ../../assets/dev/dockerfile-stages-dependency.png
+  > ```{figure} /assets/contributing/dockerfile-stages-dependency.png
   > :align: center
   > :alt: query
   > :width: 100%
diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md
index 5e0dd021ad02e..cec503ef2f77d 100644
--- a/docs/source/design/arch_overview.md
+++ b/docs/source/design/arch_overview.md
@@ -53,7 +53,7 @@ for output in outputs:
 ```
 
 More API details can be found in the {doc}`Offline Inference
-</dev/offline_inference/offline_index>` section of the API docs.
+</api/offline_inference/index>` section of the API docs.
 
 The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
 
diff --git a/docs/source/design/multimodal/adding_multimodal_plugin.md b/docs/source/design/multimodal/adding_multimodal_plugin.md
deleted file mode 100644
index bcccd284879bb..0000000000000
--- a/docs/source/design/multimodal/adding_multimodal_plugin.md
+++ /dev/null
@@ -1,16 +0,0 @@
-(adding-multimodal-plugin)=
-
-# Adding a Multimodal Plugin
-
-This document teaches you how to add a new modality to vLLM.
-
-Each modality in vLLM is represented by a {class}`~vllm.multimodal.MultiModalPlugin` and registered to {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to {meth}`~vllm.multimodal.MultiModalRegistry.register_plugin`.
-
-The remainder of this document details how to define custom {class}`~vllm.multimodal.MultiModalPlugin` s.
-
-```{note}
-This article is a work in progress.
-```
-
-% TODO: Add more instructions on how to add new plugins once embeddings is in.
diff --git a/docs/source/dev/pooling_params.md b/docs/source/dev/pooling_params.md
deleted file mode 100644
index 74b2c57443e4b..0000000000000
--- a/docs/source/dev/pooling_params.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Pooling Parameters
-
-```{eval-rst}
-.. autoclass:: vllm.PoolingParams
-    :members:
-```
diff --git a/docs/source/dev/sampling_params.md b/docs/source/dev/sampling_params.md
deleted file mode 100644
index bdc36af5153db..0000000000000
--- a/docs/source/dev/sampling_params.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Sampling Parameters
-
-```{eval-rst}
-.. autoclass:: vllm.SamplingParams
-    :members:
-```
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 6b56918ce5638..2808e1b386801 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -42,7 +42,7 @@ The first line of this example imports the classes {class}`~vllm.LLM` and {class
 from vllm import LLM, SamplingParams
 ```
 
-The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](https://docs.vllm.ai/en/stable/dev/sampling_params.html).
+The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](#sampling-params).
 
 ```python
 prompts = [
diff --git a/docs/source/index.md b/docs/source/index.md
index 11d3e24a9b60a..6747a7fcce4fe 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -137,10 +137,10 @@ community/sponsors
 :caption: API Reference
 :maxdepth: 2
 
-dev/sampling_params
-dev/pooling_params
-dev/offline_inference/offline_index
-dev/engine/engine_index
+api/offline_inference/index
+api/engine/index
+api/multimodal/index
+api/params
 ```
 
 % Design Documents: Details about vLLM internals
@@ -154,7 +154,6 @@ design/huggingface_integration
 design/plugin_system
 design/kernel/paged_attention
 design/input_processing/model_inputs_index
-design/multimodal/multimodal_index
 design/automatic_prefix_caching
 design/multiprocessing
 ```
diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
index 83178f7811825..79092ab208784 100644
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -23,7 +23,7 @@ The available APIs depend on the type of model that is being run:
 Please refer to the above pages for more details about each API.
 
 ```{seealso}
-[API Reference](/dev/offline_inference/offline_index)
+[API Reference](/api/offline_inference/index)
 ```
 
 ## Configuration Options
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 022dd3ae8a237..ec5a367594743 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -195,7 +195,7 @@ Code example: <gh-file:examples/online_serving/openai_completion_client.py>
 
 #### Extra parameters
 
-The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported.
+The following [sampling parameters](#sampling-params) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -226,7 +226,7 @@ Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
 
 #### Extra parameters
 
-The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported.
+The following [sampling parameters](#sampling-params) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -259,7 +259,7 @@ Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
 
 #### Extra parameters
 
-The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported.
+The following [pooling parameters](#pooling-params) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -447,7 +447,7 @@ Response:
 
 #### Extra parameters
 
-The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported.
+The following [pooling parameters](#pooling-params) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 7f4029e726332..4941fbac963ca 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -49,9 +49,6 @@ class MultiModalPlugin(ABC):
     process the same data differently). This registry is in turn used by
     :class:`~MultiModalRegistry` which acts at a higher level
     (i.e., the modality of the data).
-
-    See also:
-        :ref:`adding-multimodal-plugin`
     """
 
     def __init__(self) -> None:
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 8fdcc4b524035..d542461874866 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -99,12 +99,6 @@ class MultiModalDataBuiltins(TypedDict, total=False):
 MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
 """
 A dictionary containing an entry for each modality type to input.
-
-Note:
-    This dictionary also accepts modality keys defined outside
-    :class:`MultiModalDataBuiltins` as long as a customized plugin
-    is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-    Read more on that :ref:`here <adding-multimodal-plugin>`.
 """
 
 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 5f01eac4edade..9eceefb08c93f 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -125,9 +125,6 @@ def __init__(
     def register_plugin(self, plugin: MultiModalPlugin) -> None:
         """
         Register a multi-modal plugin so it can be recognized by vLLM.
-
-        See also:
-            :ref:`adding-multimodal-plugin`
         """
         data_type_key = plugin.get_data_key()
 
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 2635c0bccd1c4..b24b7e91a7ae7 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -7,7 +7,7 @@ class PoolingParams(
         msgspec.Struct,
         omit_defaults=True,  # type: ignore[call-arg]
         array_like=True):  # type: ignore[call-arg]
-    """Pooling parameters for embeddings API.
+    """API parameters for pooling models. This is currently a placeholder.
 
     Attributes:
         additional_data: Any additional data needed for pooling.

From 2f7024987e582b85b280909b87287668cd97c92f Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Wed, 8 Jan 2025 23:18:28 +0800
Subject: [PATCH 326/357] [CI/Build][Bugfix] Fix CPU CI image clean up (#11836)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .buildkite/run-cpu-test.sh               | 7 ++-----
 vllm/model_executor/layers/activation.py | 2 +-
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 87d08c8c7fdcb..1a4dae8f65e99 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -13,7 +13,7 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BU
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
 
 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
+remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 
@@ -35,10 +35,7 @@ function cpu_tests() {
   # Run basic model test
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
-    pip install pytest pytest-asyncio \
-      decord einops librosa peft Pillow sentence-transformers soundfile \
-      transformers_stream_generator matplotlib datamodel_code_generator
-    pip install torchvision --index-url https://download.pytorch.org/whl/cpu
+    pip install -r vllm/requirements-test.txt
     pytest -v -s tests/models/decoder_only/language -m cpu_model
     pytest -v -s tests/models/embedding/language -m cpu_model
     pytest -v -s tests/models/encoder_decoder/language -m cpu_model
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 46d4670bfe4f9..b8a302cf5087f 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -61,7 +61,7 @@ class SiluAndMul(CustomOp):
 
     def __init__(self):
         super().__init__()
-        if current_platform.is_cuda_alike():
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
             self.op = torch.ops._C.silu_and_mul
         elif current_platform.is_xpu():
             import intel_extension_for_pytorch as ipex

From 78f4590b60161dee1a444870ae682ba45f633502 Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Thu, 9 Jan 2025 00:11:50 +0800
Subject: [PATCH 327/357] [Bugfix][XPU] fix silu_and_mul (#11823)

Signed-off-by: yan ma <yan.ma@intel.com>
---
 vllm/model_executor/layers/activation.py | 4 ++--
 vllm/plugins/__init__.py                 | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index b8a302cf5087f..32456fee06a28 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -64,8 +64,8 @@ def __init__(self):
         if current_platform.is_cuda_alike() or current_platform.is_cpu():
             self.op = torch.ops._C.silu_and_mul
         elif current_platform.is_xpu():
-            import intel_extension_for_pytorch as ipex
-            self.op = ipex.llm.functional.silu_and_mul
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.silu_and_mul
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index c50eb2cef4cd5..e5fa4f0e4a2f6 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -63,8 +63,8 @@ def load_general_plugins():
     from vllm.platforms import current_platform
 
     if current_platform.is_xpu():
-        # see https://github.com/pytorch/pytorch/blob/8cada5cbe5450e17c26fb8b358116785324537b2/torch/_dynamo/config.py#L158  # noqa
-        os.environ['TORCH_COMPILE_DISABLE'] = 'True'
+        # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
+        torch._dynamo.config.disable = True
     if current_platform.is_hpu():
         # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
         # does not support torch.compile
@@ -72,7 +72,6 @@ def load_general_plugins():
         # torch.compile support
         is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1'
         if is_lazy:
-            # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
             torch._dynamo.config.disable = True
             # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only)
             # requires enabling lazy collectives

From ca47e176af9e0a4fa9f02325cdad5f11b40aedab Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 9 Jan 2025 01:04:46 +0800
Subject: [PATCH 328/357] [Misc] Move some model utils into vision file
 (#11848)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/clip.py     |  5 +-
 vllm/model_executor/models/pixtral.py  |  5 +-
 vllm/model_executor/models/qwen2_vl.py |  3 +-
 vllm/model_executor/models/siglip.py   |  5 +-
 vllm/model_executor/models/utils.py    | 37 +-----------
 vllm/model_executor/models/vision.py   | 83 +++++++++++++++++++++++++-
 vllm/multimodal/inputs.py              |  4 +-
 vllm/multimodal/utils.py               | 44 --------------
 8 files changed, 94 insertions(+), 92 deletions(-)

diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 1bde45cb140cb..dd69f6c9a5aff 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -20,11 +20,10 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    consecutive_placeholder_ranges,
-                                   repeat_and_pad_placeholder_tokens,
-                                   resolve_visual_encoder_outputs)
+                                   repeat_and_pad_placeholder_tokens)
 from vllm.sequence import SequenceData
 
-from .vision import VisionEncoderInfo
+from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
 
 
 def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index b74bb3c8a3f88..37b9989e489ec 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -31,14 +31,13 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
 from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   consecutive_placeholder_ranges,
-                                   resolve_visual_encoder_outputs)
+                                   consecutive_placeholder_ranges)
 from vllm.sequence import IntermediateTensors, SequenceData
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
-from .vision import VisionEncoderInfo
+from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
 
 try:
     from xformers import ops as xops
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 8537fec854b6d..76a810e8f0c20 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -66,8 +66,9 @@
 from vllm.transformers_utils.config import uses_mrope
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
-from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend,
+from .utils import (AutoWeightsLoader, WeightsMapper,
                     init_vllm_registered_model, maybe_prefix)
+from .vision import get_vit_attn_backend
 
 logger = init_logger(__name__)
 
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 7ea177e94afc0..cca42842bc06e 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -24,11 +24,10 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    consecutive_placeholder_ranges,
-                                   repeat_and_pad_placeholder_tokens,
-                                   resolve_visual_encoder_outputs)
+                                   repeat_and_pad_placeholder_tokens)
 from vllm.sequence import SequenceData
 
-from .vision import VisionEncoderInfo
+from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
 
 
 def get_siglip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 4ed3b237ae0e2..43b3c973c97b8 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -8,16 +8,12 @@
 from torch.func import functional_call
 from transformers import PretrainedConfig
 
-import vllm.envs as envs
-from vllm.attention.selector import (backend_name_to_enum,
-                                     get_global_forced_attn_backend)
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors
-from vllm.platforms import _Backend, current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_pin_memory_available, print_warning_once
+from vllm.utils import is_pin_memory_available
 
 logger = init_logger(__name__)
 
@@ -612,37 +608,6 @@ def make_empty_intermediate_tensors(
     return make_empty_intermediate_tensors
 
 
-def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
-    """
-    Get the available attention backend for Vision Transformer.
-    """
-    # TODO(Isotr0py): Remove `support_fa` after support FA for all ViTs attn.
-    selected_backend: Optional[_Backend] = get_global_forced_attn_backend()
-    if selected_backend is None:
-        backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
-        if backend_by_env_var is not None:
-            selected_backend = backend_name_to_enum(backend_by_env_var)
-    if selected_backend is None:
-        # For Volta and Turing GPUs, use xformers instead.
-        device_available = current_platform.has_device_capability(80)
-        if device_available and support_fa:
-            from transformers.utils import is_flash_attn_2_available
-            if is_flash_attn_2_available():
-                selected_backend = _Backend.FLASH_ATTN
-            else:
-                print_warning_once(
-                    "Current `vllm-flash-attn` has a bug inside vision module, "
-                    "so we use xformers backend instead. You can run "
-                    "`pip install flash-attn` to use flash-attention backend.")
-                selected_backend = _Backend.XFORMERS
-        elif current_platform.is_cpu() or current_platform.is_rocm():
-            # ROCM doesn't support xformers
-            selected_backend = _Backend.TORCH_SDPA
-        else:
-            selected_backend = _Backend.XFORMERS
-    return selected_backend
-
-
 def maybe_prefix(prefix: str, name: str) -> str:
     """Add a prefix to a name if the prefix is non-empty.
 
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 8516c9f7066f7..e6a9e153d9107 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,8 +1,15 @@
 from abc import ABC, abstractmethod
-from typing import Final, Generic, Protocol, TypeVar
+from typing import Final, Generic, Optional, Protocol, TypeVar, Union
 
+import torch
 from transformers import PretrainedConfig
 
+import vllm.envs as envs
+from vllm.attention.selector import (backend_name_to_enum,
+                                     get_global_forced_attn_backend)
+from vllm.platforms import _Backend, current_platform
+from vllm.utils import print_warning_once
+
 _C = TypeVar("_C", bound=PretrainedConfig)
 
 
@@ -60,3 +67,77 @@ def get_vision_encoder_info(
 
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
+
+
+def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
+    """
+    Get the available attention backend for Vision Transformer.
+    """
+    # TODO(Isotr0py): Remove `support_fa` after support FA for all ViTs attn.
+    selected_backend: Optional[_Backend] = get_global_forced_attn_backend()
+    if selected_backend is None:
+        backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
+        if backend_by_env_var is not None:
+            selected_backend = backend_name_to_enum(backend_by_env_var)
+    if selected_backend is None:
+        # For Volta and Turing GPUs, use xformers instead.
+        device_available = current_platform.has_device_capability(80)
+        if device_available and support_fa:
+            from transformers.utils import is_flash_attn_2_available
+            if is_flash_attn_2_available():
+                selected_backend = _Backend.FLASH_ATTN
+            else:
+                print_warning_once(
+                    "Current `vllm-flash-attn` has a bug inside vision module, "
+                    "so we use xformers backend instead. You can run "
+                    "`pip install flash-attn` to use flash-attention backend.")
+                selected_backend = _Backend.XFORMERS
+        elif current_platform.is_cpu() or current_platform.is_rocm():
+            # ROCM doesn't support xformers
+            selected_backend = _Backend.TORCH_SDPA
+        else:
+            selected_backend = _Backend.XFORMERS
+    return selected_backend
+
+
+def resolve_visual_encoder_outputs(
+    encoder_outputs: Union[torch.Tensor, list[torch.Tensor]],
+    feature_sample_layers: Optional[list[int]],
+    post_layer_norm: Optional[torch.nn.LayerNorm],
+    max_possible_layers: int,
+) -> torch.Tensor:
+    """Given the outputs a visual encoder module that may correspond to the
+    output of the last layer, or a list of hidden states to be stacked,
+    handle post normalization and resolve it into a single output tensor.
+
+    Args:
+        encoder_outputs: Output of encoder's last layer or all hidden states.
+        feature_sample_layers: Optional layer indices to grab from the encoder
+            outputs; if provided, encoder outputs must be a list.
+        post_layer_norm: Post norm to apply to the output of the encoder.
+        max_possible_layers: Total layers in the fully loaded visual encoder.
+
+    """
+    if feature_sample_layers is None:
+        if post_layer_norm is not None:
+            return post_layer_norm(encoder_outputs)
+        return encoder_outputs
+
+    # Get the hidden states corresponding to the layer indices.
+    # Negative values are relative to the full visual encoder,
+    # so offset them depending on how many layers were loaded.
+    # NOTE: this assumes that encoder_outputs contains a list
+    # of hidden states in the same order as the encoder layers
+    # that produced them.
+    offset = max_possible_layers - len(encoder_outputs)
+    hs_pool = [
+        encoder_outputs[layer_idx]
+        if layer_idx >= 0 else encoder_outputs[layer_idx + offset]
+        for layer_idx in feature_sample_layers
+    ]
+
+    # Apply post-norm on the final hidden state if we are using it
+    uses_last_layer = feature_sample_layers[-1] in (len(hs_pool) - 1, -1)
+    if post_layer_norm is not None and uses_last_layer:
+        hs_pool[-1] = post_layer_norm(encoder_outputs)
+    return torch.cat(hs_pool, dim=-1)
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index d542461874866..8680e4175593b 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -99,6 +99,8 @@ class MultiModalDataBuiltins(TypedDict, total=False):
 MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
 """
 A dictionary containing an entry for each modality type to input.
+
+The built-in modalities are defined by :class:`MultiModalDataBuiltins`.
 """
 
 
@@ -485,7 +487,7 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
 
 MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]]
 """
-A dictionary containing placeholder ranges.
+A dictionary containing placeholder ranges for each modality.
 """
 
 
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index f4a514ba55d0c..1c6bbf77b926f 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -5,7 +5,6 @@
 
 import numpy as np
 import numpy.typing as npt
-import torch
 from PIL import Image
 
 import vllm.envs as envs
@@ -285,49 +284,6 @@ def encode_video_base64(frames: npt.NDArray) -> str:
     return video_io.encode_base64(frames)
 
 
-def resolve_visual_encoder_outputs(
-    encoder_outputs: Union[torch.Tensor, list[torch.Tensor]],
-    feature_sample_layers: Optional[list[int]],
-    post_layer_norm: Optional[torch.nn.LayerNorm],
-    max_possible_layers: int,
-) -> torch.Tensor:
-    """Given the outputs a visual encoder module that may correspond to the
-    output of the last layer, or a list of hidden states to be stacked,
-    handle post normalization and resolve it into a single output tensor.
-
-    Args:
-        encoder_outputs: Output of encoder's last layer or all hidden states.
-        feature_sample_layers: Optional layer indices to grab from the encoder
-            outputs; if provided, encoder outputs must be a list.
-        post_layer_norm: Post norm to apply to the output of the encoder.
-        max_possible_layers: Total layers in the fully loaded visual encoder.
-
-    """
-    if feature_sample_layers is None:
-        if post_layer_norm is not None:
-            return post_layer_norm(encoder_outputs)
-        return encoder_outputs
-
-    # Get the hidden states corresponding to the layer indices.
-    # Negative values are relative to the full visual encoder,
-    # so offset them depending on how many layers were loaded.
-    # NOTE: this assumes that encoder_outputs contains a list
-    # of hidden states in the same order as the encoder layers
-    # that produced them.
-    offset = max_possible_layers - len(encoder_outputs)
-    hs_pool = [
-        encoder_outputs[layer_idx]
-        if layer_idx >= 0 else encoder_outputs[layer_idx + offset]
-        for layer_idx in feature_sample_layers
-    ]
-
-    # Apply post-norm on the final hidden state if we are using it
-    uses_last_layer = feature_sample_layers[-1] in (len(hs_pool) - 1, -1)
-    if post_layer_norm is not None and uses_last_layer:
-        hs_pool[-1] = post_layer_norm(encoder_outputs)
-    return torch.cat(hs_pool, dim=-1)
-
-
 # Utilities for input processors
 _T = TypeVar("_T", str, int)
 

From 5984499e473c387c17904aa9933b8ed080621ca6 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 9 Jan 2025 01:14:14 +0800
Subject: [PATCH 329/357] [Doc] Expand Multimodal API Reference (#11852)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/api/multimodal/index.md      | 61 ++++--------------------
 docs/source/api/multimodal/inputs.md     | 49 +++++++++++++++++++
 docs/source/api/multimodal/parse.md      |  9 ++++
 docs/source/api/multimodal/processing.md |  9 ++++
 docs/source/api/multimodal/profiling.md  |  9 ++++
 docs/source/api/multimodal/registry.md   |  9 ++++
 vllm/multimodal/parse.py                 | 31 ++++++++----
 vllm/multimodal/processing.py            | 26 +++++++---
 vllm/multimodal/profiling.py             |  7 ++-
 9 files changed, 139 insertions(+), 71 deletions(-)
 create mode 100644 docs/source/api/multimodal/inputs.md
 create mode 100644 docs/source/api/multimodal/parse.md
 create mode 100644 docs/source/api/multimodal/processing.md
 create mode 100644 docs/source/api/multimodal/profiling.md
 create mode 100644 docs/source/api/multimodal/registry.md

diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md
index 0046b73ea825e..51e24795a34cf 100644
--- a/docs/source/api/multimodal/index.md
+++ b/docs/source/api/multimodal/index.md
@@ -2,10 +2,6 @@
 
 # Multi-Modality
 
-```{eval-rst}
-.. currentmodule:: vllm.multimodal
-```
-
 vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
 
 Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
@@ -13,61 +9,20 @@ via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
 
 Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs).
 
-
 ## Module Contents
 
-```{eval-rst}
-.. automodule:: vllm.multimodal
-```
-
-### Registry
-
 ```{eval-rst}
 .. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
 ```
 
-```{eval-rst}
-.. autoclass:: vllm.multimodal.MultiModalRegistry
-    :members:
-    :show-inheritance:
-```
-
-### Base Classes
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.base
-    :members:
-    :show-inheritance:
-```
+## Submodules
 
-### Input Classes
+```{toctree}
+:maxdepth: 1
 
-```{eval-rst}
-.. automodule:: vllm.multimodal.inputs
-    :members:
-    :show-inheritance:
-```
-
-### Audio Classes
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.audio
-    :members:
-    :show-inheritance:
-```
-
-### Image Classes
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.image
-    :members:
-    :show-inheritance:
-```
-
-### Video Classes
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.video
-    :members:
-    :show-inheritance:
+inputs
+parse
+processing
+profiling
+registry
 ```
diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md
new file mode 100644
index 0000000000000..3d89666113229
--- /dev/null
+++ b/docs/source/api/multimodal/inputs.md
@@ -0,0 +1,49 @@
+# Input Definitions
+
+## User-facing inputs
+
+```{eval-rst}
+.. autodata:: vllm.multimodal.MultiModalDataDict
+```
+
+## Internal data structures
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.PlaceholderRange
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autodata:: vllm.multimodal.inputs.NestedTensors
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalFieldElem
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalFieldConfig
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalKwargsItem
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalKwargs
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2
+    :members:
+    :show-inheritance:
+```
diff --git a/docs/source/api/multimodal/parse.md b/docs/source/api/multimodal/parse.md
new file mode 100644
index 0000000000000..4676139efe626
--- /dev/null
+++ b/docs/source/api/multimodal/parse.md
@@ -0,0 +1,9 @@
+# Data Parsing
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.multimodal.parse
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/api/multimodal/processing.md b/docs/source/api/multimodal/processing.md
new file mode 100644
index 0000000000000..0d81c8d3966ee
--- /dev/null
+++ b/docs/source/api/multimodal/processing.md
@@ -0,0 +1,9 @@
+# Data Processing
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.multimodal.processing
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/api/multimodal/profiling.md b/docs/source/api/multimodal/profiling.md
new file mode 100644
index 0000000000000..b455145212202
--- /dev/null
+++ b/docs/source/api/multimodal/profiling.md
@@ -0,0 +1,9 @@
+# Memory Profiling
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.multimodal.profiling
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/api/multimodal/registry.md b/docs/source/api/multimodal/registry.md
new file mode 100644
index 0000000000000..0737a4385cf32
--- /dev/null
+++ b/docs/source/api/multimodal/registry.md
@@ -0,0 +1,9 @@
+# Registry
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.multimodal.registry
+    :members:
+    :member-order: bysource
+```
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 6be046ba77ca7..ccff0e857eec4 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -13,14 +13,16 @@
 
 from .audio import resample_audio
 from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
-                     ImageItem, ModalityData, MultiModalDataDict,
-                     NestedTensors, VideoItem)
+                     ImageItem, ModalityData, MultiModalDataDict, VideoItem)
 
 _T = TypeVar("_T")
 _I = TypeVar("_I")
 
 
 class ModalityDataItems(ABC, Generic[_T, _I]):
+    """
+    Represents data items for a modality in :class:`MultiModalDataItems`.
+    """
 
     def __init__(self, data: _T, modality: str) -> None:
         super().__init__()
@@ -69,6 +71,7 @@ def get_passthrough_data(self) -> Mapping[str, object]:
 
 
 class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
+    """Base class for data items that are arranged in a list."""
 
     def get_count(self) -> int:
         return len(self.data)
@@ -83,7 +86,12 @@ def get_passthrough_data(self) -> Mapping[str, object]:
         return {}
 
 
-class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]):
+class EmbeddingItems(ModalityDataItems[Union[torch.Tensor, list[torch.Tensor]],
+                                       torch.Tensor]):
+    """
+    Base class for data items that are expressed as a batched embedding tensor,
+    or a list of embedding tensors (one per item).
+    """
 
     def get_count(self) -> int:
         return len(self.data)
@@ -109,7 +117,7 @@ def __init__(self, data: Sequence[HfAudioItem]) -> None:
 
 class AudioEmbeddingItems(EmbeddingItems):
 
-    def __init__(self, data: NestedTensors) -> None:
+    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
         super().__init__(data, "audio")
 
 
@@ -137,7 +145,7 @@ def get_image_size(self, item_idx: int) -> ImageSize:
 
 class ImageEmbeddingItems(EmbeddingItems):
 
-    def __init__(self, data: NestedTensors) -> None:
+    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
         super().__init__(data, "image")
 
 
@@ -163,7 +171,7 @@ def get_frame_size(self, item_idx: int) -> ImageSize:
 
 class VideoEmbeddingItems(EmbeddingItems):
 
-    def __init__(self, data: NestedTensors) -> None:
+    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
         super().__init__(data, "video")
 
 
@@ -172,8 +180,8 @@ def __init__(self, data: NestedTensors) -> None:
 
 class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
     """
-    As :class:`MultiModalDataDict`, but normalized such that each entry
-    corresponds to a list.
+    As :data:`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
+    such that each entry corresponds to a list.
     """
 
     def get_count(self, modality: str, *, strict: bool = True) -> int:
@@ -226,7 +234,8 @@ def get_items(
 
 class MultiModalDataParser:
     """
-    Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`.
+    Parses :data:`~vllm.multimodal.inputs.MultiModalDataDict` into
+    :class:`MultiModalDataItems`.
 
     Args:
         target_sr (float, optional): Enables automatic resampling of audio
@@ -238,7 +247,9 @@ def __init__(self, *, target_sr: Optional[float] = None) -> None:
 
         self.target_sr = target_sr
 
-    def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]:
+    def _is_embeddings(
+            self, data: object
+    ) -> TypeGuard[Union[torch.Tensor, list[torch.Tensor]]]:
         if isinstance(data, torch.Tensor):
             return data.ndim == 3
         if is_list_of(data, torch.Tensor):
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index c6a30cacebdd1..07d883d5d7295 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -33,20 +33,24 @@
 
 @dataclass
 class PromptReplacement:
+    """
+    Defines how to replace portions of an input prompt with placeholder tokens.
+    """
+
     modality: str
     """The modality for which the replacement is made."""
 
     target: _PromptSeq
-    """The text or token sequence to find and replace."""
+    """The token sequence (or text) to find and replace."""
 
     replacement: Union[Callable[[int], _PromptSeq],
                        _PromptSeq] = field(repr=False)
     """
-    Given the index of the processed item within :attr:`modality`, output the
-    replacement text or token sequence.
+    Given the index of the processed item within :attr:`modality`,
+    output the replacement token sequence (or text).
 
-    For convenience, you can pass in the replacement instead of a function
-    if it does not depend on the input.
+    For convenience, you can directly pass in the replacement token sequence
+    (or text) instead of a function if it does not depend on the input.
     """
 
     def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptReplacement":
@@ -132,6 +136,11 @@ def token_ids(self) -> list[int]:
 
 @dataclass
 class BoundPromptReplacement:
+    """
+    A :class:`PromptReplacement` bound to a tokenizer to automatically
+    convert :attr:`target` and the result of :meth:`get_replacement` between
+    token sequence and text representations.
+    """
     tokenizer: AnyTokenizer = field(repr=False)
     modality: str
 
@@ -144,6 +153,7 @@ def __post_init__(self) -> None:
 
     @property
     def target(self) -> _BoundPromptSequence:
+        """The token sequence (or text) to find and replace."""
         target = self._target
 
         return _BoundPromptSequence(
@@ -153,6 +163,10 @@ def target(self) -> _BoundPromptSequence:
         )
 
     def get_replacement(self, item_idx: int) -> _BoundPromptSequence:
+        """
+        Given the index of the processed item within :attr:`modality`,
+        output the replacement token sequence (or text).
+        """
         replacement = self._replacement
         if callable(replacement):
             cache_key = item_idx
@@ -528,7 +542,7 @@ def put(
 
 
 class BaseProcessingInfo:
-    """Base class containing information to perform processing."""
+    """Base class to provide the information necessary for data processing."""
 
     def __init__(self, ctx: InputProcessingContext) -> None:
         super().__init__()
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 2ac3a6bcf3ddd..6f7da1509990f 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -19,7 +19,10 @@
 
 @dataclass
 class ProcessorInputs:
-    """Keyword arguments to :meth:`BaseMultiModalProcessor`."""
+    """
+    Represents the keyword arguments to
+    :meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
+    """
     prompt_text: str
     mm_data: MultiModalDataDict
     hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
@@ -47,7 +50,7 @@ def get_dummy_processor_inputs(
     ) -> ProcessorInputs:
         """
         Build the input which, after processing, results in
-        `self.info.get_mm_max_tokens_per_item()` placeholder tokens.
+        :code:`self.info.get_mm_max_tokens_per_item()` placeholder tokens.
         """
         raise NotImplementedError
 

From 47de8821d3cdd32fce7df6312318223aee591fd2 Mon Sep 17 00:00:00 2001
From: WangErXiao <863579016@qq.com>
Date: Thu, 9 Jan 2025 02:21:30 +0800
Subject: [PATCH 330/357] [Misc]add some explanations for BlockHashType
 (#11847)

---
 vllm/v1/core/kv_cache_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 84ff48bf428a0..22a5d2fb08a48 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -11,8 +11,10 @@
 
 class BlockHashType(NamedTuple):
     """Hash value of a block (int), the token IDs in the block, and extra keys.
-    The reason we keep a tuple of token IDs and extra keys is to make sure
-    no hash collision happens when the hash value is the same.
+    We keep a tuple of token IDs and extra keys to reduce the likelihood of
+    hash collisions when the hash value is the same. But please note that 
+    hash collisions can still theoretically occur, albeit with an extremely 
+    low probability.
     """
     # Hash value of the block in an integer.
     hash_value: int

From 56fe4c297c7d9d872eccc19e3edbf1d75e1a30e2 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Wed, 8 Jan 2025 14:33:29 -0500
Subject: [PATCH 331/357] [TPU][Quantization] TPU `W8A8` (#11785)

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .buildkite/run-tpu-test.sh                    |  11 +-
 tests/tpu/test_quantization_accuracy.py       |  49 +++++++
 .../schemes/compressed_tensors_w8a8_int8.py   | 105 ++++----------
 .../schemes/compressed_tensors_wNa16.py       |   2 +-
 .../layers/quantization/gptq_marlin.py        |   2 +-
 .../layers/quantization/kernels/__init__.py   |  74 ----------
 .../{ => mixed_precision}/MPLinearKernel.py   |   0
 .../kernels/mixed_precision/__init__.py       |  74 ++++++++++
 .../kernels/{ => mixed_precision}/exllama.py  |   0
 .../kernels/{ => mixed_precision}/machete.py  |   0
 .../kernels/{ => mixed_precision}/marlin.py   |   0
 .../kernels/scaled_mm/ScaledMMLinearKernel.py |  64 +++++++++
 .../kernels/scaled_mm/__init__.py             |  84 +++++++++++
 .../quantization/kernels/scaled_mm/cutlass.py | 134 ++++++++++++++++++
 .../quantization/kernels/scaled_mm/xla.py     | 101 +++++++++++++
 .../layers/quantization/utils/w8a8_utils.py   |  38 -----
 vllm/model_executor/parameter.py              |  13 ++
 vllm/platforms/tpu.py                         |   4 +-
 18 files changed, 565 insertions(+), 190 deletions(-)
 create mode 100644 tests/tpu/test_quantization_accuracy.py
 rename vllm/model_executor/layers/quantization/kernels/{ => mixed_precision}/MPLinearKernel.py (100%)
 create mode 100644 vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
 rename vllm/model_executor/layers/quantization/kernels/{ => mixed_precision}/exllama.py (100%)
 rename vllm/model_executor/layers/quantization/kernels/{ => mixed_precision}/machete.py (100%)
 rename vllm/model_executor/layers/quantization/kernels/{ => mixed_precision}/marlin.py (100%)
 create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
 create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
 create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
 create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py

diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
index 13605a3e97142..a8f021890f742 100644
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@@ -14,4 +14,13 @@ remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"
+docker run --privileged --net host --shm-size=16G -it \
+    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
+    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
+    && python3 -m pip install pytest \
+    && python3 -m pip install lm_eval[api]==0.4.4 \
+    && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \
+    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
+    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
+    && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"
diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py
new file mode 100644
index 0000000000000..6cd5615c44e1e
--- /dev/null
+++ b/tests/tpu/test_quantization_accuracy.py
@@ -0,0 +1,49 @@
+from dataclasses import dataclass
+
+import lm_eval
+import pytest
+
+TASK = "gsm8k"
+FILTER = "exact_match,strict-match"
+RTOL = 0.03
+
+
+@dataclass
+class GSM8KAccuracyTestConfig:
+    model_name: str
+    excepted_value: float
+
+    def get_model_args(self) -> str:
+        return (f"pretrained={self.model_name},"
+                "max_model_len=4096,max_num_seqs=32")
+
+
+# NOTE: Accuracy scores measured on GPUs.
+ACCURACY_CONFIGS = [
+    GSM8KAccuracyTestConfig(
+        model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        excepted_value=0.76),  # no bias
+    # NOTE(rob): We cannot re-initialize VLLM in the same process for TPU,
+    # so only one of these tests can run in a single call to pytest. As
+    # a follow up, move this into the LM-EVAL section of the CI.
+    # GSM8KAccuracyTestConfig(
+    #     model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8",
+    #     excepted_value=0.66),  # bias in QKV layers
+]
+
+
+@pytest.mark.parametrize("config", ACCURACY_CONFIGS)
+def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=config.get_model_args(),
+        tasks="gsm8k",
+        batch_size="auto",
+    )
+
+    EXPECTED_VALUE = config.excepted_value
+    measured_value = results["results"][TASK][FILTER]
+    assert (measured_value - RTOL < EXPECTED_VALUE
+            and measured_value + RTOL > EXPECTED_VALUE
+            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index 6cbc58d61e970..0e3f4731775c5 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -1,14 +1,13 @@
-from typing import Callable, List, Optional
+from typing import Callable, List, Optional, Set
 
 import torch
 from compressed_tensors.quantization import QuantizationStrategy
-from torch.nn import Parameter
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
-from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_int8_linear, convert_to_channelwise)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
+    ScaledMMLinearLayerConfig, choose_scaled_mm_linear_kernel)
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            ChannelQuantScaleParameter,
                                            ModelWeightParameter,
@@ -18,6 +17,7 @@
 
 
 class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
+    _kernel_backends_being_used: Set[str] = set()
 
     def __init__(self, strategy: str, is_static_input_scheme: bool,
                  input_symmetric: bool):
@@ -30,74 +30,25 @@ def get_min_capability(cls) -> int:
         # turing and up
         return 75
 
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        # WEIGHT
-        # Cutlass kernels need transposed weight.
-        weight = layer.weight
-        layer.weight = Parameter(weight.t(), requires_grad=False)
-
-        # WEIGHT SCALE
-        # Cutlass kernels support only per-tensor and per-channel.
-        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
-        # scales being passed to the kernel), convert to the per-channel case.
-        is_fused_module = len(self.logical_widths) > 1
-        if is_fused_module and self.strategy == QuantizationStrategy.TENSOR:
-            ws_channelwise = convert_to_channelwise(layer.weight_scale,
-                                                    self.logical_widths)
-            layer.weight_scale = Parameter(ws_channelwise, requires_grad=False)
-        else:
-            layer.weight_scale = Parameter(layer.weight_scale.data,
-                                           requires_grad=False)
-        # INPUT SCALE
-        if self.is_static_input_scheme:
-            if self.input_symmetric:
-                layer.input_scale = Parameter(layer.input_scale.max(),
-                                              requires_grad=False)
-                layer.input_zero_point = None
-            else:
-                # reconstruct the ranges
-                int8_traits = torch.iinfo(torch.int8)
-                azps = layer.input_zero_point.to(dtype=torch.int32)
-                range_max = (layer.input_scale *
-                             (int8_traits.max - azps)).max()
-                range_min = (layer.input_scale *
-                             (int8_traits.min - azps)).min()
-
-                scale = (range_max - range_min) / (int8_traits.max -
-                                                   int8_traits.min)
-                layer.input_scale = Parameter(scale, requires_grad=False)
-
-                # AZP loaded as int8 but used as int32
-                azp = (int8_traits.min -
-                       range_min / scale).to(dtype=torch.int32)
-                layer.input_zero_point = Parameter(azp, requires_grad=False)
-
-        else:
-            layer.input_scale = None
-            layer.input_zero_point = None
-
-        # azp_adj is the AZP adjustment term, used to account for weights.
-        # It does not depend on scales or azp, so it is the same for
-        # static and dynamic quantization.
-        # For more details, see csrc/quantization/cutlass_w8a8/Epilogues.md
-        # https://github.com/vllm-project/vllm/blob/8d59dbb00044a588cab96bcdc028006ed922eb06/csrc/quantization/cutlass_w8a8/Epilogues.md
-        if not self.input_symmetric:
-            azp_adj = layer.weight.sum(dim=0, keepdim=True, dtype=torch.int32)
-            if self.is_static_input_scheme:
-                # cutlass_w8a8 requires azp to be folded into azp_adj
-                #  in the per-tensor case
-                azp_adj = layer.input_zero_point * azp_adj
-
-            layer.azp_adj = azp_adj
-        else:
-            layer.azp_adj = None
-
     def create_weights(self, layer: torch.nn.Module,
                        output_partition_sizes: List[int],
                        input_size_per_partition: int,
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
-        self.logical_widths = output_partition_sizes
+        layer.logical_widths = output_partition_sizes
+
+        scaled_mm_linear_kernel_config = ScaledMMLinearLayerConfig(
+            is_channelwise=(self.strategy == QuantizationStrategy.CHANNEL),
+            is_static_input_scheme=self.is_static_input_scheme,
+            input_symmetric=self.input_symmetric)
+
+        kernel_type = choose_scaled_mm_linear_kernel(
+            scaled_mm_linear_kernel_config)
+
+        if kernel_type.__name__ not in self._kernel_backends_being_used:
+            logger.info("Using %s for CompressedTensorsW8A8Int8",
+                        kernel_type.__name__)
+            self._kernel_backends_being_used.add(kernel_type.__name__)
 
         # WEIGHT
         weight = ModelWeightParameter(data=torch.empty(
@@ -140,12 +91,18 @@ def create_weights(self, layer: torch.nn.Module,
                     weight_loader=weight_loader)
                 layer.register_parameter("input_zero_point", input_zero_point)
 
+        self.kernel = kernel_type(c=scaled_mm_linear_kernel_config,
+                                  w_q_param_name="weight",
+                                  w_s_param_name="weight_scale",
+                                  i_s_param_name="input_scale",
+                                  i_zp_param_name="input_zero_point",
+                                  azp_adj_param_name="azp_adj")
+
+    # Checkpoints are serialized in compressed-tensors format, which is
+    # different from the format the kernel may want. Handle repacking here.
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        self.kernel.process_weights_after_loading(layer)
+
     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
                       bias: Optional[torch.Tensor]) -> torch.Tensor:
-        return apply_int8_linear(input=x,
-                                 weight=layer.weight,
-                                 weight_scale=layer.weight_scale,
-                                 input_scale=layer.input_scale,
-                                 input_zero_point=layer.input_zero_point,
-                                 azp_adj=layer.azp_adj,
-                                 bias=bias)
+        return self.kernel.apply_weights(layer, x, bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index a515738017781..2dd243b9c3109 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -6,7 +6,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
-from vllm.model_executor.layers.quantization.kernels import (
+from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
     MPLinearLayerConfig, choose_mp_linear_kernel)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     marlin_repeat_scales_on_all_ranks)
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index a006d729cc627..2dbfca9b07690 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -11,7 +11,7 @@
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.quantization.kernels import (
+from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
     MPLinearLayerConfig, choose_mp_linear_kernel)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
diff --git a/vllm/model_executor/layers/quantization/kernels/__init__.py b/vllm/model_executor/layers/quantization/kernels/__init__.py
index 94a3dc2584d6b..e69de29bb2d1d 100644
--- a/vllm/model_executor/layers/quantization/kernels/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/__init__.py
@@ -1,74 +0,0 @@
-from typing import List, Optional, Type
-
-import vllm.envs as envs
-from vllm.model_executor.layers.quantization.kernels.exllama import (
-    ExllamaLinearKernel)
-from vllm.model_executor.layers.quantization.kernels.machete import (
-    MacheteLinearKernel)
-from vllm.model_executor.layers.quantization.kernels.marlin import (
-    MarlinLinearKernel)
-from vllm.model_executor.layers.quantization.kernels.MPLinearKernel import (
-    MPLinearKernel, MPLinearLayerConfig)
-from vllm.platforms import current_platform
-
-# in priority/performance order (when available)
-_POSSIBLE_KERNELS: List[Type[MPLinearKernel]] = [
-    MacheteLinearKernel,
-    MarlinLinearKernel,
-    ExllamaLinearKernel,
-]
-
-
-def choose_mp_linear_kernel(
-        config: MPLinearLayerConfig,
-        compute_capability: Optional[int] = None) -> Type[MPLinearKernel]:
-    """
-    Choose an MPLinearKernel that can implement the given config for the given
-     compute capability. Attempts to choose the best kernel in terms of 
-     performance.
-
-    Args:
-        config (MPLinearLayerConfig): Description of the linear layer to be 
-          implemented.
-        compute_capability (Optional[int], optional): The compute capability of
-          the target device, if None uses `current_platform` to get the compute 
-          capability. Defaults to None.
-
-    Raises:
-        ValueError: If no kernel can implement the given config.
-
-    Returns:
-        Type[MPLinearKernel]: Chosen kernel.
-    """
-    if compute_capability is None:
-        if current_platform is None:
-            raise ValueError("Cannot determine compute capability")
-        _cc = current_platform.get_device_capability()
-        compute_capability = _cc[0] * 10 + _cc[1]
-
-    failure_reasons = []
-    for kernel in _POSSIBLE_KERNELS:
-        if kernel.__name__ in envs.VLLM_DISABLED_KERNELS:
-            failure_reasons.append(
-                f' {kernel.__name__} disabled by environment variable')
-            continue
-
-        if kernel.get_min_capability() > compute_capability:
-            failure_reasons.append(
-                f"{kernel.__name__} requires capability "
-                f"{kernel.get_min_capability()}, current compute capability "
-                f"is {compute_capability}")
-            continue
-
-        can_implement, failure_reason = kernel.can_implement(config)
-        if can_implement:
-            return kernel
-        else:
-            failure_reasons.append(
-                f' {kernel.__name__} cannot implement due to: {failure_reason}'
-            )
-
-    raise ValueError(
-        "Failed to find a kernel that can implement the "\
-        "WNA16 linear layer. Reasons: \n"
-        + '\n'.join(failure_reasons))
diff --git a/vllm/model_executor/layers/quantization/kernels/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
similarity index 100%
rename from vllm/model_executor/layers/quantization/kernels/MPLinearKernel.py
rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
new file mode 100644
index 0000000000000..83549870e3f0b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
@@ -0,0 +1,74 @@
+from typing import List, Optional, Type
+
+import vllm.envs as envs
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.exllama import (  # noqa: E501
+    ExllamaLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.machete import (  # noqa: E501
+    MacheteLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.marlin import (  # noqa: E501
+    MarlinLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.MPLinearKernel import (  # noqa: E501
+    MPLinearKernel, MPLinearLayerConfig)
+from vllm.platforms import current_platform
+
+# in priority/performance order (when available)
+_POSSIBLE_KERNELS: List[Type[MPLinearKernel]] = [
+    MacheteLinearKernel,
+    MarlinLinearKernel,
+    ExllamaLinearKernel,
+]
+
+
+def choose_mp_linear_kernel(
+        config: MPLinearLayerConfig,
+        compute_capability: Optional[int] = None) -> Type[MPLinearKernel]:
+    """
+    Choose an MPLinearKernel that can implement the given config for the given
+     compute capability. Attempts to choose the best kernel in terms of 
+     performance.
+
+    Args:
+        config (MPLinearLayerConfig): Description of the linear layer to be 
+          implemented.
+        compute_capability (Optional[int], optional): The compute capability of
+          the target device, if None uses `current_platform` to get the compute 
+          capability. Defaults to None.
+
+    Raises:
+        ValueError: If no kernel can implement the given config.
+
+    Returns:
+        Type[MPLinearKernel]: Chosen kernel.
+    """
+    if compute_capability is None:
+        if current_platform is None:
+            raise ValueError("Cannot determine compute capability")
+        _cc = current_platform.get_device_capability()
+        compute_capability = _cc[0] * 10 + _cc[1]
+
+    failure_reasons = []
+    for kernel in _POSSIBLE_KERNELS:
+        if kernel.__name__ in envs.VLLM_DISABLED_KERNELS:
+            failure_reasons.append(
+                f' {kernel.__name__} disabled by environment variable')
+            continue
+
+        if kernel.get_min_capability() > compute_capability:
+            failure_reasons.append(
+                f"{kernel.__name__} requires capability "
+                f"{kernel.get_min_capability()}, current compute capability "
+                f"is {compute_capability}")
+            continue
+
+        can_implement, failure_reason = kernel.can_implement(config)
+        if can_implement:
+            return kernel
+        else:
+            failure_reasons.append(
+                f' {kernel.__name__} cannot implement due to: {failure_reason}'
+            )
+
+    raise ValueError(
+        "Failed to find a kernel that can implement the "\
+        "WNA16 linear layer. Reasons: \n"
+        + '\n'.join(failure_reasons))
diff --git a/vllm/model_executor/layers/quantization/kernels/exllama.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
similarity index 100%
rename from vllm/model_executor/layers/quantization/kernels/exllama.py
rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
diff --git a/vllm/model_executor/layers/quantization/kernels/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
similarity index 100%
rename from vllm/model_executor/layers/quantization/kernels/machete.py
rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
diff --git a/vllm/model_executor/layers/quantization/kernels/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
similarity index 100%
rename from vllm/model_executor/layers/quantization/kernels/marlin.py
rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
new file mode 100644
index 0000000000000..75cf91f191136
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
@@ -0,0 +1,64 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+
+
+@dataclass
+class ScaledMMLinearLayerConfig:
+    is_channelwise: bool
+    is_static_input_scheme: bool
+    input_symmetric: bool
+
+
+class ScaledMMLinearKernel(ABC):
+
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+        raise NotImplementedError
+
+    def __init__(self, c: ScaledMMLinearLayerConfig, w_q_param_name: str,
+                 w_s_param_name: str, i_s_param_name: str,
+                 i_zp_param_name: str, azp_adj_param_name: str) -> None:
+        assert self.can_implement(c)
+        self.config = c
+        self.w_q_name = w_q_param_name
+        self.w_s_name = w_s_param_name
+        self.i_s_name = i_s_param_name
+        self.i_zp_name = i_zp_param_name
+        self.azp_adj_name = azp_adj_param_name
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        raise NotImplementedError
+
+    def _get_weight_params(
+            self, layer: torch.nn.Module
+    ) -> Tuple[torch.Tensor,  # weight
+               torch.Tensor,  # weight_scale
+               Optional[torch.Tensor],  # input_scale, 
+               Optional[torch.Tensor],  # input_zp
+               Optional[torch.Tensor],  # azp_adj
+               ]:
+        return (
+            getattr(layer, self.w_q_name),
+            getattr(layer, self.w_s_name),
+            getattr(layer, self.i_s_name),
+            getattr(layer, self.i_zp_name),
+            getattr(layer, self.azp_adj_name),
+        )
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
new file mode 100644
index 0000000000000..586752d3d34e3
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -0,0 +1,84 @@
+import os
+from typing import Dict, List, Optional, Type
+
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import (
+    CutlassScaledMMLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (  # noqa: E501
+    ScaledMMLinearKernel, ScaledMMLinearLayerConfig)
+# from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import (
+#     TritonScaledMMLinear)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.xla import (
+    XLAScaledMMLinearKernel)
+from vllm.platforms import PlatformEnum, current_platform
+
+# in priority/performance order (when available)
+_POSSIBLE_KERNELS: Dict[PlatformEnum, List[Type[ScaledMMLinearKernel]]] = {
+    PlatformEnum.CPU: [CutlassScaledMMLinearKernel],
+    PlatformEnum.CUDA: [CutlassScaledMMLinearKernel],
+    # TODO(rob): Create TritonScaledMMLinear kernel. ROCM will
+    # incorrectly attempt to run AZP models if prompted to.
+    PlatformEnum.ROCM: [CutlassScaledMMLinearKernel],
+    PlatformEnum.TPU: [XLAScaledMMLinearKernel],
+}
+
+
+def choose_scaled_mm_linear_kernel(
+        config: ScaledMMLinearLayerConfig,
+        compute_capability: Optional[int] = None
+) -> Type[ScaledMMLinearKernel]:
+    """
+    Choose an ScalledMMLinearKernel that can implement the given config for the 
+    given compute capability. Attempts to choose the best kernel in terms of 
+    performance.
+
+    Args:
+        config (ScaledMMLinearLayerConfig): Description of the linear layer 
+            to be implemented.
+        compute_capability (Optional[int], optional): The compute capability of
+            the target device, if None uses `current_platform` to get the 
+            compute capability. Defaults to None.
+
+    Raises:
+        ValueError: If no kernel can implement the given config.
+
+    Returns:
+        Type[ScaledMMLinearKernel]: Chosen kernel.
+    """
+
+    if compute_capability is None:
+        _cc = current_platform.get_device_capability()
+        if _cc is not None:
+            compute_capability = _cc[0] * 10 + _cc[1]
+
+    failure_reasons = []
+    for kernel in _POSSIBLE_KERNELS[current_platform._enum]:
+        if kernel.__name__ in os.environ.get("VLLM_DISABLED_KERNELS", "")\
+            .split(","):
+            failure_reasons.append(
+                f' {kernel.__name__} disabled by environment variable')
+            continue
+
+        # If the current platform uses compute_capability,
+        # make sure the kernel supports the compute cability.
+        if compute_capability is not None:
+            kernel_min_capability = kernel.get_min_capability()
+            if (kernel_min_capability is not None
+                    and kernel_min_capability > compute_capability):
+                failure_reasons.append(
+                    f"{kernel.__name__} requires capability "
+                    f"{kernel_min_capability}, current compute capability "
+                    f"is {compute_capability}")
+                continue
+
+        can_implement, failure_reason = kernel.can_implement(config)
+        if can_implement:
+            return kernel
+        else:
+            failure_reasons.append(
+                f' {kernel.__name__} cannot implement due to: {failure_reason}'
+            )
+
+    raise ValueError(
+        "Failed to find a kernel that can implement the "\
+        "ScaledMM linear layer. Reasons: \n"
+        + '\n'.join(failure_reasons))
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
new file mode 100644
index 0000000000000..2e83a04286a0d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -0,0 +1,134 @@
+from typing import Optional, Tuple
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise)
+from vllm.platforms import current_platform
+
+from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
+                                   ScaledMMLinearLayerConfig)
+
+
+class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 75
+
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+
+        if (not current_platform.is_cuda() and not current_platform.is_cpu()):
+            return False, "CutlassScaledMM requires running on CUDA or CPU."
+
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # WEIGHT
+        # Cutlass kernels need transposed weight.
+        weight = getattr(layer, self.w_q_name)
+        replace_parameter(
+            layer, self.w_q_name,
+            torch.nn.Parameter(weight.t().data, requires_grad=False))
+
+        # WEIGHT SCALE
+        # Cutlass kernels support only per-tensor and per-channel.
+        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
+        # scales being passed to the kernel), convert to the per-channel case.
+        is_fused_module = len(layer.logical_widths) > 1
+        weight_scale = getattr(layer, self.w_s_name)
+        if is_fused_module and not self.config.is_channelwise:
+            weight_scale = convert_to_channelwise(weight_scale,
+                                                  layer.logical_widths)
+        replace_parameter(
+            layer, self.w_s_name,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False))
+
+        # INPUT SCALE
+        if self.config.is_static_input_scheme:
+            input_scale = getattr(layer, self.i_s_name)
+
+            if self.config.input_symmetric:
+                replace_parameter(
+                    layer, self.i_s_name,
+                    torch.nn.Parameter(input_scale.max(), requires_grad=False))
+                setattr(layer, self.i_zp_name, None)
+            else:
+                input_zero_point = getattr(layer, self.i_zp_name)
+
+                # reconstruct the ranges
+                int8_traits = torch.iinfo(torch.int8)
+                azps = input_zero_point.to(dtype=torch.int32)
+                range_max = (input_scale * (int8_traits.max - azps)).max()
+                range_min = (input_scale * (int8_traits.min - azps)).min()
+
+                scale = (range_max - range_min) / (int8_traits.max -
+                                                   int8_traits.min)
+                replace_parameter(
+                    layer, self.i_s_name,
+                    torch.nn.Parameter(scale, requires_grad=False))
+
+                # AZP loaded as int8 but used as int32
+                azp = (int8_traits.min -
+                       range_min / scale).to(dtype=torch.int32)
+                replace_parameter(layer, self.i_zp_name,
+                                  torch.nn.Parameter(azp, requires_grad=False))
+
+        else:
+            setattr(layer, self.i_s_name, None)
+            setattr(layer, self.i_zp_name, None)
+
+        # azp_adj is the AZP adjustment term, used to account for weights.
+        # It does not depend on scales or azp, so it is the same for
+        # static and dynamic quantization.
+        # For more details, see csrc/quantization/cutlass_w8a8/Epilogues.md
+        # https://github.com/vllm-project/vllm/blob/8d59dbb00044a588cab96bcdc028006ed922eb06/csrc/quantization/cutlass_w8a8/Epilogues.md
+        if not self.config.input_symmetric:
+            weight = getattr(layer, self.w_q_name)
+            azp_adj = weight.sum(dim=0, keepdim=True, dtype=torch.int32)
+            if self.config.is_static_input_scheme:
+                # cutlass_w8a8 requires azp to be folded into azp_adj
+                # in the per-tensor case
+                azp_adj = getattr(layer, self.i_zp_name) * azp_adj
+            setattr(layer, self.azp_adj_name,
+                    torch.nn.Parameter(azp_adj, requires_grad=False))
+        else:
+            setattr(layer, self.azp_adj_name, None)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)
+
+        # ops.scaled_int8_quant supports both dynamic and static quant:
+        # * dynamic, i_s is None and x_s computed from x.
+        # * static, i_s is scalar and x_s is i_s.
+        symmetric = azp_adj is None
+        x_q, x_s, x_zp = ops.scaled_int8_quant(x,
+                                               i_s,
+                                               i_zp,
+                                               symmetric=symmetric)
+
+        if x_zp is not None:
+            # Currently, static is always per-tensor and dynamic is per-token
+            static = i_zp is not None
+            azp = None if static else x_zp
+            return ops.cutlass_scaled_mm_azp(x_q,
+                                             w_q,
+                                             scale_a=x_s,
+                                             scale_b=w_s,
+                                             out_dtype=x.dtype,
+                                             azp_adj=azp_adj,
+                                             azp=azp,
+                                             bias=bias)
+        return ops.cutlass_scaled_mm(x_q,
+                                     w_q,
+                                     scale_a=x_s,
+                                     scale_b=w_s,
+                                     out_dtype=x.dtype,
+                                     bias=bias)
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
new file mode 100644
index 0000000000000..9de668e658826
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
@@ -0,0 +1,101 @@
+import warnings
+from typing import Optional, Tuple
+
+import torch
+from functorch.experimental.control_flow import cond  # noqa: F401
+
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise)
+from vllm.platforms import current_platform
+
+from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
+                                   ScaledMMLinearLayerConfig)
+
+
+class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError(
+            "TPU platform does have a concept of compute capability, "
+            "this method should not be called.")
+
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+
+        if not current_platform.is_tpu():
+            return False, "ScaledMMXLA requires running on TPU."
+
+        if c.is_static_input_scheme:
+            return False, "ScaledMMXLA requires dynamic activation scales."
+
+        if not c.input_symmetric:
+            return False, "ScaledMMXLA requires symmetric activation scales."
+
+        if not c.is_channelwise:
+            return False, "ScaledMMXLA requires channelwise weight scales"
+
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # WEIGHT
+        # [out, in] (different than cutlass_scaled_mm)
+        weight = getattr(layer, self.w_q_name)
+        replace_parameter(layer, self.w_q_name,
+                          torch.nn.Parameter(weight.data, requires_grad=False))
+
+        # WEIGHT SCALE
+        # XLA kernels support only per-tensor and per-channel.
+        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
+        # scales being passed to the kernel), convert to the per-channel case.
+        is_fused_module = len(layer.logical_widths) > 1
+        weight_scale = getattr(layer, self.w_s_name)
+        if is_fused_module and not self.config.is_channelwise:
+            weight_scale = convert_to_channelwise(weight_scale,
+                                                  layer.logical_widths)
+
+        # [out_channel,] (different than cutlass_scaled_mm)
+        weight_scale = weight_scale.squeeze(-1)
+        replace_parameter(
+            layer, self.w_s_name,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False))
+
+        # Only support symmetric dynamic activation quantization.
+        setattr(layer, self.i_s_name, None)
+        setattr(layer, self.i_zp_name, None)
+        setattr(layer, self.azp_adj_name, None)
+
+        # Filter warning for cond usage in apply_weights. It is okay
+        # to specialize the graph since bias is not dynamic.
+        warnings.filterwarnings(
+            "ignore",
+            message=
+            "Pred is a Python constant. When used with torch.cond, it specializes on one of the branches."  # noqa: E501
+        )
+
+    def no_add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
+        return x
+
+    def add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
+        return x + bias
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        w_q, w_s, _, _, _ = self._get_weight_params(layer)
+
+        import torch_xla.experimental.xla_quantized_matmul  # noqa: F401
+        out = torch.ops.xla.quantized_matmul(x,
+                                             w_q,
+                                             w_s,
+                                             zero_point=None,
+                                             block_size=-1,
+                                             int4_weight=False,
+                                             quantize_activation=True)
+
+        # Explicitly capture control flow to make dynamo happy.
+        # https://pytorch.org/docs/main/generated/exportdb/index.html#cond-branch-class-method # noqa: E501
+        return cond(bias is None, self.no_add_bias, self.add_bias, [out, bias])
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index d89071f30a549..7cdce67cf1677 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -201,44 +201,6 @@ def apply_fp8_linear(
             return output.to(dtype=input.dtype).view(*output_shape)
 
 
-def apply_int8_linear(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    weight_scale: torch.Tensor,
-    input_scale: Optional[torch.Tensor] = None,
-    input_zero_point: Optional[torch.Tensor] = None,
-    azp_adj: Optional[torch.Tensor] = None,
-    bias: Optional[torch.Tensor] = None,
-):
-    # ops.scaled_int8_quant supports both dynamic and static quant.
-    # * dynamic, layer.input_scale is None and x_scale computed from x.
-    # * static, layer.input_scale is scalar and x_scale is input_scale.
-    symmetric = azp_adj is None
-    x_q, x_scale, x_zp = ops.scaled_int8_quant(input,
-                                               input_scale,
-                                               input_zero_point,
-                                               symmetric=symmetric)
-
-    if x_zp is not None:
-        # Currently, static is always per-tensor and dynamic is per-token
-        static = input_zero_point is not None
-        azp = None if static else x_zp
-        return ops.cutlass_scaled_mm_azp(x_q,
-                                         weight,
-                                         scale_a=x_scale,
-                                         scale_b=weight_scale,
-                                         out_dtype=input.dtype,
-                                         azp_adj=azp_adj,
-                                         azp=azp,
-                                         bias=bias)
-    return ops.cutlass_scaled_mm(x_q,
-                                 weight,
-                                 scale_a=x_scale,
-                                 scale_b=weight_scale,
-                                 out_dtype=input.dtype,
-                                 bias=bias)
-
-
 def normalize_e4m3fn_to_e4m3fnuz(
     weight: torch.Tensor,
     weight_scale: torch.Tensor,
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 02d22a5ca62c0..fc5a3e7fba674 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -6,6 +6,7 @@
 
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.logger import init_logger
+from vllm.model_executor.utils import _make_synced_weight_loader
 
 __all__ = [
     "BasevLLMParameter", "PackedvLLMParameter", "PerTensorScaleParameter",
@@ -37,6 +38,18 @@ def __init__(self, data: torch.Tensor, weight_loader: Callable):
         :returns: a torch.nn.parameter
         """
 
+        # During weight loading, we often do something like:
+        # narrowed_tensor = param.data.narrow(0, offset, len)
+        # narrowed_tensor.copy_(real_weight)
+        # expecting narrowed_tensor and param.data to share the same storage.
+        # However, on TPUs, narrowed_tensor will lazily propagate to the base
+        # tensor, which is param.data, leading to the redundant memory usage.
+        # This sometimes causes OOM errors during model loading. To avoid this,
+        # we sync the param tensor after its weight loader is called.
+        from vllm.platforms import current_platform
+        if current_platform.is_tpu():
+            weight_loader = _make_synced_weight_loader(weight_loader)
+
         self._weight_loader = weight_loader
 
     @property
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 77f5c8401424b..d488daf056f1a 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -19,7 +19,9 @@ class TpuPlatform(Platform):
     device_name: str = "tpu"
     device_type: str = "tpu"
     dispatch_key: str = "XLA"
-    supported_quantization: list[str] = ["tpu_int8"]
+    supported_quantization: list[str] = [
+        "tpu_int8", "compressed-tensors", "compressed_tensors"
+    ]
 
     @classmethod
     def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:

From 526de822d501c792b051c864ba873a836d78d5bf Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Wed, 8 Jan 2025 14:23:15 -0600
Subject: [PATCH 332/357] [Kernel][Triton][AMD] Use block size heuristic for
 avg 2.8x speedup for int8 models (#11698)

Signed-off-by: Randall Smith <Randall.Smith@amd.com>
---
 .../compressed_tensors/triton_scaled_mm.py      | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
index 3ff162170f255..2659afcdc74a9 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
@@ -128,7 +128,8 @@ def triton_scaled_mm(input: torch.Tensor,
                      bias: Optional[torch.Tensor] = None,
                      block_size_m: int = 32,
                      block_size_n: int = 32,
-                     block_size_k: int = 32) -> torch.Tensor:
+                     block_size_k: int = 32,
+                     use_heuristic=True) -> torch.Tensor:
     M, K = input.shape
     N = weight.shape[1]
 
@@ -152,6 +153,20 @@ def triton_scaled_mm(input: torch.Tensor,
 
     has_scalar = lambda x: x.shape[0] == 1 and x.shape[1] == 1
 
+    if use_heuristic:
+        is_small_N = N < 8192
+        next_power_of_2_M = max(32, triton.next_power_of_2(M))
+        if next_power_of_2_M <= 32:
+            tile_shape = (64, 64, 256) if is_small_N else (64, 128, 256)
+        elif next_power_of_2_M <= 64:
+            tile_shape = (64, 64, 256)
+        elif next_power_of_2_M <= 128:
+            tile_shape = (64, 128, 128)
+        else:
+            tile_shape = (128, 128, 128)
+
+    block_size_m, block_size_n, block_size_k = tile_shape
+
     block_size_sa = 1 if has_scalar(scale_a) else block_size_m
     block_size_sb = 1 if has_scalar(scale_b) else block_size_n
 

From 3db0cafdf1fe7f4cd7e41a145f78e8a568b4d63c Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Wed, 8 Jan 2025 12:38:28 -0800
Subject: [PATCH 333/357] [Docs] Add Google Cloud Meetup (#11864)

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 1f82229f39537..253a0bb913e37 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,10 @@ Easy, fast, and cheap LLM serving for everyone
 
 ---
 
+The first vLLM meetup in 2025 is happening on January 22nd, Wednesday, with Google Cloud in San Francisco! We will talk about vLLM's performant V1 architecture, Q1 roadmap, Google Cloud's innovation around vLLM: networking, Cloud Run, Vertex, and TPU! [Register Now](https://lu.ma/zep56hui)
+
+---
+
 *Latest News* 🔥
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).

From 615e4a54017136649db275b68932af80168781f8 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Wed, 8 Jan 2025 21:20:44 -0500
Subject: [PATCH 334/357] [CI] Turn on basic correctness tests for V1 (#10864)

---
 tests/basic_correctness/test_basic_correctness.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 1c2193bb17a55..31a101e48e026 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -44,7 +44,6 @@ def test_vllm_gc_ed():
     assert weak_llm() is None
 
 
-@pytest.mark.skip_v1
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
 @pytest.mark.parametrize("dtype", ["half"])

From 1fe554bac32419a6d64a5c977849806a1efd9725 Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Thu, 9 Jan 2025 00:05:43 -0300
Subject: [PATCH 335/357] treat do_lower_case in the same way as the
 sentence-transformers library (#11815)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 tests/entrypoints/openai/test_serving_chat.py       | 1 +
 tests/models/embedding/language/test_embedding.py   | 1 +
 vllm/entrypoints/openai/serving_engine.py           | 5 +++++
 vllm/inputs/preprocess.py                           | 6 ++++++
 vllm/transformers_utils/tokenizer_group/__init__.py | 5 -----
 5 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 97248f1150979..f431d1065e0eb 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -35,6 +35,7 @@ class MockModelConfig:
     logits_processor_pattern = None
     diff_sampling_param: Optional[dict] = None
     allowed_local_media_path: str = ""
+    encoder_config = None
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index f458ef5ef556d..7749806548cd9 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -15,6 +15,7 @@
         # [Encoder-only]
         pytest.param("BAAI/bge-base-en-v1.5",
                      marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
         pytest.param("intfloat/multilingual-e5-large"),
         # [Encoder-decoder]
         pytest.param("intfloat/e5-mistral-7b-instruct",
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 319f869240036..88859255f202a 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -160,6 +160,11 @@ def _normalize_prompt_text_to_input(
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]],
         add_special_tokens: bool,
     ) -> TextTokensPrompt:
+        if (self.model_config.encoder_config is not None
+                and self.model_config.encoder_config.get(
+                    "do_lower_case", False)):
+            prompt = prompt.lower()
+
         if truncate_prompt_tokens is None:
             encoded = tokenizer(prompt, add_special_tokens=add_special_tokens)
         else:
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 6ddc1eb76f10d..3e92d5821e645 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -190,6 +190,12 @@ def _tokenize_prompt(
             # on the task and language of their request. Also needed to avoid
             # appending an EOS token to the prompt which disrupts generation.
             add_special_tokens = False
+
+        if (self.model_config.encoder_config is not None
+                and self.model_config.encoder_config.get(
+                    "do_lower_case", False)):
+            prompt = prompt.lower()
+
         return tokenizer.encode(request_id=request_id,
                                 prompt=prompt,
                                 lora_request=lora_request,
diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
index c0b3d2585a962..d400276796996 100644
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ b/vllm/transformers_utils/tokenizer_group/__init__.py
@@ -26,11 +26,6 @@ def init_tokenizer_from_configs(model_config: ModelConfig,
                        trust_remote_code=model_config.trust_remote_code,
                        revision=model_config.tokenizer_revision)
 
-    if (model_config.encoder_config is not None
-            and "do_lower_case" in model_config.encoder_config):
-        init_kwargs["do_lower_case"] = model_config.encoder_config[
-            "do_lower_case"]
-
     return get_tokenizer_group(parallel_config.tokenizer_pool_config,
                                **init_kwargs)
 

From 730e9592e97c643474aa44e9d3dbe6f55c4b9ad9 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 8 Jan 2025 22:37:48 -0500
Subject: [PATCH 336/357] [Doc] Recommend uv and python 3.12 for quickstart
 guide (#11849)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 docs/source/getting_started/quickstart.md | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 2808e1b386801..ea15d9ef065fa 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -15,10 +15,19 @@ This guide will help you quickly get started with vLLM to perform:
 ## Installation
 
 If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/project/vllm/) directly.
-It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
+
+It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
+
+```console
+$ uv venv myenv --python 3.12 --seed
+$ source myenv/bin/activate
+$ uv pip install vllm
+```
+
+You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
 
 ```console
-$ conda create -n myenv python=3.10 -y
+$ conda create -n myenv python=3.12 -y
 $ conda activate myenv
 $ pip install vllm
 ```

From d848800e884f581eeed9f154d6c2aeb38eac24de Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 9 Jan 2025 12:48:12 +0800
Subject: [PATCH 337/357] [Misc] Move `print_*_once` from utils to logger
 (#11298)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
Co-authored-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
---
 .github/workflows/lint-and-deploy.yaml        |  1 +
 vllm/attention/backends/torch_sdpa.py         |  9 ++-
 vllm/attention/backends/xformers.py           |  8 ++-
 vllm/config.py                                |  9 ++-
 vllm/entrypoints/chat_utils.py                |  7 +--
 vllm/inputs/preprocess.py                     | 20 ++++---
 vllm/inputs/registry.py                       |  4 +-
 vllm/logger.py                                | 57 +++++++++++++++++--
 vllm/lora/peft_helper.py                      |  6 +-
 vllm/lora/punica_wrapper/punica_selector.py   |  8 ++-
 vllm/model_executor/custom_op.py              |  3 +-
 .../compressed_tensors_moe.py                 |  8 ++-
 .../model_executor/layers/quantization/fp8.py |  5 +-
 .../layers/quantization/kv_cache.py           |  6 +-
 .../quantization/utils/marlin_utils_fp8.py    |  6 +-
 .../model_loader/weight_utils.py              |  8 +--
 vllm/model_executor/models/chameleon.py       |  6 +-
 vllm/model_executor/models/olmoe.py           |  6 +-
 vllm/model_executor/models/qwen2_moe.py       |  6 +-
 vllm/model_executor/models/vision.py          |  6 +-
 vllm/utils.py                                 | 12 ----
 21 files changed, 129 insertions(+), 72 deletions(-)

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index ee768db63c96c..556b60d2fca12 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -64,6 +64,7 @@ jobs:
         run: |
           export AWS_ACCESS_KEY_ID=minioadmin
           export AWS_SECRET_ACCESS_KEY=minioadmin
+          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
           helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
     
       - name: curl test
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index c14f7754596dd..ca1c4618615de 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -13,9 +13,12 @@
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.ipex_attn import PagedAttention
 from vllm.attention.ops.paged_attn import PagedAttentionMetadata
-from vllm.utils import make_tensor_with_pad, print_warning_once
+from vllm.logger import init_logger
+from vllm.utils import make_tensor_with_pad
 from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder
 
+logger = init_logger(__name__)
+
 
 class TorchSDPABackend(AttentionBackend):
 
@@ -396,8 +399,8 @@ def __init__(
             raise ValueError(
                 "Torch SPDA does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            print_warning_once("Torch SPDA does not support logits soft cap. "
-                               "Outputs may be slightly off.")
+            logger.warning_once("Torch SPDA does not support logits soft cap. "
+                                "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 694c7cc1bc36a..8c8ca8520a9db 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -17,7 +17,9 @@
     is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set)
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
-from vllm.utils import print_warning_once
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 class XFormersBackend(AttentionBackend):
@@ -385,8 +387,8 @@ def __init__(
             raise ValueError(
                 "XFormers does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            print_warning_once("XFormers does not support logits soft cap. "
-                               "Outputs may be slightly off.")
+            logger.warning_once("XFormers does not support logits soft cap. "
+                                "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/config.py b/vllm/config.py
index 6dabeb3861af2..19609085cc960 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -32,8 +32,7 @@
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
-                        get_cpu_memory, print_warning_once, random_uuid,
-                        resolve_obj_by_qualname)
+                        get_cpu_memory, random_uuid, resolve_obj_by_qualname)
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -314,7 +313,7 @@ def __init__(self,
                 sliding_window_len_min = get_min_sliding_window(
                     self.hf_text_config.sliding_window)
 
-                print_warning_once(
+                logger.warning_once(
                     f"{self.hf_text_config.model_type} has interleaved "
                     "attention, which is currently not supported by the "
                     "XFORMERS backend. Disabling sliding window and capping "
@@ -2758,7 +2757,7 @@ def uuid(self):
 
         def model_post_init(self, __context: Any) -> None:
             if not self.enable_reshape and self.enable_fusion:
-                print_warning_once(
+                logger.warning_once(
                     "Fusion enabled but reshape elimination disabled."
                     "RMSNorm + quant (fp8) fusion might not work")
 
@@ -3151,7 +3150,7 @@ def __post_init__(self):
             self.scheduler_config.chunked_prefill_enabled and \
             self.model_config.dtype == torch.float32 and \
             current_platform.get_device_capability() == (7, 5):
-            print_warning_once(
+            logger.warning_once(
                 "Turing devices tensor cores do not support float32 matmul. "
                 "To workaround this limitation, vLLM will set 'ieee' input "
                 "precision for chunked prefill triton kernels.")
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index a492d5496e025..923c7459f6948 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -35,7 +35,6 @@
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import MediaConnector
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
 
@@ -985,14 +984,14 @@ def apply_mistral_chat_template(
     **kwargs: Any,
 ) -> List[int]:
     if chat_template is not None:
-        print_warning_once(
+        logger.warning_once(
             "'chat_template' cannot be overridden for mistral tokenizer.")
     if "add_generation_prompt" in kwargs:
-        print_warning_once(
+        logger.warning_once(
             "'add_generation_prompt' is not supported for mistral tokenizer, "
             "so it will be ignored.")
     if "continue_final_message" in kwargs:
-        print_warning_once(
+        logger.warning_once(
             "'continue_final_message' is not supported for mistral tokenizer, "
             "so it will be ignored.")
 
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 3e92d5821e645..a738ffe18e3ae 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -10,7 +10,6 @@
 from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputsV2
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
-from vllm.utils import print_info_once, print_warning_once
 
 from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs,
                    PromptType, SingletonInputs, SingletonPrompt, token_inputs)
@@ -68,21 +67,24 @@ def get_decoder_start_token_id(self) -> Optional[int]:
         '''
 
         if not self.model_config.is_encoder_decoder:
-            print_warning_once("Using None for decoder start token id because "
-                               "this is not an encoder/decoder model.")
+            logger.warning_once(
+                "Using None for decoder start token id because "
+                "this is not an encoder/decoder model.")
             return None
 
         if (self.model_config is None or self.model_config.hf_config is None):
-            print_warning_once("Using None for decoder start token id because "
-                               "model config is not available.")
+            logger.warning_once(
+                "Using None for decoder start token id because "
+                "model config is not available.")
             return None
 
         dec_start_token_id = getattr(self.model_config.hf_config,
                                      'decoder_start_token_id', None)
         if dec_start_token_id is None:
-            print_warning_once("Falling back on <BOS> for decoder start token "
-                               "id because decoder start token id is not "
-                               "available.")
+            logger.warning_once(
+                "Falling back on <BOS> for decoder start token "
+                "id because decoder start token id is not "
+                "available.")
             dec_start_token_id = self.get_bos_token_id()
 
         return dec_start_token_id
@@ -231,7 +233,7 @@ def _can_process_multimodal(self) -> bool:
         # updated to use the new multi-modal processor
         can_process_multimodal = self.mm_registry.has_processor(model_config)
         if not can_process_multimodal:
-            print_info_once(
+            logger.info_once(
                 "Your model uses the legacy input pipeline instead of the new "
                 "multi-modal processor. Please note that the legacy pipeline "
                 "will be removed in a future release. For more details, see: "
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index b22b3f1594f24..aad0dfab94a01 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -12,7 +12,7 @@
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
-                        print_warning_once, resolve_mm_processor_kwargs)
+                        resolve_mm_processor_kwargs)
 
 from .data import ProcessorInputs, SingletonInputs
 from .parse import is_encoder_decoder_inputs
@@ -352,7 +352,7 @@ def dummy_data_for_profiling(
         num_tokens = dummy_data.seq_data.prompt_token_ids
         if len(num_tokens) < seq_len:
             if is_encoder_data:
-                print_warning_once(
+                logger.warning_once(
                     f"Expected at least {seq_len} dummy encoder tokens for "
                     f"profiling, but found {len(num_tokens)} tokens instead.")
             else:
diff --git a/vllm/logger.py b/vllm/logger.py
index 538db0dcf19aa..cac174f7ba02a 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -4,11 +4,12 @@
 import logging
 import os
 import sys
-from functools import partial
+from functools import lru_cache, partial
 from logging import Logger
 from logging.config import dictConfig
 from os import path
-from typing import Dict, Optional
+from types import MethodType
+from typing import Any, Optional, cast
 
 import vllm.envs as envs
 
@@ -49,8 +50,44 @@
 }
 
 
+@lru_cache
+def _print_info_once(logger: Logger, msg: str) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.info(msg, stacklevel=2)
+
+
+@lru_cache
+def _print_warning_once(logger: Logger, msg: str) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.warning(msg, stacklevel=2)
+
+
+class _VllmLogger(Logger):
+    """
+    Note:
+        This class is just to provide type information.
+        We actually patch the methods directly on the :class:`logging.Logger`
+        instance to avoid conflicting with other libraries such as
+        `intel_extension_for_pytorch.utils._logger`.
+    """
+
+    def info_once(self, msg: str) -> None:
+        """
+        As :meth:`info`, but subsequent calls with the same message
+        are silently dropped.
+        """
+        _print_info_once(self, msg)
+
+    def warning_once(self, msg: str) -> None:
+        """
+        As :meth:`warning`, but subsequent calls with the same message
+        are silently dropped.
+        """
+        _print_warning_once(self, msg)
+
+
 def _configure_vllm_root_logger() -> None:
-    logging_config: Dict = {}
+    logging_config = dict[str, Any]()
 
     if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH:
         raise RuntimeError(
@@ -84,12 +121,22 @@ def _configure_vllm_root_logger() -> None:
         dictConfig(logging_config)
 
 
-def init_logger(name: str) -> Logger:
+def init_logger(name: str) -> _VllmLogger:
     """The main purpose of this function is to ensure that loggers are
     retrieved in such a way that we can be sure the root vllm logger has
     already been configured."""
 
-    return logging.getLogger(name)
+    logger = logging.getLogger(name)
+
+    methods_to_patch = {
+        "info_once": _print_info_once,
+        "warning_once": _print_warning_once,
+    }
+
+    for method_name, method in methods_to_patch.items():
+        setattr(logger, method_name, MethodType(method, logger))
+
+    return cast(_VllmLogger, logger)
 
 
 # The root logger is initialized when the module is imported.
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index ddd42ae93d290..dacfb9ebd1480 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -4,7 +4,9 @@
 from dataclasses import MISSING, dataclass, field, fields
 from typing import Literal, Optional, Union
 
-from vllm.utils import print_info_once
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 @dataclass
@@ -42,7 +44,7 @@ def _validate_features(self):
     def __post_init__(self):
         self._validate_features()
         if self.use_rslora:
-            print_info_once("Loading LoRA weights trained with rsLoRA.")
+            logger.info_once("Loading LoRA weights trained with rsLoRA.")
             self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r)
         else:
             self.vllm_lora_scaling_factor = self.lora_alpha / self.r
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
index cd64878d95ae3..9791d492d8e48 100644
--- a/vllm/lora/punica_wrapper/punica_selector.py
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -1,19 +1,21 @@
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import print_info_once
 
 from .punica_base import PunicaWrapperBase
 
+logger = init_logger(__name__)
+
 
 def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
     if current_platform.is_cuda_alike():
         # Lazy import to avoid ImportError
         from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
-        print_info_once("Using PunicaWrapperGPU.")
+        logger.info_once("Using PunicaWrapperGPU.")
         return PunicaWrapperGPU(*args, **kwargs)
     elif current_platform.is_hpu():
         # Lazy import to avoid ImportError
         from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU
-        print_info_once("Using PunicaWrapperHPU.")
+        logger.info_once("Using PunicaWrapperHPU.")
         return PunicaWrapperHPU(*args, **kwargs)
     else:
         raise NotImplementedError
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index fddc8bad09ef5..401606e8c76f0 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -5,7 +5,6 @@
 from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
 
@@ -91,7 +90,7 @@ def enabled(cls) -> bool:
         compilation_config = get_current_vllm_config().compilation_config
         custom_ops = compilation_config.custom_ops
         if not hasattr(cls, "name"):
-            print_warning_once(
+            logger.warning_once(
                 f"Custom op {cls.__name__} was not registered, "
                 f"which means it won't appear in the op registry. "
                 f"It will be enabled/disabled based on the global settings.")
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 5fd6b017f444b..4fb8fd84e92d4 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -8,6 +8,7 @@
 
 import vllm.model_executor.layers.fused_moe  # noqa
 from vllm import _custom_ops as ops
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
                                                   FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
@@ -16,7 +17,8 @@
     all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
+
+logger = init_logger(__name__)
 
 
 class GPTQMarlinState(Enum):
@@ -142,10 +144,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                     "activation scales are None.")
             if (not all_close_1d(layer.w13_input_scale)
                     or not all_close_1d(layer.w2_input_scale)):
-                print_warning_once(
+                logger.warning_once(
                     "Found input_scales that are not equal for "
                     "fp8 MoE layer. Using the maximum across experts "
-                    "for each layer. ")
+                    "for each layer.")
             layer.w13_input_scale = torch.nn.Parameter(
                 layer.w13_input_scale.max(), requires_grad=False)
             layer.w2_input_scale = torch.nn.Parameter(
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 2fe22903a385b..a1be45a49e94a 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -28,7 +28,6 @@
                                            PerTensorScaleParameter)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 
@@ -539,10 +538,10 @@ def process_weights_after_loading(self, layer: Module) -> None:
                         "activation scales are None.")
                 if (not all_close_1d(layer.w13_input_scale)
                         or not all_close_1d(layer.w2_input_scale)):
-                    print_warning_once(
+                    logger.warning_once(
                         "Found input_scales that are not equal for "
                         "fp8 MoE layer. Using the maximum across experts "
-                        "for each layer. ")
+                        "for each layer.")
                 layer.w13_input_scale = torch.nn.Parameter(
                     layer.w13_input_scale.max(), requires_grad=False)
                 layer.w2_input_scale = torch.nn.Parameter(
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index d79536d196b92..a74f5415c8a51 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -1,8 +1,10 @@
 import torch
 
+from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
-from vllm.utils import print_warning_once
+
+logger = init_logger(__name__)
 
 
 class BaseKVCacheMethod(QuantizeMethodBase):
@@ -67,7 +69,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer._v_scale = v_scale
             if (layer._k_scale == 1.0 and layer._v_scale == 1.0
                     and "e5m2" not in layer.kv_cache_dtype):
-                print_warning_once(
+                logger.warning_once(
                     "Using KV cache scaling factor 1.0 for fp8_e4m3. This "
                     "may cause accuracy issues. Please make sure k/v_scale "
                     "scaling factors are available in the fp8 checkpoint.")
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 8b3dfaae971c3..245fe9238e421 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -3,11 +3,13 @@
 import torch
 
 import vllm._custom_ops as ops
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
 
 from .marlin_utils import marlin_make_workspace, marlin_permute_scales
 
+logger = init_logger(__name__)
+
 
 def is_fp8_marlin_supported():
     return current_platform.has_device_capability(80)
@@ -47,7 +49,7 @@ def apply_fp8_marlin_linear(
 
 def prepare_fp8_layer_for_marlin(layer: torch.nn.Module,
                                  strategy: str = "tensor") -> None:
-    print_warning_once(
+    logger.warning_once(
         "Your GPU does not have native support for FP8 computation but "
         "FP8 quantization is being used. Weight-only FP8 compression will "
         "be used leveraging the Marlin kernel. This may degrade "
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index a2c991cfdb74e..11d5fd7135d9e 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -25,7 +25,7 @@
                                                      get_quantization_config)
 from vllm.model_executor.layers.quantization.schema import QuantParamSchema
 from vllm.platforms import current_platform
-from vllm.utils import PlaceholderModule, print_warning_once
+from vllm.utils import PlaceholderModule
 
 try:
     from runai_model_streamer import SafetensorsStreamer
@@ -673,7 +673,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         None: If the remapped name is not found in params_dict.
     """
     if name.endswith(".kv_scale"):
-        print_warning_once(
+        logger.warning_once(
             "DEPRECATED. Found kv_scale in the checkpoint. "
             "This format is deprecated in favor of separate k_scale and "
             "v_scale tensors and will be removed in a future release. "
@@ -682,7 +682,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         # NOTE: we remap the deprecated kv_scale to k_scale
         remapped_name = name.replace(".kv_scale", ".attn.k_scale")
         if remapped_name not in params_dict:
-            print_warning_once(
+            logger.warning_once(
                 f"Found kv_scale in the checkpoint (e.g. {name}), "
                 "but not found the expected name in the model "
                 f"(e.g. {remapped_name}). kv_scale is "
@@ -695,7 +695,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         if name.endswith(scale_name):
             remapped_name = name.replace(scale_name, f".attn{scale_name}")
             if remapped_name not in params_dict:
-                print_warning_once(
+                logger.warning_once(
                     f"Found {scale_name} in the checkpoint (e.g. {name}), "
                     "but not found the expected name in the model "
                     f"(e.g. {remapped_name}). {scale_name} is "
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index acff926891bbe..452fe727875fe 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -11,6 +11,7 @@
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -35,13 +36,14 @@
                                         BaseProcessingInfo, PromptReplacement)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.utils import print_warning_once
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
 
+logger = init_logger(__name__)
+
 
 class ChameleonImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
@@ -1111,7 +1113,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                         remapped_kv_scale_name = name.replace(
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
-                            print_warning_once(
+                            logger.warning_once(
                                 "Found kv scale in the checkpoint (e.g. "
                                 f"{name}), but not found the expected name in "
                                 f"the model (e.g. {remapped_kv_scale_name}). "
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 5d9091cfb9311..fbe5d1aee04b3 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -20,6 +20,7 @@
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
@@ -34,13 +35,14 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.utils import print_warning_once
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
+logger = init_logger(__name__)
+
 
 class OlmoeMoE(nn.Module):
     """A tensor-parallel MoE implementation for Olmoe that shards each expert
@@ -446,7 +448,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                         remapped_kv_scale_name = name.replace(
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
-                            print_warning_once(
+                            logger.warning_once(
                                 "Found kv scale in the checkpoint "
                                 f"(e.g. {name}), but not found the expected "
                                 f"name in the model "
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index ba70243c6533d..95de6c21871bf 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -34,6 +34,7 @@
 from vllm.distributed import (get_pp_group,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
+from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -50,13 +51,14 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.utils import print_warning_once
 
 from .interfaces import SupportsPP
 from .utils import (extract_layer_index, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
+logger = init_logger(__name__)
+
 
 class Qwen2MoeMLP(nn.Module):
 
@@ -524,7 +526,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                         remapped_kv_scale_name = name.replace(
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
-                            print_warning_once(
+                            logger.warning_once(
                                 "Found kv scale in the checkpoint "
                                 f"(e.g. {name}), but not found the expected "
                                 f"name in the model "
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index e6a9e153d9107..a1395982af44c 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -7,8 +7,10 @@
 import vllm.envs as envs
 from vllm.attention.selector import (backend_name_to_enum,
                                      get_global_forced_attn_backend)
+from vllm.logger import init_logger
 from vllm.platforms import _Backend, current_platform
-from vllm.utils import print_warning_once
+
+logger = init_logger(__name__)
 
 _C = TypeVar("_C", bound=PretrainedConfig)
 
@@ -87,7 +89,7 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
             if is_flash_attn_2_available():
                 selected_backend = _Backend.FLASH_ATTN
             else:
-                print_warning_once(
+                logger.warning_once(
                     "Current `vllm-flash-attn` has a bug inside vision module, "
                     "so we use xformers backend instead. You can run "
                     "`pip install flash-attn` to use flash-attention backend.")
diff --git a/vllm/utils.py b/vllm/utils.py
index c09cae70e9af8..a92b77efd9fd8 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -696,18 +696,6 @@ def create_kv_caches_with_random(
     return key_caches, value_caches
 
 
-@lru_cache
-def print_info_once(msg: str) -> None:
-    # Set the stacklevel to 2 to print the caller's line info
-    logger.info(msg, stacklevel=2)
-
-
-@lru_cache
-def print_warning_once(msg: str) -> None:
-    # Set the stacklevel to 2 to print the caller's line info
-    logger.warning(msg, stacklevel=2)
-
-
 @lru_cache(maxsize=None)
 def is_pin_memory_available() -> bool:
     from vllm.platforms import current_platform

From a732900efc4eb0d4393e3885d5df8ef3516d4834 Mon Sep 17 00:00:00 2001
From: Guspan Tanadi <36249910+guspan-tanadi@users.noreply.github.com>
Date: Thu, 9 Jan 2025 12:39:39 +0700
Subject: [PATCH 338/357] [Doc] Intended links Python multiprocessing library
 (#11878)

---
 docs/source/design/multiprocessing.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md
index da87638e5b743..c2cdb75ea08a7 100644
--- a/docs/source/design/multiprocessing.md
+++ b/docs/source/design/multiprocessing.md
@@ -21,7 +21,7 @@ This document describes how vLLM deals with these challenges.
 
 ## Multiprocessing Methods
 
-[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html.md#contexts-and-start-methods) include:
+[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
 
 - `spawn` - spawn a new Python process. This will be the default as of Python
   3.14.

From 310aca88c984983189a57f1b72e3b1dde89fb92f Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 9 Jan 2025 15:18:21 +0800
Subject: [PATCH 339/357] [perf]fix current stream (#11870)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .../device_communicators/pynccl.py            | 15 +++++----
 vllm/distributed/parallel_state.py            |  5 +--
 vllm/utils.py                                 | 33 +++++++++++++++++++
 vllm/worker/multi_step_model_runner.py        |  8 ++---
 4 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index fda4d007ceb5b..efc59987195f5 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -10,6 +10,7 @@
     ncclRedOpTypeEnum, ncclUniqueId)
 from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
+from vllm.utils import current_stream
 
 logger = init_logger(__name__)
 
@@ -96,7 +97,7 @@ def __init__(
             self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
                 self.world_size, self.unique_id, self.rank)
 
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
             # A small all_reduce for warmup.
             data = torch.zeros(1, device=device)
             self.all_reduce(data)
@@ -119,7 +120,7 @@ def all_reduce(self,
         out_tensor = torch.empty_like(in_tensor)
 
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         self.nccl.ncclAllReduce(buffer_type(in_tensor.data_ptr()),
                                 buffer_type(out_tensor.data_ptr()),
                                 in_tensor.numel(),
@@ -141,7 +142,7 @@ def all_gather(self,
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {input_tensor.device}")
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         self.nccl.ncclAllGather(
             buffer_type(input_tensor.data_ptr()),
             buffer_type(output_tensor.data_ptr()), input_tensor.numel(),
@@ -162,7 +163,7 @@ def reduce_scatter(self,
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {input_tensor.device}")
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         self.nccl.ncclReduceScatter(
             buffer_type(input_tensor.data_ptr()),
             buffer_type(output_tensor.data_ptr()), output_tensor.numel(),
@@ -177,7 +178,7 @@ def send(self, tensor: torch.Tensor, dst: int, stream=None):
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}")
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(),
                            ncclDataTypeEnum.from_torch(tensor.dtype), dst,
                            self.comm, cudaStream_t(stream.cuda_stream))
@@ -189,7 +190,7 @@ def recv(self, tensor: torch.Tensor, src: int, stream=None):
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}")
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(),
                            ncclDataTypeEnum.from_torch(tensor.dtype), src,
                            self.comm, cudaStream_t(stream.cuda_stream))
@@ -201,7 +202,7 @@ def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}")
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         if src == self.rank:
             sendbuff = buffer_type(tensor.data_ptr())
             # NCCL requires the sender also to have a receive buffer
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index a837c1dc5953b..be7f16ef52a47 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -357,10 +357,7 @@ def _all_reduce_out_place(self, input_: torch.Tensor) -> torch.Tensor:
             return out
         pynccl_comm = self.pynccl_comm
         assert pynccl_comm is not None
-        # TODO: pynccl should not use `stream=`
-        # it can just always use the current stream.
-        out = pynccl_comm.all_reduce(input_,
-                                     stream=torch.cuda.current_stream())
+        out = pynccl_comm.all_reduce(input_)
         if out is None:
             # fall back to the default all-reduce using PyTorch.
             # this usually happens during testing.
diff --git a/vllm/utils.py b/vllm/utils.py
index a92b77efd9fd8..0b0905e675245 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -944,6 +944,39 @@ def find_nccl_library() -> str:
     return so_file
 
 
+prev_set_stream = torch.cuda.set_stream
+
+_current_stream = None
+
+
+def _patched_set_stream(stream: torch.cuda.Stream) -> None:
+    global _current_stream
+    _current_stream = stream
+    prev_set_stream(stream)
+
+
+torch.cuda.set_stream = _patched_set_stream
+
+
+def current_stream() -> torch.cuda.Stream:
+    """
+    replace `torch.cuda.current_stream()` with `vllm.utils.current_stream()`.
+    it turns out that `torch.cuda.current_stream()` is quite expensive,
+    as it will construct a new stream object at each call.
+    here we patch `torch.cuda.set_stream` to keep track of the current stream
+    directly, so that we can avoid calling `torch.cuda.current_stream()`.
+
+    the underlying hypothesis is that we do not call `torch._C._cuda_setStream`
+    from C/C++ code.
+    """
+    global _current_stream
+    if _current_stream is None:
+        # when this function is called before any stream is set,
+        # we return the default stream.
+        _current_stream = torch.cuda.current_stream()
+    return _current_stream
+
+
 def enable_trace_function_call_for_thread(vllm_config: "VllmConfig") -> None:
     """Set up function tracing for the current thread,
     if enabled via the VLLM_TRACE_FUNCTION environment variable
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index a2c2cebf8d1f6..acce923498d7e 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -14,7 +14,7 @@
                                                 get_pythonized_sample_results)
 from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
                            Logprob, SequenceGroupMetadata, SequenceOutput)
-from vllm.utils import PyObjectCache, async_tensor_h2d
+from vllm.utils import PyObjectCache, async_tensor_h2d, current_stream
 from vllm.worker.model_runner import (GPUModelRunnerBase,
                                       ModelInputForGPUWithSamplingMetadata)
 from vllm.worker.model_runner_base import (
@@ -498,7 +498,7 @@ def execute_model(
         #   appended sampler output from last iteration
         #   - also maybe pythonize if CPU is ahead of GPU
 
-        current_stream = torch.cuda.current_stream()
+        stream = current_stream()
         if not model_input.is_first_multi_step:
             # Explicitly block on the previous step's forward to make sure we
             # don't clobber any GPU tensors still in use.
@@ -541,7 +541,7 @@ def execute_model(
                                                        num_steps=1)
 
         # record the event for the current step so that the next step can sync
-        model_input.record_step_event(current_stream)
+        model_input.record_step_event(stream)
 
         if get_pp_group().is_last_rank and self.is_driver_worker:
             assert isinstance(output, list)
@@ -552,7 +552,7 @@ def execute_model(
             # event for the pythonization so that we only pythonize if the
             # tensors are ready. May be able to be combined with the step event
             output_ready_event = torch.cuda.Event()
-            output_ready_event.record(current_stream)
+            output_ready_event.record(stream)
             if self.parallel_config.pipeline_parallel_size > 1:
                 output[0].sampled_token_ids_cpu = output[
                     0].sampled_token_ids.cpu()

From 0bd1ff43469f867f92786a3596c3e4a64df43400 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 9 Jan 2025 17:02:53 +0800
Subject: [PATCH 340/357] [Bugfix] Override dunder methods of placeholder
 modules (#11882)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/test_utils.py |  47 ++++++++++-
 vllm/utils.py       | 189 +++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 220 insertions(+), 16 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0285b00d73be1..14d2fbd63b90d 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -7,9 +7,9 @@
 import torch
 from vllm_test_utils import monitor
 
-from vllm.utils import (FlexibleArgumentParser, StoreBoolean, deprecate_kwargs,
-                        get_open_port, memory_profiling, merge_async_iterators,
-                        supports_kw)
+from vllm.utils import (FlexibleArgumentParser, PlaceholderModule,
+                        StoreBoolean, deprecate_kwargs, get_open_port,
+                        memory_profiling, merge_async_iterators, supports_kw)
 
 from .utils import error_on_warning, fork_new_process_for_each_test
 
@@ -323,3 +323,44 @@ def measure_current_non_torch():
     del weights
     lib.cudaFree(handle1)
     lib.cudaFree(handle2)
+
+
+def test_placeholder_module_error_handling():
+    placeholder = PlaceholderModule("placeholder_1234")
+
+    def build_ctx():
+        return pytest.raises(ModuleNotFoundError,
+                             match="No module named")
+
+    with build_ctx():
+        int(placeholder)
+
+    with build_ctx():
+        placeholder()
+
+    with build_ctx():
+        _ = placeholder.some_attr
+
+    with build_ctx():
+        # Test conflict with internal __name attribute
+        _ = placeholder.name
+
+    # OK to print the placeholder or use it in a f-string
+    _ = repr(placeholder)
+    _ = str(placeholder)
+
+    # No error yet; only error when it is used downstream
+    placeholder_attr = placeholder.placeholder_attr("attr")
+
+    with build_ctx():
+        int(placeholder_attr)
+
+    with build_ctx():
+        placeholder_attr()
+
+    with build_ctx():
+        _ = placeholder_attr.some_attr
+
+    with build_ctx():
+        # Test conflict with internal __module attribute
+        _ = placeholder_attr.module
diff --git a/vllm/utils.py b/vllm/utils.py
index 0b0905e675245..487088591ebc2 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -46,7 +46,7 @@
 import zmq.asyncio
 from packaging.version import Version
 from torch.library import Library
-from typing_extensions import ParamSpec, TypeIs, assert_never
+from typing_extensions import Never, ParamSpec, TypeIs, assert_never
 
 import vllm.envs as envs
 from vllm.logger import enable_trace_function_call, init_logger
@@ -1627,24 +1627,183 @@ def get_vllm_optional_dependencies():
     }
 
 
-@dataclass(frozen=True)
-class PlaceholderModule:
+class _PlaceholderBase:
+    """
+    Disallows downstream usage of placeholder modules.
+
+    We need to explicitly override each dunder method because
+    :meth:`__getattr__` is not called when they are accessed.
+
+    See also:
+        [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
+    """
+
+    def __getattr__(self, key: str) -> Never:
+        """
+        The main class should implement this to throw an error
+        for attribute accesses representing downstream usage.
+        """
+        raise NotImplementedError
+
+    # [Basic customization]
+
+    def __lt__(self, other: object):
+        return self.__getattr__("__lt__")
+
+    def __le__(self, other: object):
+        return self.__getattr__("__le__")
+
+    def __eq__(self, other: object):
+        return self.__getattr__("__eq__")
+
+    def __ne__(self, other: object):
+        return self.__getattr__("__ne__")
+
+    def __gt__(self, other: object):
+        return self.__getattr__("__gt__")
+
+    def __ge__(self, other: object):
+        return self.__getattr__("__ge__")
+
+    def __hash__(self):
+        return self.__getattr__("__hash__")
+
+    def __bool__(self):
+        return self.__getattr__("__bool__")
+
+    # [Callable objects]
+
+    def __call__(self, *args: object, **kwargs: object):
+        return self.__getattr__("__call__")
+
+    # [Container types]
+
+    def __len__(self):
+        return self.__getattr__("__len__")
+
+    def __getitem__(self, key: object):
+        return self.__getattr__("__getitem__")
+
+    def __setitem__(self, key: object, value: object):
+        return self.__getattr__("__setitem__")
+
+    def __delitem__(self, key: object):
+        return self.__getattr__("__delitem__")
+
+    # __missing__ is optional according to __getitem__ specification,
+    # so it is skipped
+
+    # __iter__ and __reversed__ have a default implementation
+    # based on __len__ and __getitem__, so they are skipped.
+
+    # [Numeric Types]
+
+    def __add__(self, other: object):
+        return self.__getattr__("__add__")
+
+    def __sub__(self, other: object):
+        return self.__getattr__("__sub__")
+
+    def __mul__(self, other: object):
+        return self.__getattr__("__mul__")
+
+    def __matmul__(self, other: object):
+        return self.__getattr__("__matmul__")
+
+    def __truediv__(self, other: object):
+        return self.__getattr__("__truediv__")
+
+    def __floordiv__(self, other: object):
+        return self.__getattr__("__floordiv__")
+
+    def __mod__(self, other: object):
+        return self.__getattr__("__mod__")
+
+    def __divmod__(self, other: object):
+        return self.__getattr__("__divmod__")
+
+    def __pow__(self, other: object, modulo: object = ...):
+        return self.__getattr__("__pow__")
+
+    def __lshift__(self, other: object):
+        return self.__getattr__("__lshift__")
+
+    def __rshift__(self, other: object):
+        return self.__getattr__("__rshift__")
+
+    def __and__(self, other: object):
+        return self.__getattr__("__and__")
+
+    def __xor__(self, other: object):
+        return self.__getattr__("__xor__")
+
+    def __or__(self, other: object):
+        return self.__getattr__("__or__")
+
+    # r* and i* methods have lower priority than
+    # the methods for left operand so they are skipped
+
+    def __neg__(self):
+        return self.__getattr__("__neg__")
+
+    def __pos__(self):
+        return self.__getattr__("__pos__")
+
+    def __abs__(self):
+        return self.__getattr__("__abs__")
+
+    def __invert__(self):
+        return self.__getattr__("__invert__")
+
+    # __complex__, __int__ and __float__ have a default implementation
+    # based on __index__, so they are skipped.
+
+    def __index__(self):
+        return self.__getattr__("__index__")
+
+    def __round__(self, ndigits: object = ...):
+        return self.__getattr__("__round__")
+
+    def __trunc__(self):
+        return self.__getattr__("__trunc__")
+
+    def __floor__(self):
+        return self.__getattr__("__floor__")
+
+    def __ceil__(self):
+        return self.__getattr__("__ceil__")
+
+    # [Context managers]
+
+    def __enter__(self):
+        return self.__getattr__("__enter__")
+
+    def __exit__(self, *args: object, **kwargs: object):
+        return self.__getattr__("__exit__")
+
+
+class PlaceholderModule(_PlaceholderBase):
     """
     A placeholder object to use when a module does not exist.
 
     This enables more informative errors when trying to access attributes
     of a module that does not exists.
     """
-    name: str
+
+    def __init__(self, name: str) -> None:
+        super().__init__()
+
+        # Apply name mangling to avoid conflicting with module attributes
+        self.__name = name
 
     def placeholder_attr(self, attr_path: str):
         return _PlaceholderModuleAttr(self, attr_path)
 
     def __getattr__(self, key: str):
-        name = self.name
+        name = self.__name
 
         try:
-            importlib.import_module(self.name)
+            importlib.import_module(name)
         except ImportError as exc:
             for extra, names in get_vllm_optional_dependencies().items():
                 if name in names:
@@ -1657,17 +1816,21 @@ def __getattr__(self, key: str):
                              "when the original module can be imported")
 
 
-@dataclass(frozen=True)
-class _PlaceholderModuleAttr:
-    module: PlaceholderModule
-    attr_path: str
+class _PlaceholderModuleAttr(_PlaceholderBase):
+
+    def __init__(self, module: PlaceholderModule, attr_path: str) -> None:
+        super().__init__()
+
+        # Apply name mangling to avoid conflicting with module attributes
+        self.__module = module
+        self.__attr_path = attr_path
 
     def placeholder_attr(self, attr_path: str):
-        return _PlaceholderModuleAttr(self.module,
-                                      f"{self.attr_path}.{attr_path}")
+        return _PlaceholderModuleAttr(self.__module,
+                                      f"{self.__attr_path}.{attr_path}")
 
     def __getattr__(self, key: str):
-        getattr(self.module, f"{self.attr_path}.{key}")
+        getattr(self.__module, f"{self.__attr_path}.{key}")
 
         raise AssertionError("PlaceholderModule should not be used "
                              "when the original module can be imported")

From 1d967acb45d5d18434409b822f105f087e379eee Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <ye.charlotte.qi@gmail.com>
Date: Thu, 9 Jan 2025 01:36:39 -0800
Subject: [PATCH 341/357] [Bugfix] fix beam search input errors and latency
 benchmark script (#11875)

Signed-off-by: Ye Qi <yeq@meta.com>
Co-authored-by: yeq <yeq@devgpu004.lla3.facebook.com>
---
 benchmarks/benchmark_latency.py | 23 +++++++++++++++++------
 vllm/entrypoints/llm.py         | 10 ++++++----
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index e669ce4db299d..77c4f6aa927e4 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -13,6 +13,7 @@
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptType
+from vllm.sampling_params import BeamSearchParams
 from vllm.utils import FlexibleArgumentParser
 
 
@@ -40,6 +41,20 @@ def main(args: argparse.Namespace):
         "prompt_token_ids": batch
     } for batch in dummy_prompt_token_ids.tolist()]
 
+    def llm_generate():
+        if not args.use_beam_search:
+            llm.generate(dummy_prompts,
+                         sampling_params=sampling_params,
+                         use_tqdm=False)
+        else:
+            llm.beam_search(
+                dummy_prompts,
+                BeamSearchParams(
+                    beam_width=args.n,
+                    max_tokens=args.output_len,
+                    ignore_eos=True,
+                ))
+
     def run_to_completion(profile_dir: Optional[str] = None):
         if profile_dir:
             with torch.profiler.profile(
@@ -49,15 +64,11 @@ def run_to_completion(profile_dir: Optional[str] = None):
                     ],
                     on_trace_ready=torch.profiler.tensorboard_trace_handler(
                         str(profile_dir))) as p:
-                llm.generate(dummy_prompts,
-                             sampling_params=sampling_params,
-                             use_tqdm=False)
+                llm_generate()
             print(p.key_averages().table(sort_by="self_cuda_time_total"))
         else:
             start_time = time.perf_counter()
-            llm.generate(dummy_prompts,
-                         sampling_params=sampling_params,
-                         use_tqdm=False)
+            llm_generate()
             end_time = time.perf_counter()
             latency = end_time - start_time
             return latency
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index e48fd1a4fa5e9..acb4db85632a8 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -21,7 +21,7 @@
                                          parse_chat_messages,
                                          resolve_chat_template_content_format)
 from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt
-from vllm.inputs.parse import parse_and_batch_prompt
+from vllm.inputs.parse import is_token_prompt, parse_and_batch_prompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.guided_decoding.guided_fields import (
@@ -457,7 +457,7 @@ def generate(
 
     def beam_search(
         self,
-        prompts: List[Union[str, List[int]]],
+        prompts: List[Union[TokensPrompt, TextPrompt]],
         params: BeamSearchParams,
     ) -> List[BeamSearchOutput]:
         """
@@ -493,8 +493,10 @@ def sort_beams_key(x: BeamSearchSequence) -> float:
         instances: List[BeamSearchInstance] = []
 
         for prompt in prompts:
-            prompt_tokens = prompt if isinstance(
-                prompt, list) else tokenizer.encode(prompt)
+            if is_token_prompt(prompt):
+                prompt_tokens = prompt["prompt_token_ids"]
+            else:
+                prompt_tokens = tokenizer.encode(prompt["prompt"])
             instances.append(BeamSearchInstance(prompt_tokens))
 
         for _ in range(max_tokens):

From 65097ca0af5c1d7caa3d9d8224fa8b4790a5f7bc Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 9 Jan 2025 17:43:40 +0800
Subject: [PATCH 342/357] [Doc] Add model development API Reference (#11884)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml                      |  2 +-
 docs/source/api/{params.md => inference_params.md} |  5 ++---
 docs/source/api/model/adapters.md                  |  9 +++++++++
 docs/source/api/model/index.md                     | 12 ++++++++++++
 docs/source/api/model/interfaces.md                |  9 +++++++++
 docs/source/api/model/interfaces_base.md           |  9 +++++++++
 docs/source/index.md                               |  3 ++-
 vllm/model_executor/models/interfaces.py           | 11 +++++++----
 vllm/model_executor/models/interfaces_base.py      |  3 +++
 9 files changed, 54 insertions(+), 9 deletions(-)
 rename docs/source/api/{params.md => inference_params.md} (79%)
 create mode 100644 docs/source/api/model/adapters.md
 create mode 100644 docs/source/api/model/index.md
 create mode 100644 docs/source/api/model/interfaces.md
 create mode 100644 docs/source/api/model/interfaces_base.md

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f883595f6d9ad..e288f8f30159a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -38,7 +38,7 @@ steps:
   - pip install -r requirements-docs.txt
   - SPHINXOPTS=\"-W\" make html
   # Check API reference (if it fails, you may have missing mock imports)
-  - grep \"sig sig-object py\" build/html/api/params.html
+  - grep \"sig sig-object py\" build/html/api/inference_params.html
 
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
   fast_check: true
diff --git a/docs/source/api/params.md b/docs/source/api/inference_params.md
similarity index 79%
rename from docs/source/api/params.md
rename to docs/source/api/inference_params.md
index a3b4d9cbb44ec..181c30cab9c4a 100644
--- a/docs/source/api/params.md
+++ b/docs/source/api/inference_params.md
@@ -1,6 +1,6 @@
-# Optional Parameters
+# Inference Parameters
 
-Optional parameters for vLLM APIs.
+Inference parameters for vLLM APIs.
 
 (sampling-params)=
 
@@ -19,4 +19,3 @@ Optional parameters for vLLM APIs.
 .. autoclass:: vllm.PoolingParams
     :members:
 ```
-
diff --git a/docs/source/api/model/adapters.md b/docs/source/api/model/adapters.md
new file mode 100644
index 0000000000000..e103a51d0070d
--- /dev/null
+++ b/docs/source/api/model/adapters.md
@@ -0,0 +1,9 @@
+# Model Adapters
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.model_executor.models.adapters
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/api/model/index.md b/docs/source/api/model/index.md
new file mode 100644
index 0000000000000..b8437e3c3517a
--- /dev/null
+++ b/docs/source/api/model/index.md
@@ -0,0 +1,12 @@
+# Model Development
+
+## Submodules
+
+```{toctree}
+:maxdepth: 1
+
+interfaces_base
+interfaces
+adapters
+```
+
diff --git a/docs/source/api/model/interfaces.md b/docs/source/api/model/interfaces.md
new file mode 100644
index 0000000000000..55bee57f64faa
--- /dev/null
+++ b/docs/source/api/model/interfaces.md
@@ -0,0 +1,9 @@
+# Optional Interfaces
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.model_executor.models.interfaces
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/api/model/interfaces_base.md b/docs/source/api/model/interfaces_base.md
new file mode 100644
index 0000000000000..75d58d34228e9
--- /dev/null
+++ b/docs/source/api/model/interfaces_base.md
@@ -0,0 +1,9 @@
+# Base Model Interfaces
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.model_executor.models.interfaces_base
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/index.md b/docs/source/index.md
index 6747a7fcce4fe..23e4304fe29d9 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -139,8 +139,9 @@ community/sponsors
 
 api/offline_inference/index
 api/engine/index
+api/inference_params
 api/multimodal/index
-api/params
+api/model/index
 ```
 
 % Design Documents: Details about vLLM internals
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 6f26603046483..b51cba86ec1a4 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -38,13 +38,15 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
         to be merged with text embeddings.
 
         The output embeddings must be one of the following formats:
+    
         - A list or tuple of 2D tensors, where each tensor corresponds to 
-          each input multimodal data item (e.g, image).
+            each input multimodal data item (e.g, image).
         - A single 3D tensor, with the batch dimension grouping the 2D tensors.
 
-        NOTE: The returned multimodal embeddings must be in the same order as 
-        the appearances of their corresponding multimodal data item in the 
-        input prompt.
+        Note:
+            The returned multimodal embeddings must be in the same order as 
+            the appearances of their corresponding multimodal data item in the 
+            input prompt.
         """
         ...
 
@@ -59,6 +61,7 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         ...
 
+    @overload
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index de733b6d49a53..4c353ae6ffc13 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -35,6 +35,7 @@
 
 @runtime_checkable
 class VllmModel(Protocol[C_co, T_co]):
+    """The interface required for all models in vLLM."""
 
     def __init__(
         self,
@@ -97,6 +98,7 @@ def is_vllm_model(
 
 @runtime_checkable
 class VllmModelForTextGeneration(VllmModel[C_co, T], Protocol[C_co, T]):
+    """The interface required for all generative models in vLLM."""
 
     def compute_logits(
         self,
@@ -142,6 +144,7 @@ def is_text_generation_model(
 
 @runtime_checkable
 class VllmModelForPooling(VllmModel[C_co, T], Protocol[C_co, T]):
+    """The interface required for all pooling models in vLLM."""
 
     def pooler(
         self,

From 405eb8e3967eb9bd263b3919796cb3b45a2931d3 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Thu, 9 Jan 2025 21:46:50 +0800
Subject: [PATCH 343/357] [platform] Allow platform specify attention backend
 (#11609)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Mengqing Cao <cmq0113@163.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
---
 tests/kernels/test_attention_selector.py |  74 ++++++------
 vllm/attention/selector.py               | 139 ++---------------------
 vllm/platforms/cpu.py                    |   7 +-
 vllm/platforms/cuda.py                   |  77 ++++++++++++-
 vllm/platforms/hpu.py                    |   7 +-
 vllm/platforms/interface.py              |   8 +-
 vllm/platforms/openvino.py               |   7 +-
 vllm/platforms/rocm.py                   |   6 +-
 vllm/platforms/tpu.py                    |   7 +-
 vllm/platforms/xpu.py                    |   7 +-
 10 files changed, 164 insertions(+), 175 deletions(-)

diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index 916cc2efa3895..a08c874407e3f 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -1,10 +1,10 @@
-from unittest.mock import patch
+from unittest.mock import Mock, patch
 
 import pytest
 import torch
 
 from tests.kernels.utils import override_backend_env_variable
-from vllm.attention.selector import which_attn_to_use
+from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
 from vllm.platforms.openvino import OpenVinoPlatform
@@ -12,6 +12,13 @@
 from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL
 
 
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear lru cache to ensure each test case runs without caching.
+    """
+    _cached_get_attn_backend.cache_clear()
+
+
 @pytest.mark.parametrize(
     "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
 @pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
@@ -24,67 +31,70 @@ def test_env(name: str, device: str, monkeypatch):
 
     if device == "cpu":
         with patch("vllm.attention.selector.current_platform", CpuPlatform()):
-            backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
-                                        False)
-        assert backend.name == "TORCH_SDPA"
+            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
+                                       False)
+        assert backend.get_name() == "TORCH_SDPA"
     elif device == "hip":
         with patch("vllm.attention.selector.current_platform", RocmPlatform()):
-            backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
-                                        False)
-        assert backend.name == "ROCM_FLASH"
+            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
+                                       False)
+        assert backend.get_name() == "ROCM_FLASH"
     elif device == "openvino":
         with patch("vllm.attention.selector.current_platform",
-                   OpenVinoPlatform()):
-            backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
-                                        False)
-        assert backend.name == "OPENVINO"
+                   OpenVinoPlatform()), patch.dict('sys.modules',
+                                                   {'openvino': Mock()}):
+            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
+                                       False)
+        assert backend.get_name() == "OPENVINO"
     else:
-        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
-            backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
-                                        False)
-        assert backend.name == name
+        if name in ["XFORMERS", "FLASHINFER"]:
+            with patch("vllm.attention.selector.current_platform",
+                       CudaPlatform()):
+                backend = get_attn_backend(16, torch.float16, torch.float16,
+                                           16, False)
+            assert backend.get_name() == name
 
 
 def test_flash_attn(monkeypatch):
     """Test FlashAttn validation."""
     # TODO: When testing for v1, pipe in `use_v1` as an argument to
-    # which_attn_to_use
+    # get_attn_backend
 
     override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
 
     # Unsupported CUDA arch
     with patch("torch.cuda.get_device_capability", return_value=(7, 5)):
-        backend = which_attn_to_use(16, torch.float16, None, 16, False)
-        assert backend.name != STR_FLASH_ATTN_VAL
+        backend = get_attn_backend(16, torch.float16, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # Unsupported data type
-    backend = which_attn_to_use(16, torch.float8_e4m3fn, None, 16, False)
-    assert backend.name != STR_FLASH_ATTN_VAL
+    backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
+    assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # Unsupported kv cache data type
-    backend = which_attn_to_use(16, torch.float16, "fp8", 16, False)
-    assert backend.name != STR_FLASH_ATTN_VAL
+    backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
+    assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # Unsupported block size
-    backend = which_attn_to_use(16, torch.float16, None, 8, False)
-    assert backend.name != STR_FLASH_ATTN_VAL
+    backend = get_attn_backend(16, torch.float16, None, 8, False)
+    assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # flash-attn is not installed
     with patch.dict('sys.modules', {'vllm_flash_attn': None}):
-        backend = which_attn_to_use(16, torch.float16, None, 16, False)
-        assert backend.name != STR_FLASH_ATTN_VAL
+        backend = get_attn_backend(16, torch.float16, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # Unsupported head size
-    backend = which_attn_to_use(17, torch.float16, None, 16, False)
-    assert backend.name != STR_FLASH_ATTN_VAL
+    backend = get_attn_backend(17, torch.float16, None, 16, False)
+    assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # Attention-free models should bypass env and use PlaceholderAttention
-    backend = which_attn_to_use(16, torch.float16, torch.float16, 16, True)
-    assert backend.name != STR_FLASH_ATTN_VAL
+    backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
+    assert backend.get_name() != STR_FLASH_ATTN_VAL
 
 
 def test_invalid_env(monkeypatch):
     """Throw an exception if the backend name is invalid."""
     override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
     with pytest.raises(ValueError):
-        which_attn_to_use(16, torch.float16, None, 16, False)
+        get_attn_backend(16, torch.float16, None, 16, False)
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index d263839705690..0ff007c87b1c9 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -9,7 +9,7 @@
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
 from vllm.platforms import _Backend, current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR
+from vllm.utils import STR_BACKEND_ENV_VAR, resolve_obj_by_qualname
 
 logger = init_logger(__name__)
 
@@ -114,83 +114,19 @@ def _cached_get_attn_backend(
             BlocksparseFlashAttentionBackend)
         return BlocksparseFlashAttentionBackend
 
-    backend = which_attn_to_use(head_size, dtype, kv_cache_dtype, block_size,
-                                is_attention_free, use_v1)
-    if backend == _Backend.FLASH_ATTN:
-        logger.info("Using Flash Attention backend.")
-        from vllm.attention.backends.flash_attn import (  # noqa: F401
-            FlashAttentionBackend)
-        return FlashAttentionBackend
-    if backend == _Backend.FLASH_ATTN_VLLM_V1:
-        from vllm.v1.attention.backends.flash_attn import (  # noqa: F401
-            FlashAttentionBackend as FlashAttentionBackendV1)
-        return FlashAttentionBackendV1
-    if backend == _Backend.XFORMERS:
-        logger.info("Using XFormers backend.")
-        from vllm.attention.backends.xformers import (  # noqa: F401
-            XFormersBackend)
-        return XFormersBackend
-    elif backend == _Backend.ROCM_FLASH:
-        logger.info("Using ROCmFlashAttention backend.")
-        from vllm.attention.backends.rocm_flash_attn import (  # noqa: F401
-            ROCmFlashAttentionBackend)
-        return ROCmFlashAttentionBackend
-    elif backend == _Backend.TORCH_SDPA:
-        assert current_platform.is_cpu(), RuntimeError(
-            "Torch SDPA backend is only used for the CPU device.")
-        logger.info("Using Torch SDPA backend.")
-        from vllm.attention.backends.torch_sdpa import TorchSDPABackend
-        return TorchSDPABackend
-    elif backend == _Backend.OPENVINO:
-        logger.info("Using OpenVINO Attention backend.")
-        from vllm.attention.backends.openvino import OpenVINOAttentionBackend
-        return OpenVINOAttentionBackend
-    elif backend == _Backend.IPEX:
-        assert current_platform.is_xpu(), RuntimeError(
-            "IPEX attention backend is only used for the XPU device.")
-        logger.info("Using IPEX attention backend.")
-        from vllm.attention.backends.ipex_attn import IpexAttnBackend
-        return IpexAttnBackend
-    elif backend == _Backend.FLASHINFER:
-        logger.info("Using Flashinfer backend.")
-        from vllm.attention.backends.flashinfer import FlashInferBackend
-        return FlashInferBackend
-    elif backend == _Backend.HPU_ATTN:
-        logger.info("Using HPUAttention backend.")
-        from vllm.attention.backends.hpu_attn import HPUAttentionBackend
-        return HPUAttentionBackend
-    elif backend == _Backend.PALLAS:
-        logger.info("Using Pallas backend.")
-        from vllm.attention.backends.pallas import PallasAttentionBackend
-        return PallasAttentionBackend
-    elif backend == _Backend.NO_ATTENTION:
-        from vllm.attention.backends.placeholder_attn import (
-            PlaceholderAttentionBackend)
-        return PlaceholderAttentionBackend
-    else:
-        raise ValueError("Invalid attention backend.")
-
-
-def which_attn_to_use(head_size: int,
-                      dtype: torch.dtype,
-                      kv_cache_dtype: Optional[str],
-                      block_size: int,
-                      is_attention_free: bool,
-                      use_v1: bool = False) -> _Backend:
-    """Returns which flash attention backend to use."""
-    # Default case.
-    selected_backend = _Backend.FLASH_ATTN
-
     # If there are no attention layers (e.g. we are running Mamba),
     # use the placeholder NO_ATTENTION
     if is_attention_free:
-        return _Backend.NO_ATTENTION
+        from vllm.attention.backends.placeholder_attn import (
+            PlaceholderAttentionBackend)
+        return PlaceholderAttentionBackend
 
     # Check whether a particular choice of backend was
     # previously forced.
     #
     # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
     # ENVIRONMENT VARIABLE.
+    selected_backend = None
     backend_by_global_setting: Optional[_Backend] = (
         get_global_forced_attn_backend())
     if backend_by_global_setting is not None:
@@ -201,64 +137,13 @@ def which_attn_to_use(head_size: int,
         if backend_by_env_var is not None:
             selected_backend = backend_name_to_enum(backend_by_env_var)
 
-    # get device-specific default attn_backend
-    default_backend = current_platform.get_default_attn_backend(
-        selected_backend)
-    if default_backend is not None:
-        return default_backend
-
-    if use_v1:
-        return _Backend.FLASH_ATTN_VLLM_V1
-
-    # FlashAttn in NVIDIA GPUs.
-    if selected_backend == _Backend.FLASH_ATTN:
-        if not current_platform.has_device_capability(80):
-            # Volta and Turing NVIDIA GPUs.
-            logger.info(
-                "Cannot use FlashAttention-2 backend for Volta and Turing "
-                "GPUs.")
-            selected_backend = _Backend.XFORMERS
-        elif dtype not in (torch.float16, torch.bfloat16):
-            logger.info(
-                "Cannot use FlashAttention-2 backend for dtype other than "
-                "torch.float16 or torch.bfloat16.")
-            selected_backend = _Backend.XFORMERS
-        elif kv_cache_dtype is not None and kv_cache_dtype.startswith("fp8"):
-            logger.info(
-                "Cannot use FlashAttention-2 backend for FP8 KV cache.")
-            logger.warning(
-                "Please use FlashInfer backend with FP8 KV Cache for "
-                "better performance by setting environment variable  "
-                "VLLM_ATTENTION_BACKEND=FLASHINFER")
-            selected_backend = _Backend.XFORMERS
-        elif block_size % 16 != 0:
-            logger.info(
-                "Cannot use FlashAttention-2 backend for block size not "
-                "divisible by 16.")
-            selected_backend = _Backend.XFORMERS
-
-    # FlashAttn is valid for the model, checking if the package is installed.
-    if selected_backend == _Backend.FLASH_ATTN:
-        try:
-            import vllm.vllm_flash_attn  # noqa: F401
-            from vllm.attention.backends.flash_attn import (  # noqa: F401
-                FlashAttentionBackend)
-
-            supported_sizes = FlashAttentionBackend.get_supported_head_sizes()
-            if head_size not in supported_sizes:
-                logger.info(
-                    "Cannot use FlashAttention-2 backend for head size %d.",
-                    head_size)
-                selected_backend = _Backend.XFORMERS
-        except ImportError:
-            logger.info(
-                "Cannot use FlashAttention-2 backend because the "
-                "vllm.vllm_flash_attn package is not found. "
-                "Make sure that vllm_flash_attn was built and installed "
-                "(on by default).")
-            selected_backend = _Backend.XFORMERS
-
-    return selected_backend
+    # get device-specific attn_backend
+    attention_cls = current_platform.get_attn_backend_cls(
+        selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1)
+    if not attention_cls:
+        raise ValueError(
+            f"Invalid attention backend for {current_platform.device_name}")
+    return resolve_obj_by_qualname(attention_cls)
 
 
 @contextmanager
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 7ba7f5150150c..eb3e269cac285 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -28,10 +28,13 @@ def get_device_name(cls, device_id: int = 0) -> str:
         return "cpu"
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
         if selected_backend != _Backend.TORCH_SDPA:
             logger.info("Cannot use %s backend on CPU.", selected_backend)
-        return _Backend.TORCH_SDPA
+        logger.info("Using Torch SDPA backend.")
+        return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
 
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 3c5350b778345..23ceac83e49de 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -16,7 +16,7 @@
 import vllm.envs as envs
 from vllm.logger import init_logger
 
-from .interface import DeviceCapability, Platform, PlatformEnum
+from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
@@ -141,6 +141,81 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
 
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
+                             kv_cache_dtype, block_size, use_v1) -> str:
+        if use_v1:
+            logger.info("Using Flash Attention backend on V1 engine.")
+            return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
+        if selected_backend == _Backend.FLASHINFER:
+            logger.info("Using FlashInfer backend.")
+            return "vllm.attention.backends.flashinfer.FlashInferBackend"
+        elif selected_backend == _Backend.XFORMERS:
+            logger.info("Using XFormers backend.")
+            return "vllm.attention.backends.xformers.XFormersBackend"
+        elif selected_backend == _Backend.FLASH_ATTN:
+            pass
+        elif selected_backend:
+            raise ValueError(
+                f"Invalid attention backend for {cls.device_name}")
+
+        target_backend = _Backend.FLASH_ATTN
+        if not cls.has_device_capability(80):
+            # Volta and Turing NVIDIA GPUs.
+            logger.info(
+                "Cannot use FlashAttention-2 backend for Volta and Turing "
+                "GPUs.")
+            target_backend = _Backend.XFORMERS
+        elif dtype not in (torch.float16, torch.bfloat16):
+            logger.info(
+                "Cannot use FlashAttention-2 backend for dtype other than "
+                "torch.float16 or torch.bfloat16.")
+            target_backend = _Backend.XFORMERS
+        elif kv_cache_dtype is not None and \
+            kv_cache_dtype.startswith("fp8"):
+            logger.info(
+                "Cannot use FlashAttention-2 backend for FP8 KV cache.")
+            logger.warning(
+                "Please use FlashInfer backend with FP8 KV Cache for "
+                "better performance by setting environment variable  "
+                "VLLM_ATTENTION_BACKEND=FLASHINFER")
+            target_backend = _Backend.XFORMERS
+        elif block_size % 16 != 0:
+            logger.info(
+                "Cannot use FlashAttention-2 backend for block size not "
+                "divisible by 16.")
+            target_backend = _Backend.XFORMERS
+
+        # FlashAttn is valid for the model, checking if the package is
+        # installed.
+        if target_backend == _Backend.FLASH_ATTN:
+            try:
+                import vllm.vllm_flash_attn  # noqa: F401
+                from vllm.attention.backends.flash_attn import (  # noqa: F401
+                    FlashAttentionBackend)
+
+                supported_sizes = \
+                    FlashAttentionBackend.get_supported_head_sizes()
+                if head_size not in supported_sizes:
+                    logger.info(
+                        "Cannot use FlashAttention-2 backend for head size %d.",
+                        head_size)
+                    target_backend = _Backend.XFORMERS
+            except ImportError:
+                logger.info(
+                    "Cannot use FlashAttention-2 backend because the "
+                    "vllm.vllm_flash_attn package is not found. "
+                    "Make sure that vllm_flash_attn was built and installed "
+                    "(on by default).")
+                target_backend = _Backend.XFORMERS
+
+        if target_backend == _Backend.XFORMERS:
+            logger.info("Using XFormers backend.")
+            return "vllm.attention.backends.xformers.XFormersBackend"
+
+        logger.info("Using Flash Attention backend.")
+        return "vllm.attention.backends.flash_attn.FlashAttentionBackend"
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 0a44f2b74163a..8152d881fa8d9 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -21,8 +21,11 @@ class HpuPlatform(Platform):
     dispatch_key: str = "HPU"
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
-        return _Backend.HPU_ATTN
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
+        logger.info("Using HPUAttention backend.")
+        return "vllm.attention.backends.hpu_attn.HPUAttentionBackend"
 
     @classmethod
     def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index ddccaa2ce0148..f440358f65fbb 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -112,9 +112,11 @@ def is_cuda_alike(self) -> bool:
         return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend):
-        """Get the default attention backend of a device."""
-        return None
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
+        """Get the attention backend class of a device."""
+        return ""
 
     @classmethod
     def get_device_capability(
diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
index 16eb8dc81efc2..9390eda535c8f 100644
--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -28,10 +28,13 @@ class OpenVinoPlatform(Platform):
     dispatch_key: str = "CPU"
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
         if selected_backend != _Backend.OPENVINO:
             logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
-        return _Backend.OPENVINO
+        logger.info("Using OpenVINO Attention backend.")
+        return "vllm.attention.backends.openvino.OpenVINOAttentionBackend"
 
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index aa779f265135f..1c2f602efc856 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -70,7 +70,8 @@ class RocmPlatform(Platform):
     ]
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
+                             kv_cache_dtype, block_size, use_v1) -> str:
         selected_backend = (_Backend.ROCM_FLASH if selected_backend
                             == _Backend.FLASH_ATTN else selected_backend)
         if selected_backend == _Backend.ROCM_FLASH:
@@ -79,7 +80,8 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
                 logger.info("flash_attn is not supported on NAVI GPUs.")
         else:
             logger.info("%s is not supported in AMD GPUs.", selected_backend)
-        return _Backend.ROCM_FLASH
+        logger.info("Using ROCmFlashAttention backend.")
+        return "vllm.attention.backends.rocm_flash_attn.ROCmFlashAttentionBackend"  # noqa: E501
 
     @classmethod
     @lru_cache(maxsize=8)
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index d488daf056f1a..8a59b53ca4b15 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -24,10 +24,13 @@ class TpuPlatform(Platform):
     ]
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
         if selected_backend != _Backend.PALLAS:
             logger.info("Cannot use %s backend on TPU.", selected_backend)
-        return _Backend.PALLAS
+        logger.info("Using Pallas backend.")
+        return "vllm.attention.backends.pallas.PallasAttentionBackend"
 
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 78e17c2afec65..00692a5d23031 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -21,10 +21,13 @@ class XPUPlatform(Platform):
     dispatch_key: str = "XPU"
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
         if selected_backend != _Backend.IPEX:
             logger.info("Cannot use %s backend on XPU.", selected_backend)
-        return _Backend.IPEX
+        logger.info("Using IPEX attention backend.")
+        return "vllm.attention.backends.ipex_attn.IpexAttnBackend"
 
     @staticmethod
     def get_device_capability(device_id: int = 0) -> DeviceCapability:

From bd8287221187279c668ac10c3edd5242b8d8b429 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 9 Jan 2025 22:47:29 +0800
Subject: [PATCH 344/357] [ci]try to fix flaky multi-step tests (#11894)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/multi_step/test_correctness_async_llm.py | 3 +--
 tests/utils.py                                 | 9 +++++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
index 7203d635c2fa8..8456a463adeeb 100644
--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -16,7 +16,6 @@
 NUM_PROMPTS = [10]
 
 DEFAULT_SERVER_ARGS: List[str] = [
-    "--disable-log-requests",
     "--worker-use-ray",
     "--gpu-memory-utilization",
     "0.85",
@@ -110,7 +109,7 @@ async def test_multi_step(
 
     # Spin up client/server & issue completion API requests.
     # Default `max_wait_seconds` is 240 but was empirically
-    # was raised 3x to 720 *just for this test* due to
+    # was raised 5x to 1200 *just for this test* due to
     # observed timeouts in GHA CI
     ref_completions = await completions_with_server_args(
         prompts,
diff --git a/tests/utils.py b/tests/utils.py
index bf3d88194e4ca..f4eecf19e8c64 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -157,13 +157,19 @@ def url_root(self) -> str:
     def url_for(self, *parts: str) -> str:
         return self.url_root + "/" + "/".join(parts)
 
-    def get_client(self):
+    def get_client(self, **kwargs):
+        if "timeout" not in kwargs:
+            kwargs["timeout"] = 600
         return openai.OpenAI(
             base_url=self.url_for("v1"),
             api_key=self.DUMMY_API_KEY,
+            max_retries=0,
+            **kwargs,
         )
 
     def get_async_client(self, **kwargs):
+        if "timeout" not in kwargs:
+            kwargs["timeout"] = 600
         return openai.AsyncOpenAI(base_url=self.url_for("v1"),
                                   api_key=self.DUMMY_API_KEY,
                                   max_retries=0,
@@ -780,7 +786,6 @@ async def completions_with_server_args(
     assert len(max_tokens) == len(prompts)
 
     outputs = None
-    max_wait_seconds = 240 * 3  # 240 is default
     with RemoteOpenAIServer(model_name,
                             server_cli_args,
                             max_wait_seconds=max_wait_seconds) as server:

From 9a228348d2f9a2c85dfc67d6b9fe883bf10a4680 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 10 Jan 2025 01:19:37 +0800
Subject: [PATCH 345/357] [Misc] Provide correct Pixtral-HF chat template
 (#11891)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md | 61 ++++++++++++++------------
 examples/template_pixtral_hf.jinja     | 38 ++++++++++++++++
 tests/entrypoints/test_chat_utils.py   |  1 +
 3 files changed, 73 insertions(+), 27 deletions(-)
 create mode 100644 examples/template_pixtral_hf.jinja

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 3ba34c77205e5..acbe27a22a679 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -322,7 +322,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - ✅︎
   - ✅︎
 * - `Qwen2ForCausalLM`
-  - Qwen2
+  - QwQ, Qwen2
   - `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc.
   - ✅︎
   - ✅︎
@@ -436,7 +436,7 @@ loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/t
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
-{func}`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
+{func}`~vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
 of the whole prompt are extracted from the normalized hidden state corresponding to the last token.
 
 #### Reward Modeling (`--task reward`)
@@ -468,7 +468,7 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
-{func}`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
+{func}`~vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
 
 ```{important}
 For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
@@ -499,7 +499,7 @@ e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "r
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
-{func}`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
+{func}`~vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
 
 #### Sentence Pair Scoring (`--task score`)
 
@@ -550,6 +550,28 @@ On the other hand, modalities separated by `/` are mutually exclusive.
 
 See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model.
 
+````{important}
+To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
+or `--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
+
+Offline inference:
+```python
+llm = LLM(
+    model="Qwen/Qwen2-VL-7B-Instruct",
+    limit_mm_per_prompt={"image": 4},
+)
+```
+
+Online inference:
+```bash
+vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
+```
+````
+
+```{note}
+vLLM currently only supports adding LoRA to the language backbone of multimodal models.
+```
+
 ### Generative Models
 
 See [this page](#generative-models) for more information on how to use generative models.
@@ -689,14 +711,14 @@ See [this page](#generative-models) for more information on how to use generativ
 * - `Phi3VForCausalLM`
   - Phi-3-Vision, Phi-3.5-Vision
   - T + I<sup>E+</sup>
-  - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct` etc.
+  - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc.
   -
   - ✅︎
   - ✅︎
 * - `PixtralForConditionalGeneration`
   - Pixtral
   - T + I<sup>+</sup>
-  - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` etc.
+  - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` (see note), etc.
   -
   - ✅︎
   - ✅︎
@@ -715,7 +737,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - ✅︎
   - ✅︎
 * - `Qwen2VLForConditionalGeneration`
-  - Qwen2-VL
+  - QVQ, Qwen2-VL
   - T + I<sup>E+</sup> + V<sup>E+</sup>
   - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc.
   - ✅︎
@@ -733,26 +755,6 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
-````{important}
-To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
-or `--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
-
-```python
-llm = LLM(
-    model="Qwen/Qwen2-VL-7B-Instruct",
-    limit_mm_per_prompt={"image": 4},
-)
-```
-
-```bash
-vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
-```
-````
-
-```{note}
-vLLM currently only supports adding LoRA to the language backbone of multimodal models.
-```
-
 ```{note}
 To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
 ```
@@ -762,6 +764,11 @@ The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`
 For more details, please see: <gh-pr:4087#issuecomment-2250397630>
 ```
 
+```{note}
+The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingface.co/mistral-community/pixtral-12b/discussions/22)).
+A corrected version is available at <gh-file:examples/template_pixtral_hf.jinja>.
+```
+
 ### Pooling Models
 
 See [this page](pooling-models) for more information on how to use pooling models.
diff --git a/examples/template_pixtral_hf.jinja b/examples/template_pixtral_hf.jinja
new file mode 100644
index 0000000000000..e94661cb39071
--- /dev/null
+++ b/examples/template_pixtral_hf.jinja
@@ -0,0 +1,38 @@
+{%- if messages[0]["role"] == "system" %}
+    {%- set system_message = messages[0]["content"] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+
+{{- bos_token }}
+{%- for message in loop_messages %}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif %}
+    {%- if message["role"] == "user" %}
+        {%- if loop.last and system_message is defined %}
+            {{- "[INST]" + system_message + "\n" }}
+        {%- else %}
+            {{- "[INST]" }}
+        {%- endif %}
+        {%- if message["content"] is not string %}
+            {%- for chunk in message["content"] %}
+                {%- if chunk["type"] == "text" %}
+                    {{- chunk["text"] }}
+                {%- elif chunk["type"] == "image" %}
+                    {{- "[IMG]" }}
+                {%- else %}
+                    {{- raise_exception("Unrecognized content type!") }}
+                {%- endif %}
+            {%- endfor %}
+        {%- else %}
+            {{- message["content"] }}
+        {%- endif %}
+        {{- "[/INST]" }}
+    {%- elif message["role"] == "assistant" %}
+        {{- message["content"] + eos_token}}
+    {%- else %}
+        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}
+    {%- endif %}
+{%- endfor %}
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index d63b963522e73..8f242df4a60e3 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -758,6 +758,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
      ("template_falcon.jinja", "string"),
      ("template_inkbot.jinja", "string"),
      ("template_llava.jinja", "string"),
+     ("template_pixtral_hf.jinja", "openai"),
      ("template_vlm2vec.jinja", "openai"),
      ("tool_chat_template_granite_20b_fc.jinja", "string"),
      ("tool_chat_template_hermes.jinja", "string"),

From 36f5303578397d122693a19007be38ba2f02bcbc Mon Sep 17 00:00:00 2001
From: Charles Frye <cfrye59@gmail.com>
Date: Thu, 9 Jan 2025 15:26:37 -0800
Subject: [PATCH 346/357] [Docs] Add Modal to deployment frameworks (#11907)

---
 docs/source/deployment/frameworks/bentoml.md | 2 +-
 docs/source/deployment/frameworks/index.md   | 1 +
 docs/source/deployment/frameworks/modal.md   | 7 +++++++
 3 files changed, 9 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/deployment/frameworks/modal.md

diff --git a/docs/source/deployment/frameworks/bentoml.md b/docs/source/deployment/frameworks/bentoml.md
index ea0b5d1d4c93b..2bf435bda8380 100644
--- a/docs/source/deployment/frameworks/bentoml.md
+++ b/docs/source/deployment/frameworks/bentoml.md
@@ -2,6 +2,6 @@
 
 # BentoML
 
-[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes.
+[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-compliant image and deploy it on Kubernetes.
 
 For details, see the tutorial [vLLM inference in the BentoML documentation](https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html).
diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md
index 6a59131d36618..964782763f6b3 100644
--- a/docs/source/deployment/frameworks/index.md
+++ b/docs/source/deployment/frameworks/index.md
@@ -8,6 +8,7 @@ cerebrium
 dstack
 helm
 lws
+modal
 skypilot
 triton
 ```
diff --git a/docs/source/deployment/frameworks/modal.md b/docs/source/deployment/frameworks/modal.md
new file mode 100644
index 0000000000000..e7c42088e36a9
--- /dev/null
+++ b/docs/source/deployment/frameworks/modal.md
@@ -0,0 +1,7 @@
+(deployment-modal)=
+
+# Modal
+
+vLLM can be run on cloud GPUs with [Modal](https://modal.com), a serverless computing platform designed for fast auto-scaling.
+
+For details on how to deploy vLLM on Modal, see [this tutorial in the Modal documentation](https://modal.com/docs/examples/vllm_inference).

From c3cf54dda4df200bc8913ed69d210a7108dfa320 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 10 Jan 2025 11:10:12 +0800
Subject: [PATCH 347/357] [Doc][5/N] Move Community and API Reference to the
 bottom (#11896)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 README.md                                     |  2 +-
 .../source/design/automatic_prefix_caching.md |  2 +-
 docs/source/index.md                          | 62 ++++++++++++-------
 3 files changed, 40 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 253a0bb913e37..67c557bfe13a9 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ vLLM is a fast and easy-to-use library for LLM inference and serving.
 vLLM is fast with:
 
 - State-of-the-art serving throughput
-- Efficient management of attention key and value memory with **PagedAttention**
+- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
 - Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
diff --git a/docs/source/design/automatic_prefix_caching.md b/docs/source/design/automatic_prefix_caching.md
index 4398536b2b4ad..6d3dd056e6a60 100644
--- a/docs/source/design/automatic_prefix_caching.md
+++ b/docs/source/design/automatic_prefix_caching.md
@@ -2,7 +2,7 @@
 
 # Automatic Prefix Caching
 
-The core idea of [PagedAttention](#design-paged-attention) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
+The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
 
 To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block.
 
diff --git a/docs/source/index.md b/docs/source/index.md
index 23e4304fe29d9..356fa4b7fd573 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -26,7 +26,7 @@ vLLM is a fast and easy-to-use library for LLM inference and serving.
 vLLM is fast with:
 
 - State-of-the-art serving throughput
-- Efficient management of attention key and value memory with **PagedAttention**
+- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
 - Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8
@@ -54,6 +54,8 @@ For more information, check out the following:
 
 ## Documentation
 
+% How to start using vLLM?
+
 ```{toctree}
 :caption: Getting Started
 :maxdepth: 1
@@ -65,6 +67,8 @@ getting_started/troubleshooting
 getting_started/faq
 ```
 
+% What does vLLM support?
+
 ```{toctree}
 :caption: Models
 :maxdepth: 1
@@ -75,6 +79,8 @@ models/supported_models
 models/extensions/index
 ```
 
+% Additional capabilities
+
 ```{toctree}
 :caption: Features
 :maxdepth: 1
@@ -89,6 +95,8 @@ features/spec_decode
 features/compatibility_matrix
 ```
 
+% Details about running vLLM
+
 ```{toctree}
 :caption: Inference and Serving
 :maxdepth: 1
@@ -104,6 +112,8 @@ serving/usage_stats
 serving/integrations/index
 ```
 
+% Scaling up vLLM for production
+
 ```{toctree}
 :caption: Deployment
 :maxdepth: 1
@@ -115,6 +125,8 @@ deployment/frameworks/index
 deployment/integrations/index
 ```
 
+% Making the most out of vLLM
+
 ```{toctree}
 :caption: Performance
 :maxdepth: 1
@@ -123,28 +135,7 @@ performance/optimization
 performance/benchmarks
 ```
 
-% Community: User community resources
-
-```{toctree}
-:caption: Community
-:maxdepth: 1
-
-community/meetups
-community/sponsors
-```
-
-```{toctree}
-:caption: API Reference
-:maxdepth: 2
-
-api/offline_inference/index
-api/engine/index
-api/inference_params
-api/multimodal/index
-api/model/index
-```
-
-% Design Documents: Details about vLLM internals
+% Explanation of vLLM internals
 
 ```{toctree}
 :caption: Design Documents
@@ -159,7 +150,7 @@ design/automatic_prefix_caching
 design/multiprocessing
 ```
 
-% Developer Guide: How to contribute to the vLLM project
+% How to contribute to the vLLM project
 
 ```{toctree}
 :caption: Developer Guide
@@ -172,6 +163,29 @@ contributing/model/index
 contributing/vulnerability_management
 ```
 
+% Technical API specifications
+
+```{toctree}
+:caption: API Reference
+:maxdepth: 2
+
+api/offline_inference/index
+api/engine/index
+api/inference_params
+api/multimodal/index
+api/model/index
+```
+
+% Latest news and acknowledgements
+
+```{toctree}
+:caption: Community
+:maxdepth: 1
+
+community/meetups
+community/sponsors
+```
+
 # Indices and tables
 
 - {ref}`genindex`

From b844b99ad309b05f37b1acb5360c82be7b16281d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 10 Jan 2025 11:24:00 +0800
Subject: [PATCH 348/357] [VLM] Enable tokenized inputs for merged multi-modal
 processor (#11900)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/multimodal/test_processing.py      |  31 ++++--
 vllm/inputs/data.py                      |   4 +-
 vllm/inputs/preprocess.py                |   4 -
 vllm/model_executor/models/blip2.py      |  22 +++-
 vllm/model_executor/models/chameleon.py  |  32 +++++-
 vllm/model_executor/models/fuyu.py       |  24 +++--
 vllm/model_executor/models/interfaces.py |   8 +-
 vllm/model_executor/models/llava.py      |   8 +-
 vllm/model_executor/models/phi3v.py      |   4 +-
 vllm/model_executor/models/ultravox.py   |  18 ++--
 vllm/multimodal/processing.py            | 127 ++++++++++++++++-------
 vllm/multimodal/profiling.py             |   2 +-
 12 files changed, 207 insertions(+), 77 deletions(-)

diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index d98bd9736b65f..d18909a4197b6 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -649,7 +649,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
         )
 
 
-def _test_processing_cache_correctness(
+def _test_processing_correctness(
     model_id: str,
     modalities: dict[str, bool],
     hit_rate: float,
@@ -691,6 +691,7 @@ def _test_processing_cache_correctness(
     baseline_processor = factories.build_processor(ctx, cache=None)
     cached_processor = factories.build_processor(ctx, cache=cache)
     dummy_inputs = baseline_processor.dummy_inputs
+    tokenizer = baseline_processor.info.get_tokenizer()
 
     rng = np.random.RandomState(0)
 
@@ -747,7 +748,25 @@ def _test_processing_cache_correctness(
         )
 
         assert baseline_result == cached_result, (
-            f"Failed ({batch_idx=}, {mm_data=})")
+            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
+
+        baseline_tokenized_result = baseline_processor.apply(
+            tokenizer.encode(prompt),
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+        assert baseline_result == baseline_tokenized_result, (
+            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
+
+        cached_tokenized_result = cached_processor.apply(
+            tokenizer.encode(prompt),
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+        assert cached_result == cached_tokenized_result, (
+            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
 
 
 # yapf: disable
@@ -771,14 +790,14 @@ def _test_processing_cache_correctness(
 @pytest.mark.parametrize("num_batches", [32])
 @pytest.mark.parametrize("simplify_rate", [1.0])
 # yapf: enable
-def test_processing_cache_correctness(
+def test_processing_correctness(
     model_id: str,
     modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
 ):
-    _test_processing_cache_correctness(
+    _test_processing_correctness(
         model_id,
         modalities,
         hit_rate=hit_rate,
@@ -795,7 +814,7 @@ def test_processing_cache_correctness(
 @pytest.mark.parametrize("num_batches", [32])
 @pytest.mark.parametrize("simplify_rate", [1.0])
 # yapf: enable
-def test_processing_cache_correctness_phi3v(
+def test_processing_correctness_phi3v(
     model_id: str,
     modalities: dict[str, bool],
     hit_rate: float,
@@ -809,7 +828,7 @@ def test_processing_cache_correctness_phi3v(
 
     AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True)
 
-    _test_processing_cache_correctness(
+    _test_processing_correctness(
         model_id,
         modalities,
         hit_rate=hit_rate,
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index cdaf6dd76eaa1..b8163a7acde1d 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -44,13 +44,13 @@ class TokensPrompt(TypedDict):
 
     multi_modal_data: NotRequired["MultiModalDataDict"]
     """
-    DEPRECATED: Optional multi-modal data to pass to the model,
+    Optional multi-modal data to pass to the model,
     if the model supports it.
     """
 
     mm_processor_kwargs: NotRequired[Dict[str, Any]]
     """
-    DEPRECATED: Optional multi-modal processor kwargs to be forwarded to the
+    Optional multi-modal processor kwargs to be forwarded to the
     multimodal input mapper & processor. Note that if multiple modalities
     have registered mappers etc for the model being considered, we attempt
     to pass the mm_processor_kwargs to each of them.
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index a738ffe18e3ae..0890883cc984f 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -279,10 +279,6 @@ async def _process_multimodal_async(
 
         mm_processor = self.mm_registry.create_processor(
             self.model_config, tokenizer)
-        if isinstance(prompt, list):
-            logger.warning("Passing `multi_modal_data` in TokensPrompt is"
-                           "deprecated and will be removed in a future update")
-            prompt = tokenizer.decode(prompt)
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 7dfc0b687c6e3..917b88e802071 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -441,6 +441,24 @@ def get_dummy_processor_inputs(
 
 class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
 
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            # HF processor always adds placeholders even when there's no image
+            tokenizer = self.info.get_tokenizer()
+            prompt_ids = tokenizer.encode(prompt)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -469,11 +487,11 @@ def _get_prompt_replacements(
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only <image> tokens should be considered as placeholders,
         # so we ignore the trailing bos_token
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 452fe727875fe..a6634204699c9 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -99,6 +99,34 @@ def get_dummy_processor_inputs(
 class ChameleonMultiModalProcessor(
         BaseMultiModalProcessor[ChameleonProcessingInfo]):
 
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        # HF processor adds sep token for chat mode
+        tokenizer = self.info.get_tokenizer()
+        sep_token_id: int = \
+            tokenizer.vocab[tokenizer.sep_token]  # type: ignore
+
+        return prompt_tokens + [sep_token_id]
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -128,11 +156,11 @@ def _get_prompt_replacements(
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only <image> tokens should be considered as placeholders,
         # so we ignore the image_start_token and image_end_token
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 59af5f0b3ae98..63e7147f84e03 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -16,7 +16,7 @@
 """ PyTorch Fuyu model."""
 import math
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict)
+                    TypedDict, Union)
 
 import torch
 import torch.nn as nn
@@ -149,14 +149,10 @@ def _call_hf_processor(
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-
         if not mm_data:
             # Avoid warning from HF logger for text-only input
-            # Input_ids format: bos_token_id + prompt_token_ids + boa_token_id
-            # Tokenizer won't add boa_token_id by default, we add it manually.
-            tokenizer = self.info.get_tokenizer()
-            boa_token_id: int = tokenizer.vocab["<0x04>"]  # type: ignore
-            prompt_ids = tokenizer.encode(prompt) + [boa_token_id]
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
 
         processed_outputs = super()._call_hf_processor(
@@ -181,6 +177,16 @@ def _call_hf_processor(
 
         return processed_outputs
 
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        # HF processor adds boa_token_id
+        tokenizer = self.info.get_tokenizer()
+        boa_token_id: int = tokenizer.vocab["<0x04>"]  # type: ignore
+
+        return prompt_tokens + [boa_token_id]
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -223,11 +229,11 @@ def get_replacement_fuyu(item_idx: int):
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only |SPEAKER| (image) tokens should be considered as placeholders,
         # so we ignore the trailing bos_token_id
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index b51cba86ec1a4..c5fd0d9332379 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -39,13 +39,13 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
 
         The output embeddings must be one of the following formats:
     
-        - A list or tuple of 2D tensors, where each tensor corresponds to 
-            each input multimodal data item (e.g, image).
+        - A list or tuple of 2D tensors, where each tensor corresponds to
+          each input multimodal data item (e.g, image).
         - A single 3D tensor, with the batch dimension grouping the 2D tensors.
 
         Note:
-            The returned multimodal embeddings must be in the same order as 
-            the appearances of their corresponding multimodal data item in the 
+            The returned multimodal embeddings must be in the same order as
+            the appearances of their corresponding multimodal data item in the
             input prompt.
         """
         ...
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 8d94acf3b21d5..bb3db60c7d8ed 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -724,7 +724,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
@@ -737,7 +737,7 @@ def apply(
             image_height=-1,
         )
 
-        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         mm_items = self._to_mm_items(mm_data)
         mm_item_counts = mm_items.get_all_counts()
@@ -760,7 +760,7 @@ def get_replacement_mantis(item_idx: int):
             )
         ])
 
-        prompt_ids, prompt_text, _ = self._apply_prompt_replacements(
+        prompt_ids, prompt, _ = self._apply_prompt_replacements(
             result["prompt_token_ids"],
             mantis_mm_repls,
             mm_item_counts,
@@ -788,7 +788,7 @@ def get_replacement_mantis(item_idx: int):
 
         return MultiModalInputsV2(
             type="multimodal",
-            prompt=prompt_text,
+            prompt=prompt,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
             mm_placeholders=mm_placeholder_ranges,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index a1b1af35604db..7a230e5beb367 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -481,11 +481,11 @@ def _apply_prompt_replacements(
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only <|image|> tokens should be considered as placeholders,
         # so we ignore the trailing bos_token_id
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index fada22d685dd6..3edfb5107683a 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -138,12 +138,8 @@ def _call_hf_processor(
     ) -> BatchFeature:
         # Text-only input not supported in composite processor
         if not mm_data:
-            tokenizer = self.info.get_tokenizer()
-
-            prompt_ids = tokenizer.encode(
-                prompt,
-                add_special_tokens=False,  # type: ignore
-            )
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
 
         mm_data = dict(mm_data)
@@ -188,6 +184,16 @@ def _call_hf_processor(
         )
         return BatchFeature(combined_outputs)
 
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        # HF processor omits bos_token_id by setting add_special_tokens=False
+        tokenizer = self.info.get_tokenizer()
+        assert prompt_tokens[0] == tokenizer.bos_token_id
+
+        return prompt_tokens[1:]
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 07d883d5d7295..8b47dfb07387f 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -725,15 +725,15 @@ def _call_hf_processor(
             mm_kwargs,
         )
 
-    def _apply_hf_processor(
+    def _apply_hf_processor_text_mm(
         self,
         prompt_text: str,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> tuple[list[int], MultiModalKwargs]:
         """
-        Wrapper of :meth:`_call_hf_processor` that applies
-        additional pre-processing and post-processing.
+        Apply the HF processor on the prompt text and multi-modal data
+        together.
         """
         processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
 
@@ -753,40 +753,93 @@ def _apply_hf_processor(
 
         return prompt_ids, mm_kwargs
 
-    def _apply_hf_processor_missing(
-        self,
-        prompt_text: str,
-        mm_missing_data_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ):
+    def _apply_hf_processor_text_only(self, prompt_text: str) -> list[int]:
         """
-        Apply the HF processor on the full prompt text, but only on the
-        multi-modal data that are missing from the cache.
+        Apply the HF processor on the prompt text only.
 
-        Note:
-            We pass prompt text and multi-modal data into the HF processor
-            in separate calls to avoid HF prompt replacement being done for
-            cached items; instead, we rely on our own prompt replacement logic
-            (:meth:`_get_prompt_replacements`) for the full text.
+        Since HF processor requires that text and multi-modal items
+        correspond to each other, we create dummy multi-modal items
+        to go along with the text.
         """
-        mm_missing_counts = mm_missing_data_items.get_all_counts()
-
-        prompt_ids, _ = self._apply_hf_processor(
+        prompt_ids, _ = self._apply_hf_processor_text_mm(
             prompt_text=prompt_text,
             mm_items=MultiModalDataItems({}),
             hf_processor_mm_kwargs={},
         )
 
-        # Some HF processors (e.g. Qwen2-VL) expect corresponding
-        # multi-modal tokens to be in the prompt text
+        return prompt_ids
+
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        """
+        Apply the HF processor on the prompt tokens only.
+
+        Most HF processors accept prompt text but not prompt tokens.
+        If the HF processor adds or removes tokens that are not related to
+        multi-modal data, you should override this method so it is consistent
+        with the output of :meth:`_apply_hf_processor_text_only` on the
+        corresponding text.
+        """
+        return prompt_tokens
+
+    def _apply_hf_processor_mm_only(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalKwargs:
+        """
+        Apply the HF processor on the multi-modal data only.
+
+        Since HF processor requires that text and multi-modal items
+        correspond to each other, we generate dummy text using
+        :class:`DummyInputsBuilder` to go along with the multi-modal data.
+        """
+        mm_counts = mm_items.get_all_counts()
+
         dummy_inputs = self.dummy_inputs.get_dummy_processor_inputs(
             self.info.ctx.model_config.max_model_len,
-            mm_missing_counts,
+            mm_counts,
         )
 
-        _, mm_missing_kwargs = self._apply_hf_processor(
+        _, mm_kwargs = self._apply_hf_processor_text_mm(
             prompt_text=dummy_inputs.prompt_text,
-            mm_items=mm_missing_data_items,
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
+        return mm_kwargs
+
+    def _apply_hf_processor_main(
+        self,
+        prompt: Union[str, list[int]],
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        *,
+        enable_hf_prompt_replacement: bool,
+    ) -> tuple[list[int], MultiModalKwargs]:
+        """
+        Apply the HF processor on the prompt text and multi-modal data.
+
+        Note:
+            If :code:`enable_hf_prompt_replacement=False`, the prompt should
+            correspond to the multi-modal items.
+        """
+        if isinstance(prompt, str):
+            if enable_hf_prompt_replacement:
+                return self._apply_hf_processor_text_mm(
+                    prompt_text=prompt,
+                    mm_items=mm_items,
+                    hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                )
+
+            prompt_ids = self._apply_hf_processor_text_only(prompt)
+        else:
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt)
+
+        mm_missing_kwargs = self._apply_hf_processor_mm_only(
+            mm_items=mm_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
         )
 
@@ -794,7 +847,7 @@ def _apply_hf_processor_missing(
 
     def _cached_apply_hf_processor(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> tuple[list[int], MultiModalKwargs]:
@@ -807,10 +860,11 @@ def _cached_apply_hf_processor(
 
         _, passthrough_data = self._get_hf_mm_data(mm_data_items)
         if cache is None or passthrough_data:
-            return self._apply_hf_processor(
-                prompt_text=prompt_text,
+            return self._apply_hf_processor_main(
+                prompt=prompt,
                 mm_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                enable_hf_prompt_replacement=True,
             )
 
         mm_maybe_cached_kw_items = {
@@ -832,10 +886,13 @@ def _cached_apply_hf_processor(
         }
         mm_missing_data_items = self._to_mm_items(mm_missing_data)
 
-        prompt_ids, mm_missing_kwargs = self._apply_hf_processor_missing(
-            prompt_text=prompt_text,
-            mm_missing_data_items=mm_missing_data_items,
+        # NOTE: `prompt` does not correspond to `mm_missing_data_items`,
+        # so we need to pass `enable_hf_prompt_replacement=False`
+        prompt_ids, mm_missing_kwargs = self._apply_hf_processor_main(
+            prompt=prompt,
+            mm_items=mm_missing_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            enable_hf_prompt_replacement=False,
         )
 
         mm_missing_next_idx = {
@@ -1018,7 +1075,7 @@ def _validate_mm_placeholders(
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
@@ -1056,7 +1113,7 @@ def apply(
             mm_hashes = None
 
         prompt_ids, mm_kwargs = self._cached_apply_hf_processor(
-            prompt_text,
+            prompt,
             mm_items,
             hf_processor_mm_kwargs,
         )
@@ -1101,12 +1158,12 @@ def apply(
         # there is no need for us to insert them
         if all(len(repls) == 0 for repls in mm_missing_repls.items()):
             tokenizer = self.info.get_tokenizer()
-            prompt_text = decode_tokens(tokenizer, prompt_ids)
+            prompt = decode_tokens(tokenizer, prompt_ids)
             mm_placeholders = hf_mm_placeholders
         else:
             (
                 prompt_ids,
-                prompt_text,
+                prompt,
                 missing_mm_placeholders,
             ) = self._apply_prompt_replacements(
                 prompt_ids,
@@ -1125,7 +1182,7 @@ def apply(
 
         return MultiModalInputsV2(
             type="multimodal",
-            prompt=prompt_text,
+            prompt=prompt,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
             mm_hashes=mm_hashes,
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 6f7da1509990f..ec580cd6ecddd 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -137,7 +137,7 @@ def _get_dummy_mm_inputs(
             seq_len, mm_counts)
 
         return self.processor.apply(
-            prompt_text=processor_inputs.prompt_text,
+            prompt=processor_inputs.prompt_text,
             mm_data=processor_inputs.mm_data,
             hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
         )

From 3de2b1eafb12e420c563cb7153d4d2f0e8451ca9 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 10 Jan 2025 11:25:20 +0800
Subject: [PATCH 349/357] [Doc] Show default pooling method in a table (#11904)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/generative_models.md |  8 ++--
 docs/source/models/pooling_models.md    | 59 +++++++++++++++++--------
 2 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index 6228c7c2ac957..a9f74c4d3fbb8 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -8,14 +8,14 @@ In vLLM, generative models implement the {class}`~vllm.model_executor.models.Vll
 Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
 which are then passed through {class}`~vllm.model_executor.layers.Sampler` to obtain the final text.
 
+For generative models, the only supported `--task` option is `"generate"`.
+Usually, this is automatically inferred so you don't have to specify it.
+
 ## Offline Inference
 
 The {class}`~vllm.LLM` class provides various methods for offline inference.
 See [Engine Arguments](#engine-args) for a list of options when initializing the model.
 
-For generative models, the only supported {code}`task` option is {code}`"generate"`.
-Usually, this is automatically inferred so you don't have to specify it.
-
 ### `LLM.generate`
 
 The {class}`~vllm.LLM.generate` method is available to all generative models in vLLM.
@@ -33,7 +33,7 @@ for output in outputs:
 ```
 
 You can optionally control the language generation by passing {class}`~vllm.SamplingParams`.
-For example, you can use greedy sampling by setting {code}`temperature=0`:
+For example, you can use greedy sampling by setting `temperature=0`:
 
 ```python
 llm = LLM(model="facebook/opt-125m")
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 3e4407cfdc233..745f3fd81980d 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -14,30 +14,53 @@ As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM feature
 pooling models as they only work on the generation or decode stage, so performance may not improve as much.
 ```
 
-## Offline Inference
-
-The {class}`~vllm.LLM` class provides various methods for offline inference.
-See [Engine Arguments](#engine-args) for a list of options when initializing the model.
-
-For pooling models, we support the following {code}`task` options:
-
-- Embedding ({code}`"embed"` / {code}`"embedding"`)
-- Classification ({code}`"classify"`)
-- Sentence Pair Scoring ({code}`"score"`)
-- Reward Modeling ({code}`"reward"`)
+For pooling models, we support the following `--task` options.
+The selected option sets the default pooler used to extract the final hidden states:
+
+```{list-table}
+:widths: 50 25 25 25
+:header-rows: 1
+
+* - Task
+  - Pooling Type
+  - Normalization
+  - Softmax
+* - Embedding (`embed`)
+  - `LAST`
+  - ✅︎
+  - ✗
+* - Classification (`classify`)
+  - `LAST`
+  - ✗
+  - ✅︎
+* - Sentence Pair Scoring (`score`)
+  - \*
+  - \*
+  - \*
+* - Reward Modeling (`reward`)
+  - `ALL`
+  - ✗
+  - ✗
+```
 
-The selected task determines the default {class}`~vllm.model_executor.layers.Pooler` that is used:
+\*The default pooler is always defined by the model.
 
-- Embedding: Extract only the hidden states corresponding to the last token, and apply normalization.
-- Classification: Extract only the hidden states corresponding to the last token, and apply softmax.
-- Sentence Pair Scoring: Extract only the hidden states corresponding to the last token, and apply softmax.
-- Reward Modeling: Extract all of the hidden states and return them directly.
+```{note}
+If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table.
+```
 
 When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
-we attempt to override the default pooler based on its Sentence Transformers configuration file ({code}`modules.json`).
+we attempt to override the default pooler based on its Sentence Transformers configuration file (`modules.json`).
 
-You can customize the model's pooling method via the {code}`override_pooler_config` option,
+```{tip}
+You can customize the model's pooling method via the `--override-pooler-config` option,
 which takes priority over both the model's and Sentence Transformers's defaults.
+```
+
+## Offline Inference
+
+The {class}`~vllm.LLM` class provides various methods for offline inference.
+See [Engine Arguments](#engine-args) for a list of options when initializing the model.
 
 ### `LLM.encode`
 

From cf5f000d218fbcbc4bf404de8ed9d9607a128c3b Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Fri, 10 Jan 2025 13:14:42 +0800
Subject: [PATCH 350/357] [torch.compile] Hide KV cache behind torch.compile
 boundary (#11677)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 tests/kernels/test_encoder_decoder_attn.py | 18 +++--
 tests/test_utils.py                        | 85 +++++++++++++++++++++-
 tests/v1/engine/test_engine_core.py        |  3 +
 tests/v1/engine/test_engine_core_client.py |  3 +
 vllm/attention/layer.py                    | 29 +++++---
 vllm/config.py                             |  1 -
 vllm/forward_context.py                    | 33 +++++----
 vllm/utils.py                              | 35 +++++++++
 vllm/v1/worker/gpu_model_runner.py         |  6 +-
 vllm/worker/cpu_enc_dec_model_runner.py    |  3 +-
 vllm/worker/cpu_model_runner.py            |  3 +-
 vllm/worker/cpu_pooling_model_runner.py    |  3 +-
 vllm/worker/cpu_worker.py                  |  4 +-
 vllm/worker/enc_dec_model_runner.py        |  3 +-
 vllm/worker/model_runner.py                |  5 +-
 vllm/worker/pooling_model_runner.py        |  3 +-
 vllm/worker/worker.py                      |  4 +-
 vllm/worker/worker_base.py                 |  1 +
 18 files changed, 198 insertions(+), 44 deletions(-)

diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py
index 614674375786e..e008a56de6208 100644
--- a/tests/kernels/test_encoder_decoder_attn.py
+++ b/tests/kernels/test_encoder_decoder_attn.py
@@ -142,12 +142,18 @@ class that Attention will automatically select when it is constructed.
             torch.tensor([], dtype=torch.float32, device=CUDA_DEVICE))
 
     # Construct KV cache
-    kv_cache = make_kv_cache(test_pt.num_blocks,
-                             test_pt.num_heads,
-                             test_pt.head_size,
-                             test_pt.block_size,
-                             device=CUDA_DEVICE,
-                             backend=test_pt.backend_name)
+    if test_pt.attn_type in (AttentionType.DECODER,
+                             AttentionType.ENCODER_DECODER):
+        kv_cache = make_kv_cache(test_pt.num_blocks,
+                                 test_pt.num_heads,
+                                 test_pt.head_size,
+                                 test_pt.block_size,
+                                 device=CUDA_DEVICE,
+                                 backend=test_pt.backend_name)
+    else:
+        kv_cache = torch.tensor([])
+
+    attn.kv_cache = [kv_cache]
     return TestResources(scale, attn, kv_cache)
 
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 14d2fbd63b90d..6810e0302f897 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -7,9 +7,11 @@
 import torch
 from vllm_test_utils import monitor
 
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.utils import (FlexibleArgumentParser, PlaceholderModule,
-                        StoreBoolean, deprecate_kwargs, get_open_port,
-                        memory_profiling, merge_async_iterators, supports_kw)
+                        StoreBoolean, bind_kv_cache, deprecate_kwargs,
+                        get_open_port, memory_profiling, merge_async_iterators,
+                        supports_kw)
 
 from .utils import error_on_warning, fork_new_process_for_each_test
 
@@ -325,6 +327,85 @@ def measure_current_non_torch():
     lib.cudaFree(handle2)
 
 
+def test_bind_kv_cache():
+    from vllm.attention import Attention
+
+    ctx = {
+        'layers.0.self_attn': Attention(32, 128, 0.1),
+        'layers.1.self_attn': Attention(32, 128, 0.1),
+        'layers.2.self_attn': Attention(32, 128, 0.1),
+        'layers.3.self_attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = [
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+    ]
+    bind_kv_cache(ctx, [kv_cache])
+    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0]
+    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[1]
+    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[2]
+    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[3]
+
+def test_bind_kv_cache_non_attention():
+    from vllm.attention import Attention
+
+    # example from Jamba PP=2
+    ctx = {
+        'model.layers.20.attn': Attention(32, 128, 0.1),
+        'model.layers.28.attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = [
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+    ]
+    bind_kv_cache(ctx, [kv_cache])
+    assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[0]
+    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
+
+
+def test_bind_kv_cache_encoder_decoder():
+    from vllm.attention import Attention, AttentionType
+
+    # example from bart
+    ctx = {
+        'encoder.layers.0.self_attn.attn':
+            Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
+        'decoder.layers.0.encoder_attn.attn':
+            Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
+        'decoder.layers.0.self_attn.attn':
+            Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
+    }
+
+    kv_cache = [
+        torch.zeros((1, )),
+    ]
+    encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
+
+    bind_kv_cache(ctx, [kv_cache])
+    assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
+    assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
+    assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
+
+
+def test_bind_kv_cache_pp():
+    cfg = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=2))
+    with set_current_vllm_config(cfg):
+        from vllm.attention import Attention
+
+        ctx = {
+            'layers.0.self_attn': Attention(32, 128, 0.1),
+        }
+        kv_cache = [
+            [torch.zeros((1, ))],
+            [torch.zeros((1, ))]
+        ]
+        bind_kv_cache(ctx, kv_cache)
+        assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0][0]
+        assert ctx['layers.0.self_attn'].kv_cache[1] is kv_cache[1][0]
+
+
 def test_placeholder_module_error_handling():
     placeholder = PlaceholderModule("placeholder_1234")
 
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 8dd9b23fbdd5f..5b1732036e807 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -4,6 +4,7 @@
 import pytest
 from transformers import AutoTokenizer
 
+from tests.utils import fork_new_process_for_each_test
 from vllm import SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
@@ -36,6 +37,7 @@ def make_request() -> EngineCoreRequest:
     )
 
 
+@fork_new_process_for_each_test
 def test_engine_core(monkeypatch):
 
     with monkeypatch.context() as m:
@@ -138,6 +140,7 @@ def test_engine_core(monkeypatch):
         assert len(engine_core.scheduler.running) == 0
 
 
+@fork_new_process_for_each_test
 def test_engine_core_advanced_sampling(monkeypatch):
     """
     A basic end-to-end test to verify that the engine functions correctly 
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 5a21806e57a11..7eac16f2cf542 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -6,6 +6,7 @@
 import pytest
 from transformers import AutoTokenizer
 
+from tests.utils import fork_new_process_for_each_test
 from vllm import SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
@@ -75,6 +76,7 @@ async def loop_until_done_async(client: EngineCoreClient, outputs: Dict):
             break
 
 
+@fork_new_process_for_each_test
 @pytest.mark.parametrize("multiprocessing_mode", [True, False])
 def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
 
@@ -143,6 +145,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
         client.abort_requests([request.request_id])
 
 
+@fork_new_process_for_each_test
 @pytest.mark.asyncio
 async def test_engine_core_client_asyncio(monkeypatch):
 
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index f1b3598e60b54..55e4e14027f79 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -121,6 +121,13 @@ def __init__(
         compilation_config.static_forward_context[prefix] = self
         self.layer_name = prefix
         self.attn_type = attn_type
+        # use a placeholder kv cache tensor during init, which will be replaced
+        # by bind_kv_cache
+        # this variable will not be accessed if use_direct_call is True
+        self.kv_cache = [
+            torch.tensor([]) for _ in range(get_current_vllm_config(
+            ).parallel_config.pipeline_parallel_size)
+        ]
 
     def forward(
         self,
@@ -148,11 +155,11 @@ def forward(
             if value is not None:
                 value = value.view(-1, self.num_kv_heads, self.head_size)
             torch.ops.vllm.unified_attention_with_output(
-                query, key, value, output, kv_cache, self.layer_name)
+                query, key, value, output, self.layer_name)
             return output.view(-1, hidden_size)
         else:
             return torch.ops.vllm.unified_attention(query, key, value,
-                                                    kv_cache, self.layer_name)
+                                                    self.layer_name)
 
     def extra_repr(self) -> str:
         s = f"head_size={self.impl.head_size}"  # type: ignore
@@ -230,12 +237,12 @@ def unified_attention(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    kv_cache: torch.Tensor,
     layer_name: str,
 ) -> torch.Tensor:
     forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.dynamic_forward_context
-    self = forward_context.static_forward_context[layer_name]
+    attn_metadata = forward_context.attn_metadata
+    self = forward_context.attn_layers[layer_name]
+    kv_cache = self.kv_cache[forward_context.virtual_engine]
     return self.impl.forward(query, key, value, kv_cache, attn_metadata,
                              self._k_scale, self._v_scale)
 
@@ -244,7 +251,6 @@ def unified_attention_fake(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    kv_cache: torch.Tensor,
     layer_name: str,
 ) -> torch.Tensor:
     return torch.empty_like(query).contiguous()
@@ -253,7 +259,7 @@ def unified_attention_fake(
 direct_register_custom_op(
     op_name="unified_attention",
     op_func=unified_attention,
-    mutates_args=["kv_cache"],
+    mutates_args=[],
     fake_impl=unified_attention_fake,
     dispatch_key=current_platform.dispatch_key,
 )
@@ -264,12 +270,12 @@ def unified_attention_with_output(
     key: torch.Tensor,
     value: torch.Tensor,
     output: torch.Tensor,
-    kv_cache: torch.Tensor,
     layer_name: str,
 ) -> None:
     forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.dynamic_forward_context
-    self = forward_context.static_forward_context[layer_name]
+    attn_metadata = forward_context.attn_metadata
+    self = forward_context.attn_layers[layer_name]
+    kv_cache = self.kv_cache[forward_context.virtual_engine]
     self.impl.forward(query,
                       key,
                       value,
@@ -285,7 +291,6 @@ def unified_attention_with_output_fake(
     key: torch.Tensor,
     value: torch.Tensor,
     output: torch.Tensor,
-    kv_cache: torch.Tensor,
     layer_name: str,
 ) -> None:
     return
@@ -294,7 +299,7 @@ def unified_attention_with_output_fake(
 direct_register_custom_op(
     op_name="unified_attention_with_output",
     op_func=unified_attention_with_output,
-    mutates_args=["kv_cache", "output"],
+    mutates_args=["output"],
     fake_impl=unified_attention_with_output_fake,
     dispatch_key=current_platform.dispatch_key,
 )
diff --git a/vllm/config.py b/vllm/config.py
index 19609085cc960..13b5390008a35 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2780,7 +2780,6 @@ def model_post_init(self, __context: Any) -> None:
     compilation_time: float = PrivateAttr
 
     # Per-model forward context
-    # Mainly used to store attention cls
     # Map from layer name to the attention cls
     static_forward_context: Dict[str, Any] = PrivateAttr
 
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 7f56575279e9b..828b394ec5d21 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -2,7 +2,7 @@
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import torch
 
@@ -10,6 +10,9 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+
 logger = init_logger(__name__)
 
 track_batchsize: bool = envs.VLLM_LOG_BATCHSIZE_INTERVAL >= 0
@@ -21,9 +24,12 @@
 
 @dataclass
 class ForwardContext:
-    static_forward_context: Dict[str, Any]
+    # copy from vllm_config.compilation_config.static_forward_context
+    attn_layers: Dict[str, Any]
     # TODO: extend to support per-layer dynamic forward context
-    dynamic_forward_context: Any
+    attn_metadata: "AttentionMetadata"  # set dynamically for each forward pass
+    # TODO: remove after making all virtual_engines share the same kv cache
+    virtual_engine: int  # set dynamically for each forward pass
 
 
 _forward_context: Optional[ForwardContext] = None
@@ -38,34 +44,35 @@ def get_forward_context() -> ForwardContext:
 
 
 @contextmanager
-def set_forward_context(context: Any, vllm_config: VllmConfig):
+def set_forward_context(attn_metadata: Any,
+                        vllm_config: VllmConfig,
+                        virtual_engine: int = 0):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
     Here we can inject common logic for every model forward pass.
     """
     global forward_start_time
-    need_to_track_batchsize = track_batchsize and context is not None
+    need_to_track_batchsize = track_batchsize and attn_metadata is not None
     if need_to_track_batchsize:
         forward_start_time = time.perf_counter()
     global _forward_context
     prev_context = _forward_context
     _forward_context = ForwardContext(
-        static_forward_context=vllm_config.compilation_config.
-        static_forward_context,
-        dynamic_forward_context=context)
+        attn_layers=vllm_config.compilation_config.static_forward_context,
+        virtual_engine=virtual_engine,
+        attn_metadata=attn_metadata)
     try:
         yield
     finally:
-        global batchsize_counter
         global last_logging_time, batchsize_logging_interval
         if need_to_track_batchsize:
-            if hasattr(context, "num_prefill_tokens"):
+            if hasattr(attn_metadata, "num_prefill_tokens"):
                 # for v0 attention backends
-                batchsize = context.num_prefill_tokens + \
-                    context.num_decode_tokens
+                batchsize = attn_metadata.num_prefill_tokens + \
+                    attn_metadata.num_decode_tokens
             else:
                 # for v1 attention backends
-                batchsize = context.num_input_tokens
+                batchsize = attn_metadata.num_input_tokens
             # we use synchronous scheduling right now,
             # adding a sync point here should not affect
             # scheduling of the next batch
diff --git a/vllm/utils.py b/vllm/utils.py
index 487088591ebc2..8c3e5200b3d98 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -2138,3 +2138,38 @@ def get_mp_context():
     _check_multiproc_method()
     mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
     return multiprocessing.get_context(mp_method)
+
+
+def bind_kv_cache(
+        ctx: Dict[str, Any],
+        kv_cache: List[List[torch.Tensor]],  # [virtual_engine][layer_index]
+) -> None:
+    # Bind the kv_cache tensor to Attention modules, similar to
+    # ctx[layer_name].kv_cache[ve]=kv_cache[ve][extract_layer_index(layer_name)]
+    # Special things handled here:
+    # 1. Some models have non-attention layers, e.g., Jamba
+    # 2. Pipeline parallelism, each rank only has a subset of layers
+    # 3. Encoder attention has no kv cache
+    # 4. Encoder-decoder models, encoder-decoder attention and decoder-only
+    #    attention of the same layer (e.g., bart's decoder.layers.1.self_attn
+    #    and decoder.layers.1.encoder_attn) is mapped to the same kv cache
+    #    tensor
+    from vllm.attention import AttentionType
+    from vllm.model_executor.models.utils import extract_layer_index
+    layer_need_kv_cache = [
+        layer_name for layer_name in ctx
+        if ctx[layer_name].attn_type in (AttentionType.DECODER,
+                                         AttentionType.ENCODER_DECODER)
+    ]
+    layer_index_sorted = sorted(
+        set(
+            extract_layer_index(layer_name)
+            for layer_name in layer_need_kv_cache))
+    for layer_name in layer_need_kv_cache:
+        kv_cache_idx = layer_index_sorted.index(
+            extract_layer_index(layer_name))
+        forward_ctx = ctx[layer_name]
+        assert len(forward_ctx.kv_cache) == len(kv_cache)
+        for ve, ve_kv_cache in enumerate(kv_cache):
+            assert forward_ctx.kv_cache[ve].numel() == 0
+            forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a1d4f9b135789..fb87dc5a8222a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -16,7 +16,8 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.sampling_params import SamplingType
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        LayerBlockType, cdiv, is_pin_memory_available)
+                        LayerBlockType, bind_kv_cache, cdiv,
+                        is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
 from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
@@ -860,3 +861,6 @@ def initialize_kv_cache(self, num_blocks: int) -> None:
                 torch.zeros(kv_cache_shape,
                             dtype=self.kv_cache_dtype,
                             device=self.device))
+        bind_kv_cache(
+            self.vllm_config.compilation_config.static_forward_context,
+            [self.kv_caches])
diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py
index cc24cfe04d2ba..fa6775cbd6c66 100644
--- a/vllm/worker/cpu_enc_dec_model_runner.py
+++ b/vllm/worker/cpu_enc_dec_model_runner.py
@@ -305,7 +305,8 @@ def execute_model(
             intermediate_tensors,
         }
 
-        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 model_input.virtual_engine):
             hidden_states = model_executable(**execute_model_kwargs)
 
         # Compute the logits.
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index f1531e0fc0675..d99db4e0c6c40 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -526,7 +526,8 @@ def execute_model(
             execute_model_kwargs.update(
                 {"previous_hidden_states": previous_hidden_states})
 
-        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 model_input.virtual_engine):
             hidden_states = model_executable(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py
index 17b2fd2564a04..d31ba89e12375 100644
--- a/vllm/worker/cpu_pooling_model_runner.py
+++ b/vllm/worker/cpu_pooling_model_runner.py
@@ -69,7 +69,8 @@ def execute_model(
             intermediate_tensors,
         }
 
-        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 model_input.virtual_engine):
             hidden_states = model_executable(**execute_model_kwargs)
 
         # Only perform pooling in the driver worker.
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index b5dfebfce6f75..494c6506f3c0f 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -13,7 +13,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, bind_kv_cache
 from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner
 from vllm.worker.cpu_model_runner import CPUModelRunner, CPUModelRunnerBase
 from vllm.worker.cpu_pooling_model_runner import CPUPoolingModelRunner
@@ -293,6 +293,8 @@ def _init_cache_engine(self) -> None:
             self.cache_engine[ve].cpu_cache
             for ve in range(self.parallel_config.pipeline_parallel_size)
         ]
+        bind_kv_cache(self.compilation_config.static_forward_context,
+                      self.cpu_cache)
         self.model_runner.block_size = self.cache_engine[0].block_size
 
         assert all(
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 4d5d918087be8..8a161b740042d 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -175,7 +175,8 @@ def execute_model(
         } if self.has_inner_state else {}
 
         multi_modal_kwargs = model_input.multi_modal_kwargs or {}
-        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 model_input.virtual_engine):
             hidden_or_intermediate_states = model_executable(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 1c6d1bbee78ee..2b918483d3675 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1527,7 +1527,8 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                         self._update_inputs_to_capture_for_enc_dec_model(
                             capture_inputs)
 
-                    with set_forward_context(attn_metadata, self.vllm_config):
+                    with set_forward_context(attn_metadata, self.vllm_config,
+                                             virtual_engine):
                         graph_runner.capture(**capture_inputs)
                     self.graph_memory_pool = graph_runner.graph.pool()
                     self.graph_runners[virtual_engine][batch_size] = (
@@ -1695,7 +1696,7 @@ def execute_model(
 
         if not bypass_model_exec:
             with set_forward_context(model_input.attn_metadata,
-                                     self.vllm_config):
+                                     self.vllm_config, virtual_engine):
                 hidden_or_intermediate_states = model_executable(
                     input_ids=model_input.input_tokens,
                     positions=model_input.input_positions,
diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
index f79b3773bcbd2..6de227f3cb2b9 100644
--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -105,7 +105,8 @@ def execute_model(
         if model_input.token_types is not None:
             cross_enc_kwargs["token_type_ids"] = model_input.token_types
 
-        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 virtual_engine):
             hidden_or_intermediate_states = model_executable(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index f51b51d433d3d..0f12549e3f3fd 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -21,7 +21,7 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
                            SequenceGroupMetadata, SequenceGroupMetadataDelta)
-from vllm.utils import GiB_bytes, memory_profiling
+from vllm.utils import GiB_bytes, bind_kv_cache, memory_profiling
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
 from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
@@ -285,6 +285,8 @@ def _init_cache_engine(self):
             self.cache_engine[ve].gpu_cache
             for ve in range(self.parallel_config.pipeline_parallel_size)
         ]
+        bind_kv_cache(self.compilation_config.static_forward_context,
+                      self.gpu_cache)
 
     def _warm_up_model(self) -> None:
         if not self.model_config.enforce_eager:
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 249b3ed2dfd37..a835718e1db19 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -43,6 +43,7 @@ def __init__(
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
         self.kv_transfer_config = vllm_config.kv_transfer_config
+        self.compilation_config = vllm_config.compilation_config
         from vllm.platforms import current_platform
         self.current_platform = current_platform
 

From ac2f3f7fee93cf9cd97c0078e362feab7b6c8299 Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Fri, 10 Jan 2025 00:56:36 -0700
Subject: [PATCH 351/357] [Bugfix] Validate lora adapters to avoid crashing
 server (#11727)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../entrypoints/openai/test_lora_adapters.py  | 269 ++++++++++++++++++
 tests/entrypoints/openai/test_lora_lineage.py | 109 -------
 tests/entrypoints/openai/test_serving_chat.py |   8 +-
 .../entrypoints/openai/test_serving_models.py |  10 +-
 tests/entrypoints/openai/test_shutdown.py     |  27 +-
 vllm/engine/async_llm_engine.py               |   4 +
 vllm/engine/multiprocessing/__init__.py       |  20 +-
 vllm/engine/multiprocessing/client.py         |  42 ++-
 vllm/engine/multiprocessing/engine.py         |  27 +-
 vllm/engine/protocol.py                       |   5 +
 vllm/entrypoints/openai/api_server.py         |   7 +-
 vllm/entrypoints/openai/run_batch.py          |   1 +
 vllm/entrypoints/openai/serving_models.py     |  78 +++--
 vllm/lora/worker_manager.py                   |  19 +-
 vllm/v1/engine/async_llm.py                   |   4 +
 15 files changed, 459 insertions(+), 171 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_lora_adapters.py
 delete mode 100644 tests/entrypoints/openai/test_lora_lineage.py

diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
new file mode 100644
index 0000000000000..46a064f6d9e68
--- /dev/null
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -0,0 +1,269 @@
+import asyncio
+import json
+import shutil
+from contextlib import suppress
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically this needs Mistral-7B-v0.1 as base, but we're not testing
+# generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="module")
+def server_with_lora_modules_json(zephyr_lora_files):
+    # Define the json format LoRA module configurations
+    lora_module_1 = {
+        "name": "zephyr-lora",
+        "path": zephyr_lora_files,
+        "base_model_name": MODEL_NAME
+    }
+
+    lora_module_2 = {
+        "name": "zephyr-lora2",
+        "path": zephyr_lora_files,
+        "base_model_name": MODEL_NAME
+    }
+
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        json.dumps(lora_module_1),
+        json.dumps(lora_module_2),
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "64",
+    ]
+
+    # Enable the /v1/load_lora_adapter endpoint
+    envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server_with_lora_modules_json):
+    async with server_with_lora_modules_json.get_async_client(
+    ) as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_static_lora_lineage(client: openai.AsyncOpenAI,
+                                   zephyr_lora_files):
+    models = await client.models.list()
+    models = models.data
+    served_model = models[0]
+    lora_models = models[1:]
+    assert served_model.id == MODEL_NAME
+    assert served_model.root == MODEL_NAME
+    assert served_model.parent is None
+    assert all(lora_model.root == zephyr_lora_files
+               for lora_model in lora_models)
+    assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
+    assert lora_models[0].id == "zephyr-lora"
+    assert lora_models[1].id == "zephyr-lora2"
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI,
+                                    zephyr_lora_files):
+
+    response = await client.post("load_lora_adapter",
+                                 cast_to=str,
+                                 body={
+                                     "lora_name": "zephyr-lora-3",
+                                     "lora_path": zephyr_lora_files
+                                 })
+    # Ensure adapter loads before querying /models
+    assert "success" in response
+
+    models = await client.models.list()
+    models = models.data
+    dynamic_lora_model = models[-1]
+    assert dynamic_lora_model.root == zephyr_lora_files
+    assert dynamic_lora_model.parent == MODEL_NAME
+    assert dynamic_lora_model.id == "zephyr-lora-3"
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_not_found(client: openai.AsyncOpenAI):
+    with pytest.raises(openai.NotFoundError):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": "notfound",
+                              "lora_path": "/not/an/adapter"
+                          })
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI,
+                                          tmp_path):
+    invalid_files = tmp_path / "invalid_files"
+    invalid_files.mkdir()
+    (invalid_files / "adapter_config.json").write_text("this is not json")
+
+    with pytest.raises(openai.BadRequestError):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": "invalid-json",
+                              "lora_path": str(invalid_files)
+                          })
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_invalid_lora_rank(client: openai.AsyncOpenAI,
+                                              tmp_path, zephyr_lora_files):
+    invalid_rank = tmp_path / "invalid_rank"
+
+    # Copy adapter from zephyr_lora_files to invalid_rank
+    shutil.copytree(zephyr_lora_files, invalid_rank)
+
+    with open(invalid_rank / "adapter_config.json") as f:
+        adapter_config = json.load(f)
+
+    print(adapter_config)
+
+    # assert False
+
+    # Change rank to invalid value
+    adapter_config["r"] = 1024
+    with open(invalid_rank / "adapter_config.json", "w") as f:
+        json.dump(adapter_config, f)
+
+    with pytest.raises(openai.BadRequestError,
+                       match="is greater than max_lora_rank"):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": "invalid-json",
+                              "lora_path": str(invalid_rank)
+                          })
+
+
+@pytest.mark.asyncio
+async def test_multiple_lora_adapters(client: openai.AsyncOpenAI, tmp_path,
+                                      zephyr_lora_files):
+    """Validate that many loras can be dynamically registered and inferenced 
+    with concurrently"""
+
+    # This test file configures the server with --max-cpu-loras=2 and this test
+    # will concurrently load 10 adapters, so it should flex the LRU cache
+    async def load_and_run_adapter(adapter_name: str):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": adapter_name,
+                              "lora_path": str(zephyr_lora_files)
+                          })
+        for _ in range(3):
+            await client.completions.create(
+                model=adapter_name,
+                prompt=["Hello there", "Foo bar bazz buzz"],
+                max_tokens=5,
+            )
+
+    lora_tasks = []
+    for i in range(10):
+        lora_tasks.append(
+            asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
+
+    results, _ = await asyncio.wait(lora_tasks)
+
+    for r in results:
+        assert not isinstance(r, Exception), f"Got exception {r}"
+
+
+@pytest.mark.asyncio
+async def test_loading_invalid_adapters_does_not_break_others(
+        client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files):
+
+    invalid_files = tmp_path / "invalid_files"
+    invalid_files.mkdir()
+    (invalid_files / "adapter_config.json").write_text("this is not json")
+
+    stop_good_requests_event = asyncio.Event()
+
+    async def run_good_requests(client):
+        # Run chat completions requests until event set
+
+        results = []
+
+        while not stop_good_requests_event.is_set():
+            try:
+                batch = await client.completions.create(
+                    model="zephyr-lora",
+                    prompt=["Hello there", "Foo bar bazz buzz"],
+                    max_tokens=5,
+                )
+                results.append(batch)
+            except Exception as e:
+                results.append(e)
+
+        return results
+
+    # Create task to run good requests
+    good_task = asyncio.create_task(run_good_requests(client))
+
+    # Run a bunch of bad adapter loads
+    for _ in range(25):
+        with suppress(openai.NotFoundError):
+            await client.post("load_lora_adapter",
+                              cast_to=str,
+                              body={
+                                  "lora_name": "notfound",
+                                  "lora_path": "/not/an/adapter"
+                              })
+    for _ in range(25):
+        with suppress(openai.BadRequestError):
+            await client.post("load_lora_adapter",
+                              cast_to=str,
+                              body={
+                                  "lora_name": "invalid",
+                                  "lora_path": str(invalid_files)
+                              })
+
+    # Ensure all the running requests with lora adapters succeeded
+    stop_good_requests_event.set()
+    results = await good_task
+    for r in results:
+        assert not isinstance(r, Exception), f"Got exception {r}"
+
+    # Ensure we can load another adapter and run it
+    await client.post("load_lora_adapter",
+                      cast_to=str,
+                      body={
+                          "lora_name": "valid",
+                          "lora_path": zephyr_lora_files
+                      })
+    await client.completions.create(
+        model="valid",
+        prompt=["Hello there", "Foo bar bazz buzz"],
+        max_tokens=5,
+    )
diff --git a/tests/entrypoints/openai/test_lora_lineage.py b/tests/entrypoints/openai/test_lora_lineage.py
deleted file mode 100644
index ce4f85c13fff9..0000000000000
--- a/tests/entrypoints/openai/test_lora_lineage.py
+++ /dev/null
@@ -1,109 +0,0 @@
-import json
-
-import openai  # use the official client for correctness check
-import pytest
-import pytest_asyncio
-# downloading lora to test lora requests
-from huggingface_hub import snapshot_download
-
-from ...utils import RemoteOpenAIServer
-
-# any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-# technically this needs Mistral-7B-v0.1 as base, but we're not testing
-# generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
-
-
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-
-
-@pytest.fixture(scope="module")
-def server_with_lora_modules_json(zephyr_lora_files):
-    # Define the json format LoRA module configurations
-    lora_module_1 = {
-        "name": "zephyr-lora",
-        "path": zephyr_lora_files,
-        "base_model_name": MODEL_NAME
-    }
-
-    lora_module_2 = {
-        "name": "zephyr-lora2",
-        "path": zephyr_lora_files,
-        "base_model_name": MODEL_NAME
-    }
-
-    args = [
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "bfloat16",
-        "--max-model-len",
-        "8192",
-        "--enforce-eager",
-        # lora config below
-        "--enable-lora",
-        "--lora-modules",
-        json.dumps(lora_module_1),
-        json.dumps(lora_module_2),
-        "--max-lora-rank",
-        "64",
-        "--max-cpu-loras",
-        "2",
-        "--max-num-seqs",
-        "64",
-    ]
-
-    # Enable the /v1/load_lora_adapter endpoint
-    envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
-
-    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
-        yield remote_server
-
-
-@pytest_asyncio.fixture
-async def client_for_lora_lineage(server_with_lora_modules_json):
-    async with server_with_lora_modules_json.get_async_client(
-    ) as async_client:
-        yield async_client
-
-
-@pytest.mark.asyncio
-async def test_static_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
-                                   zephyr_lora_files):
-    models = await client_for_lora_lineage.models.list()
-    models = models.data
-    served_model = models[0]
-    lora_models = models[1:]
-    assert served_model.id == MODEL_NAME
-    assert served_model.root == MODEL_NAME
-    assert served_model.parent is None
-    assert all(lora_model.root == zephyr_lora_files
-               for lora_model in lora_models)
-    assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
-    assert lora_models[0].id == "zephyr-lora"
-    assert lora_models[1].id == "zephyr-lora2"
-
-
-@pytest.mark.asyncio
-async def test_dynamic_lora_lineage(
-        client_for_lora_lineage: openai.AsyncOpenAI, zephyr_lora_files):
-
-    response = await client_for_lora_lineage.post("load_lora_adapter",
-                                                  cast_to=str,
-                                                  body={
-                                                      "lora_name":
-                                                      "zephyr-lora-3",
-                                                      "lora_path":
-                                                      zephyr_lora_files
-                                                  })
-    # Ensure adapter loads before querying /models
-    assert "success" in response
-
-    models = await client_for_lora_lineage.models.list()
-    models = models.data
-    dynamic_lora_model = models[-1]
-    assert dynamic_lora_model.root == zephyr_lora_files
-    assert dynamic_lora_model.parent == MODEL_NAME
-    assert dynamic_lora_model.id == "zephyr-lora-3"
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index f431d1065e0eb..85f485364a411 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -52,7 +52,7 @@ async def _async_serving_chat_init():
     engine = MockEngine()
     model_config = await engine.get_model_config()
 
-    models = OpenAIServingModels(model_config, BASE_MODEL_PATHS)
+    models = OpenAIServingModels(engine, model_config, BASE_MODEL_PATHS)
     serving_completion = OpenAIServingChat(engine,
                                            model_config,
                                            models,
@@ -73,7 +73,8 @@ def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
 
-    models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
                                  model_config=MockModelConfig())
     serving_chat = OpenAIServingChat(mock_engine,
                                      MockModelConfig(),
@@ -116,7 +117,8 @@ def test_serving_chat_could_load_correct_generation_config():
     mock_engine.errored = False
 
     # Initialize the serving chat
-    models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
                                  model_config=mock_model_config)
     serving_chat = OpenAIServingChat(mock_engine,
                                      mock_model_config,
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index 96897dc730da2..657ea20213ec9 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -4,6 +4,7 @@
 import pytest
 
 from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.openai.protocol import (ErrorResponse,
                                               LoadLoraAdapterRequest,
                                               UnloadLoraAdapterRequest)
@@ -21,13 +22,16 @@
 
 async def _async_serving_models_init() -> OpenAIServingModels:
     mock_model_config = MagicMock(spec=ModelConfig)
+    mock_engine_client = MagicMock(spec=EngineClient)
     # Set the max_model_len attribute to avoid missing attribute
     mock_model_config.max_model_len = 2048
 
-    serving_models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
+    serving_models = OpenAIServingModels(engine_client=mock_engine_client,
+                                         base_model_paths=BASE_MODEL_PATHS,
                                          model_config=mock_model_config,
                                          lora_modules=None,
                                          prompt_adapters=None)
+    await serving_models.init_static_loras()
 
     return serving_models
 
@@ -113,5 +117,5 @@ async def test_unload_lora_adapter_not_found():
     request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter")
     response = await serving_models.unload_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
-    assert response.type == "InvalidUserInput"
-    assert response.code == HTTPStatus.BAD_REQUEST
+    assert response.type == "NotFoundError"
+    assert response.code == HTTPStatus.NOT_FOUND
diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py
index 6fcc92022855b..090523a836e12 100644
--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@@ -1,6 +1,3 @@
-import json
-import os
-
 import openai
 import pytest
 
@@ -10,16 +7,7 @@
 
 
 @pytest.mark.asyncio
-async def test_shutdown_on_engine_failure(tmp_path):
-    # Use a bad adapter to crash the engine
-    # (This test will fail when that bug is fixed)
-    adapter_path = tmp_path / "bad_adapter"
-    os.mkdir(adapter_path)
-    with open(adapter_path / "adapter_model_config.json", "w") as f:
-        json.dump({"not": "real"}, f)
-    with open(adapter_path / "adapter_model.safetensors", "wb") as f:
-        f.write(b"this is fake")
-
+async def test_shutdown_on_engine_failure():
     # dtype, max-len etc set so that this can run in CI
     args = [
         "--dtype",
@@ -29,9 +17,6 @@ async def test_shutdown_on_engine_failure(tmp_path):
         "--enforce-eager",
         "--max-num-seqs",
         "128",
-        "--enable-lora",
-        "--lora-modules",
-        f"bad-adapter={tmp_path / 'bad_adapter'}",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -39,9 +24,13 @@ async def test_shutdown_on_engine_failure(tmp_path):
 
             with pytest.raises(
                 (openai.APIConnectionError, openai.InternalServerError)):
-                # This crashes the engine
-                await client.completions.create(model="bad-adapter",
-                                                prompt="Hello, my name is")
+                # Asking for lots of prompt logprobs will currently crash the
+                # engine. This may change in the future when that bug is fixed
+                prompt = "Hello " * 4000
+                await client.completions.create(
+                    model=MODEL_NAME,
+                    prompt=prompt,
+                    extra_body={"prompt_logprobs": 10})
 
             # Now the server should shut down
             return_code = remote_server.proc.wait(timeout=8)
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 66a5089074ff5..da23ed19ef7be 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1257,6 +1257,10 @@ async def stop_profile(self) -> None:
         else:
             self.engine.model_executor._run_workers("stop_profile")
 
+    async def add_lora(self, lora_request: LoRARequest) -> None:
+        """Load a new LoRA adapter into the engine for future requests."""
+        self.engine.add_lora(lora_request)
+
 
 # TODO(v1): Remove this class proxy when V1 goes default.
 if envs.VLLM_USE_V1:
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index 420f540d0b5f4..7132f9840001a 100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -1,4 +1,5 @@
-from dataclasses import dataclass
+import uuid
+from dataclasses import dataclass, field
 from enum import Enum
 from typing import List, Mapping, Optional, Union, overload
 
@@ -120,10 +121,23 @@ class RPCUProfileRequest(Enum):
     STOP_PROFILE = 2
 
 
+@dataclass
+class RPCLoadAdapterRequest:
+    lora_request: LoRARequest
+    # Set the default value of request_id to a new UUID
+    request_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+
+
+@dataclass
+class RPCAdapterLoadedResponse:
+    request_id: str
+
+
 RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest,
-                      RPCUProfileRequest]
+                      RPCUProfileRequest, RPCLoadAdapterRequest]
 
-REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCError]
+REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse,
+                          RPCError]
 
 
 def ENGINE_DEAD_ERROR(
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 0a046c71e86e8..a9ab899535180 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -25,8 +25,10 @@
                                          IPC_HEALTH_EXT, IPC_INPUT_EXT,
                                          IPC_OUTPUT_EXT, RPC_REQUEST_T,
                                          VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
-                                         RPCError, RPCProcessRequest,
-                                         RPCStartupRequest, RPCStartupResponse,
+                                         RPCAdapterLoadedResponse, RPCError,
+                                         RPCLoadAdapterRequest,
+                                         RPCProcessRequest, RPCStartupRequest,
+                                         RPCStartupResponse,
                                          RPCUProfileRequest)
 from vllm.engine.protocol import EngineClient
 # yapf: enable
@@ -240,17 +242,22 @@ async def run_output_handler_loop(self):
                         queue = self.output_queues.get(request_id)
                         if queue is not None:
                             queue.put_nowait(exception)
+                # Put each output into the appropriate queue.
+                elif isinstance(request_outputs, RPCAdapterLoadedResponse):
+                    self._add_output(request_outputs)
                 else:
-                    # Put each output into the appropriate steam.
                     for request_output in request_outputs:
-                        queue = self.output_queues.get(
-                            request_output.request_id)
-                        if queue is not None:
-                            queue.put_nowait(request_output)
+                        self._add_output(request_output)
 
         except asyncio.CancelledError:
             logger.debug("Shutting down MQLLMEngineClient output handler.")
 
+    def _add_output(self, request_output: Union[RequestOutput,
+                                                RPCAdapterLoadedResponse]):
+        queue = self.output_queues.get(request_output.request_id)
+        if queue is not None:
+            queue.put_nowait(request_output)
+
     async def setup(self):
         """Setup the client before it starts sending server requests."""
 
@@ -659,3 +666,24 @@ async def stop_profile(self) -> None:
 
         await self._send_one_way_rpc_request(
             request=RPCUProfileRequest.STOP_PROFILE, socket=self.input_socket)
+
+    async def add_lora(self, lora_request: LoRARequest) -> None:
+        """Load a new LoRA adapter into the engine for future requests."""
+        # Uses the same I/O as generate requests
+        request = RPCLoadAdapterRequest(lora_request)
+
+        # Create output queue for this requests.
+        queue: asyncio.Queue[Union[None, BaseException]] = asyncio.Queue()
+        self.output_queues[request.request_id] = queue
+
+        # Send the request
+        request_bytes = pickle.dumps(request)
+        await self.input_socket.send_multipart((request_bytes, ), copy=False)
+
+        # Wait for the response
+        request_output = await queue.get()
+        self.output_queues.pop(request.request_id)
+
+        # Raise on error, otherwise happily return None
+        if isinstance(request_output, BaseException):
+            raise request_output
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 49a90b321dac4..36f4df4b02731 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -14,8 +14,10 @@
                                          IPC_HEALTH_EXT, IPC_INPUT_EXT,
                                          IPC_OUTPUT_EXT, REQUEST_OUTPUTS_T,
                                          VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
-                                         RPCError, RPCProcessRequest,
-                                         RPCStartupRequest, RPCStartupResponse,
+                                         RPCAdapterLoadedResponse, RPCError,
+                                         RPCLoadAdapterRequest,
+                                         RPCProcessRequest, RPCStartupRequest,
+                                         RPCStartupResponse,
                                          RPCUProfileRequest)
 # yapf: enable
 from vllm.executor.gpu_executor import GPUExecutor
@@ -234,6 +236,8 @@ def handle_new_input(self):
                         self.start_profile()
                     else:
                         self.stop_profile()
+                elif isinstance(request, RPCLoadAdapterRequest):
+                    self._handle_load_adapter_request(request)
                 else:
                     raise ValueError("Unknown RPCRequest Type: "
                                      f"{type(request)}")
@@ -284,6 +288,19 @@ def _handle_abort_request(self, request: RPCAbortRequest):
         if self.log_requests:
             logger.info("Aborted request %s.", request.request_id)
 
+    def _handle_load_adapter_request(self, request: RPCLoadAdapterRequest):
+        try:
+            self.engine.add_lora(request.lora_request)
+        except BaseException as e:
+            # Send back an error if the adater fails to load
+            rpc_err = RPCError(request_id=request.request_id,
+                               is_engine_errored=False,
+                               exception=e)
+            self._send_outputs(rpc_err)
+        # Otherwise, send back the successful load message
+        self._send_outputs(
+            RPCAdapterLoadedResponse(request_id=request.request_id))
+
     def _health_check(self):
         # Send unhealthy if engine has already errored
         if self._errored_with is not None:
@@ -296,7 +313,11 @@ def _health_check(self):
             self._send_unhealthy(e)
 
     def _send_outputs(self, outputs: REQUEST_OUTPUTS_T):
-        """Send List of RequestOutput to RPCClient."""
+        """Send outputs back to the engine client. These can be:
+        - Exceptions
+        - A list of generation outputs
+        - A response from loading a lora adapter
+        """
         if outputs:
             try:
                 from ray.exceptions import RayTaskError
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index a066836b92708..f05ff62c4766b 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -270,3 +270,8 @@ async def start_profile(self) -> None:
     async def stop_profile(self) -> None:
         """Start profiling the engine"""
         ...
+
+    @abstractmethod
+    async def add_lora(self, lora_request: LoRARequest) -> None:
+        """Load a new LoRA adapter into the engine for future requests."""
+        ...
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index bc1471e1f534d..925d7db43138b 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -662,7 +662,7 @@ async def add_request_id(request: Request, call_next):
     return app
 
 
-def init_app_state(
+async def init_app_state(
     engine_client: EngineClient,
     model_config: ModelConfig,
     state: State,
@@ -690,12 +690,13 @@ def init_app_state(
     logger.info("Using supplied chat template:\n%s", resolved_chat_template)
 
     state.openai_serving_models = OpenAIServingModels(
+        engine_client=engine_client,
         model_config=model_config,
         base_model_paths=base_model_paths,
         lora_modules=args.lora_modules,
         prompt_adapters=args.prompt_adapters,
     )
-    # TODO: The chat template is now broken for lora adapters :(
+    await state.openai_serving_models.init_static_loras()
     state.openai_serving_chat = OpenAIServingChat(
         engine_client,
         model_config,
@@ -794,7 +795,7 @@ def signal_handler(*_) -> None:
         app = build_app(args)
 
         model_config = await engine_client.get_model_config()
-        init_app_state(engine_client, model_config, app.state, args)
+        await init_app_state(engine_client, model_config, app.state, args)
 
         shutdown_task = await serve_http(
             app,
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 822c0f5f7c211..f8f136f9d5024 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -215,6 +215,7 @@ async def main(args):
 
     # Create the openai serving objects.
     openai_serving_models = OpenAIServingModels(
+        engine_client=engine,
         model_config=model_config,
         base_model_paths=base_model_paths,
         lora_modules=None,
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 26966896bc272..a222eafadcb68 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -5,15 +5,19 @@
 from typing import List, Optional, Union
 
 from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.openai.protocol import (ErrorResponse,
                                               LoadLoraAdapterRequest,
                                               ModelCard, ModelList,
                                               ModelPermission,
                                               UnloadLoraAdapterRequest)
+from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.utils import AtomicCounter
 
+logger = init_logger(__name__)
+
 
 @dataclass
 class BaseModelPath:
@@ -45,6 +49,7 @@ class OpenAIServingModels:
 
     def __init__(
         self,
+        engine_client: EngineClient,
         model_config: ModelConfig,
         base_model_paths: List[BaseModelPath],
         *,
@@ -55,20 +60,11 @@ def __init__(
 
         self.base_model_paths = base_model_paths
         self.max_model_len = model_config.max_model_len
+        self.engine_client = engine_client
 
+        self.static_lora_modules = lora_modules
+        self.lora_requests: List[LoRARequest] = []
         self.lora_id_counter = AtomicCounter(0)
-        self.lora_requests = []
-        if lora_modules is not None:
-            self.lora_requests = [
-                LoRARequest(lora_name=lora.name,
-                            lora_int_id=i,
-                            lora_path=lora.path,
-                            base_model_name=lora.base_model_name
-                            if lora.base_model_name
-                            and self.is_base_model(lora.base_model_name) else
-                            self.base_model_paths[0].name)
-                for i, lora in enumerate(lora_modules, start=1)
-            ]
 
         self.prompt_adapter_requests = []
         if prompt_adapters is not None:
@@ -84,6 +80,19 @@ def __init__(
                         prompt_adapter_local_path=prompt_adapter.local_path,
                         prompt_adapter_num_virtual_tokens=num_virtual_tokens))
 
+    async def init_static_loras(self):
+        """Loads all static LoRA modules.
+        Raises if any fail to load"""
+        if self.static_lora_modules is None:
+            return
+        for lora in self.static_lora_modules:
+            load_request = LoadLoraAdapterRequest(lora_path=lora.path,
+                                                  lora_name=lora.name)
+            load_result = await self.load_lora_adapter(
+                request=load_request, base_model_name=lora.base_model_name)
+            if isinstance(load_result, ErrorResponse):
+                raise ValueError(load_result.message)
+
     def is_base_model(self, model_name):
         return any(model.name == model_name for model in self.base_model_paths)
 
@@ -129,17 +138,47 @@ async def show_available_models(self) -> ModelList:
 
     async def load_lora_adapter(
             self,
-            request: LoadLoraAdapterRequest) -> Union[ErrorResponse, str]:
+            request: LoadLoraAdapterRequest,
+            base_model_name: Optional[str] = None
+    ) -> Union[ErrorResponse, str]:
         error_check_ret = await self._check_load_lora_adapter_request(request)
         if error_check_ret is not None:
             return error_check_ret
 
         lora_name, lora_path = request.lora_name, request.lora_path
         unique_id = self.lora_id_counter.inc(1)
-        self.lora_requests.append(
-            LoRARequest(lora_name=lora_name,
-                        lora_int_id=unique_id,
-                        lora_path=lora_path))
+        lora_request = LoRARequest(lora_name=lora_name,
+                                   lora_int_id=unique_id,
+                                   lora_path=lora_path)
+        if base_model_name is not None and self.is_base_model(base_model_name):
+            lora_request.base_model_name = base_model_name
+
+        # Validate that the adapter can be loaded into the engine
+        # This will also pre-load it for incoming requests
+        try:
+            await self.engine_client.add_lora(lora_request)
+        except ValueError as e:
+            # Adapter not found or lora configuration errors
+            if "No adapter found" in str(e):
+                return create_error_response(message=str(e),
+                                             err_type="NotFoundError",
+                                             status_code=HTTPStatus.NOT_FOUND)
+            else:
+                return create_error_response(
+                    message=str(e),
+                    err_type="BadRequestError",
+                    status_code=HTTPStatus.BAD_REQUEST)
+        except BaseException as e:
+            # Some other unexpected problem loading the adapter, e.g. malformed
+            # input files.
+            # More detailed error messages for the user would be nicer here
+            return create_error_response(message=str(e),
+                                         err_type="BadRequestError",
+                                         status_code=HTTPStatus.BAD_REQUEST)
+
+        self.lora_requests.append(lora_request)
+        logger.info("Loaded new LoRA adapter: name '%s', path '%s'", lora_name,
+                    lora_path)
         return f"Success: LoRA adapter '{lora_name}' added successfully."
 
     async def unload_lora_adapter(
@@ -155,6 +194,7 @@ async def unload_lora_adapter(
             lora_request for lora_request in self.lora_requests
             if lora_request.lora_name != lora_name
         ]
+        logger.info("Removed LoRA adapter: name '%s'", lora_name)
         return f"Success: LoRA adapter '{lora_name}' removed successfully."
 
     async def _check_load_lora_adapter_request(
@@ -195,8 +235,8 @@ async def _check_unload_lora_adapter_request(
             return create_error_response(
                 message=
                 f"The lora adapter '{request.lora_name}' cannot be found.",
-                err_type="InvalidUserInput",
-                status_code=HTTPStatus.BAD_REQUEST)
+                err_type="NotFoundError",
+                status_code=HTTPStatus.NOT_FOUND)
 
         return None
 
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 10976fac23028..eec462743fe9d 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -115,6 +115,14 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
                 embedding_padding_modules=self.embedding_padding_modules,
                 weights_mapper=hf_to_vllm_mapper)
 
+        except FileNotFoundError as e:
+            # FileNotFoundError should be raised if both
+            # - No adapter found to download from huggingface (or in
+            #       offline mode)
+            # - No local adapter files found at `lora_request.lora_path`
+            raise ValueError(
+                f"Loading lora {lora_request.lora_name} failed: No adapter "
+                f"found for {lora_path}") from e
         except Exception as e:
             raise RuntimeError(f"Loading lora {lora_path} failed") from e
         if lora.rank > self.lora_config.max_lora_rank:
@@ -209,12 +217,19 @@ def _apply_adapters(self, lora_requests: Set[LoRARequest]) -> None:
 
     def add_adapter(self, lora_request: LoRARequest) -> bool:
         if lora_request.lora_int_id not in self.list_adapters():
-            # Remove before we load the new lora to save memory
+            # Load the new adapter first to ensure it is actually valid, before
+            # evicting any existing adapters.
+            # This may cause the # of loaded lora adapters to very temporarily
+            # exceed `--max-cpu-loras`.
+            lora = self._load_adapter(lora_request)
+
+            # Loading succeeded, now check if we will exceed cache capacity and
+            # evict if the oldest adapter if so
             if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
                 assert isinstance(self._adapter_manager,
                                   LRUCacheLoRAModelManager)
                 self._adapter_manager.remove_oldest_adapter()
-            lora = self._load_adapter(lora_request)
+            # Then add the new adapter to the cache
             loaded = self._adapter_manager.add_adapter(lora)
         else:
             # If the lora is already loaded, just touch it to
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index b963ba74f13f0..5daae45dee85c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -339,3 +339,7 @@ def errored(self) -> bool:
     @property
     def dead_error(self) -> BaseException:
         return Exception()  # TODO: implement
+
+    async def add_lora(self, lora_request: LoRARequest) -> None:
+        """Load a new LoRA adapter into the engine for future requests."""
+        raise NotImplementedError("LoRA not yet supported in V1")

From 61af6332565d0093855fee7266699e548b1c0d1c Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Fri, 10 Jan 2025 16:20:46 +0800
Subject: [PATCH 352/357] [BUGFIX] Fix `UnspecifiedPlatform` package name
 (#11916)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 vllm/platforms/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index f6ac14446c021..6ca95b41dbb07 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -179,7 +179,7 @@ def resolve_current_platform_cls_qualname() -> str:
         logger.info("Automatically detected platform %s.",
                     activated_builtin_plugins[0])
     else:
-        platform_cls_qualname = "vllm.interface.UnspecifiedPlatform"
+        platform_cls_qualname = "vllm.platforms.interface.UnspecifiedPlatform"
         logger.info(
             "No platform detected, vLLM is running on UnspecifiedPlatform")
     return platform_cls_qualname

From d53575a5f0e5c0f9003b4ec6e33c8bf761e93cef Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 10 Jan 2025 16:25:17 +0800
Subject: [PATCH 353/357] [ci] fix gh200 tests (#11919)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/model_executor/model_loader/weight_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 11d5fd7135d9e..9cfcdbf620d2b 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -29,7 +29,9 @@
 
 try:
     from runai_model_streamer import SafetensorsStreamer
-except ImportError:
+except (ImportError, OSError):
+    # see https://github.com/run-ai/runai-model-streamer/issues/26
+    # OSError will be raised on arm64 platform
     runai_model_streamer = PlaceholderModule(
         "runai_model_streamer")  # type: ignore[assignment]
     SafetensorsStreamer = runai_model_streamer.placeholder_attr(

From d907be7dc7926e64d6240bf4425d7399eaed150e Mon Sep 17 00:00:00 2001
From: cennn <61925104+cennn@users.noreply.github.com>
Date: Fri, 10 Jan 2025 17:18:25 +0800
Subject: [PATCH 354/357] [misc] remove python function call for custom
 activation op (#11885)

Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 vllm/_custom_ops.py                      | 27 --------
 vllm/model_executor/layers/activation.py | 79 ++++++++++++++----------
 2 files changed, 46 insertions(+), 60 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index afb350591e562..d04cbbc0a9eed 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -34,33 +34,6 @@ def register_fake(fn):
         from torch.library import impl_abstract as register_fake
 
 
-# activation ops
-def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_and_mul(out, x)
-
-
-def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_tanh_and_mul(out, x)
-
-
-def fatrelu_and_mul(out: torch.Tensor,
-                    x: torch.Tensor,
-                    threshold: float = 0.0) -> None:
-    torch.ops._C.fatrelu_and_mul(out, x, threshold)
-
-
-def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_fast(out, x)
-
-
-def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_new(out, x)
-
-
-def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_quick(out, x)
-
-
 # page attention ops
 def paged_attention_v1(
     out: torch.Tensor,
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 32456fee06a28..2475190d197d3 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -30,6 +30,8 @@ class FatreluAndMul(CustomOp):
     def __init__(self, threshold: float = 0.):
         super().__init__()
         self.threshold = threshold
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            self.op = torch.ops._C.fatrelu_and_mul
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
@@ -39,12 +41,10 @@ def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         return x1 * x2
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         d = x.shape[-1] // 2
         output_shape = (x.shape[:-1] + (d, ))
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.fatrelu_and_mul(out, x, self.threshold)
+        self.op(out, x, self.threshold)
         return out
 
 
@@ -103,6 +103,17 @@ def __init__(self, approximate: str = "none"):
         self.approximate = approximate
         if approximate not in ("none", "tanh"):
             raise ValueError(f"Unknown approximate mode: {approximate}")
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            if approximate == "none":
+                self.op = torch.ops._C.gelu_and_mul
+            elif approximate == "tanh":
+                self.op = torch.ops._C.gelu_tanh_and_mul
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            if approximate == "none":
+                self.op = ipex_ops.gelu_and_mul
+            else:
+                self.op = ipex_ops.gelu_tanh_and_mul
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
@@ -110,27 +121,17 @@ def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         d = x.shape[-1] // 2
         output_shape = (x.shape[:-1] + (d, ))
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        if self.approximate == "none":
-            ops.gelu_and_mul(out, x)
-        elif self.approximate == "tanh":
-            ops.gelu_tanh_and_mul(out, x)
+        self.op(out, x)
         return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm._ipex_ops import ipex_ops as ops
-
         d = x.shape[-1] // 2
         output_shape = (x.shape[:-1] + (d, ))
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        if self.approximate == "none":
-            ops.gelu_and_mul(out, x)
-        elif self.approximate == "tanh":
-            ops.gelu_tanh_and_mul(out, x)
+        self.op(out, x)
         return out
 
     def extra_repr(self) -> str:
@@ -140,6 +141,14 @@ def extra_repr(self) -> str:
 @CustomOp.register("gelu_new")
 class NewGELU(CustomOp):
 
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            self.op = torch.ops._C.gelu_new
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.gelu_new
+
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         c = math.sqrt(2.0 / math.pi)
@@ -147,58 +156,62 @@ def forward_native(self, x: torch.Tensor) -> torch.Tensor:
                                            (x + 0.044715 * torch.pow(x, 3.0))))
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         out = torch.empty_like(x)
-        ops.gelu_new(out, x)
+        self.op(out, x)
         return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm._ipex_ops import ipex_ops as ops
-
-        return ops.gelu_new(x)
+        return self.op(x)
 
 
 @CustomOp.register("gelu_fast")
 class FastGELU(CustomOp):
 
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            self.op = torch.ops._C.gelu_fast
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.gelu_fast
+
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 *
                                            (1.0 + 0.044715 * x * x)))
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         out = torch.empty_like(x)
-        ops.gelu_fast(out, x)
+        self.op(out, x)
         return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm._ipex_ops import ipex_ops as ops
-
-        return ops.gelu_fast(x)
+        return self.op(x)
 
 
 @CustomOp.register("quick_gelu")
 class QuickGELU(CustomOp):
     # https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L90
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            self.op = torch.ops._C.gelu_quick
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.gelu_quick
+
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         return x * torch.sigmoid(1.702 * x)
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         out = torch.empty_like(x)
-        ops.gelu_quick(out, x)
+        self.op(out, x)
         return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm._ipex_ops import ipex_ops as ops
-
         out = torch.empty_like(x)
-        ops.gelu_quick(out, x)
+        self.op(out, x)
         return out
 
     # TODO implement forward_xpu for QuickGELU

From ef725feafcc1f2d6763cc888751fb2b36840587b Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Fri, 10 Jan 2025 18:02:38 +0800
Subject: [PATCH 355/357] [platform] support pytorch custom op pluggable
 (#11328)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 vllm/model_executor/custom_op.py | 7 +++++++
 vllm/platforms/interface.py      | 4 ++++
 2 files changed, 11 insertions(+)

diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 401606e8c76f0..96995c56bf504 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -57,6 +57,11 @@ def forward_hpu(self, *args, **kwargs):
         # PyTorch-native implementation.
         return self.forward_native(*args, **kwargs)
 
+    def forward_oot(self, *args, **kwargs):
+        # By default, we assume that OOT ops are compatible with the
+        # PyTorch-native implementation.
+        return self.forward_native(*args, **kwargs)
+
     def dispatch_forward(self):
         # NOTE(woosuk): Here we assume that vLLM was built for only one
         # specific backend. Currently, we do not support dynamic dispatching.
@@ -81,6 +86,8 @@ def dispatch_forward(self):
             return self.forward_tpu
         elif current_platform.is_xpu():
             return self.forward_xpu
+        elif current_platform.is_out_of_tree():
+            return self.forward_oot
         else:
             return self.forward_cuda
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index f440358f65fbb..01d753408e6d0 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -45,6 +45,7 @@ class PlatformEnum(enum.Enum):
     CPU = enum.auto()
     NEURON = enum.auto()
     OPENVINO = enum.auto()
+    OOT = enum.auto()
     UNSPECIFIED = enum.auto()
 
 
@@ -107,6 +108,9 @@ def is_neuron(self) -> bool:
     def is_openvino(self) -> bool:
         return self._enum == PlatformEnum.OPENVINO
 
+    def is_out_of_tree(self) -> bool:
+        return self._enum == PlatformEnum.OOT
+
     def is_cuda_alike(self) -> bool:
         """Stateless version of :func:`torch.cuda.is_available`."""
         return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)

From d85c47d6ad24c286ae55fd9da231808b8ddd7a7f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 10 Jan 2025 12:05:56 +0000
Subject: [PATCH 356/357] Replace "online inference" with "online serving"
 (#11923)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/run-cpu-test.sh                                    | 2 +-
 docs/source/features/structured_outputs.md                    | 4 ++--
 docs/source/getting_started/installation/hpu-gaudi.md         | 4 ++--
 docs/source/getting_started/quickstart.md                     | 2 +-
 docs/source/models/generative_models.md                       | 2 +-
 docs/source/models/pooling_models.md                          | 2 +-
 docs/source/models/supported_models.md                        | 4 ++--
 docs/source/serving/multimodal_inputs.md                      | 2 +-
 .../openai_chat_completion_client_for_multimodal.py           | 4 ++--
 tests/models/decoder_only/audio_language/test_ultravox.py     | 4 ++--
 vllm/model_executor/models/molmo.py                           | 2 +-
 11 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 1a4dae8f65e99..5a285be039393 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -61,7 +61,7 @@ function cpu_tests() {
     pytest -s -v -k cpu_model \
     tests/basic_correctness/test_chunked_prefill.py"  
 
-  # online inference
+  # online serving
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     export VLLM_CPU_KVCACHE_SPACE=10 
diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md
index ccd9a6a1b1a14..a42c3dd64ad10 100644
--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@@ -5,7 +5,7 @@
 vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines), [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer), or [xgrammar](https://github.com/mlc-ai/xgrammar) as backends for the guided decoding.
 This document shows you some examples of the different options that are available to generate structured outputs.
 
-## Online Inference (OpenAI API)
+## Online Serving (OpenAI API)
 
 You can generate structured outputs using the OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API.
 
@@ -239,7 +239,7 @@ The main available options inside `GuidedDecodingParams` are:
 - `backend`
 - `whitespace_pattern`
 
-These parameters can be used in the same way as the parameters from the Online Inference examples above.
+These parameters can be used in the same way as the parameters from the Online Serving examples above.
 One example for the usage of the `choices` parameter is shown below:
 
 ```python
diff --git a/docs/source/getting_started/installation/hpu-gaudi.md b/docs/source/getting_started/installation/hpu-gaudi.md
index 1d50cef3bdc83..21822327c8822 100644
--- a/docs/source/getting_started/installation/hpu-gaudi.md
+++ b/docs/source/getting_started/installation/hpu-gaudi.md
@@ -83,7 +83,7 @@ $ python setup.py develop
 ## Supported Features
 
 - [Offline inference](#offline-inference)
-- Online inference via [OpenAI-Compatible Server](#openai-compatible-server)
+- Online serving via [OpenAI-Compatible Server](#openai-compatible-server)
 - HPU autodetection - no need to manually select device within vLLM
 - Paged KV cache with algorithms enabled for Intel Gaudi accelerators
 - Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
@@ -385,5 +385,5 @@ the below:
   completely. With HPU Graphs disabled, you are trading latency and
   throughput at lower batches for potentially higher throughput on
   higher batches. You can do that by adding `--enforce-eager` flag to
-  server (for online inference), or by passing `enforce_eager=True`
+  server (for online serving), or by passing `enforce_eager=True`
   argument to LLM constructor (for offline inference).
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index ea15d9ef065fa..d7d43785c6c24 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -5,7 +5,7 @@
 This guide will help you quickly get started with vLLM to perform:
 
 - [Offline batched inference](#quickstart-offline)
-- [Online inference using OpenAI-compatible server](#quickstart-online)
+- [Online serving using OpenAI-compatible server](#quickstart-online)
 
 ## Prerequisites
 
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index a9f74c4d3fbb8..6a5a58ad74ab7 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -118,7 +118,7 @@ print("Loaded chat template:", custom_template)
 outputs = llm.chat(conversation, chat_template=custom_template)
 ```
 
-## Online Inference
+## Online Serving
 
 Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs:
 
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 745f3fd81980d..324b1f550e694 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -127,7 +127,7 @@ print(f"Score: {score}")
 
 A code example can be found here: <gh-file:examples/offline_inference/offline_inference_scoring.py>
 
-## Online Inference
+## Online Serving
 
 Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs:
 
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index acbe27a22a679..72910ea1e2d19 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -552,7 +552,7 @@ See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the mod
 
 ````{important}
 To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
-or `--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
+or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt:
 
 Offline inference:
 ```python
@@ -562,7 +562,7 @@ llm = LLM(
 )
 ```
 
-Online inference:
+Online serving:
 ```bash
 vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
 ```
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index 9f5e1b908d786..7e96ed46f2dcc 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -199,7 +199,7 @@ for o in outputs:
     print(generated_text)
 ```
 
-## Online Inference
+## Online Serving
 
 Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
 
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index 213d075542e81..03cc037bb6779 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -1,5 +1,5 @@
 """An example showing how to use vLLM to serve multimodal models 
-and run online inference with OpenAI client.
+and run online serving with OpenAI client.
 
 Launch the vLLM server with the following command:
 
@@ -309,7 +309,7 @@ def main(args) -> None:
 
 if __name__ == "__main__":
     parser = FlexibleArgumentParser(
-        description='Demo on using OpenAI client for online inference with '
+        description='Demo on using OpenAI client for online serving with '
         'multimodal language models served with vLLM.')
     parser.add_argument('--chat-type',
                         '-c',
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index 0bb98df1b58e6..1e329dc4cb22e 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -237,8 +237,8 @@ def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
 
 
 @pytest.mark.asyncio
-async def test_online_inference(client, audio_assets):
-    """Exercises online inference with/without chunked prefill enabled."""
+async def test_online_serving(client, audio_assets):
+    """Exercises online serving with/without chunked prefill enabled."""
 
     messages = [{
         "role":
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 2e60bc719f096..c45ee9b921c9e 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1068,7 +1068,7 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
         trust_remote_code=model_config.trust_remote_code)
 
     # NOTE: message formatting for raw text prompt is only applied for
-    # offline inference; for online inference, the prompt is always in
+    # offline inference; for online serving, the prompt is always in
     # instruction format and tokenized.
     if prompt is not None and re.match(r"^User:[\s\S]*?(Assistant:)*$",
                                        prompt):

From 241ad7b301facac0728e2b3312d71fe47acc8c9e Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 10 Jan 2025 20:45:33 +0800
Subject: [PATCH 357/357] [ci] Fix sampler tests (#11922)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml |  1 +
 tests/conftest.py             | 11 +++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e288f8f30159a..7d13269540864 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -214,6 +214,7 @@ steps:
   - vllm/model_executor/layers
   - vllm/sampling_metadata.py
   - tests/samplers
+  - tests/conftest.py
   commands:
     - pytest -v -s samplers
     - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
diff --git a/tests/conftest.py b/tests/conftest.py
index 917151ddcb8d4..95af4ac1eb17b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -28,12 +28,13 @@
                               init_distributed_environment,
                               initialize_model_parallel)
 from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
-                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
+                         TokensPrompt, to_enc_dec_tuple_list,
+                         zip_enc_dec_prompts)
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
-                        identity)
+                        identity, is_list_of)
 
 logger = init_logger(__name__)
 
@@ -886,6 +887,12 @@ def generate_beam_search(
         beam_width: int,
         max_tokens: int,
     ) -> List[Tuple[List[List[int]], List[str]]]:
+        if is_list_of(prompts, str, check="all"):
+            prompts = [TextPrompt(prompt=prompt) for prompt in prompts]
+        else:
+            prompts = [
+                TokensPrompt(prompt_token_ids=tokens) for tokens in prompts
+            ]
         outputs = self.model.beam_search(
             prompts,
             BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))