diff --git a/engines/python/setup/djl_python/properties_manager/lmi_dist_rb_properties.py b/engines/python/setup/djl_python/properties_manager/lmi_dist_rb_properties.py index ccc6b78b1..d311b2196 100644 --- a/engines/python/setup/djl_python/properties_manager/lmi_dist_rb_properties.py +++ b/engines/python/setup/djl_python/properties_manager/lmi_dist_rb_properties.py @@ -10,8 +10,9 @@ # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" # BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for # the specific language governing permissions and limitations under the License. +import ast from enum import Enum -from typing import Optional, Mapping +from typing import Optional, Mapping, Tuple from pydantic import model_validator, field_validator @@ -60,7 +61,10 @@ class LmiDistRbProperties(Properties): enable_lora: Optional[bool] = False max_loras: Optional[int] = 4 max_lora_rank: Optional[int] = 16 + fully_sharded_loras: bool = False lora_extra_vocab_size: Optional[int] = 256 + long_lora_scaling_factors: Optional[Tuple[float, ...]] = None + lora_dtype: Optional[str] = 'auto' max_cpu_loras: Optional[int] = None max_logprobs: Optional[int] = 20 enable_chunked_prefill: Optional[bool] = None @@ -94,6 +98,23 @@ def validate_speculative_and_fml(self): ) return self + @field_validator('long_lora_scaling_factors', mode='before') + def validate_long_lora_scaling_factors(cls, val): + if isinstance(val, str): + val = ast.literal_eval(val) + if not isinstance(val, tuple): + if isinstance(val, list): + val = tuple(float(v) for v in val) + elif isinstance(val, float): + val = (val, ) + elif isinstance(val, int): + val = (float(val), ) + else: + raise ValueError( + "long_lora_scaling_factors must be convertible to a tuple of floats." + ) + return val + @field_validator('limit_mm_per_prompt', mode="before") def validate_limit_mm_per_prompt(cls, val) -> Mapping[str, int]: out_dict: Dict[str, int] = {} diff --git a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py index 577a110b5..25f2874d9 100644 --- a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py +++ b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py @@ -10,6 +10,7 @@ # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" # BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for # the specific language governing permissions and limitations under the License. +import ast from enum import Enum from typing import Optional, Any, Mapping, Tuple @@ -51,7 +52,7 @@ class VllmRbProperties(Properties): max_lora_rank: Optional[int] = 16 fully_sharded_loras: bool = False lora_extra_vocab_size: int = 256 - long_lora_scaling_factors: Optional[Tuple[float]] = None + long_lora_scaling_factors: Optional[Tuple[float, ...]] = None lora_dtype: Optional[str] = 'auto' max_cpu_loras: Optional[int] = None @@ -91,6 +92,23 @@ def validate_engine(cls, engine): f"Need python engine to start vLLM RollingBatcher") return engine + @field_validator('long_lora_scaling_factors', mode='before') + def validate_long_lora_scaling_factors(cls, val): + if isinstance(val, str): + val = ast.literal_eval(val) + if not isinstance(val, tuple): + if isinstance(val, list): + val = tuple(float(v) for v in val) + elif isinstance(val, float): + val = (val, ) + elif isinstance(val, int): + val = (float(val), ) + else: + raise ValueError( + "long_lora_scaling_factors must be convertible to a tuple of floats." + ) + return val + @field_validator('limit_mm_per_prompt', mode="before") def validate_limit_mm_per_prompt(cls, val) -> Mapping[str, int]: out_dict: Dict[str, int] = {} diff --git a/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py b/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py index e526c396a..e51f83113 100644 --- a/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py +++ b/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py @@ -78,7 +78,11 @@ def __init__(self, model_id_or_path: str, properties: dict, **kwargs): enable_lora=self.lmi_dist_config.enable_lora, max_loras=self.lmi_dist_config.max_loras, max_lora_rank=self.lmi_dist_config.max_lora_rank, + fully_sharded_loras=self.lmi_dist_config.fully_sharded_loras, lora_extra_vocab_size=self.lmi_dist_config.lora_extra_vocab_size, + long_lora_scaling_factors=self.lmi_dist_config. + long_lora_scaling_factors, + lora_dtype=self.lmi_dist_config.lora_dtype, max_cpu_loras=self.lmi_dist_config.max_cpu_loras, revision=self.lmi_dist_config.revision, enable_chunked_prefill=self.lmi_dist_config.enable_chunked_prefill, diff --git a/engines/python/setup/djl_python/tests/test_properties_manager.py b/engines/python/setup/djl_python/tests/test_properties_manager.py index 20290ded6..b9b6c9f86 100644 --- a/engines/python/setup/djl_python/tests/test_properties_manager.py +++ b/engines/python/setup/djl_python/tests/test_properties_manager.py @@ -440,6 +440,8 @@ def test_vllm_valid(properties): int(properties['max_model_len'])) self.assertEqual(vllm_configs.enforce_eager, bool(properties['enforce_eager'])) + self.assertEqual(vllm_configs.enable_lora, + bool(properties['enable_lora'])) self.assertEqual(vllm_configs.gpu_memory_utilization, float(properties['gpu_memory_utilization'])) @@ -457,6 +459,36 @@ def test_enforce_eager(properties): vllm_props = VllmRbProperties(**properties) self.assertTrue(vllm_props.enforce_eager is False) + def test_long_lora_scaling_factors(properties): + properties['long_lora_scaling_factors'] = "3.0" + vllm_props = VllmRbProperties(**properties) + self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, )) + + properties['long_lora_scaling_factors'] = "3" + vllm_props = VllmRbProperties(**properties) + self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, )) + + properties['long_lora_scaling_factors'] = "3.0,4.0" + vllm_props = VllmRbProperties(**properties) + self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, 4.0)) + + properties['long_lora_scaling_factors'] = "3.0, 4.0 " + vllm_props = VllmRbProperties(**properties) + self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, 4.0)) + + properties['long_lora_scaling_factors'] = "(3.0,)" + vllm_props = VllmRbProperties(**properties) + self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, )) + + properties['long_lora_scaling_factors'] = "(3.0,4.0)" + vllm_props = VllmRbProperties(**properties) + self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, 4.0)) + + def test_invalid_long_lora_scaling_factors(properties): + properties['long_lora_scaling_factors'] = "a,b" + with self.assertRaises(ValueError): + VllmRbProperties(**properties) + properties = { 'model_id': 'sample_model_id', 'engine': 'Python', @@ -466,12 +498,15 @@ def test_enforce_eager(properties): 'dtype': 'fp16', 'quantize': 'awq', 'enforce_eager': "True", + 'enable_lora': "true", "gpu_memory_utilization": "0.85", 'load_format': 'pt' } test_vllm_valid(properties.copy()) test_invalid_quantization_method(properties.copy()) test_enforce_eager(properties.copy()) + test_long_lora_scaling_factors(properties.copy()) + test_invalid_long_lora_scaling_factors(properties.copy()) def test_sd_inf2_properties(self): properties = { @@ -507,6 +542,7 @@ def test_with_min_properties(): self.assertEqual(lmi_configs.dtype, 'auto') self.assertEqual(lmi_configs.gpu_memory_utilization, 0.9) self.assertTrue(lmi_configs.mpi_mode) + self.assertFalse(lmi_configs.enable_lora) def test_with_most_properties(): properties = { @@ -516,6 +552,7 @@ def test_with_most_properties(): 'max_rolling_batch_size': '64', 'max_rolling_batch_prefill_tokens': '12500', 'dtype': 'fp32', + 'enable_lora': "true", } lmi_configs = LmiDistRbProperties(**properties, **min_properties) @@ -533,6 +570,8 @@ def test_with_most_properties(): self.assertEqual(lmi_configs.dtype, 'fp32') self.assertTrue(lmi_configs.mpi_mode) self.assertTrue(lmi_configs.trust_remote_code) + self.assertEqual(lmi_configs.enable_lora, + bool(properties['enable_lora'])) def test_invalid_quantization(): properties = {'quantize': 'invalid'} @@ -551,6 +590,36 @@ def test_quantization_squeezellm(): self.assertEqual(lmi_configs.quantize.value, LmiDistQuantizeMethods.squeezellm.value) + def test_long_lora_scaling_factors(): + properties = {"long_lora_scaling_factors": "3.0"} + lmi_configs = LmiDistRbProperties(**properties, **min_properties) + self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, )) + + properties = {"long_lora_scaling_factors": "3"} + lmi_configs = LmiDistRbProperties(**properties, **min_properties) + self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, )) + + properties = {"long_lora_scaling_factors": "3.0,4.0"} + lmi_configs = LmiDistRbProperties(**properties, **min_properties) + self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, 4.0)) + + properties = {"long_lora_scaling_factors": "3.0, 4.0 "} + lmi_configs = LmiDistRbProperties(**properties, **min_properties) + self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, 4.0)) + + properties = {"long_lora_scaling_factors": "(3.0,)"} + lmi_configs = LmiDistRbProperties(**properties, **min_properties) + self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, )) + + properties = {"long_lora_scaling_factors": "(3.0,4.0)"} + lmi_configs = LmiDistRbProperties(**properties, **min_properties) + self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, 4.0)) + + def test_invalid_long_lora_scaling_factors(): + properties = {'long_lora_scaling_factors': "(a,b)"} + with self.assertRaises(ValueError): + LmiDistRbProperties(**properties, **min_properties) + min_properties = { 'engine': 'MPI', 'mpi_mode': 'true', @@ -561,6 +630,8 @@ def test_quantization_squeezellm(): test_invalid_quantization() test_quantization_with_dtype_error() test_quantization_squeezellm() + test_long_lora_scaling_factors() + test_invalid_long_lora_scaling_factors() def test_scheduler_properties(self): properties = { diff --git a/serving/docs/adapters.md b/serving/docs/adapters.md index a5ee05b43..bbbaa8d20 100644 --- a/serving/docs/adapters.md +++ b/serving/docs/adapters.md @@ -29,16 +29,16 @@ More details can be found in the user guide. Here are the settings that are available when using LoRA Adapter. -| Item | Environment Variable | LMI Version | Configuration Type | Description | Example value | -|----------------------------------|----------------------------------|-------------|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------| -| option.enable_lora | OPTION_ENABLE_LORA | \>= 0.27.0 | Pass Through | This config enables support for LoRA adapters. | Default: `false` | -| option.max_loras | OPTION_MAX_LORAS | \>= 0.27.0 | Pass Through | This config determines the maximum number of LoRA adapters that can be run at once. Allocates GPU memory for those number adapters. | Default: `4` | -| option.max_lora_rank | OPTION_MAX_LORA_RANK | \>= 0.27.0 | Pass Through | This config determines the maximum rank allowed for a LoRA adapter. Set this value to maximum rank of your adapters. Setting a larger value will enable more adapters at a greater memory usage cost. | Default: `16` | -| option.max_cpu_loras | OPTION_MAX_CPU_LORAS | \>= 0.27.0 | Pass Through | Maximum number of LoRAs to store in CPU memory. Must be >= than max_loras. Defaults to max_loras. | Default: `None` | -| option.fully_sharded_loras | OPTION_FULLY_SHARDED_LORAS | \>= 0.31.0 | Pass Through | By default, only half of the LoRA computation is sharded with tensor parallelism. Enabling this will use the fully sharded layers. At high sequence length, max rank or tensor parallel size, this is likely faster. | Default: `true` | -| option.lora_extra_vocab_size | OPTION_LORA_EXTRA_VOCAB_SIZE | \>= 0.31.0 | Pass Through | This config determines the maximum additional vocabulary that can be added through a LoRA adapter. | Default: `256` | -| option.long_lora_scaling_factors | OPTION_LONG_LORA_SCALING_FACTORS | \>= 0.31.0 | Pass Through | Specify multiple scaling factors (which can be different from base model scaling factor - see eg. Long LoRA) to allow for multiple LoRA adapters trained with those scaling factors to be used at the same time. If not specified, only adapters trained with the base model scaling factor are allowed. | Default: `None` | -| option.lora_dtype | OPTION_LORA_DTYPE | \>= 0.31.0 | Pass Through | Data type for LoRA. Valid values are auto, float16, bfloat16, float32. If auto, will default to base model dtype. | Default: `auto` | +| Item | Environment Variable | LMI Version | Configuration Type | Description | Example value | +|----------------------------------|----------------------------------|-------------|--------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------| +| option.enable_lora | OPTION_ENABLE_LORA | \>= 0.27.0 | Pass Through | This config enables support for LoRA adapters. | Default: `false` | +| option.max_loras | OPTION_MAX_LORAS | \>= 0.27.0 | Pass Through | This config determines the maximum number of LoRA adapters that can be run at once. Allocates GPU memory for those number adapters. | Default: `4` | +| option.max_lora_rank | OPTION_MAX_LORA_RANK | \>= 0.27.0 | Pass Through | This config determines the maximum rank allowed for a LoRA adapter. Set this value to maximum rank of your adapters. Setting a larger value will enable more adapters at a greater memory usage cost. | Default: `16` | +| option.max_cpu_loras | OPTION_MAX_CPU_LORAS | \>= 0.27.0 | Pass Through | Maximum number of LoRAs to store in CPU memory. Must be >= than max_loras. Defaults to max_loras. | Default: `None` | +| option.fully_sharded_loras | OPTION_FULLY_SHARDED_LORAS | \>= 0.31.0 | Pass Through | By default, only half of the LoRA computation is sharded with tensor parallelism. Enabling this will use the fully sharded layers. At high sequence length, max rank or tensor parallel size, this is likely faster. | Default: `true` | +| option.lora_extra_vocab_size | OPTION_LORA_EXTRA_VOCAB_SIZE | \>= 0.31.0 | Pass Through | This config determines the maximum additional vocabulary that can be added through a LoRA adapter. | Default: `256` | +| option.long_lora_scaling_factors | OPTION_LONG_LORA_SCALING_FACTORS | \>= 0.31.0 | Pass Through | Specify multiple scaling factors (which can be different from base model scaling factor) to allow for multiple LoRA adapters trained with those scaling factors to be used at the same time. If not specified, only adapters trained with the base model scaling factor are allowed. | Default: `None`. Example: "3.0,4.0". | +| option.lora_dtype | OPTION_LORA_DTYPE | \>= 0.31.0 | Pass Through | Data type for LoRA. Valid values are auto, float16, bfloat16, float32. If auto, will default to base model dtype. | Default: `auto` | ## Managing Adapters