Skip to content

Commit

Permalink
[lora] Add lora configs and validation
Browse files Browse the repository at this point in the history
  • Loading branch information
xyang16 committed Nov 14, 2024
1 parent 62a63be commit c97d8a2
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS"
# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for
# the specific language governing permissions and limitations under the License.
import ast
from enum import Enum
from typing import Optional, Mapping
from typing import Optional, Mapping, Tuple

from pydantic import model_validator, field_validator

Expand Down Expand Up @@ -60,7 +61,10 @@ class LmiDistRbProperties(Properties):
enable_lora: Optional[bool] = False
max_loras: Optional[int] = 4
max_lora_rank: Optional[int] = 16
fully_sharded_loras: bool = False
lora_extra_vocab_size: Optional[int] = 256
long_lora_scaling_factors: Optional[Tuple[float, ...]] = None
lora_dtype: Optional[str] = 'auto'
max_cpu_loras: Optional[int] = None
max_logprobs: Optional[int] = 20
enable_chunked_prefill: Optional[bool] = None
Expand Down Expand Up @@ -94,6 +98,23 @@ def validate_speculative_and_fml(self):
)
return self

@field_validator('long_lora_scaling_factors', mode='before')
def validate_long_lora_scaling_factors(cls, val):
if isinstance(val, str):
val = ast.literal_eval(val)
if not isinstance(val, tuple):
if isinstance(val, list):
val = tuple(float(v) for v in val)
elif isinstance(val, float):
val = (val, )
elif isinstance(val, int):
val = (float(val), )
else:
raise ValueError(
"long_lora_scaling_factors must be convertible to a tuple of floats."
)
return val

@field_validator('limit_mm_per_prompt', mode="before")
def validate_limit_mm_per_prompt(cls, val) -> Mapping[str, int]:
out_dict: Dict[str, int] = {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS"
# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for
# the specific language governing permissions and limitations under the License.
import ast
from enum import Enum
from typing import Optional, Any, Mapping, Tuple

Expand Down Expand Up @@ -51,7 +52,7 @@ class VllmRbProperties(Properties):
max_lora_rank: Optional[int] = 16
fully_sharded_loras: bool = False
lora_extra_vocab_size: int = 256
long_lora_scaling_factors: Optional[Tuple[float]] = None
long_lora_scaling_factors: Optional[Tuple[float, ...]] = None
lora_dtype: Optional[str] = 'auto'
max_cpu_loras: Optional[int] = None

Expand Down Expand Up @@ -91,6 +92,23 @@ def validate_engine(cls, engine):
f"Need python engine to start vLLM RollingBatcher")
return engine

@field_validator('long_lora_scaling_factors', mode='before')
def validate_long_lora_scaling_factors(cls, val):
if isinstance(val, str):
val = ast.literal_eval(val)
if not isinstance(val, tuple):
if isinstance(val, list):
val = tuple(float(v) for v in val)
elif isinstance(val, float):
val = (val, )
elif isinstance(val, int):
val = (float(val), )
else:
raise ValueError(
"long_lora_scaling_factors must be convertible to a tuple of floats."
)
return val

@field_validator('limit_mm_per_prompt', mode="before")
def validate_limit_mm_per_prompt(cls, val) -> Mapping[str, int]:
out_dict: Dict[str, int] = {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,11 @@ def __init__(self, model_id_or_path: str, properties: dict, **kwargs):
enable_lora=self.lmi_dist_config.enable_lora,
max_loras=self.lmi_dist_config.max_loras,
max_lora_rank=self.lmi_dist_config.max_lora_rank,
fully_sharded_loras=self.lmi_dist_config.fully_sharded_loras,
lora_extra_vocab_size=self.lmi_dist_config.lora_extra_vocab_size,
long_lora_scaling_factors=self.lmi_dist_config.
long_lora_scaling_factors,
lora_dtype=self.lmi_dist_config.lora_dtype,
max_cpu_loras=self.lmi_dist_config.max_cpu_loras,
revision=self.lmi_dist_config.revision,
enable_chunked_prefill=self.lmi_dist_config.enable_chunked_prefill,
Expand Down
71 changes: 71 additions & 0 deletions engines/python/setup/djl_python/tests/test_properties_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,8 @@ def test_vllm_valid(properties):
int(properties['max_model_len']))
self.assertEqual(vllm_configs.enforce_eager,
bool(properties['enforce_eager']))
self.assertEqual(vllm_configs.enable_lora,
bool(properties['enable_lora']))
self.assertEqual(vllm_configs.gpu_memory_utilization,
float(properties['gpu_memory_utilization']))

Expand All @@ -457,6 +459,36 @@ def test_enforce_eager(properties):
vllm_props = VllmRbProperties(**properties)
self.assertTrue(vllm_props.enforce_eager is False)

def test_long_lora_scaling_factors(properties):
properties['long_lora_scaling_factors'] = "3.0"
vllm_props = VllmRbProperties(**properties)
self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, ))

properties['long_lora_scaling_factors'] = "3"
vllm_props = VllmRbProperties(**properties)
self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, ))

properties['long_lora_scaling_factors'] = "3.0,4.0"
vllm_props = VllmRbProperties(**properties)
self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, 4.0))

properties['long_lora_scaling_factors'] = "3.0, 4.0 "
vllm_props = VllmRbProperties(**properties)
self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, 4.0))

properties['long_lora_scaling_factors'] = "(3.0,)"
vllm_props = VllmRbProperties(**properties)
self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, ))

properties['long_lora_scaling_factors'] = "(3.0,4.0)"
vllm_props = VllmRbProperties(**properties)
self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, 4.0))

def test_invalid_long_lora_scaling_factors(properties):
properties['long_lora_scaling_factors'] = "a,b"
with self.assertRaises(ValueError):
VllmRbProperties(**properties)

properties = {
'model_id': 'sample_model_id',
'engine': 'Python',
Expand All @@ -466,12 +498,15 @@ def test_enforce_eager(properties):
'dtype': 'fp16',
'quantize': 'awq',
'enforce_eager': "True",
'enable_lora': "true",
"gpu_memory_utilization": "0.85",
'load_format': 'pt'
}
test_vllm_valid(properties.copy())
test_invalid_quantization_method(properties.copy())
test_enforce_eager(properties.copy())
test_long_lora_scaling_factors(properties.copy())
test_invalid_long_lora_scaling_factors(properties.copy())

def test_sd_inf2_properties(self):
properties = {
Expand Down Expand Up @@ -507,6 +542,7 @@ def test_with_min_properties():
self.assertEqual(lmi_configs.dtype, 'auto')
self.assertEqual(lmi_configs.gpu_memory_utilization, 0.9)
self.assertTrue(lmi_configs.mpi_mode)
self.assertFalse(lmi_configs.enable_lora)

def test_with_most_properties():
properties = {
Expand All @@ -516,6 +552,7 @@ def test_with_most_properties():
'max_rolling_batch_size': '64',
'max_rolling_batch_prefill_tokens': '12500',
'dtype': 'fp32',
'enable_lora': "true",
}

lmi_configs = LmiDistRbProperties(**properties, **min_properties)
Expand All @@ -533,6 +570,8 @@ def test_with_most_properties():
self.assertEqual(lmi_configs.dtype, 'fp32')
self.assertTrue(lmi_configs.mpi_mode)
self.assertTrue(lmi_configs.trust_remote_code)
self.assertEqual(lmi_configs.enable_lora,
bool(properties['enable_lora']))

def test_invalid_quantization():
properties = {'quantize': 'invalid'}
Expand All @@ -551,6 +590,36 @@ def test_quantization_squeezellm():
self.assertEqual(lmi_configs.quantize.value,
LmiDistQuantizeMethods.squeezellm.value)

def test_long_lora_scaling_factors():
properties = {"long_lora_scaling_factors": "3.0"}
lmi_configs = LmiDistRbProperties(**properties, **min_properties)
self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, ))

properties = {"long_lora_scaling_factors": "3"}
lmi_configs = LmiDistRbProperties(**properties, **min_properties)
self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, ))

properties = {"long_lora_scaling_factors": "3.0,4.0"}
lmi_configs = LmiDistRbProperties(**properties, **min_properties)
self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, 4.0))

properties = {"long_lora_scaling_factors": "3.0, 4.0 "}
lmi_configs = LmiDistRbProperties(**properties, **min_properties)
self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, 4.0))

properties = {"long_lora_scaling_factors": "(3.0,)"}
lmi_configs = LmiDistRbProperties(**properties, **min_properties)
self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, ))

properties = {"long_lora_scaling_factors": "(3.0,4.0)"}
lmi_configs = LmiDistRbProperties(**properties, **min_properties)
self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, 4.0))

def test_invalid_long_lora_scaling_factors():
properties = {'long_lora_scaling_factors': "(a,b)"}
with self.assertRaises(ValueError):
LmiDistRbProperties(**properties, **min_properties)

min_properties = {
'engine': 'MPI',
'mpi_mode': 'true',
Expand All @@ -561,6 +630,8 @@ def test_quantization_squeezellm():
test_invalid_quantization()
test_quantization_with_dtype_error()
test_quantization_squeezellm()
test_long_lora_scaling_factors()
test_invalid_long_lora_scaling_factors()

def test_scheduler_properties(self):
properties = {
Expand Down
20 changes: 10 additions & 10 deletions serving/docs/adapters.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,16 @@ More details can be found in the user guide.

Here are the settings that are available when using LoRA Adapter.

| Item | Environment Variable | LMI Version | Configuration Type | Description | Example value |
|----------------------------------|----------------------------------|-------------|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------|
| option.enable_lora | OPTION_ENABLE_LORA | \>= 0.27.0 | Pass Through | This config enables support for LoRA adapters. | Default: `false` |
| option.max_loras | OPTION_MAX_LORAS | \>= 0.27.0 | Pass Through | This config determines the maximum number of LoRA adapters that can be run at once. Allocates GPU memory for those number adapters. | Default: `4` |
| option.max_lora_rank | OPTION_MAX_LORA_RANK | \>= 0.27.0 | Pass Through | This config determines the maximum rank allowed for a LoRA adapter. Set this value to maximum rank of your adapters. Setting a larger value will enable more adapters at a greater memory usage cost. | Default: `16` |
| option.max_cpu_loras | OPTION_MAX_CPU_LORAS | \>= 0.27.0 | Pass Through | Maximum number of LoRAs to store in CPU memory. Must be >= than max_loras. Defaults to max_loras. | Default: `None` |
| option.fully_sharded_loras | OPTION_FULLY_SHARDED_LORAS | \>= 0.31.0 | Pass Through | By default, only half of the LoRA computation is sharded with tensor parallelism. Enabling this will use the fully sharded layers. At high sequence length, max rank or tensor parallel size, this is likely faster. | Default: `true` |
| option.lora_extra_vocab_size | OPTION_LORA_EXTRA_VOCAB_SIZE | \>= 0.31.0 | Pass Through | This config determines the maximum additional vocabulary that can be added through a LoRA adapter. | Default: `256` |
| option.long_lora_scaling_factors | OPTION_LONG_LORA_SCALING_FACTORS | \>= 0.31.0 | Pass Through | Specify multiple scaling factors (which can be different from base model scaling factor - see eg. Long LoRA) to allow for multiple LoRA adapters trained with those scaling factors to be used at the same time. If not specified, only adapters trained with the base model scaling factor are allowed. | Default: `None` |
| option.lora_dtype | OPTION_LORA_DTYPE | \>= 0.31.0 | Pass Through | Data type for LoRA. Valid values are auto, float16, bfloat16, float32. If auto, will default to base model dtype. | Default: `auto` |
| Item | Environment Variable | LMI Version | Configuration Type | Description | Example value |
|----------------------------------|----------------------------------|-------------|--------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------|
| option.enable_lora | OPTION_ENABLE_LORA | \>= 0.27.0 | Pass Through | This config enables support for LoRA adapters. | Default: `false` |
| option.max_loras | OPTION_MAX_LORAS | \>= 0.27.0 | Pass Through | This config determines the maximum number of LoRA adapters that can be run at once. Allocates GPU memory for those number adapters. | Default: `4` |
| option.max_lora_rank | OPTION_MAX_LORA_RANK | \>= 0.27.0 | Pass Through | This config determines the maximum rank allowed for a LoRA adapter. Set this value to maximum rank of your adapters. Setting a larger value will enable more adapters at a greater memory usage cost. | Default: `16` |
| option.max_cpu_loras | OPTION_MAX_CPU_LORAS | \>= 0.27.0 | Pass Through | Maximum number of LoRAs to store in CPU memory. Must be >= than max_loras. Defaults to max_loras. | Default: `None` |
| option.fully_sharded_loras | OPTION_FULLY_SHARDED_LORAS | \>= 0.31.0 | Pass Through | By default, only half of the LoRA computation is sharded with tensor parallelism. Enabling this will use the fully sharded layers. At high sequence length, max rank or tensor parallel size, this is likely faster. | Default: `true` |
| option.lora_extra_vocab_size | OPTION_LORA_EXTRA_VOCAB_SIZE | \>= 0.31.0 | Pass Through | This config determines the maximum additional vocabulary that can be added through a LoRA adapter. | Default: `256` |
| option.long_lora_scaling_factors | OPTION_LONG_LORA_SCALING_FACTORS | \>= 0.31.0 | Pass Through | Specify multiple scaling factors (which can be different from base model scaling factor) to allow for multiple LoRA adapters trained with those scaling factors to be used at the same time. If not specified, only adapters trained with the base model scaling factor are allowed. | Default: `None`. Example: "3.0,4.0". |
| option.lora_dtype | OPTION_LORA_DTYPE | \>= 0.31.0 | Pass Through | Data type for LoRA. Valid values are auto, float16, bfloat16, float32. If auto, will default to base model dtype. | Default: `auto` |

## Managing Adapters

Expand Down

0 comments on commit c97d8a2

Please sign in to comment.