[lora] Add lora configs and validation

deepjavalibrary · Nov 14, 2024 · c97d8a2 · c97d8a2
1 parent 62a63be
commit c97d8a2
Show file tree

Hide file tree

Showing 5 changed files with 126 additions and 12 deletions.
diff --git a/engines/python/setup/djl_python/properties_manager/lmi_dist_rb_properties.py b/engines/python/setup/djl_python/properties_manager/lmi_dist_rb_properties.py
@@ -10,8 +10,9 @@
 # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS"
 # BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for
 # the specific language governing permissions and limitations under the License.
+import ast
 from enum import Enum
-from typing import Optional, Mapping
+from typing import Optional, Mapping, Tuple
 
 from pydantic import model_validator, field_validator
 
@@ -60,7 +61,10 @@ class LmiDistRbProperties(Properties):
     enable_lora: Optional[bool] = False
     max_loras: Optional[int] = 4
     max_lora_rank: Optional[int] = 16
+    fully_sharded_loras: bool = False
     lora_extra_vocab_size: Optional[int] = 256
+    long_lora_scaling_factors: Optional[Tuple[float, ...]] = None
+    lora_dtype: Optional[str] = 'auto'
     max_cpu_loras: Optional[int] = None
     max_logprobs: Optional[int] = 20
     enable_chunked_prefill: Optional[bool] = None
@@ -94,6 +98,23 @@ def validate_speculative_and_fml(self):
             )
         return self
 
+    @field_validator('long_lora_scaling_factors', mode='before')
+    def validate_long_lora_scaling_factors(cls, val):
+        if isinstance(val, str):
+            val = ast.literal_eval(val)
+        if not isinstance(val, tuple):
+            if isinstance(val, list):
+                val = tuple(float(v) for v in val)
+            elif isinstance(val, float):
+                val = (val, )
+            elif isinstance(val, int):
+                val = (float(val), )
+            else:
+                raise ValueError(
+                    "long_lora_scaling_factors must be convertible to a tuple of floats."
+                )
+        return val
+
     @field_validator('limit_mm_per_prompt', mode="before")
     def validate_limit_mm_per_prompt(cls, val) -> Mapping[str, int]:
         out_dict: Dict[str, int] = {}

diff --git a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py
@@ -10,6 +10,7 @@
 # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS"
 # BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for
 # the specific language governing permissions and limitations under the License.
+import ast
 from enum import Enum
 from typing import Optional, Any, Mapping, Tuple
 
@@ -51,7 +52,7 @@ class VllmRbProperties(Properties):
     max_lora_rank: Optional[int] = 16
     fully_sharded_loras: bool = False
     lora_extra_vocab_size: int = 256
-    long_lora_scaling_factors: Optional[Tuple[float]] = None
+    long_lora_scaling_factors: Optional[Tuple[float, ...]] = None
     lora_dtype: Optional[str] = 'auto'
     max_cpu_loras: Optional[int] = None
 
@@ -91,6 +92,23 @@ def validate_engine(cls, engine):
                 f"Need python engine to start vLLM RollingBatcher")
         return engine
 
+    @field_validator('long_lora_scaling_factors', mode='before')
+    def validate_long_lora_scaling_factors(cls, val):
+        if isinstance(val, str):
+            val = ast.literal_eval(val)
+        if not isinstance(val, tuple):
+            if isinstance(val, list):
+                val = tuple(float(v) for v in val)
+            elif isinstance(val, float):
+                val = (val, )
+            elif isinstance(val, int):
+                val = (float(val), )
+            else:
+                raise ValueError(
+                    "long_lora_scaling_factors must be convertible to a tuple of floats."
+                )
+        return val
+
     @field_validator('limit_mm_per_prompt', mode="before")
     def validate_limit_mm_per_prompt(cls, val) -> Mapping[str, int]:
         out_dict: Dict[str, int] = {}

diff --git a/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py b/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py
@@ -78,7 +78,11 @@ def __init__(self, model_id_or_path: str, properties: dict, **kwargs):
             enable_lora=self.lmi_dist_config.enable_lora,
             max_loras=self.lmi_dist_config.max_loras,
             max_lora_rank=self.lmi_dist_config.max_lora_rank,
+            fully_sharded_loras=self.lmi_dist_config.fully_sharded_loras,
             lora_extra_vocab_size=self.lmi_dist_config.lora_extra_vocab_size,
+            long_lora_scaling_factors=self.lmi_dist_config.
+            long_lora_scaling_factors,
+            lora_dtype=self.lmi_dist_config.lora_dtype,
             max_cpu_loras=self.lmi_dist_config.max_cpu_loras,
             revision=self.lmi_dist_config.revision,
             enable_chunked_prefill=self.lmi_dist_config.enable_chunked_prefill,

diff --git a/engines/python/setup/djl_python/tests/test_properties_manager.py b/engines/python/setup/djl_python/tests/test_properties_manager.py
@@ -440,6 +440,8 @@ def test_vllm_valid(properties):
                              int(properties['max_model_len']))
             self.assertEqual(vllm_configs.enforce_eager,
                              bool(properties['enforce_eager']))
+            self.assertEqual(vllm_configs.enable_lora,
+                             bool(properties['enable_lora']))
             self.assertEqual(vllm_configs.gpu_memory_utilization,
                              float(properties['gpu_memory_utilization']))
 
@@ -457,6 +459,36 @@ def test_enforce_eager(properties):
             vllm_props = VllmRbProperties(**properties)
             self.assertTrue(vllm_props.enforce_eager is False)
 
+        def test_long_lora_scaling_factors(properties):
+            properties['long_lora_scaling_factors'] = "3.0"
+            vllm_props = VllmRbProperties(**properties)
+            self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, ))
+
+            properties['long_lora_scaling_factors'] = "3"
+            vllm_props = VllmRbProperties(**properties)
+            self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, ))
+
+            properties['long_lora_scaling_factors'] = "3.0,4.0"
+            vllm_props = VllmRbProperties(**properties)
+            self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, 4.0))
+
+            properties['long_lora_scaling_factors'] = "3.0, 4.0 "
+            vllm_props = VllmRbProperties(**properties)
+            self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, 4.0))
+
+            properties['long_lora_scaling_factors'] = "(3.0,)"
+            vllm_props = VllmRbProperties(**properties)
+            self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, ))
+
+            properties['long_lora_scaling_factors'] = "(3.0,4.0)"
+            vllm_props = VllmRbProperties(**properties)
+            self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, 4.0))
+
+        def test_invalid_long_lora_scaling_factors(properties):
+            properties['long_lora_scaling_factors'] = "a,b"
+            with self.assertRaises(ValueError):
+                VllmRbProperties(**properties)
+
         properties = {
             'model_id': 'sample_model_id',
             'engine': 'Python',
@@ -466,12 +498,15 @@ def test_enforce_eager(properties):
             'dtype': 'fp16',
             'quantize': 'awq',
             'enforce_eager': "True",
+            'enable_lora': "true",
             "gpu_memory_utilization": "0.85",
             'load_format': 'pt'
         }
         test_vllm_valid(properties.copy())
         test_invalid_quantization_method(properties.copy())
         test_enforce_eager(properties.copy())
+        test_long_lora_scaling_factors(properties.copy())
+        test_invalid_long_lora_scaling_factors(properties.copy())
 
     def test_sd_inf2_properties(self):
         properties = {
@@ -507,6 +542,7 @@ def test_with_min_properties():
             self.assertEqual(lmi_configs.dtype, 'auto')
             self.assertEqual(lmi_configs.gpu_memory_utilization, 0.9)
             self.assertTrue(lmi_configs.mpi_mode)
+            self.assertFalse(lmi_configs.enable_lora)
 
         def test_with_most_properties():
             properties = {
@@ -516,6 +552,7 @@ def test_with_most_properties():
                 'max_rolling_batch_size': '64',
                 'max_rolling_batch_prefill_tokens': '12500',
                 'dtype': 'fp32',
+                'enable_lora': "true",
             }
 
             lmi_configs = LmiDistRbProperties(**properties, **min_properties)
@@ -533,6 +570,8 @@ def test_with_most_properties():
             self.assertEqual(lmi_configs.dtype, 'fp32')
             self.assertTrue(lmi_configs.mpi_mode)
             self.assertTrue(lmi_configs.trust_remote_code)
+            self.assertEqual(lmi_configs.enable_lora,
+                             bool(properties['enable_lora']))
 
         def test_invalid_quantization():
             properties = {'quantize': 'invalid'}
@@ -551,6 +590,36 @@ def test_quantization_squeezellm():
             self.assertEqual(lmi_configs.quantize.value,
                              LmiDistQuantizeMethods.squeezellm.value)
 
+        def test_long_lora_scaling_factors():
+            properties = {"long_lora_scaling_factors": "3.0"}
+            lmi_configs = LmiDistRbProperties(**properties, **min_properties)
+            self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, ))
+
+            properties = {"long_lora_scaling_factors": "3"}
+            lmi_configs = LmiDistRbProperties(**properties, **min_properties)
+            self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, ))
+
+            properties = {"long_lora_scaling_factors": "3.0,4.0"}
+            lmi_configs = LmiDistRbProperties(**properties, **min_properties)
+            self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, 4.0))
+
+            properties = {"long_lora_scaling_factors": "3.0, 4.0 "}
+            lmi_configs = LmiDistRbProperties(**properties, **min_properties)
+            self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, 4.0))
+
+            properties = {"long_lora_scaling_factors": "(3.0,)"}
+            lmi_configs = LmiDistRbProperties(**properties, **min_properties)
+            self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, ))
+
+            properties = {"long_lora_scaling_factors": "(3.0,4.0)"}
+            lmi_configs = LmiDistRbProperties(**properties, **min_properties)
+            self.assertEqual(lmi_configs.long_lora_scaling_factors, (3.0, 4.0))
+
+        def test_invalid_long_lora_scaling_factors():
+            properties = {'long_lora_scaling_factors': "(a,b)"}
+            with self.assertRaises(ValueError):
+                LmiDistRbProperties(**properties, **min_properties)
+
         min_properties = {
             'engine': 'MPI',
             'mpi_mode': 'true',
@@ -561,6 +630,8 @@ def test_quantization_squeezellm():
         test_invalid_quantization()
         test_quantization_with_dtype_error()
         test_quantization_squeezellm()
+        test_long_lora_scaling_factors()
+        test_invalid_long_lora_scaling_factors()
 
     def test_scheduler_properties(self):
         properties = {

diff --git a/serving/docs/adapters.md b/serving/docs/adapters.md
@@ -29,16 +29,16 @@ More details can be found in the user guide.
 
 Here are the settings that are available when using LoRA Adapter.
 
-| Item                             | Environment Variable             | LMI Version | Configuration Type | Description                                                                                                                                                                                                                                                                                              | Example value    |
-|----------------------------------|----------------------------------|-------------|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------|
-| option.enable_lora               | OPTION_ENABLE_LORA               | \>= 0.27.0  | Pass Through       | This config enables support for LoRA adapters.                                                                                                                                                                                                                                                           | Default: `false` |
-| option.max_loras                 | OPTION_MAX_LORAS                 | \>= 0.27.0  | Pass Through       | This config determines the maximum number of LoRA adapters that can be run at once. Allocates GPU memory for those number adapters.                                                                                                                                                                      | Default: `4`     |
-| option.max_lora_rank             | OPTION_MAX_LORA_RANK             | \>= 0.27.0  | Pass Through       | This config determines the maximum rank allowed for a LoRA adapter. Set this value to maximum rank of your adapters. Setting a larger value will enable more adapters at a greater memory usage cost.                                                                                                    | Default: `16`    |
-| option.max_cpu_loras             | OPTION_MAX_CPU_LORAS             | \>= 0.27.0  | Pass Through       | Maximum number of LoRAs to store in CPU memory. Must be >= than max_loras. Defaults to max_loras.                                                                                                                                                                                                        | Default: `None`  |
-| option.fully_sharded_loras       | OPTION_FULLY_SHARDED_LORAS       | \>= 0.31.0  | Pass Through       | By default, only half of the LoRA computation is sharded with tensor parallelism. Enabling this will use the fully sharded layers. At high sequence length, max rank or tensor parallel size, this is likely faster.                                                                                     | Default: `true`  |
-| option.lora_extra_vocab_size     | OPTION_LORA_EXTRA_VOCAB_SIZE     | \>= 0.31.0  | Pass Through       | This config determines the maximum additional vocabulary that can be added through a LoRA adapter.                                                                                                                                                                                                       | Default: `256`   |
-| option.long_lora_scaling_factors | OPTION_LONG_LORA_SCALING_FACTORS | \>= 0.31.0  | Pass Through       | Specify multiple scaling factors (which can be different from base model scaling factor - see eg. Long LoRA) to allow for multiple LoRA adapters trained with those scaling factors to be used at the same time. If not specified, only adapters trained with the base model scaling factor are allowed. | Default: `None`  |
-| option.lora_dtype                | OPTION_LORA_DTYPE                | \>= 0.31.0  | Pass Through       | Data type for LoRA. Valid values are auto, float16, bfloat16, float32. If auto, will default to base model dtype.                                                                                                                                                                                        | Default: `auto`  |
+| Item                             | Environment Variable             | LMI Version | Configuration Type | Description                                                                                                                                                                                                                                                                           | Example value                        |
+|----------------------------------|----------------------------------|-------------|--------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------|
+| option.enable_lora               | OPTION_ENABLE_LORA               | \>= 0.27.0  | Pass Through       | This config enables support for LoRA adapters.                                                                                                                                                                                                                                        | Default: `false`                     |
+| option.max_loras                 | OPTION_MAX_LORAS                 | \>= 0.27.0  | Pass Through       | This config determines the maximum number of LoRA adapters that can be run at once. Allocates GPU memory for those number adapters.                                                                                                                                                   | Default: `4`                         |
+| option.max_lora_rank             | OPTION_MAX_LORA_RANK             | \>= 0.27.0  | Pass Through       | This config determines the maximum rank allowed for a LoRA adapter. Set this value to maximum rank of your adapters. Setting a larger value will enable more adapters at a greater memory usage cost.                                                                                 | Default: `16`                        |
+| option.max_cpu_loras             | OPTION_MAX_CPU_LORAS             | \>= 0.27.0  | Pass Through       | Maximum number of LoRAs to store in CPU memory. Must be >= than max_loras. Defaults to max_loras.                                                                                                                                                                                     | Default: `None`                      |
+| option.fully_sharded_loras       | OPTION_FULLY_SHARDED_LORAS       | \>= 0.31.0  | Pass Through       | By default, only half of the LoRA computation is sharded with tensor parallelism. Enabling this will use the fully sharded layers. At high sequence length, max rank or tensor parallel size, this is likely faster.                                                                  | Default: `true`                      |
+| option.lora_extra_vocab_size     | OPTION_LORA_EXTRA_VOCAB_SIZE     | \>= 0.31.0  | Pass Through       | This config determines the maximum additional vocabulary that can be added through a LoRA adapter.                                                                                                                                                                                    | Default: `256`                       |
+| option.long_lora_scaling_factors | OPTION_LONG_LORA_SCALING_FACTORS | \>= 0.31.0  | Pass Through       | Specify multiple scaling factors (which can be different from base model scaling factor) to allow for multiple LoRA adapters trained with those scaling factors to be used at the same time. If not specified, only adapters trained with the base model scaling factor are allowed.  | Default: `None`. Example: "3.0,4.0". |
+| option.lora_dtype                | OPTION_LORA_DTYPE                | \>= 0.31.0  | Pass Through       | Data type for LoRA. Valid values are auto, float16, bfloat16, float32. If auto, will default to base model dtype.                                                                                                                                                                     | Default: `auto`                      |
 
 ## Managing Adapters