fix tgi/vllm/trt examples

huggingface · Dec 10, 2024 · 46bfe63 · 46bfe63
1 parent 49a7acc
commit 46bfe63
Show file tree

Hide file tree

Showing 5 changed files with 13 additions and 18 deletions.
diff --git a/examples/cuda_pytorch_llama.yaml b/examples/cuda_pytorch_llama.yaml
@@ -20,16 +20,10 @@ backend:
   model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 
 scenario:
-  memory: true
-  latency: true
-
-  warmup_runs: 10
-  iterations: 10
-  duration: 10
-
   input_shapes:
-    batch_size: 1
-    sequence_length: 256
+    batch_size: 4
+    sequence_length: 64
+
   generate_kwargs:
     max_new_tokens: 32
     min_new_tokens: 32
diff --git a/examples/cuda_tgi_llama.yaml b/examples/cuda_tgi_llama.yaml
@@ -15,6 +15,7 @@ launcher:
 backend:
   device: cuda
   device_ids: 0
+  cuda_graphs: 0 # remove for better perf but bigger memory footprint
   model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 
 scenario:
@@ -23,5 +24,5 @@ scenario:
     sequence_length: 64
 
   generate_kwargs:
-    max_new_tokens: 16
-    min_new_tokens: 16
+    max_new_tokens: 32
+    min_new_tokens: 32
diff --git a/examples/cuda_trt_llama.yaml b/examples/cuda_trt_llama.yaml
@@ -23,5 +23,5 @@ scenario:
     sequence_length: 64
 
   generate_kwargs:
-    max_new_tokens: 16
-    min_new_tokens: 16
+    max_new_tokens: 32
+    min_new_tokens: 32
diff --git a/examples/cuda_vllm_llama.yaml b/examples/cuda_vllm_llama.yaml
@@ -15,16 +15,16 @@ launcher:
 backend:
   device: cuda
   device_ids: 0
-  serving_mode: offline
+  serving_mode: online # server-like
   model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
   engine_args:
-    enforce_eager: true
+    enforce_eager: true # remove for better perf but bigger memory footprint
 
 scenario:
   input_shapes:
     batch_size: 4
     sequence_length: 64
 
   generate_kwargs:
-    max_new_tokens: 16
-    min_new_tokens: 16
+    max_new_tokens: 32
+    min_new_tokens: 32
diff --git a/examples/mps_pytorch_bert.yaml b/examples/mps_pytorch_bert.yaml
@@ -1,7 +1,7 @@
 defaults:
   - benchmark
   - scenario: inference
-  - launcher: inline # mps has problems with multi processing (process launcher)
+  - launcher: inline # mps fails with python multi-processing for some reason
   - backend: pytorch
   - _base_
   - _self_