Created a profile called rhoai_inference_performance_blog_setup for…

… testing the "granite-3b-code-instruct" LLM model using local-ci Fixed `KeyErrors` by using the right variable heirarchy Do not throw exceptions but instead return the control back to the source Enabled tests visualization Variable `export_artifacts.enabled: true` appearing to cause few insignificant issues at the test_ci, plot stages Revert "Do not throw exceptions but instead return the control back to the source" This reverts commit 9bc33c2. using small model, phi-2 for deploying on a singel T4 tensor core gpu Fixed the `ValueError: Bare-metal cluster not recognized` by adding a new cluster metal-profile updating kserve deployment mode from Raw to Serverless stick to the rawDeployment mode Execute llm-load-test for 10 minutes Deply Kserve in serverless mode Revert "Deply Kserve in serverless mode" This reverts commit 1204060. test mixtral-8x7b using 2.12 rc2 build Use `phi-2` model for the testing efforts
openshift-psap · Aug 26, 2024 · b8aa52b · b8aa52b
1 parent 656e220
commit b8aa52b
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 3 deletions.
diff --git a/projects/kserve/roles/kserve_validate_model/tasks/validate_model_vllm.yaml b/projects/kserve/roles/kserve_validate_model/tasks/validate_model_vllm.yaml
@@ -62,7 +62,7 @@
 
     echo $CURL_DATA >> "$query"
     for i in {1..{{ kserve_validate_model_query_count }}}; do
-      curl --output $dest --write-out "%{http_code}" \
+      curl -k --output $dest --write-out "%{http_code}" \
         {{ model_endpoint_cmd.stdout }}/v1/completions \
         -H "Content-Type: application/json" \
         -d "${CURL_DATA}" > "$status_code"

diff --git a/projects/kserve/testing/config.yaml b/projects/kserve/testing/config.yaml
@@ -35,6 +35,9 @@ ci_presets:
   cluster_a30:
     clusters.sutest.compute.machineset.type: "PSAP A30 node"
 
+  cluster_t4:
+    clusters.sutest.compute.machineset.type: "PSAP T4 node"
+
   cluster_2xa100:
     clusters.sutest.compute.machineset.type: "Beaker A100-80GB"
 
@@ -282,6 +285,25 @@ ci_presets:
     tests.e2e.llm_load_test.args.concurrency: [1, 2, 4, 8, 16, 32, 64, 96, 128]
     export_artifacts.enabled: true
 
+  ##### PSAP 1474 Setup
+  rhoai_inference_performance_blog_setup:
+    extends: [vllm, e2e_perf]
+    tests.e2e.models:
+    - name: phi-2
+      testing:
+        size: small  # Determines the dataset size
+        max_concurrency: 128  # I guess, it defines cap on the concurrency
+    tests.e2e.matbenchmark.enabled: true
+    tests.e2e.llm_load_test.enabled: true
+    tests.e2e.llm_load_test.args.duration: 180
+    tests.e2e.llm_load_test.args.concurrency: 1     #Also supports List
+    tests.visualize: true
+    matbench.lts.opensearch.export.enabled: false
+    export_artifacts.enabled: false
+    export_artifacts.bucket: rhods-baremetal-results
+    export_artifacts.path_prefix: local-ci/rhods/kserve
+  ##### PSAP 1474 End of Setup
+
   # --
   # single-model vLLM
   # --
@@ -461,6 +483,7 @@ clusters:
     e26-h23-000-r650: cluster_icelake
     bb37-h13-000-r750.rdu3.labs.perfscale.redhat.com: cluster_a30
     nvd-srv-02.nvidia.eng.rdu2.redhat.com: cluster_2xa100
+    x37-h13-000-r740xd.rdu3.labs.perfscale.redhat.com: cluster_t4
   create:
     type: single # can be: single, ocp, managed
     keep: false

diff --git a/projects/kserve/testing/test_e2e.py b/projects/kserve/testing/test_e2e.py
@@ -346,7 +346,7 @@ def test_one_model(
     else:
         results_dir = env.ARTIFACT_DIR
         with env.NextArtifactDir("plots"):
-            visualize.prepare_matbench()
+            #visualize.prepare_matbench()
             import test
             test.generate_plots(results_dir)
 
@@ -636,7 +636,7 @@ def launch_test_consolidated_model(consolidated_model, dedicated_dir=True):
 
 
 def matbenchmark_run_llm_load_test(namespace, llm_load_test_args, model_max_concurrency):
-    visualize.prepare_matbench()
+    #visualize.prepare_matbench()
 
     with env.NextArtifactDir("matbenchmark__llm_load_test"):
         benchmark_values = {}