Added 3.1 meta/llama 3.1 8B, 70B instruct models for testing

openshift-psap · Sep 12, 2024 · c7c669b · c7c669b
1 parent 7d8657f
commit c7c669b
Show file tree

Hide file tree

Showing 6 changed files with 53 additions and 18 deletions.
diff --git a/projects/core/library/export.py b/projects/core/library/export.py
@@ -67,6 +67,11 @@ def export_artifacts(artifacts_dirname, test_step=None):
         job_id = job[4:].replace("/job/", "_")
 
         run_id = f"middleware_jenkins/{job_id}/{build_number}"
+
+    elif os.environ.get("JOB_NAME") == "local-ci":
+        run_id = os.environ["TEST_RUN_IDENTIFIER"]
+        test_step = ''
+
     else:
         logging.error("CI engine not recognized, cannot build the run id ...")
         raise ValueError("CI engine not recognized, cannot build the run id ...")

diff --git a/...odels/llama-3-dot-1-8b/kustomization.yaml → ...llama-3.1-70b-instruct/kustomization.yaml b/...odels/llama-3-dot-1-8b/kustomization.yaml → ...llama-3.1-70b-instruct/kustomization.yaml
@@ -1,7 +1,7 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 
-namePrefix: llama-3-dot-1-8b-
+namePrefix: llama-3.1-70b-instruct-
 
 resources:
 - ../../base

diff --git a/...ects/kserve/roles/kserve_deploy_model/files/vllm/models/llama-3.1-70b-instruct/patch.yaml b/...ects/kserve/roles/kserve_deploy_model/files/vllm/models/llama-3.1-70b-instruct/patch.yaml
@@ -0,0 +1,19 @@
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+  name: isvc
+spec:
+  predictor:
+    minReplicas: 1
+    maxReplicas: 1
+    model:
+      storageUri: s3://psap-1474-blog-models/meta-llama/Meta-Llama-3.1-70B-Instruct/ 
+      args:
+      - --dtype=bfloat16
+      - --tensor-parallel-size=8
+      - --max-model-len=4096
+      resources:
+        requests:
+          nvidia.com/gpu: "8"
+        limits:
+          nvidia.com/gpu: "8"
diff --git a/...erve/roles/kserve_deploy_model/files/vllm/models/llama-3.1-8b-instruct/kustomization.yaml b/...erve/roles/kserve_deploy_model/files/vllm/models/llama-3.1-8b-instruct/kustomization.yaml
@@ -0,0 +1,14 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namePrefix: llama-3.1-8b-instruct- 
+
+resources:
+- ../../base
+
+patches:
+- path: patch.yaml
+  target:
+    kind: InferenceService
+  options:
+    allowNameChange: true
diff --git a/...s/vllm/models/llama-3-dot-1-8b/patch.yaml → ...m/models/llama-3.1-8b-instruct/patch.yaml b/...s/vllm/models/llama-3-dot-1-8b/patch.yaml → ...m/models/llama-3.1-8b-instruct/patch.yaml
@@ -4,11 +4,14 @@ metadata:
   name: isvc
 spec:
   predictor:
+    minReplicas: 1
+    maxReplicas: 1
     model:
-      storageUri: s3://psap-hf-models/Meta-Llama-3.1-8B/Meta-Llama-3.1-8B/
+      storageUri: s3://psap-1474-blog-models/meta-llama/Meta-Llama-3.1-8B-Instruct/ 
       args:
       - --dtype=bfloat16
-      - --tensor-parallel-size=1 
+      - --tensor-parallel-size=1
+      - --max-model-len=4096 
       resources:
         requests:
           nvidia.com/gpu: "1"

diff --git a/projects/kserve/testing/config.yaml b/projects/kserve/testing/config.yaml
@@ -96,6 +96,10 @@ ci_presets:
 
   raw:
     kserve.raw_deployment.enabled: true
+    tests.e2e.llm_load_test.args.interface: "http"
+
+  serverless:
+    tests.e2e.llm_load_test.args.interface: "https"
 
   quick_raw:
     extends: [raw, no_gpu]
@@ -139,7 +143,6 @@ ci_presets:
     # kserve.model.serving_runtime.update_image: False
     tests.e2e.llm_load_test.args.plugin: "openai_plugin"
     tests.e2e.llm_load_test.args.streaming: true
-    tests.e2e.llm_load_test.args.interface: "https"
     tests.e2e.llm_load_test.args.endpoint: "/v1/completions"
       # tests.e2e.llm_load_test.args.model_name: "/mnt/models/"
       # tests.e2e.llm_load_test.args.host: "http://localhost:8033"
@@ -292,30 +295,21 @@ ci_presets:
 
   ##### PSAP 1474 Setup
   rhoai_inference_performance_blog_setup:
-    extends: [vllm, e2e_perf, metal]
+    extends: [serverless, vllm, e2e_perf, metal]
     tests.e2e.models:
     - name: phi-2
       testing:
         size: small  # Determines the dataset size
         max_concurrency: 1  # I guess, it defines cap on the concurrency
-    - name: granite-8b-code-instruct
-      testing:
-        size: small
-        max_concurrency: 1
-    - name: llama-3-dot-1-8b
-      testing:
-        size: small
-        max_concurrency: 1
-
     tests.e2e.matbenchmark.enabled: true
     tests.e2e.llm_load_test.enabled: true
-    tests.e2e.llm_load_test.args.duration: 180 
+    tests.e2e.llm_load_test.args.duration: 60 
     tests.e2e.llm_load_test.args.concurrency: [1, 2, 4, 8, 16, 32, 64, 128, 256]    #Also supports List
     tests.visualize: true
     matbench.lts.opensearch.export.enabled: false
-    export_artifacts.enabled: false
-    export_artifacts.bucket: rhods-baremetal-results
-    export_artifacts.path_prefix: local-ci/rhods/kserve
+    export_artifacts.enabled: true 
+    export_artifacts.bucket: psap-1474-results
+    export_artifacts.path_prefix: dry-run
   ##### PSAP 1474 End of Setup
 
   # --