Skip to content

Commit

Permalink
Added 3.1 meta/llama 3.1 8B, 70B instruct models for testing
Browse files Browse the repository at this point in the history
  • Loading branch information
mcharanrm committed Sep 12, 2024
1 parent 7d8657f commit c7c669b
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 18 deletions.
5 changes: 5 additions & 0 deletions projects/core/library/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ def export_artifacts(artifacts_dirname, test_step=None):
job_id = job[4:].replace("/job/", "_")

run_id = f"middleware_jenkins/{job_id}/{build_number}"

elif os.environ.get("JOB_NAME") == "local-ci":
run_id = os.environ["TEST_RUN_IDENTIFIER"]
test_step = ''

else:
logging.error("CI engine not recognized, cannot build the run id ...")
raise ValueError("CI engine not recognized, cannot build the run id ...")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

namePrefix: llama-3-dot-1-8b-
namePrefix: llama-3.1-70b-instruct-

resources:
- ../../base
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
apiVersion: serving.kserve.io/v1beta1
kind: InferenceService
metadata:
name: isvc
spec:
predictor:
minReplicas: 1
maxReplicas: 1
model:
storageUri: s3://psap-1474-blog-models/meta-llama/Meta-Llama-3.1-70B-Instruct/
args:
- --dtype=bfloat16
- --tensor-parallel-size=8
- --max-model-len=4096
resources:
requests:
nvidia.com/gpu: "8"
limits:
nvidia.com/gpu: "8"
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

namePrefix: llama-3.1-8b-instruct-

resources:
- ../../base

patches:
- path: patch.yaml
target:
kind: InferenceService
options:
allowNameChange: true
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@ metadata:
name: isvc
spec:
predictor:
minReplicas: 1
maxReplicas: 1
model:
storageUri: s3://psap-hf-models/Meta-Llama-3.1-8B/Meta-Llama-3.1-8B/
storageUri: s3://psap-1474-blog-models/meta-llama/Meta-Llama-3.1-8B-Instruct/
args:
- --dtype=bfloat16
- --tensor-parallel-size=1
- --tensor-parallel-size=1
- --max-model-len=4096
resources:
requests:
nvidia.com/gpu: "1"
Expand Down
24 changes: 9 additions & 15 deletions projects/kserve/testing/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ ci_presets:

raw:
kserve.raw_deployment.enabled: true
tests.e2e.llm_load_test.args.interface: "http"

serverless:
tests.e2e.llm_load_test.args.interface: "https"

quick_raw:
extends: [raw, no_gpu]
Expand Down Expand Up @@ -139,7 +143,6 @@ ci_presets:
# kserve.model.serving_runtime.update_image: False
tests.e2e.llm_load_test.args.plugin: "openai_plugin"
tests.e2e.llm_load_test.args.streaming: true
tests.e2e.llm_load_test.args.interface: "https"
tests.e2e.llm_load_test.args.endpoint: "/v1/completions"
# tests.e2e.llm_load_test.args.model_name: "/mnt/models/"
# tests.e2e.llm_load_test.args.host: "http://localhost:8033"
Expand Down Expand Up @@ -292,30 +295,21 @@ ci_presets:

##### PSAP 1474 Setup
rhoai_inference_performance_blog_setup:
extends: [vllm, e2e_perf, metal]
extends: [serverless, vllm, e2e_perf, metal]
tests.e2e.models:
- name: phi-2
testing:
size: small # Determines the dataset size
max_concurrency: 1 # I guess, it defines cap on the concurrency
- name: granite-8b-code-instruct
testing:
size: small
max_concurrency: 1
- name: llama-3-dot-1-8b
testing:
size: small
max_concurrency: 1

tests.e2e.matbenchmark.enabled: true
tests.e2e.llm_load_test.enabled: true
tests.e2e.llm_load_test.args.duration: 180
tests.e2e.llm_load_test.args.duration: 60
tests.e2e.llm_load_test.args.concurrency: [1, 2, 4, 8, 16, 32, 64, 128, 256] #Also supports List
tests.visualize: true
matbench.lts.opensearch.export.enabled: false
export_artifacts.enabled: false
export_artifacts.bucket: rhods-baremetal-results
export_artifacts.path_prefix: local-ci/rhods/kserve
export_artifacts.enabled: true
export_artifacts.bucket: psap-1474-results
export_artifacts.path_prefix: dry-run
##### PSAP 1474 End of Setup

# --
Expand Down

0 comments on commit c7c669b

Please sign in to comment.