Skip to content

Latest commit

 

History

History
189 lines (157 loc) · 6.13 KB

File metadata and controls

189 lines (157 loc) · 6.13 KB

Raw PVC Feature

# OPTIONAL: set your namespace for testing
(cd pocs/persistence-and-caching/raw-pvc && kustomize edit set namespace knim-raw-pvc)
# apply the required resources
$ k apply -k pocs/persistence-and-caching/raw-pvc

namespace/knim-raw-pvc created
secret/ngc-secret created
secret/nvidia-nim-secrets created
persistentvolumeclaim/nim-pvc created
servingruntime.serving.kserve.io/nvidia-nim-llama3-8b-instruct-1.0.0 created
inferenceservice.serving.kserve.io/llama3-8b-instruct-1xgpu created
# wait for the service to be ready; this might take a couple of minutes
$ k wait inferenceservices -n knim-raw-pvc llama3-8b-instruct-1xgpu --for condition=Ready --timeout 300s

inferenceservice.serving.kserve.io/llama3-8b-instruct-1xgpu condition met
# list all components
$ k get all -n knim-raw-pvc -o name

pod/llama3-8b-instruct-1xgpu-predictor-5bd786f77d-bljj2
service/llama3-8b-instruct-1xgpu-predictor
deployment.apps/llama3-8b-instruct-1xgpu-predictor
replicaset.apps/llama3-8b-instruct-1xgpu-predictor-5bd786f77d
horizontalpodautoscaler.autoscaling/llama3-8b-instruct-1xgpu-predictor
# grab the name of the pod created
$ k get pods -n knim-raw-pvc

NAME                                                  READY   STATUS    RESTARTS   AGE
llama3-8b-instruct-1xgpu-predictor-5bd786f77d-bljj2   2/2     Running   0          9m17s
# check the download time; don't forget to use the correct pod name from your environment
$ pod=llama3-8b-instruct-1xgpu-predictor-5bd786f77d-29nbg && \
k logs -n knim-raw-pvc $pod kserve-container | grep 'Model workspace is now ready'

INFO 07-24 20:42:13.878 ngc_injector.py:172] Model workspace is now ready. It took 70.084 seconds

Note: Currently, an OpenShift Route is not created by Kserve in Raw deployment mode. This should be fixed in time for the integration. For the POC, we create the Route manually.

# create an openshift route
$ oc expose service -n knim-raw-pvc llama3-8b-instruct-1xgpu-predictor

route/llama3-8b-instruct-1xgpu-predictor exposed
# test using NIM API to get a list of the existing models and their attributes
$ runtimeurl=$(k get route -n knim-raw-pvc llama3-8b-instruct-1xgpu-predictor -o yaml | yq '.spec.host') && \
curl -sk $runtimeurl/v1/models | jq

{
  "object": "list",
  "data": [
    {
      "id": "meta/llama3-8b-instruct",
      "object": "model",
      ...
      "permission": [
        {
         ...
        }
      ]
    }
  ]
}
# test using NIM API to interact with the underlying model
$ runtimeurl=$(k get route -n knim-raw-pvc llama3-8b-instruct-1xgpu-predictor -o yaml | yq '.spec.host') && \
curl -sk $runtimeurl/v1/chat/completions -H "Content-Type: application/json" -d \
'{
  "model": "meta/llama3-8b-instruct",
  "messages": [{"role":"user","content":"What is Red Hat OpenShift AI?"}],
  "temperature": 0.5,
  "top_p": 1,
  "max_tokens": 1024,
  "stream": false
}' | jq

{
  ...
  "model": "meta/llama3-8b-instruct",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "Red Hat OpenShift AI is a cloud-native, containerized platform designed to simplify the deployment, management, and scaling of artificial intelligence (AI) and machine learning (ML) workloads..."
      },
      ...
    }
  ],
  "usage": {
    ...
  }
}
# verify only one replica is scheduled currently (this might take a minute or two to get right)
$ k get hpa -n knim-raw-pvc llama3-8b-instruct-1xgpu-predictor -o yaml | yq '.status' | yq '. |= pick(["currentReplicas", "desiredReplicas"])'

currentReplicas: 1
desiredReplicas: 1

Before continuing further, you should know that for this demo, we configured the HPA for scaleMetric=cpu and scaleTarget=1 to make it scale up for 1% utilization of the CPU.
Check kustomization.yaml.

# trigger a scale-up of the pods by sending multiple interaction requests simultaneously
$ runtimeurl=$(k get route -n knim-raw-pvc llama3-8b-instruct-1xgpu-predictor -o yaml | yq '.spec.host') && \
for i in {1..80}; do curl -sk $runtimeurl/v1/chat/completions -H "Content-Type: application/json" -d \
'{
  "model": "meta/llama3-8b-instruct",
  "messages": [{"role":"user","content":"What is Red Hat OpenShift AI?"}],
  "temperature": 0.5,
  "top_p": 1,
  "max_tokens": 1024,
  "stream": false
}' 2>&1 > /dev/null &; done
# wait a minute or so and verify a second replica is requested by the HPA
$ k get hpa -n knim-raw-pvc llama3-8b-instruct-1xgpu-predictor -o yaml | yq '.status' | yq '. |= pick(["currentReplicas", "desiredReplicas"])'

currentReplicas: 1
desiredReplicas: 3
# grab the name of the new pod created based on the age
$ k get pods -n knim-raw-pvc

NAME                                                  READY   STATUS    RESTARTS   AGE
llama3-8b-instruct-1xgpu-predictor-5bd786f77d-2cr4m   0/2     Running   0          9s
llama3-8b-instruct-1xgpu-predictor-5bd786f77d-bljj2   2/2     Running   0          11m
llama3-8b-instruct-1xgpu-predictor-5bd786f77d-mbqqc   2/2     Running   0          39s
# you can verify the hpa once all the pods are alive and ready
$ k get hpa -n knim-raw-pvc llama3-8b-instruct-1xgpu-predictor -o yaml | yq '.status' | yq '. |= pick(["currentReplicas", "desiredReplicas"])'

currentReplicas: 3
desiredReplicas: 3
# check the download time in the SECOND pod; don't forget to use the correct pod name from your environment
$ pod=llama3-8b-instruct-1xgpu-predictor-5bd786f77d-2cr4m && \
k logs -n knim-raw-pvc $pod kserve-container | grep 'Model workspace is now ready'

INFO 07-24 20:53:02.445 ngc_injector.py:172] Model workspace is now ready. It took 2.383 seconds

Note the time it took to prepare the model, only 2.383 seconds.

# cleanup - this might take a couple of minutes
$ k delete -k pocs/persistence-and-caching/raw-pvc

namespace "knim-raw-pvc" deleted
secret "ngc-secret" deleted
secret "nvidia-nim-secrets" deleted
persistentvolumeclaim "nim-pvc" deleted
servingruntime.serving.kserve.io "nvidia-nim-llama3-8b-instruct-1.0.0" deleted
inferenceservice.serving.kserve.io "llama3-8b-instruct-1xgpu" deleted