diff --git a/.github/workflows/_example-workflow.yml b/.github/workflows/_example-workflow.yml
index 39ac1388db..8de7eba9cb 100644
--- a/.github/workflows/_example-workflow.yml
+++ b/.github/workflows/_example-workflow.yml
@@ -50,7 +50,6 @@ jobs:
# Image Build
####################################################################################################
build-images:
- if: ${{ !(fromJSON(inputs.test_helmchart)) }}
runs-on: "docker-build-${{ inputs.node }}"
steps:
- name: Clean Up Working Directory
diff --git a/.github/workflows/_helm-e2e.yml b/.github/workflows/_helm-e2e.yml
index b162baf9a7..48be97012a 100644
--- a/.github/workflows/_helm-e2e.yml
+++ b/.github/workflows/_helm-e2e.yml
@@ -29,6 +29,10 @@ on:
default: "latest"
required: false
type: string
+ version:
+ default: "0-latest"
+ required: false
+ type: string
jobs:
get-test-case:
@@ -154,6 +158,13 @@ jobs:
exit 0
fi
+ for img in `helm template -n $NAMESPACE $RELEASE_NAME oci://ghcr.io/opea-project/charts/${CHART_NAME} -f ${{ inputs.example }}/kubernetes/helm/${value_file} --version ${{ inputs.version }} | grep 'image:' | grep 'opea/' | awk '{print $2}' | xargs`;
+ do
+ # increase helm install wait for for vllm-gaudi case
+ if [[ $img == *"vllm-gaudi"* ]]; then
+ ROLLOUT_TIMEOUT_SECONDS=900s
+ fi
+ done
if ! helm install \
--create-namespace \
--namespace $NAMESPACE \
@@ -163,9 +174,11 @@ jobs:
--set global.modelUseHostPath=/home/sdp/.cache/huggingface/hub \
--set GOOGLE_API_KEY=${{ env.GOOGLE_API_KEY}} \
--set GOOGLE_CSE_ID=${{ env.GOOGLE_CSE_ID}} \
+ --set web-retriever.GOOGLE_API_KEY=${{ env.GOOGLE_API_KEY}} \
+ --set web-retriever.GOOGLE_CSE_ID=${{ env.GOOGLE_CSE_ID}} \
-f ${{ inputs.example }}/kubernetes/helm/${value_file} \
- --version 0-latest \
- --wait; then
+ --version ${{ inputs.version }} \
+ --wait --timeout "$ROLLOUT_TIMEOUT_SECONDS"; then
echo "Failed to install chart ${{ inputs.example }}"
echo "skip_validate=true" >> $GITHUB_ENV
.github/workflows/scripts/k8s-utils.sh dump_pods_status $NAMESPACE
diff --git a/.github/workflows/scripts/k8s-utils.sh b/.github/workflows/scripts/k8s-utils.sh
index ba58e1a152..0676a80d38 100755
--- a/.github/workflows/scripts/k8s-utils.sh
+++ b/.github/workflows/scripts/k8s-utils.sh
@@ -12,7 +12,7 @@ function dump_pod_log() {
kubectl describe pod $pod_name -n $namespace
echo "-----------------------------------"
echo "#kubectl logs $pod_name -n $namespace"
- kubectl logs $pod_name -n $namespace
+ kubectl logs $pod_name -n $namespace --all-containers --prefix=true
echo "-----------------------------------"
}
@@ -44,8 +44,13 @@ function dump_pods_status() {
function dump_all_pod_logs() {
namespace=$1
+ echo "------SUMMARY of POD STATUS in NS $namespace------"
+ kubectl get pods -n $namespace -o wide
+ echo "------SUMMARY of SVC STATUS in NS $namespace------"
+ kubectl get services -n $namespace -o wide
+ echo "------SUMMARY of endpoint STATUS in NS $namespace------"
+ kubectl get endpoints -n $namespace -o wide
echo "-----DUMP POD STATUS AND LOG in NS $namespace------"
-
pods=$(kubectl get pods -n $namespace -o jsonpath='{.items[*].metadata.name}')
for pod_name in $pods
do
diff --git a/AgentQnA/docker_compose/amd/gpu/rocm/README.md b/AgentQnA/docker_compose/amd/gpu/rocm/README.md
new file mode 100644
index 0000000000..b0d76d0d31
--- /dev/null
+++ b/AgentQnA/docker_compose/amd/gpu/rocm/README.md
@@ -0,0 +1,101 @@
+# Single node on-prem deployment with Docker Compose on AMD GPU
+
+This example showcases a hierarchical multi-agent system for question-answering applications. We deploy the example on Xeon. For LLMs, we use OpenAI models via API calls. For instructions on using open-source LLMs, please refer to the deployment guide [here](../../../../README.md).
+
+## Deployment with docker
+
+1. First, clone this repo.
+ ```
+ export WORKDIR=
+ cd $WORKDIR
+ git clone https://github.com/opea-project/GenAIExamples.git
+ ```
+2. Set up environment for this example
+
+ ```
+ # Example: host_ip="192.168.1.1" or export host_ip="External_Public_IP"
+ export host_ip=$(hostname -I | awk '{print $1}')
+ # if you are in a proxy environment, also set the proxy-related environment variables
+ export http_proxy="Your_HTTP_Proxy"
+ export https_proxy="Your_HTTPs_Proxy"
+ # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
+ export no_proxy="Your_No_Proxy"
+
+ export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
+ #OPANAI_API_KEY if you want to use OpenAI models
+ export OPENAI_API_KEY=
+ # Set AMD GPU settings
+ export AGENTQNA_CARD_ID="card1"
+ export AGENTQNA_RENDER_ID="renderD136"
+ ```
+
+3. Deploy the retrieval tool (i.e., DocIndexRetriever mega-service)
+
+ First, launch the mega-service.
+
+ ```
+ cd $WORKDIR/GenAIExamples/AgentQnA/retrieval_tool
+ bash launch_retrieval_tool.sh
+ ```
+
+ Then, ingest data into the vector database. Here we provide an example. You can ingest your own data.
+
+ ```
+ bash run_ingest_data.sh
+ ```
+
+4. Launch Tool service
+ In this example, we will use some of the mock APIs provided in the Meta CRAG KDD Challenge to demonstrate the benefits of gaining additional context from mock knowledge graphs.
+ ```
+ docker run -d -p=8080:8000 docker.io/aicrowd/kdd-cup-24-crag-mock-api:v0
+ ```
+5. Launch `Agent` service
+
+ ```
+ cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/amd/gpu/rocm
+ bash launch_agent_service_tgi_rocm.sh
+ ```
+
+6. [Optional] Build `Agent` docker image if pulling images failed.
+
+ ```
+ git clone https://github.com/opea-project/GenAIComps.git
+ cd GenAIComps
+ docker build -t opea/agent:latest -f comps/agent/src/Dockerfile .
+ ```
+
+## Validate services
+
+First look at logs of the agent docker containers:
+
+```
+# worker agent
+docker logs rag-agent-endpoint
+```
+
+```
+# supervisor agent
+docker logs react-agent-endpoint
+```
+
+You should see something like "HTTP server setup successful" if the docker containers are started successfully.
+
+Second, validate worker agent:
+
+```
+curl http://${host_ip}:9095/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
+ "query": "Most recent album by Taylor Swift"
+ }'
+```
+
+Third, validate supervisor agent:
+
+```
+curl http://${host_ip}:9090/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
+ "query": "Most recent album by Taylor Swift"
+ }'
+```
+
+## How to register your own tools with agent
+
+You can take a look at the tools yaml and python files in this example. For more details, please refer to the "Provide your own tools" section in the instructions [here](https://github.com/opea-project/GenAIComps/tree/main/comps/agent/src/README.md).
diff --git a/AgentQnA/kubernetes/helm/gaudi-values.yaml b/AgentQnA/kubernetes/helm/gaudi-values.yaml
index 91ef5d1026..2d171ea22a 100644
--- a/AgentQnA/kubernetes/helm/gaudi-values.yaml
+++ b/AgentQnA/kubernetes/helm/gaudi-values.yaml
@@ -4,35 +4,13 @@
# Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values
-tgi:
+vllm:
enabled: true
- accelDevice: "gaudi"
image:
- repository: ghcr.io/huggingface/tgi-gaudi
- tag: "2.0.6"
- resources:
- limits:
- habana.ai/gaudi: 4
- MAX_INPUT_LENGTH: "4096"
- MAX_TOTAL_TOKENS: "8192"
- CUDA_GRAPHS: ""
- OMPI_MCA_btl_vader_single_copy_mechanism: "none"
- PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
- ENABLE_HPU_GRAPH: "true"
- LIMIT_HPU_GRAPH: "true"
- USE_FLASH_ATTENTION: "true"
- FLASH_ATTENTION_RECOMPUTE: "true"
- extraCmdArgs: ["--sharded","true","--num-shard","4"]
- livenessProbe:
- initialDelaySeconds: 5
- periodSeconds: 5
- timeoutSeconds: 1
- readinessProbe:
- initialDelaySeconds: 5
- periodSeconds: 5
- timeoutSeconds: 1
- startupProbe:
- initialDelaySeconds: 5
- periodSeconds: 5
- timeoutSeconds: 1
- failureThreshold: 120
+ repository: opea/vllm-gaudi
+supervisor:
+ llm_endpoint_url: http://{{ .Release.Name }}-vllm
+ragagent:
+ llm_endpoint_url: http://{{ .Release.Name }}-vllm
+sqlagent:
+ llm_endpoint_url: http://{{ .Release.Name }}-vllm
diff --git a/AgentQnA/tests/step1_build_images.sh b/AgentQnA/tests/step1_build_images.sh
index e00cf75106..4cb8a2e4d1 100644
--- a/AgentQnA/tests/step1_build_images.sh
+++ b/AgentQnA/tests/step1_build_images.sh
@@ -38,19 +38,17 @@ function build_vllm_docker_image() {
echo "Building the vllm docker image"
cd $WORKPATH
echo $WORKPATH
- if [ ! -d "./vllm" ]; then
- echo "clone vllm repo...."
- git clone https://github.com/vllm-project/vllm.git
+ if [ ! -d "./vllm-fork" ]; then
+ git clone https://github.com/HabanaAI/vllm-fork.git
fi
- cd ./vllm
- echo "Checking out latest stable release of vllm"
- git checkout v0.6.6
- docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:comps --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
+ cd ./vllm-fork
+ git checkout v0.6.4.post2+Gaudi-1.19.0
+ docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:ci --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
if [ $? -ne 0 ]; then
- echo "opea/vllm-gaudi:comps failed"
+ echo "opea/vllm-gaudi:ci failed"
exit 1
else
- echo "opea/vllm-gaudi:comps successful"
+ echo "opea/vllm-gaudi:ci successful"
fi
}
diff --git a/AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh b/AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh
index c99e212ff6..824f7aa855 100644
--- a/AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh
+++ b/AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh
@@ -8,15 +8,17 @@ WORKPATH=$(dirname "$PWD")
export WORKDIR=$WORKPATH/../../
echo "WORKDIR=${WORKDIR}"
export ip_address=$(hostname -I | awk '{print $1}')
-export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
+export TOOLSET_PATH=$WORKPATH/tools/
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
model="meta-llama/Meta-Llama-3.1-70B-Instruct"
-export HF_CACHE_DIR=$WORKDIR/hf_cache
+export HF_CACHE_DIR=/data2/huggingface
if [ ! -d "$HF_CACHE_DIR" ]; then
+ HF_CACHE_DIR=$WORKDIR/hf_cache
mkdir -p "$HF_CACHE_DIR"
fi
+echo "HF_CACHE_DIR=$HF_CACHE_DIR"
ls $HF_CACHE_DIR
vllm_port=8086
@@ -35,7 +37,7 @@ function start_vllm_service_70B() {
echo "start vllm gaudi service"
echo "**************model is $model**************"
- vllm_image=opea/vllm-gaudi:comps
+ vllm_image=opea/vllm-gaudi:ci
docker run -d --runtime=habana --rm --name "vllm-gaudi-server" -e HABANA_VISIBLE_DEVICES=0,1,2,3 -p $vllm_port:8000 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host $vllm_image --model ${model} --max-seq-len-to-capture 16384 --tensor-parallel-size 4
sleep 5s
echo "Waiting vllm gaudi ready"
diff --git a/AgentQnA/tests/test_compose_on_gaudi.sh b/AgentQnA/tests/test_compose_on_gaudi.sh
index cf224b6aa1..880102f0f8 100644
--- a/AgentQnA/tests/test_compose_on_gaudi.sh
+++ b/AgentQnA/tests/test_compose_on_gaudi.sh
@@ -4,9 +4,6 @@
set -xe
-echo "All running containers"
-docker ps
-
WORKPATH=$(dirname "$PWD")
export WORKDIR=$WORKPATH/../../
echo "WORKDIR=${WORKDIR}"
diff --git a/AudioQnA/kubernetes/helm/gaudi-values.yaml b/AudioQnA/kubernetes/helm/gaudi-values.yaml
index faaad653ef..9b06ff4296 100644
--- a/AudioQnA/kubernetes/helm/gaudi-values.yaml
+++ b/AudioQnA/kubernetes/helm/gaudi-values.yaml
@@ -5,7 +5,7 @@ tgi:
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
- tag: "2.0.6"
+ tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
@@ -33,11 +33,15 @@ tgi:
failureThreshold: 120
whisper:
+ image:
+ repository: opea/whisper-gaudi
resources:
limits:
habana.ai/gaudi: 1
speecht5:
+ image:
+ repository: opea/speecht5-gaudi
resources:
limits:
habana.ai/gaudi: 1
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README.md b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
index 01a00a8193..764afba4d4 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
@@ -280,7 +280,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
1. TEI Embedding Service
```bash
- curl ${host_ip}:6006/embed \
+ curl http://${host_ip}:6006/embed \
-X POST \
-d '{"inputs":"What is Deep Learning?"}' \
-H 'Content-Type: application/json'
diff --git a/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml b/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml
new file mode 100644
index 0000000000..f552e1d5bc
--- /dev/null
+++ b/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml
@@ -0,0 +1,112 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Override CPU resource request and probe timing values in specific subcharts
+#
+# RESOURCES
+#
+# Resource request matching actual resource usage (with enough slack)
+# is important when service is scaled up, so that right amount of pods
+# get scheduled to right nodes.
+#
+# Because resource usage depends on the used devices, model, data type
+# and SW versions, and this top-level chart has overrides for them,
+# resource requests need to be specified here too.
+#
+# To test service without resource request, use "resources: {}".
+#
+# PROBES
+#
+# Inferencing pods startup / warmup takes *much* longer on CPUs than
+# with acceleration devices, and their responses are also slower,
+# especially when node is running several instances of these services.
+#
+# Kubernetes restarting pod before its startup finishes, or not
+# sending it queries because it's not in ready state due to slow
+# readiness responses, does really NOT help in getting faster responses.
+#
+# => probe timings need to be increased when running on CPU.
+
+vllm:
+ enabled: false
+tgi:
+ enabled: true
+ # TODO: add Helm value also for TGI data type option:
+ # https://github.com/opea-project/GenAIExamples/issues/330
+ LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
+
+ # Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit:
+ #resources:
+ # limits:
+ # cpu: 8
+ # memory: 70Gi
+ # requests:
+ # cpu: 6
+ # memory: 65Gi
+
+ livenessProbe:
+ initialDelaySeconds: 8
+ periodSeconds: 8
+ failureThreshold: 24
+ timeoutSeconds: 4
+ readinessProbe:
+ initialDelaySeconds: 16
+ periodSeconds: 8
+ timeoutSeconds: 4
+ startupProbe:
+ initialDelaySeconds: 10
+ periodSeconds: 5
+ failureThreshold: 180
+ timeoutSeconds: 2
+
+teirerank:
+ RERANK_MODEL_ID: "BAAI/bge-reranker-base"
+
+ # Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
+ resources:
+ limits:
+ cpu: 4
+ memory: 30Gi
+ requests:
+ cpu: 2
+ memory: 25Gi
+
+ livenessProbe:
+ initialDelaySeconds: 8
+ periodSeconds: 8
+ failureThreshold: 24
+ timeoutSeconds: 4
+ readinessProbe:
+ initialDelaySeconds: 8
+ periodSeconds: 8
+ timeoutSeconds: 4
+ startupProbe:
+ initialDelaySeconds: 5
+ periodSeconds: 5
+ failureThreshold: 120
+
+tei:
+ EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5"
+
+ # Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
+ resources:
+ limits:
+ cpu: 4
+ memory: 4Gi
+ requests:
+ cpu: 2
+ memory: 3Gi
+
+ livenessProbe:
+ initialDelaySeconds: 5
+ periodSeconds: 5
+ failureThreshold: 24
+ timeoutSeconds: 2
+ readinessProbe:
+ initialDelaySeconds: 5
+ periodSeconds: 5
+ timeoutSeconds: 2
+ startupProbe:
+ initialDelaySeconds: 5
+ periodSeconds: 5
+ failureThreshold: 120
diff --git a/ChatQnA/kubernetes/helm/cpu-values.yaml b/ChatQnA/kubernetes/helm/cpu-values.yaml
index b4c5ee5ddb..86b68a921f 100644
--- a/ChatQnA/kubernetes/helm/cpu-values.yaml
+++ b/ChatQnA/kubernetes/helm/cpu-values.yaml
@@ -1,109 +1,5 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
-# Override CPU resource request and probe timing values in specific subcharts
-#
-# RESOURCES
-#
-# Resource request matching actual resource usage (with enough slack)
-# is important when service is scaled up, so that right amount of pods
-# get scheduled to right nodes.
-#
-# Because resource usage depends on the used devices, model, data type
-# and SW versions, and this top-level chart has overrides for them,
-# resource requests need to be specified here too.
-#
-# To test service without resource request, use "resources: {}".
-#
-# PROBES
-#
-# Inferencing pods startup / warmup takes *much* longer on CPUs than
-# with acceleration devices, and their responses are also slower,
-# especially when node is running several instances of these services.
-#
-# Kubernetes restarting pod before its startup finishes, or not
-# sending it queries because it's not in ready state due to slow
-# readiness responses, does really NOT help in getting faster responses.
-#
-# => probe timings need to be increased when running on CPU.
-
-tgi:
- # TODO: add Helm value also for TGI data type option:
- # https://github.com/opea-project/GenAIExamples/issues/330
- LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-
- # Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit:
- resources:
- limits:
- cpu: 8
- memory: 70Gi
- requests:
- cpu: 6
- memory: 65Gi
-
- livenessProbe:
- initialDelaySeconds: 8
- periodSeconds: 8
- failureThreshold: 24
- timeoutSeconds: 4
- readinessProbe:
- initialDelaySeconds: 16
- periodSeconds: 8
- timeoutSeconds: 4
- startupProbe:
- initialDelaySeconds: 10
- periodSeconds: 5
- failureThreshold: 180
- timeoutSeconds: 2
-
-teirerank:
- RERANK_MODEL_ID: "BAAI/bge-reranker-base"
-
- # Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
- resources:
- limits:
- cpu: 4
- memory: 30Gi
- requests:
- cpu: 2
- memory: 25Gi
-
- livenessProbe:
- initialDelaySeconds: 8
- periodSeconds: 8
- failureThreshold: 24
- timeoutSeconds: 4
- readinessProbe:
- initialDelaySeconds: 8
- periodSeconds: 8
- timeoutSeconds: 4
- startupProbe:
- initialDelaySeconds: 5
- periodSeconds: 5
- failureThreshold: 120
-
-tei:
- EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5"
-
- # Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
- resources:
- limits:
- cpu: 4
- memory: 4Gi
- requests:
- cpu: 2
- memory: 3Gi
-
- livenessProbe:
- initialDelaySeconds: 5
- periodSeconds: 5
- failureThreshold: 24
- timeoutSeconds: 2
- readinessProbe:
- initialDelaySeconds: 5
- periodSeconds: 5
- timeoutSeconds: 2
- startupProbe:
- initialDelaySeconds: 5
- periodSeconds: 5
- failureThreshold: 120
+image:
+ repository: opea/chatqna
diff --git a/ChatQnA/kubernetes/helm/gaudi-values.yaml b/ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml
similarity index 97%
rename from ChatQnA/kubernetes/helm/gaudi-values.yaml
rename to ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml
index 47df99fc44..d4da00c976 100644
--- a/ChatQnA/kubernetes/helm/gaudi-values.yaml
+++ b/ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml
@@ -4,12 +4,15 @@
# Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values
+vllm:
+ enabled: false
# TGI: largest bottleneck for ChatQnA
tgi:
+ enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
- tag: "2.0.6"
+ tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
diff --git a/ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml b/ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml
index 6c1a44ebff..76eafae029 100644
--- a/ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml
+++ b/ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml
@@ -6,9 +6,9 @@
tgi:
enabled: false
-
vllm:
enabled: true
+ shmSize: 1Gi
accelDevice: "gaudi"
image:
repository: opea/vllm-gaudi
@@ -19,7 +19,7 @@ vllm:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
- failureThreshold: 120
+ failureThreshold: 180
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
@@ -39,7 +39,6 @@ vllm:
"--max-seq_len-to-capture", "2048"
]
-
# Reranking: second largest bottleneck when reranking is in use
# (i.e. query context docs have been uploaded with data-prep)
#
diff --git a/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml b/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml
index aad83623d5..8e8a491a0a 100644
--- a/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml
+++ b/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml
@@ -44,17 +44,18 @@ teirerank:
readinessProbe:
timeoutSeconds: 1
-tgi:
+tgi-guardrails:
+ enabled: true
accelDevice: "gaudi"
+ LLM_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"
image:
repository: ghcr.io/huggingface/tgi-gaudi
- tag: "2.0.6"
+ tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
- # higher limits are needed with extra input tokens added by rerank
- MAX_INPUT_LENGTH: "2048"
- MAX_TOTAL_TOKENS: "4096"
+ MAX_INPUT_LENGTH: "1024"
+ MAX_TOTAL_TOKENS: "2048"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
@@ -75,34 +76,37 @@ tgi:
timeoutSeconds: 1
failureThreshold: 120
-tgi-guardrails:
+tgi:
+ enabled: false
+vllm:
enabled: true
+ shmSize: 1Gi
accelDevice: "gaudi"
- LLM_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"
image:
- repository: ghcr.io/huggingface/tgi-gaudi
- tag: "2.0.6"
+ repository: opea/vllm-gaudi
resources:
limits:
habana.ai/gaudi: 1
- MAX_INPUT_LENGTH: "1024"
- MAX_TOTAL_TOKENS: "2048"
- CUDA_GRAPHS: ""
- OMPI_MCA_btl_vader_single_copy_mechanism: "none"
- ENABLE_HPU_GRAPH: "true"
- LIMIT_HPU_GRAPH: "true"
- USE_FLASH_ATTENTION: "true"
- FLASH_ATTENTION_RECOMPUTE: "true"
- livenessProbe:
+ startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
+ failureThreshold: 180
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
- startupProbe:
+ livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
- failureThreshold: 120
+
+ PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
+ OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+
+ extraCmdArgs: [
+ "--tensor-parallel-size", "1",
+ "--block-size", "128",
+ "--max-num-seqs", "256",
+ "--max-seq_len-to-capture", "2048"
+ ]
diff --git a/ChatQnA/kubernetes/helm/guardrails-values.yaml b/ChatQnA/kubernetes/helm/guardrails-values.yaml
deleted file mode 100644
index d37a41060c..0000000000
--- a/ChatQnA/kubernetes/helm/guardrails-values.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-image:
- repository: opea/chatqna-guardrails
-
-# guardrails related config
-guardrails-usvc:
- enabled: true
- # SAFETY_GUARD_ENDPOINT: "http://{{ .Release.Name }}-tgi-guardrails"
- SAFETY_GUARD_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"
-tgi-guardrails:
- enabled: true
- LLM_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"
diff --git a/ChatQnA/kubernetes/helm/nv-values.yaml b/ChatQnA/kubernetes/helm/nv-values.yaml
deleted file mode 100644
index 67c4e3ac18..0000000000
--- a/ChatQnA/kubernetes/helm/nv-values.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-# To override values in subchart tgi
-tgi:
- accelDevice: "nvidia"
- image:
- repository: ghcr.io/huggingface/text-generation-inference
- tag: "2.2.0"
- resources:
- limits:
- nvidia.com/gpu: 1
- livenessProbe:
- initialDelaySeconds: 5
- periodSeconds: 5
- timeoutSeconds: 1
- readinessProbe:
- initialDelaySeconds: 5
- periodSeconds: 5
- timeoutSeconds: 1
- startupProbe:
- initialDelaySeconds: 5
- periodSeconds: 5
- timeoutSeconds: 1
- failureThreshold: 120
diff --git a/ChatQnA/tests/test_manifest_guardrails_on_gaudi.sh b/ChatQnA/tests/test_manifest_guardrails_on_gaudi.sh
deleted file mode 100755
index 274cc5209c..0000000000
--- a/ChatQnA/tests/test_manifest_guardrails_on_gaudi.sh
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-ROLLOUT_TIMEOUT_SECONDS="1800s"
-KUBECTL_TIMEOUT_SECONDS="60s"
-
-function validate_chatqna() {
- local ns=$1
- local log=$2
- max_retry=20
- # make sure microservice retriever-usvc is ready
- # try to curl retriever-svc for max_retry times
- test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
- for ((i=1; i<=max_retry; i++))
- do
- endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
- curl http://$endpoint_url/v1/retrieval -X POST \
- -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
- -H 'Content-Type: application/json' && break
- sleep 30
- done
- # if i is bigger than max_retry, then exit with error
- if [ $i -gt $max_retry ]; then
- echo "Microservice retriever failed, exit with error."
- return 1
- fi
- # make sure microservice tgi-svc is ready
- for ((i=1; i<=max_retry; i++))
- do
- endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
- curl http://$endpoint_url/generate -X POST \
- -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
- -H 'Content-Type: application/json' && break
- sleep 30
- done
- # if i is bigger than max_retry, then exit with error
- if [ $i -gt $max_retry ]; then
- echo "Microservice tgi failed, exit with error."
- return 1
- fi
-
- # check megaservice works
- # generate a random logfile name to avoid conflict among multiple runners
- LOGFILE=$LOG_PATH/curlmega_$log.log
- endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
- curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
- exit_code=$?
- if [ $exit_code -ne 0 ]; then
- echo "Megaservice failed, please check the logs in $LOGFILE!"
- return ${exit_code}
- fi
-
- echo "Checking response results, make sure the output is reasonable. "
- local status=false
- if [[ -f $LOGFILE ]] &&
- [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
- status=true
- fi
- if [[ $status == false ]]; then
- echo "Response check failed, please check the logs in artifacts!"
- return 1
- else
- echo "Response check succeed!"
- fi
- return 0
-}
-
-function install_chatqna() {
- echo "Testing manifests chatqna_guardrails"
- local ns=$1
- bash ChatQnA/tests/common/_test_manifest_utils.sh _cleanup_ns $ns
- pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest
- kubectl create namespace $ns
- # install guardrails
- kubectl apply -f chatqna-guardrails.yaml -n $ns
- # Sleep enough time for chatqna_guardrails to be ready
- sleep 60
-}
-
-if [ $# -eq 0 ]; then
- echo "Usage: $0 "
- exit 1
-fi
-
-case "$1" in
- init_ChatQnA)
- pushd ChatQnA/tests/common
- bash _test_manifest_utils.sh init_ChatQnA
- popd
- ;;
- install_ChatQnA)
- NAMESPACE=$2
- install_chatqna $NAMESPACE
- popd
- ;;
- validate_ChatQnA)
- NAMESPACE=$2
- SERVICE_NAME=chatqna-guardrails
- validate_chatqna $NAMESPACE chatqna-guardrails
- ret=$?
- if [ $ret -ne 0 ]; then
- exit $ret
- fi
- ;;
-
- *)
- echo "Unknown function: $1"
- ;;
-esac
diff --git a/ChatQnA/tests/test_manifest_guardrails_on_xeon.sh b/ChatQnA/tests/test_manifest_guardrails_on_xeon.sh
deleted file mode 100755
index 63d494c9f8..0000000000
--- a/ChatQnA/tests/test_manifest_guardrails_on_xeon.sh
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-ROLLOUT_TIMEOUT_SECONDS="1800s"
-KUBECTL_TIMEOUT_SECONDS="60s"
-
-function validate_chatqna() {
- local ns=$1
- local log=$2
- max_retry=10
- # make sure microservice retriever-usvc is ready
- # try to curl retriever-svc for max_retry times
- test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
- for ((i=1; i<=max_retry; i++))
- do
- endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
- curl http://$endpoint_url/v1/retrieval -X POST \
- -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
- -H 'Content-Type: application/json' && break
- sleep 30
- done
- # if i is bigger than max_retry, then exit with error
- if [ $i -gt $max_retry ]; then
- echo "Microservice retriever failed, exit with error."
- return 1
- fi
- # make sure microservice tgi-svc is ready
- for ((i=1; i<=max_retry; i++))
- do
- endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
- curl http://$endpoint_url/generate -X POST \
- -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
- -H 'Content-Type: application/json' && break
- sleep 30
- done
- # if i is bigger than max_retry, then exit with error
- if [ $i -gt $max_retry ]; then
- echo "Microservice tgi failed, exit with error."
- return 1
- fi
-
- # check megaservice works
- # generate a random logfile name to avoid conflict among multiple runners
- LOGFILE=$LOG_PATH/curlmega_$log.log
- endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
- curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
- exit_code=$?
- if [ $exit_code -ne 0 ]; then
- echo "Megaservice failed, please check the logs in $LOGFILE!"
- return ${exit_code}
- fi
-
- echo "Checking response results, make sure the output is reasonable. "
- local status=false
- if [[ -f $LOGFILE ]] &&
- [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
- status=true
- fi
- if [ $status == false ]; then
- echo "Response check failed, please check the logs in artifacts!"
- return 1
- else
- echo "Response check succeed!"
- fi
- return 0
-}
-
-function install_chatqna() {
- echo "Testing manifests chatqna_guardrails"
- local ns=$1
- bash ChatQnA/tests/common/_test_manifest_utils.sh _cleanup_ns $ns
- pushd ChatQnA/kubernetes/intel/cpu/xeon/manifest
- kubectl create namespace $ns
- # install guardrail
- kubectl apply -f chatqna-guardrails.yaml -n $ns
- # Sleep enough time for chatqna_guardrails to be ready
- sleep 60
-}
-
-if [ $# -eq 0 ]; then
- echo "Usage: $0 "
- exit 1
-fi
-
-case "$1" in
- init_ChatQnA)
- pushd ChatQnA/tests/common
- bash _test_manifest_utils.sh init_ChatQnA
- popd
- ;;
- install_ChatQnA)
- NAMESPACE=$2
- install_chatqna $NAMESPACE
- popd
- ;;
- validate_ChatQnA)
- NAMESPACE=$2
- SERVICE_NAME=chatqna-guardrails
- validate_chatqna $NAMESPACE chatqna-guardrails
- ret=$?
- if [ $ret -ne 0 ]; then
- exit $ret
- fi
- ;;
-
- *)
- echo "Unknown function: $1"
- ;;
-esac
diff --git a/ChatQnA/tests/test_manifest_on_gaudi.sh b/ChatQnA/tests/test_manifest_on_gaudi.sh
deleted file mode 100755
index d1764401ff..0000000000
--- a/ChatQnA/tests/test_manifest_on_gaudi.sh
+++ /dev/null
@@ -1,113 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-ROLLOUT_TIMEOUT_SECONDS="1800s"
-KUBECTL_TIMEOUT_SECONDS="60s"
-
-function install_chatqna {
- echo "namespace is $NAMESPACE"
- kubectl apply -f chatqna.yaml -n $NAMESPACE
- # Sleep enough time for retreiver-usvc to be ready
- sleep 60
-}
-
-function validate_chatqna() {
- local ns=$1
- local log=$2
- max_retry=20
- # make sure microservice retriever-usvc is ready
- # try to curl retriever-svc for max_retry times
- test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
- for ((i=1; i<=max_retry; i++))
- do
- endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
- curl http://$endpoint_url/v1/retrieval -X POST \
- -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
- -H 'Content-Type: application/json' && break
- sleep 30
- done
- # if i is bigger than max_retry, then exit with error
- if [ $i -gt $max_retry ]; then
- echo "Microservice retriever failed, exit with error."
- return 1
- fi
- # make sure microservice tgi-svc is ready
- for ((i=1; i<=max_retry; i++))
- do
- endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
- curl http://$endpoint_url/generate -X POST \
- -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
- -H 'Content-Type: application/json' && break
- sleep 30
- done
- # if i is bigger than max_retry, then exit with error
- if [ $i -gt $max_retry ]; then
- echo "Microservice tgi failed, exit with error."
- return 1
- fi
-
- # check megaservice works
- # generate a random logfile name to avoid conflict among multiple runners
- LOGFILE=$LOG_PATH/curlmega_$log.log
- endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
- curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
- exit_code=$?
- if [ $exit_code -ne 0 ]; then
- echo "Megaservice failed, please check the logs in $LOGFILE!"
- return ${exit_code}
- fi
-
- echo "Checking response results, make sure the output is reasonable. "
- local status=false
- if [[ -f $LOGFILE ]] &&
- [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
- status=true
- fi
- if [ $status == false ]; then
- echo "Response check failed, please check the logs in artifacts!"
- return 1
- else
- echo "Response check succeed!"
- fi
- return 0
-}
-
-if [ $# -eq 0 ]; then
- echo "Usage: $0 "
- exit 1
-fi
-
-case "$1" in
- init_ChatQnA)
- pushd ChatQnA/tests/common
- bash _test_manifest_utils.sh init_ChatQnA
- popd
- ;;
- install_ChatQnA)
- pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest
- NAMESPACE=$2
- install_chatqna
- popd
- ;;
- validate_ChatQnA)
- NAMESPACE=$2
- SERVICE_NAME=chatqna
- validate_chatqna $NAMESPACE chatqna
- ret=$?
- if [ $ret -ne 0 ]; then
- exit $ret
- fi
- ;;
-
- *)
- echo "Unknown function: $1"
- ;;
-esac
diff --git a/ChatQnA/tests/test_manifest_on_xeon.sh b/ChatQnA/tests/test_manifest_on_xeon.sh
deleted file mode 100755
index 4c93a8958e..0000000000
--- a/ChatQnA/tests/test_manifest_on_xeon.sh
+++ /dev/null
@@ -1,112 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-ROLLOUT_TIMEOUT_SECONDS="1800s"
-KUBECTL_TIMEOUT_SECONDS="60s"
-
-function install_chatqna {
- echo "namespace is $NAMESPACE"
- kubectl apply -f chatqna.yaml -n $NAMESPACE
- # Sleep enough time for retreiver-usvc to be ready
- sleep 60
-}
-
-function validate_chatqna() {
- local ns=$1
- local log=$2
- max_retry=10
- # make sure microservice retriever-usvc is ready
- # try to curl retriever-svc for max_retry times
- test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
- for ((i=1; i<=max_retry; i++))
- do
- endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
- curl http://$endpoint_url/v1/retrieval -X POST \
- -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
- -H 'Content-Type: application/json' && break
- sleep 30
- done
- # if i is bigger than max_retry, then exit with error
- if [ $i -gt $max_retry ]; then
- echo "Microservice retriever failed, exit with error."
- return 1
- fi
- # make sure microservice tgi-svc is ready
- for ((i=1; i<=max_retry; i++))
- do
- endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
- curl http://$endpoint_url/generate -X POST \
- -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
- -H 'Content-Type: application/json' && break
- sleep 30
- done
- # if i is bigger than max_retry, then exit with error
- if [ $i -gt $max_retry ]; then
- echo "Microservice tgi failed, exit with error."
- return 1
- fi
-
- # check megaservice works
- # generate a random logfile name to avoid conflict among multiple runners
- LOGFILE=$LOG_PATH/curlmega_$log.log
- endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
- curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
- exit_code=$?
- if [ $exit_code -ne 0 ]; then
- echo "Megaservice failed, please check the logs in $LOGFILE!"
- return ${exit_code}
- fi
-
- echo "Checking response results, make sure the output is reasonable. "
- local status=false
- if [[ -f $LOGFILE ]] &&
- [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
- status=true
- fi
- if [ $status == false ]; then
- echo "Response check failed, please check the logs in artifacts!"
- return 1
- else
- echo "Response check succeed!"
- fi
- return 0
-}
-
-if [ $# -eq 0 ]; then
- echo "Usage: $0 "
- exit 1
-fi
-
-case "$1" in
- init_ChatQnA)
- pushd ChatQnA/tests/common
- bash _test_manifest_utils.sh init_ChatQnA
- popd
- ;;
- install_ChatQnA)
- pushd ChatQnA/kubernetes/intel/cpu/xeon/manifest
- NAMESPACE=$2
- install_chatqna
- popd
- ;;
- validate_ChatQnA)
- NAMESPACE=$2
- SERVICE_NAME=chatqna
- validate_chatqna $NAMESPACE chatqna
- ret=$?
- if [ $ret -ne 0 ]; then
- exit $ret
- fi
- ;;
- *)
- echo "Unknown function: $1"
- ;;
-esac
diff --git a/ChatQnA/tests/test_manifest_vllm_on_gaudi.sh b/ChatQnA/tests/test_manifest_vllm_on_gaudi.sh
deleted file mode 100755
index c1ab58460e..0000000000
--- a/ChatQnA/tests/test_manifest_vllm_on_gaudi.sh
+++ /dev/null
@@ -1,118 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-ROLLOUT_TIMEOUT_SECONDS="1800s"
-KUBECTL_TIMEOUT_SECONDS="60s"
-
-function validate_chatqna() {
- local ns=$1
- local log=$2
- max_retry=20
- # make sure microservice retriever-usvc is ready
- # try to curl retriever-svc for max_retry times
- test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
- for ((i=1; i<=max_retry; i++))
- do
- endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
- curl http://$endpoint_url/v1/retrieval -X POST \
- -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
- -H 'Content-Type: application/json' && break
- sleep 30
- done
- # if i is bigger than max_retry, then exit with error
- if [ $i -gt $max_retry ]; then
- echo "Microservice retriever failed, exit with error."
- return 1
- fi
-
- # make sure microservice vllm-svc is ready
- for ((i=1; i<=max_retry; i++))
- do
- endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-vllm" $ns)
- curl http://$endpoint_url/v1/chat/completions -X POST \
- -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}' \
- -H 'Content-Type: application/json' && break
- sleep 30
- done
- # if i is bigger than max_retry, then exit with error
- if [ $i -gt $max_retry ]; then
- echo "Microservice vllm failed, exit with error."
- return 1
- fi
-
- # check megaservice works
- # generate a random logfile name to avoid conflict among multiple runners
- LOGFILE=$LOG_PATH/curlmega_$log.log
- endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
- curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
- exit_code=$?
- if [ $exit_code -ne 0 ]; then
- echo "Megaservice failed, please check the logs in $LOGFILE!"
- return ${exit_code}
- fi
-
- echo "Checking response results, make sure the output is reasonable. "
- local status=false
- if [[ -f $LOGFILE ]] &&
- [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
- status=true
- fi
- if [ $status == false ]; then
- echo "Response check failed, please check the logs in artifacts!"
- return 1
- else
- echo "Response check succeed!"
- fi
- return 0
-}
-
-function install_chatqna() {
- echo "Testing manifests chatqna_vllm"
- local ns=$1
- bash ChatQnA/tests/common/_test_manifest_utils.sh _cleanup_ns $ns
- kubectl create namespace $ns
- # install guardrail
- pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest
- kubectl apply -f chatqna-vllm.yaml -n $ns
- # Sleep enough time for chatqna_vllm to be ready, vllm warmup takes about 5 minutes
- sleep 280
-}
-
-if [ $# -eq 0 ]; then
- echo "Usage: $0 "
- exit 1
-fi
-
-case "$1" in
- init_ChatQnA)
- pushd ChatQnA/tests/common
- bash _test_manifest_utils.sh init_ChatQnA
- popd
- ;;
- install_ChatQnA)
- NAMESPACE=$2
- install_chatqna $NAMESPACE
- popd
- ;;
- validate_ChatQnA)
- NAMESPACE=$2
- SERVICE_NAME=chatqna-vllm
- validate_chatqna $NAMESPACE chatqna-vllm
- ret=$?
- if [ $ret -ne 0 ]; then
- exit $ret
- fi
- ;;
-
- *)
- echo "Unknown function: $1"
- ;;
-esac
diff --git a/CodeGen/kubernetes/helm/gaudi-values.yaml b/CodeGen/kubernetes/helm/gaudi-values.yaml
index e26bb4a5ed..25ac2c3959 100644
--- a/CodeGen/kubernetes/helm/gaudi-values.yaml
+++ b/CodeGen/kubernetes/helm/gaudi-values.yaml
@@ -6,13 +6,18 @@ tgi:
LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct
image:
repository: ghcr.io/huggingface/tgi-gaudi
- tag: "2.0.6"
+ tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "1024"
MAX_TOTAL_TOKENS: "2048"
CUDA_GRAPHS: ""
+ OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+ ENABLE_HPU_GRAPH: "true"
+ LIMIT_HPU_GRAPH: "true"
+ USE_FLASH_ATTENTION: "true"
+ FLASH_ATTENTION_RECOMPUTE: "true"
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
diff --git a/CodeGen/tests/test_manifest_on_gaudi.sh b/CodeGen/tests/test_manifest_on_gaudi.sh
deleted file mode 100755
index a54e2d76df..0000000000
--- a/CodeGen/tests/test_manifest_on_gaudi.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_codegen() {
- # executed under path manifest/codegen/xeon
- # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
- find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
- # replace microservice image tag
- find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
- # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
- find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
- # set huggingface token
- find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_codegen {
- echo "namespace is $NAMESPACE"
- kubectl apply -f codegen.yaml -n $NAMESPACE
-}
-
-function validate_codegen() {
- ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
- port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
- echo "try to curl http://${ip_address}:${port}/v1/codegen..."
-
- # generate a random logfile name to avoid conflict among multiple runners
- LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
- # Curl the Mega Service
- curl http://${ip_address}:${port}/v1/codegen -H "Content-Type: application/json" \
- -d '{"messages": "def print_hello_world():"}' > $LOGFILE
- exit_code=$?
- if [ $exit_code -ne 0 ]; then
- echo "Megaservice codegen failed, please check the logs in $LOGFILE!"
- exit 1
- fi
-
- echo "Checking response results, make sure the output is reasonable. "
- local status=false
- if [[ -f $LOGFILE ]] && \
- [[ $(grep -c "print" $LOGFILE) != 0 ]]; then
- status=true
- fi
-
- if [ $status == false ]; then
- echo "Response check failed, please check the logs in artifacts!"
- exit 1
- else
- echo "Response check succeed!"
- fi
-}
-
-if [ $# -eq 0 ]; then
- echo "Usage: $0 "
- exit 1
-fi
-
-case "$1" in
- init_CodeGen)
- pushd CodeGen/kubernetes/intel/hpu/gaudi/manifest
- init_codegen
- popd
- ;;
- install_CodeGen)
- pushd CodeGen/kubernetes/intel/hpu/gaudi/manifest
- NAMESPACE=$2
- install_codegen
- popd
- ;;
- validate_CodeGen)
- NAMESPACE=$2
- SERVICE_NAME=codegen
- validate_codegen
- ;;
- *)
- echo "Unknown function: $1"
- ;;
-esac
diff --git a/CodeGen/tests/test_manifest_on_xeon.sh b/CodeGen/tests/test_manifest_on_xeon.sh
deleted file mode 100755
index b0975f14a6..0000000000
--- a/CodeGen/tests/test_manifest_on_xeon.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_codegen() {
- # executed under path manifest/codegen/xeon
- # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
- find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
- # replace microservice image tag
- find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
- # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
- find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
- # set huggingface token
- find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_codegen {
- echo "namespace is $NAMESPACE"
- kubectl apply -f codegen.yaml -n $NAMESPACE
-}
-
-function validate_codegen() {
- ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
- port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
- echo "try to curl http://${ip_address}:${port}/v1/codegen..."
-
- # generate a random logfile name to avoid conflict among multiple runners
- LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
- # Curl the Mega Service
- curl http://${ip_address}:${port}/v1/codegen -H "Content-Type: application/json" \
- -d '{"messages": "def print_hello_world():"}' > $LOGFILE
- exit_code=$?
- if [ $exit_code -ne 0 ]; then
- echo "Megaservice codegen failed, please check the logs in $LOGFILE!"
- exit 1
- fi
-
- echo "Checking response results, make sure the output is reasonable. "
- local status=false
- if [[ -f $LOGFILE ]] && \
- [[ $(grep -c "print" $LOGFILE) != 0 ]]; then
- status=true
- fi
-
- if [ $status == false ]; then
- echo "Response check failed, please check the logs in artifacts!"
- exit 1
- else
- echo "Response check succeed!"
- fi
-}
-
-if [ $# -eq 0 ]; then
- echo "Usage: $0 "
- exit 1
-fi
-
-case "$1" in
- init_CodeGen)
- pushd CodeGen/kubernetes/intel/cpu/xeon/manifest
- init_codegen
- popd
- ;;
- install_CodeGen)
- pushd CodeGen/kubernetes/intel/cpu/xeon/manifest
- NAMESPACE=$2
- install_codegen
- popd
- ;;
- validate_CodeGen)
- NAMESPACE=$2
- SERVICE_NAME=codegen
- validate_codegen
- ;;
- *)
- echo "Unknown function: $1"
- ;;
-esac
diff --git a/CodeTrans/kubernetes/helm/gaudi-values.yaml b/CodeTrans/kubernetes/helm/gaudi-values.yaml
index e5367383ae..89ed259285 100644
--- a/CodeTrans/kubernetes/helm/gaudi-values.yaml
+++ b/CodeTrans/kubernetes/helm/gaudi-values.yaml
@@ -5,13 +5,18 @@ tgi:
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
- tag: "2.0.6"
+ tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "1024"
MAX_TOTAL_TOKENS: "2048"
CUDA_GRAPHS: ""
+ OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+ ENABLE_HPU_GRAPH: "true"
+ LIMIT_HPU_GRAPH: "true"
+ USE_FLASH_ATTENTION: "true"
+ FLASH_ATTENTION_RECOMPUTE: "true"
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
diff --git a/CodeTrans/tests/test_manifest_on_gaudi.sh b/CodeTrans/tests/test_manifest_on_gaudi.sh
deleted file mode 100755
index 7be05ae33b..0000000000
--- a/CodeTrans/tests/test_manifest_on_gaudi.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_codetrans() {
- # executed under path manifest/codetrans/xeon
- # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
- find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
- # replace microservice image tag
- find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
- # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
- find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
- # set huggingface token
- find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_codetrans {
- echo "namespace is $NAMESPACE"
- kubectl apply -f codetrans.yaml -n $NAMESPACE
-}
-
-function validate_codetrans() {
- ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
- port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
- echo "try to curl http://${ip_address}:${port}/v1/codetrans..."
-
- # generate a random logfile name to avoid conflict among multiple runners
- LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
- # Curl the Mega Service
- curl http://${ip_address}:${port}/v1/codetrans \
- -H 'Content-Type: application/json' \
- -d '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}"}' > $LOGFILE
- exit_code=$?
- if [ $exit_code -ne 0 ]; then
- echo "Megaservice codetrans failed, please check the logs in $LOGFILE!"
- exit 1
- fi
-
- echo "Checking response results, make sure the output is reasonable. "
- local status=false
- if [[ -f $LOGFILE ]] && \
- [[ $(grep -c "print" $LOGFILE) != 0 ]]; then
- status=true
- fi
-
- if [ $status == false ]; then
- echo "Response check failed, please check the logs in artifacts!"
- exit 1
- else
- echo "Response check succeed!"
- fi
-}
-
-if [ $# -eq 0 ]; then
- echo "Usage: $0 "
- exit 1
-fi
-
-case "$1" in
- init_CodeTrans)
- pushd CodeTrans/kubernetes/intel/hpu/gaudi/manifest
- init_codetrans
- popd
- ;;
- install_CodeTrans)
- pushd CodeTrans/kubernetes/intel/hpu/gaudi/manifest
- NAMESPACE=$2
- install_codetrans
- popd
- ;;
- validate_CodeTrans)
- NAMESPACE=$2
- SERVICE_NAME=codetrans
- validate_codetrans
- ;;
- *)
- echo "Unknown function: $1"
- ;;
-esac
diff --git a/CodeTrans/tests/test_manifest_on_xeon.sh b/CodeTrans/tests/test_manifest_on_xeon.sh
deleted file mode 100755
index 7f2b969240..0000000000
--- a/CodeTrans/tests/test_manifest_on_xeon.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_codetrans() {
- # executed under path manifest/codetrans/xeon
- # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
- find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
- # replace microservice image tag
- find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
- # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
- find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
- # set huggingface token
- find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_codetrans {
- echo "namespace is $NAMESPACE"
- kubectl apply -f codetrans.yaml -n $NAMESPACE
-}
-
-function validate_codetrans() {
- ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
- port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
- echo "try to curl http://${ip_address}:${port}/v1/codetrans..."
-
- # generate a random logfile name to avoid conflict among multiple runners
- LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
- # Curl the Mega Service
- curl http://${ip_address}:${port}/v1/codetrans \
- -H 'Content-Type: application/json' \
- -d '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}"}' > $LOGFILE
- exit_code=$?
- if [ $exit_code -ne 0 ]; then
- echo "Megaservice codetrans failed, please check the logs in $LOGFILE!"
- exit 1
- fi
-
- echo "Checking response results, make sure the output is reasonable. "
- local status=false
- if [[ -f $LOGFILE ]] && \
- [[ $(grep -c "print" $LOGFILE) != 0 ]]; then
- status=true
- fi
-
- if [ $status == false ]; then
- echo "Response check failed, please check the logs in artifacts!"
- exit 1
- else
- echo "Response check succeed!"
- fi
-}
-
-if [ $# -eq 0 ]; then
- echo "Usage: $0 "
- exit 1
-fi
-
-case "$1" in
- init_CodeTrans)
- pushd CodeTrans/kubernetes/intel/cpu/xeon/manifest
- init_codetrans
- popd
- ;;
- install_CodeTrans)
- pushd CodeTrans/kubernetes/intel/cpu/xeon/manifest
- NAMESPACE=$2
- install_codetrans
- popd
- ;;
- validate_CodeTrans)
- NAMESPACE=$2
- SERVICE_NAME=codetrans
- validate_codetrans
- ;;
- *)
- echo "Unknown function: $1"
- ;;
-esac
diff --git a/DocSum/kubernetes/helm/cpu-values.yaml b/DocSum/kubernetes/helm/cpu-values.yaml
index 97818ae448..6f2ab7768f 100644
--- a/DocSum/kubernetes/helm/cpu-values.yaml
+++ b/DocSum/kubernetes/helm/cpu-values.yaml
@@ -2,4 +2,6 @@
# SPDX-License-Identifier: Apache-2.0
tgi:
- LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
+ enabled: true
+vllm:
+ enabled: false
diff --git a/DocSum/kubernetes/helm/gaudi-values.yaml b/DocSum/kubernetes/helm/gaudi-values.yaml
index 5cfae25928..eda0abe8c4 100644
--- a/DocSum/kubernetes/helm/gaudi-values.yaml
+++ b/DocSum/kubernetes/helm/gaudi-values.yaml
@@ -1,16 +1,21 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
+vllm:
+ enabled: false
+
+llm-uservice:
+ DOCSUM_BACKEND: "TGI"
+
tgi:
+ enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
- tag: "2.0.6"
+ tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
- MAX_INPUT_LENGTH: "1024"
- MAX_TOTAL_TOKENS: "2048"
CUDA_GRAPHS: ""
ENABLE_HPU_GRAPH: true
LIMIT_HPU_GRAPH: true
diff --git a/DocSum/tests/test_manifest_on_gaudi.sh b/DocSum/tests/test_manifest_on_gaudi.sh
deleted file mode 100755
index db731ac4a9..0000000000
--- a/DocSum/tests/test_manifest_on_gaudi.sh
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_docsum() {
- # executed under path manifest/docsum/xeon
- # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
- find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
- # replace microservice image tag
- find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
- # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
- find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
- # set huggingface token
- find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_docsum {
- echo "namespace is $NAMESPACE"
- kubectl apply -f docsum.yaml -n $NAMESPACE
-}
-
-function validate_docsum() {
- ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
- port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
- echo "try to curl http://${ip_address}:${port}/v1/docsum..."
-
- # generate a random logfile name to avoid conflict among multiple runners
- LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
- # Curl the Mega Service
- curl http://${ip_address}:${port}/v1/docsum \
- -H 'Content-Type: multipart/form-data' \
- -F 'type=text' \
- -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." > $LOGFILE
- exit_code=$?
- if [ $exit_code -ne 0 ]; then
- echo "Megaservice docsum failed, please check the logs in $LOGFILE!"
- exit 1
- fi
-
- echo "Checking response results, make sure the output is reasonable. "
- local status=false
- if [[ -f $LOGFILE ]] && \
- [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
- status=true
- fi
-
- if [ $status == false ]; then
- echo "Response check failed, please check the logs in artifacts!"
- exit 1
- else
- echo "Response check succeed!"
- fi
-}
-
-if [ $# -eq 0 ]; then
- echo "Usage: $0 "
- exit 1
-fi
-
-case "$1" in
- init_DocSum)
- pushd DocSum/kubernetes/intel/hpu/gaudi/manifest
- init_docsum
- popd
- ;;
- install_DocSum)
- pushd DocSum/kubernetes/intel/hpu/gaudi/manifest
- NAMESPACE=$2
- install_docsum
- popd
- ;;
- validate_DocSum)
- NAMESPACE=$2
- SERVICE_NAME=docsum
- validate_docsum
- ;;
- *)
- echo "Unknown function: $1"
- ;;
-esac
diff --git a/DocSum/tests/test_manifest_on_xeon.sh b/DocSum/tests/test_manifest_on_xeon.sh
deleted file mode 100755
index 0bf613975c..0000000000
--- a/DocSum/tests/test_manifest_on_xeon.sh
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_docsum() {
- # executed under path manifest/docsum/xeon
- # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
- find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
- # replace microservice image tag
- find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
- # replace the repository "image: opea/*" with "image: $IMAGE_REPO/opea/"
- find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
- # set huggingface token
- find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_docsum {
- echo "namespace is $NAMESPACE"
- kubectl apply -f docsum.yaml -n $NAMESPACE
-}
-
-function validate_docsum() {
- ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
- port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
- echo "try to curl http://${ip_address}:${port}/v1/docsum..."
-
- # generate a random logfile name to avoid conflict among multiple runners
- LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
- # Curl the Mega Service
- curl http://${ip_address}:${port}/v1/docsum \
- -H 'Content-Type: multipart/form-data' \
- -F 'type=text' \
- -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." > $LOGFILE
- exit_code=$?
- if [ $exit_code -ne 0 ]; then
- echo "Megaservice docsum failed, please check the logs in $LOGFILE!"
- exit 1
- fi
-
- echo "Checking response results, make sure the output is reasonable. "
- local status=false
- if [[ -f $LOGFILE ]] && \
- [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
- status=true
- fi
-
- if [ $status == false ]; then
- echo "Response check failed, please check the logs in artifacts!"
- exit 1
- else
- echo "Response check succeed!"
- fi
-}
-
-if [ $# -eq 0 ]; then
- echo "Usage: $0 "
- exit 1
-fi
-
-case "$1" in
- init_DocSum)
- pushd DocSum/kubernetes/intel/cpu/xeon/manifest
- init_docsum
- popd
- ;;
- install_DocSum)
- pushd DocSum/kubernetes/intel/cpu/xeon/manifest
- NAMESPACE=$2
- install_docsum
- popd
- ;;
- validate_DocSum)
- NAMESPACE=$2
- SERVICE_NAME=docsum
- validate_docsum
- ;;
- *)
- echo "Unknown function: $1"
- ;;
-esac
diff --git a/FaqGen/kubernetes/helm/gaudi-values.yaml b/FaqGen/kubernetes/helm/gaudi-values.yaml
index d14729c4a3..e45cde146f 100644
--- a/FaqGen/kubernetes/helm/gaudi-values.yaml
+++ b/FaqGen/kubernetes/helm/gaudi-values.yaml
@@ -5,13 +5,25 @@ tgi:
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
- tag: "2.0.6"
+ tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
- MAX_INPUT_LENGTH: "4096"
- MAX_TOTAL_TOKENS: "8192"
+ MAX_INPUT_LENGTH: "1024"
+ MAX_TOTAL_TOKENS: "2048"
CUDA_GRAPHS: "0"
+ OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+ ENABLE_HPU_GRAPH: "true"
+ LIMIT_HPU_GRAPH: "true"
+ USE_FLASH_ATTENTION: "true"
+ FLASH_ATTENTION_RECOMPUTE: "true"
+ PREFILL_BATCH_BUCKET_SIZE: 1
+ BATCH_BUCKET_SIZE: 8
+ extraCmdArgs:
+ - "--max-batch-total-tokens"
+ - "65536"
+ - "--max-batch-prefill-tokens"
+ - "4096"
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
diff --git a/MultimodalQnA/README.md b/MultimodalQnA/README.md
index 8e46be1c8a..bda42ee285 100644
--- a/MultimodalQnA/README.md
+++ b/MultimodalQnA/README.md
@@ -87,12 +87,12 @@ In the below, we provide a table that describes for each microservice component
Gaudi default compose.yaml
-| MicroService | Open Source Project | HW | Port | Endpoint |
-| ------------ | --------------------- | ----- | ---- | --------------------------------------------------------------------- |
-| Embedding | Langchain | Xeon | 6000 | /v1/embeddings |
-| Retriever | Langchain, Redis | Xeon | 7000 | /v1/multimodal_retrieval |
-| LVM | Langchain, TGI | Gaudi | 9399 | /v1/lvm |
-| Dataprep | Redis, Langchain, TGI | Gaudi | 6007 | /v1/generate_transcripts, /v1/generate_captions, /v1/ingest_with_text |
+| MicroService | Open Source Project | HW | Port | Endpoint |
+| ------------ | --------------------- | ----- | ---- | ----------------------------------------------------------- |
+| Embedding | Langchain | Xeon | 6000 | /v1/embeddings |
+| Retriever | Langchain, Redis | Xeon | 7000 | /v1/multimodal_retrieval |
+| LVM | Langchain, TGI | Gaudi | 9399 | /v1/lvm |
+| Dataprep | Redis, Langchain, TGI | Gaudi | 6007 | /v1/generate_transcripts, /v1/generate_captions, /v1/ingest |
diff --git a/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md b/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md
index e49b264823..f49b9815f1 100644
--- a/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md
+++ b/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md
@@ -289,6 +289,7 @@ To delete all uploaded files along with data indexed with `$INDEX_NAME` in REDIS
```bash
curl -X POST \
-H "Content-Type: application/json" \
+ -d '{"file_path": "all"}' \
${DATAPREP_DELETE_FILE_ENDPOINT}
```
diff --git a/MultimodalQnA/docker_compose/amd/gpu/rocm/set_env.sh b/MultimodalQnA/docker_compose/amd/gpu/rocm/set_env.sh
index c271a4b553..5cb482bc55 100644
--- a/MultimodalQnA/docker_compose/amd/gpu/rocm/set_env.sh
+++ b/MultimodalQnA/docker_compose/amd/gpu/rocm/set_env.sh
@@ -26,8 +26,8 @@ export MM_RETRIEVER_SERVICE_HOST_IP=${HOST_IP}
export LVM_SERVICE_HOST_IP=${HOST_IP}
export MEGA_SERVICE_HOST_IP=${HOST_IP}
export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:8888/v1/multimodalqna"
-export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/ingest"
-export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/generate_transcripts"
-export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/generate_captions"
-export DATAPREP_GET_FILE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/get"
-export DATAPREP_DELETE_FILE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/delete"
+export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/ingest"
+export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/generate_transcripts"
+export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/generate_captions"
+export DATAPREP_GET_FILE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/get"
+export DATAPREP_DELETE_FILE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/delete"
diff --git a/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md b/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md
index 714fc72661..7e4fa6894a 100644
--- a/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md
@@ -92,7 +92,7 @@ export REDIS_INSIGHTS_PORT=8001
export REDIS_URL="redis://${host_ip}:${REDIS_DB_PORT}"
export REDIS_HOST=${host_ip}
export INDEX_NAME="mm-rag-redis"
-export DATAPREP_MMR_PORT=5000
+export DATAPREP_MMR_PORT=6007
export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest"
export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts"
export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions"
@@ -334,15 +334,6 @@ export audio_fn="AudioSample.wav"
wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav -O ${audio_fn}
```
-```bash
-export DATAPREP_MMR_PORT=6007
-export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest"
-export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts"
-export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions"
-export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/get"
-export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/delete"
-```
-
Test dataprep microservice with generating transcript. This command updates a knowledge base by uploading a local video .mp4 and an audio .wav file.
```bash
@@ -398,6 +389,7 @@ To delete all uploaded files along with data indexed with `$INDEX_NAME` in REDIS
```bash
curl -X POST \
-H "Content-Type: application/json" \
+ -d '{"file_path": "all"}' \
${DATAPREP_DELETE_FILE_ENDPOINT}
```
diff --git a/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml b/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 681ba25ee6..31f543c755 100644
--- a/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -26,7 +26,7 @@ services:
- redis-vector-db
- lvm-llava
ports:
- - "6007:${DATAPREP_MMR_PORT}"
+ - "${DATAPREP_MMR_PORT}:5000"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
diff --git a/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh b/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh
index 593e89452c..057f90990c 100755
--- a/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh
+++ b/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh
@@ -28,7 +28,7 @@ export REDIS_URL="redis://${host_ip}:${REDIS_DB_PORT}"
export REDIS_HOST=${host_ip}
export INDEX_NAME="mm-rag-redis"
-export DATAPREP_MMR_PORT=5000
+export DATAPREP_MMR_PORT=6007
export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest"
export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts"
export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions"
diff --git a/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md b/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md
index 35ed4abbc8..2379fc3d4d 100644
--- a/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -37,7 +37,7 @@ export WHISPER_PORT=7066
export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr"
export MAX_IMAGES=1
export WHISPER_MODEL="base"
-export DATAPREP_MMR_PORT=5000
+export DATAPREP_MMR_PORT=6007
export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest"
export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts"
export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions"
@@ -282,15 +282,6 @@ wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_ex
Test dataprep microservice with generating transcript. This command updates a knowledge base by uploading a local video .mp4 and an audio .wav file.
-```bash
-export DATAPREP_MMR_PORT=6007
-export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest"
-export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts"
-export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions"
-export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/get"
-export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/delete"
-```
-
```bash
curl --silent --write-out "HTTPSTATUS:%{http_code}" \
${DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT} \
@@ -324,6 +315,7 @@ Also, you are able to get the list of all files that you uploaded:
```bash
curl -X POST \
-H "Content-Type: application/json" \
+ -d '{"file_path": "all"}' \
${DATAPREP_GET_FILE_ENDPOINT}
```
diff --git a/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml
index 6c646674b7..26b5610f5e 100644
--- a/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -28,7 +28,7 @@ services:
- redis-vector-db
- lvm
ports:
- - "6007:${DATAPREP_MMR_PORT}"
+ - "${DATAPREP_MMR_PORT}:5000"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
@@ -42,16 +42,21 @@ services:
MULTIMODAL_DATAPREP: true
DATAPREP_COMPONENT_NAME: "OPEA_DATAPREP_MULTIMODALREDIS"
restart: unless-stopped
- embedding-multimodal-bridgetower:
- image: ${REGISTRY:-opea}/embedding-multimodal-bridgetower:${TAG:-latest}
- container_name: embedding-multimodal-bridgetower
+ embedding-multimodal-bridgetower-gaudi:
+ image: ${REGISTRY:-opea}/embedding-multimodal-bridgetower-gaudi:${TAG:-latest}
+ container_name: embedding-multimodal-bridgetower-gaudi
ports:
- ${EMM_BRIDGETOWER_PORT}:${EMM_BRIDGETOWER_PORT}
+ ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
PORT: ${EMM_BRIDGETOWER_PORT}
+ HABANA_VISIBLE_DEVICES: all
+ runtime: habana
+ cap_add:
+ - SYS_NICE
healthcheck:
test: ["CMD-SHELL", "http_proxy='' curl -f http://localhost:${EMM_BRIDGETOWER_PORT}/v1/health_check"]
interval: 10s
@@ -64,7 +69,7 @@ services:
image: ${REGISTRY:-opea}/embedding:${TAG:-latest}
container_name: embedding
depends_on:
- embedding-multimodal-bridgetower:
+ embedding-multimodal-bridgetower-gaudi:
condition: service_healthy
ports:
- ${MM_EMBEDDING_PORT_MICROSERVICE}:${MM_EMBEDDING_PORT_MICROSERVICE}
diff --git a/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh b/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh
index a92483f9a0..cc35d58d08 100755
--- a/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh
+++ b/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -28,7 +28,7 @@ export WHISPER_PORT=7066
export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr"
export MAX_IMAGES=1
-export DATAPREP_MMR_PORT=5000
+export DATAPREP_MMR_PORT=6007
export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest"
export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts"
export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions"
diff --git a/MultimodalQnA/docker_image_build/build.yaml b/MultimodalQnA/docker_image_build/build.yaml
index e90d0cc686..1fc599c3e5 100644
--- a/MultimodalQnA/docker_image_build/build.yaml
+++ b/MultimodalQnA/docker_image_build/build.yaml
@@ -23,6 +23,12 @@ services:
dockerfile: comps/third_parties/bridgetower/src/Dockerfile
extends: multimodalqna
image: ${REGISTRY:-opea}/embedding-multimodal-bridgetower:${TAG:-latest}
+ embedding-multimodal-bridgetower-gaudi:
+ build:
+ context: GenAIComps
+ dockerfile: comps/third_parties/bridgetower/src/Dockerfile.intel_hpu
+ extends: multimodalqna
+ image: ${REGISTRY:-opea}/embedding-multimodal-bridgetower-gaudi:${TAG:-latest}
embedding:
build:
context: GenAIComps
diff --git a/MultimodalQnA/tests/test_compose_on_gaudi.sh b/MultimodalQnA/tests/test_compose_on_gaudi.sh
index d50e024dc2..ccb4f1894d 100644
--- a/MultimodalQnA/tests/test_compose_on_gaudi.sh
+++ b/MultimodalQnA/tests/test_compose_on_gaudi.sh
@@ -59,7 +59,7 @@ function build_docker_images() {
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
- service_list="multimodalqna multimodalqna-ui embedding-multimodal-bridgetower embedding retriever lvm dataprep whisper"
+ service_list="multimodalqna multimodalqna-ui embedding-multimodal-bridgetower-gaudi embedding retriever lvm dataprep whisper"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
@@ -82,7 +82,7 @@ function setup_env() {
export MAX_IMAGES=1
export WHISPER_MODEL="base"
export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr"
- export DATAPREP_MMR_PORT=5000
+ export DATAPREP_MMR_PORT=6007
export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest"
export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts"
export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions"
@@ -177,19 +177,19 @@ function validate_microservices() {
# Check if the microservices are running correctly.
# Bridgetower Embedding Server
- echo "Validating embedding-multimodal-bridgetower"
+ echo "Validating embedding-multimodal-bridgetower-gaudi"
validate_service \
"http://${host_ip}:${EMM_BRIDGETOWER_PORT}/v1/encode" \
'"embedding":[' \
- "embedding-multimodal-bridgetower" \
- "embedding-multimodal-bridgetower" \
+ "embedding-multimodal-bridgetower-gaudi" \
+ "embedding-multimodal-bridgetower-gaudi" \
'{"text":"This is example"}'
validate_service \
"http://${host_ip}:${EMM_BRIDGETOWER_PORT}/v1/encode" \
'"embedding":[' \
- "embedding-multimodal-bridgetower" \
- "embedding-multimodal-bridgetower" \
+ "embedding-multimodal-bridgetower-gaudi" \
+ "embedding-multimodal-bridgetower-gaudi" \
'{"text":"This is example", "img_b64_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC"}'
# embedding microservice
@@ -210,11 +210,6 @@ function validate_microservices() {
sleep 1m # retrieval can't curl as expected, try to wait for more time
- export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/ingest"
- export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/generate_transcripts"
- export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/generate_captions"
- export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/get"
-
# test data prep
echo "Validating Data Prep with Generating Transcript for Video"
validate_service \
diff --git a/MultimodalQnA/tests/test_compose_on_rocm.sh b/MultimodalQnA/tests/test_compose_on_rocm.sh
index 7787d919ee..9ba5c68c90 100644
--- a/MultimodalQnA/tests/test_compose_on_rocm.sh
+++ b/MultimodalQnA/tests/test_compose_on_rocm.sh
@@ -67,11 +67,11 @@ function setup_env() {
export LVM_SERVICE_HOST_IP=${HOST_IP}
export MEGA_SERVICE_HOST_IP=${HOST_IP}
export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:8888/v1/multimodalqna"
- export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/ingest"
- export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/generate_transcripts"
- export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/generate_captions"
- export DATAPREP_GET_FILE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/get"
- export DATAPREP_DELETE_FILE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/delete"
+ export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/ingest"
+ export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/generate_transcripts"
+ export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/generate_captions"
+ export DATAPREP_GET_FILE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/get"
+ export DATAPREP_DELETE_FILE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/delete"
}
function start_services() {
@@ -174,11 +174,6 @@ function validate_microservices() {
sleep 1m # retrieval can't curl as expected, try to wait for more time
- export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/ingest"
- export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/generate_transcripts"
- export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/generate_captions"
- export DATAPREP_GET_FILE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/get"
-
# test data prep
echo "Data Prep with Generating Transcript for Video"
validate_service \
diff --git a/MultimodalQnA/tests/test_compose_on_xeon.sh b/MultimodalQnA/tests/test_compose_on_xeon.sh
index 10b1579478..b5d254b58c 100644
--- a/MultimodalQnA/tests/test_compose_on_xeon.sh
+++ b/MultimodalQnA/tests/test_compose_on_xeon.sh
@@ -79,7 +79,7 @@ function setup_env() {
export REDIS_URL="redis://${host_ip}:${REDIS_DB_PORT}"
export REDIS_HOST=${host_ip}
export INDEX_NAME="mm-rag-redis"
- export DATAPREP_MMR_PORT=5000
+ export DATAPREP_MMR_PORT=6007
export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest"
export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts"
export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions"
@@ -207,11 +207,6 @@ function validate_microservices() {
sleep 1m # retrieval can't curl as expected, try to wait for more time
- export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/ingest"
- export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/generate_transcripts"
- export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/generate_captions"
- export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/get"
-
# test data prep
echo "Validating Data Prep with Generating Transcript for Video"
validate_service \
diff --git a/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py b/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py
index 6b94e54be9..7919ce5910 100644
--- a/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py
+++ b/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py
@@ -610,7 +610,7 @@ def select_upload_type(choice, request: gr.Request):
"BACKEND_SERVICE_ENDPOINT", f"http://localhost:{MEGA_SERVICE_PORT}/v1/multimodalqna"
)
dataprep_ingest_endpoint = os.getenv(
- "DATAPREP_INGEST_SERVICE_ENDPOINT", f"http://localhost:{DATAPREP_MMR_PORT}/v1/ingest_with_text"
+ "DATAPREP_INGEST_SERVICE_ENDPOINT", f"http://localhost:{DATAPREP_MMR_PORT}/v1/ingest"
)
dataprep_gen_transcript_endpoint = os.getenv(
"DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT", f"http://localhost:{DATAPREP_MMR_PORT}/v1/generate_transcripts"
diff --git a/SearchQnA/kubernetes/helm/README.md b/SearchQnA/kubernetes/helm/README.md
new file mode 100644
index 0000000000..ccdf71a32f
--- /dev/null
+++ b/SearchQnA/kubernetes/helm/README.md
@@ -0,0 +1,18 @@
+# Deploy SearchQnA on Kubernetes cluster
+
+- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information.
+- For more deploy options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme).
+
+## Deploy on Xeon
+
+```
+export HFTOKEN="insert-your-huggingface-token-here"
+helm install searchqna oci://ghcr.io/opea-project/charts/searchqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml
+```
+
+## Deploy on Gaudi
+
+```
+export HFTOKEN="insert-your-huggingface-token-here"
+helm install searchqna oci://ghcr.io/opea-project/charts/searchqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml
+```
diff --git a/SearchQnA/kubernetes/helm/cpu-values.yaml b/SearchQnA/kubernetes/helm/cpu-values.yaml
new file mode 100644
index 0000000000..4de7affb83
--- /dev/null
+++ b/SearchQnA/kubernetes/helm/cpu-values.yaml
@@ -0,0 +1,7 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+tgi:
+ LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
+llm_uservice:
+ LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
diff --git a/SearchQnA/kubernetes/helm/gaudi-values.yaml b/SearchQnA/kubernetes/helm/gaudi-values.yaml
new file mode 100644
index 0000000000..ef327645de
--- /dev/null
+++ b/SearchQnA/kubernetes/helm/gaudi-values.yaml
@@ -0,0 +1,50 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+tgi:
+ accelDevice: "gaudi"
+ image:
+ repository: ghcr.io/huggingface/tgi-gaudi
+ tag: "2.3.1"
+ resources:
+ limits:
+ habana.ai/gaudi: 1
+ MAX_INPUT_LENGTH: "2048"
+ MAX_TOTAL_TOKENS: "4096"
+ CUDA_GRAPHS: ""
+ HF_HUB_DISABLE_PROGRESS_BARS: 1
+ HF_HUB_ENABLE_HF_TRANSFER: 0
+ ENABLE_HPU_GRAPH: true
+ LIMIT_HPU_GRAPH: true
+ USE_FLASH_ATTENTION: true
+ FLASH_ATTENTION_RECOMPUTE: true
+ livenessProbe:
+ initialDelaySeconds: 5
+ periodSeconds: 5
+ timeoutSeconds: 1
+ readinessProbe:
+ initialDelaySeconds: 5
+ periodSeconds: 5
+ timeoutSeconds: 1
+ startupProbe:
+ initialDelaySeconds: 5
+ periodSeconds: 5
+ timeoutSeconds: 1
+ failureThreshold: 120
+
+tei:
+ accelDevice: "gaudi"
+ image:
+ repository: ghcr.io/huggingface/tei-gaudi
+ tag: "1.5.0"
+ OMPI_MCA_btl_vader_single_copy_mechanism: none
+ MAX_WARMUP_SEQUENCE_LENGTH: 512
+ securityContext:
+ readOnlyRootFilesystem: false
+ resources:
+ limits:
+ habana.ai/gaudi: 1
+ livenessProbe:
+ timeoutSeconds: 1
+ readinessProbe:
+ timeoutSeconds: 1
diff --git a/Text2Image/kubernetes/helm/README.md b/Text2Image/kubernetes/helm/README.md
new file mode 100644
index 0000000000..6d26e77bd9
--- /dev/null
+++ b/Text2Image/kubernetes/helm/README.md
@@ -0,0 +1,18 @@
+# Deploy txt2img on Kubernetes cluster
+
+- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information.
+- For more deploy options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme).
+
+## Deploy on Xeon
+
+```
+export HFTOKEN="insert-your-huggingface-token-here"
+helm install txt2img oci://ghcr.io/opea-project/charts/txt2img --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml
+```
+
+## Deploy on Gaudi
+
+```
+export HFTOKEN="insert-your-huggingface-token-here"
+helm install txt2img oci://ghcr.io/opea-project/charts/txt2img --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml
+```
diff --git a/Text2Image/kubernetes/helm/cpu-values.yaml b/Text2Image/kubernetes/helm/cpu-values.yaml
new file mode 100644
index 0000000000..87a6085784
--- /dev/null
+++ b/Text2Image/kubernetes/helm/cpu-values.yaml
@@ -0,0 +1,6 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+text2image:
+ image:
+ repository: opea/text2image
diff --git a/Text2Image/kubernetes/helm/gaudi-values.yaml b/Text2Image/kubernetes/helm/gaudi-values.yaml
new file mode 100644
index 0000000000..f43d405d5a
--- /dev/null
+++ b/Text2Image/kubernetes/helm/gaudi-values.yaml
@@ -0,0 +1,30 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+text2image:
+ accelDevice: "gaudi"
+ image:
+ repository: opea/text2image-gaudi
+ resources:
+ limits:
+ habana.ai/gaudi: 1
+ # The following hugepage related settings is for default MODEL stable-diffusion-v1-5/stable-diffusion-v1-5
+ # User should change the resource limits for other models
+ hugepages-2Mi: 256Mi
+ volumes:
+ - name: hugepage-2mi
+ emptyDir:
+ medium: HugePages-2Mi
+ volumeMounts:
+ - name: hugepage-2mi
+ mountPath: /hugepages-2Mi
+ OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+ readinessProbe:
+ initialDelaySeconds: 5
+ periodSeconds: 5
+ timeoutSeconds: 1
+ startupProbe:
+ initialDelaySeconds: 5
+ periodSeconds: 5
+ timeoutSeconds: 1
+ failureThreshold: 120
diff --git a/Translation/tests/test_manifest_on_gaudi.sh b/Translation/tests/test_manifest_on_gaudi.sh
deleted file mode 100755
index ea1f113cd7..0000000000
--- a/Translation/tests/test_manifest_on_gaudi.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_translation() {
- # executed under path manifest/translation/xeon
- # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
- find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
- # replace microservice image tag
- find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
- # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
- find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
- # set huggingface token
- find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_translation {
- echo "namespace is $NAMESPACE"
- kubectl apply -f translation.yaml -n $NAMESPACE
- sleep 50s
-}
-
-function validate_translation() {
- ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
- port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
- echo "try to curl http://${ip_address}:${port}/v1/translation..."
-
- # generate a random logfile name to avoid conflict among multiple runners
- LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
- # Curl the Mega Service
- curl http://${ip_address}:${port}/v1/translation \
- -H 'Content-Type: application/json' \
- -d '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' > $LOGFILE
- exit_code=$?
- if [ $exit_code -ne 0 ]; then
- echo "Megaservice translation failed, please check the logs in $LOGFILE!"
- exit 1
- fi
-
- echo "Checking response results, make sure the output is reasonable. "
- local status=false
- if [[ -f $LOGFILE ]] && \
- [[ $(grep -c "translation" $LOGFILE) != 0 ]]; then
- status=true
- fi
-
- if [ $status == false ]; then
- echo "Response check failed, please check the logs in artifacts!"
- else
- echo "Response check succeed!"
- fi
-}
-
-if [ $# -eq 0 ]; then
- echo "Usage: $0 "
- exit 1
-fi
-
-case "$1" in
- init_Translation)
- pushd Translation/kubernetes/intel/hpu/gaudi/manifest
- init_translation
- popd
- ;;
- install_Translation)
- pushd Translation/kubernetes/intel/hpu/gaudi/manifest
- NAMESPACE=$2
- install_translation
- popd
- ;;
- validate_Translation)
- NAMESPACE=$2
- SERVICE_NAME=translation
- validate_translation
- ;;
- *)
- echo "Unknown function: $1"
- ;;
-esac
diff --git a/Translation/tests/test_manifest_on_xeon.sh b/Translation/tests/test_manifest_on_xeon.sh
deleted file mode 100755
index d32eb4a229..0000000000
--- a/Translation/tests/test_manifest_on_xeon.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_translation() {
- # executed under path manifest/translation/xeon
- # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
- find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
- # replace microservice image tag
- find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
- # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
- find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
- # set huggingface token
- find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_translation {
- echo "namespace is $NAMESPACE"
- kubectl apply -f translation.yaml -n $NAMESPACE
-}
-
-function validate_translation() {
- ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
- port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
- echo "try to curl http://${ip_address}:${port}/v1/translation..."
-
- # generate a random logfile name to avoid conflict among multiple runners
- LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
- # Curl the Mega Service
- curl http://${ip_address}:${port}/v1/translation \
- -H 'Content-Type: application/json' \
- -d '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' > $LOGFILE
- exit_code=$?
- if [ $exit_code -ne 0 ]; then
- echo "Megaservice translation failed, please check the logs in $LOGFILE!"
- exit 1
- fi
-
- echo "Checking response results, make sure the output is reasonable. "
- local status=false
- if [[ -f $LOGFILE ]] && \
- [[ $(grep -c "translation" $LOGFILE) != 0 ]]; then
- status=true
- fi
-
- if [ $status == false ]; then
- echo "Response check failed, please check the logs in artifacts!"
- else
- echo "Response check succeed!"
- fi
-}
-
-if [ $# -eq 0 ]; then
- echo "Usage: $0 "
- exit 1
-fi
-
-case "$1" in
- init_Translation)
- pushd Translation/kubernetes/intel/cpu/xeon/manifest
- init_translation
- popd
- ;;
- install_Translation)
- pushd Translation/kubernetes/intel/cpu/xeon/manifest
- NAMESPACE=$2
- install_translation
- popd
- ;;
- validate_Translation)
- NAMESPACE=$2
- SERVICE_NAME=translation
- validate_translation
- ;;
- *)
- echo "Unknown function: $1"
- ;;
-esac
diff --git a/VisualQnA/kubernetes/helm/gaudi-values.yaml b/VisualQnA/kubernetes/helm/gaudi-values.yaml
index 5a0e95c3a9..eb6494a142 100644
--- a/VisualQnA/kubernetes/helm/gaudi-values.yaml
+++ b/VisualQnA/kubernetes/helm/gaudi-values.yaml
@@ -9,13 +9,18 @@ tgi:
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
- tag: "2.0.6"
+ tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "4096"
MAX_TOTAL_TOKENS: "8192"
CUDA_GRAPHS: ""
+ OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+ ENABLE_HPU_GRAPH: "true"
+ LIMIT_HPU_GRAPH: "true"
+ USE_FLASH_ATTENTION: "true"
+ FLASH_ATTENTION_RECOMPUTE: "true"
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5