diff --git a/.github/workflows/_example-workflow.yml b/.github/workflows/_example-workflow.yml
index 39ac1388db..8de7eba9cb 100644
--- a/.github/workflows/_example-workflow.yml
+++ b/.github/workflows/_example-workflow.yml
@@ -50,7 +50,6 @@ jobs:
 # Image Build
 ####################################################################################################
   build-images:
-    if: ${{ !(fromJSON(inputs.test_helmchart)) }}
     runs-on: "docker-build-${{ inputs.node }}"
     steps:
       - name: Clean Up Working Directory
diff --git a/.github/workflows/_helm-e2e.yml b/.github/workflows/_helm-e2e.yml
index b162baf9a7..48be97012a 100644
--- a/.github/workflows/_helm-e2e.yml
+++ b/.github/workflows/_helm-e2e.yml
@@ -29,6 +29,10 @@ on:
         default: "latest"
         required: false
         type: string
+      version:
+        default: "0-latest"
+        required: false
+        type: string
 
 jobs:
   get-test-case:
@@ -154,6 +158,13 @@ jobs:
             exit 0
           fi
 
+          for img in `helm template -n $NAMESPACE $RELEASE_NAME oci://ghcr.io/opea-project/charts/${CHART_NAME} -f ${{ inputs.example }}/kubernetes/helm/${value_file} --version ${{ inputs.version }} | grep 'image:' | grep 'opea/' | awk '{print $2}' | xargs`;
+          do
+            # increase helm install wait for for vllm-gaudi case
+            if [[ $img == *"vllm-gaudi"* ]]; then
+              ROLLOUT_TIMEOUT_SECONDS=900s
+            fi
+          done
           if ! helm install \
             --create-namespace \
             --namespace $NAMESPACE \
@@ -163,9 +174,11 @@ jobs:
             --set global.modelUseHostPath=/home/sdp/.cache/huggingface/hub \
             --set GOOGLE_API_KEY=${{ env.GOOGLE_API_KEY}} \
             --set GOOGLE_CSE_ID=${{ env.GOOGLE_CSE_ID}} \
+            --set web-retriever.GOOGLE_API_KEY=${{ env.GOOGLE_API_KEY}} \
+            --set web-retriever.GOOGLE_CSE_ID=${{ env.GOOGLE_CSE_ID}} \
             -f ${{ inputs.example }}/kubernetes/helm/${value_file} \
-            --version 0-latest \
-            --wait; then
+            --version ${{ inputs.version }} \
+            --wait --timeout "$ROLLOUT_TIMEOUT_SECONDS"; then
             echo "Failed to install chart ${{ inputs.example }}"
             echo "skip_validate=true" >> $GITHUB_ENV
             .github/workflows/scripts/k8s-utils.sh dump_pods_status $NAMESPACE
diff --git a/.github/workflows/scripts/k8s-utils.sh b/.github/workflows/scripts/k8s-utils.sh
index ba58e1a152..0676a80d38 100755
--- a/.github/workflows/scripts/k8s-utils.sh
+++ b/.github/workflows/scripts/k8s-utils.sh
@@ -12,7 +12,7 @@ function dump_pod_log() {
     kubectl describe pod $pod_name -n $namespace
     echo "-----------------------------------"
     echo "#kubectl logs $pod_name -n $namespace"
-    kubectl logs $pod_name -n $namespace
+    kubectl logs $pod_name -n $namespace --all-containers --prefix=true
     echo "-----------------------------------"
 }
 
@@ -44,8 +44,13 @@ function dump_pods_status() {
 
 function dump_all_pod_logs() {
     namespace=$1
+    echo "------SUMMARY of POD STATUS in NS $namespace------"
+    kubectl get pods -n $namespace -o wide
+    echo "------SUMMARY of SVC STATUS in NS $namespace------"
+    kubectl get services -n $namespace -o wide
+    echo "------SUMMARY of endpoint STATUS in NS $namespace------"
+    kubectl get endpoints -n $namespace -o wide
     echo "-----DUMP POD STATUS AND LOG in NS $namespace------"
-
     pods=$(kubectl get pods -n $namespace -o jsonpath='{.items[*].metadata.name}')
     for pod_name in $pods
     do
diff --git a/AgentQnA/docker_compose/amd/gpu/rocm/README.md b/AgentQnA/docker_compose/amd/gpu/rocm/README.md
new file mode 100644
index 0000000000..b0d76d0d31
--- /dev/null
+++ b/AgentQnA/docker_compose/amd/gpu/rocm/README.md
@@ -0,0 +1,101 @@
+# Single node on-prem deployment with Docker Compose on AMD GPU
+
+This example showcases a hierarchical multi-agent system for question-answering applications. We deploy the example on Xeon. For LLMs, we use OpenAI models via API calls. For instructions on using open-source LLMs, please refer to the deployment guide [here](../../../../README.md).
+
+## Deployment with docker
+
+1. First, clone this repo.
+   ```
+   export WORKDIR=<your-work-directory>
+   cd $WORKDIR
+   git clone https://github.com/opea-project/GenAIExamples.git
+   ```
+2. Set up environment for this example </br>
+
+   ```
+   # Example: host_ip="192.168.1.1" or export host_ip="External_Public_IP"
+   export host_ip=$(hostname -I | awk '{print $1}')
+   # if you are in a proxy environment, also set the proxy-related environment variables
+   export http_proxy="Your_HTTP_Proxy"
+   export https_proxy="Your_HTTPs_Proxy"
+   # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
+   export no_proxy="Your_No_Proxy"
+
+   export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
+   #OPANAI_API_KEY if you want to use OpenAI models
+   export OPENAI_API_KEY=<your-openai-key>
+   # Set AMD GPU settings
+   export AGENTQNA_CARD_ID="card1"
+   export AGENTQNA_RENDER_ID="renderD136"
+   ```
+
+3. Deploy the retrieval tool (i.e., DocIndexRetriever mega-service)
+
+   First, launch the mega-service.
+
+   ```
+   cd $WORKDIR/GenAIExamples/AgentQnA/retrieval_tool
+   bash launch_retrieval_tool.sh
+   ```
+
+   Then, ingest data into the vector database. Here we provide an example. You can ingest your own data.
+
+   ```
+   bash run_ingest_data.sh
+   ```
+
+4. Launch Tool service
+   In this example, we will use some of the mock APIs provided in the Meta CRAG KDD Challenge to demonstrate the benefits of gaining additional context from mock knowledge graphs.
+   ```
+   docker run -d -p=8080:8000 docker.io/aicrowd/kdd-cup-24-crag-mock-api:v0
+   ```
+5. Launch `Agent` service
+
+   ```
+   cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/amd/gpu/rocm
+   bash launch_agent_service_tgi_rocm.sh
+   ```
+
+6. [Optional] Build `Agent` docker image if pulling images failed.
+
+   ```
+   git clone https://github.com/opea-project/GenAIComps.git
+   cd GenAIComps
+   docker build -t opea/agent:latest -f comps/agent/src/Dockerfile .
+   ```
+
+## Validate services
+
+First look at logs of the agent docker containers:
+
+```
+# worker agent
+docker logs rag-agent-endpoint
+```
+
+```
+# supervisor agent
+docker logs react-agent-endpoint
+```
+
+You should see something like "HTTP server setup successful" if the docker containers are started successfully.</p>
+
+Second, validate worker agent:
+
+```
+curl http://${host_ip}:9095/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
+     "query": "Most recent album by Taylor Swift"
+    }'
+```
+
+Third, validate supervisor agent:
+
+```
+curl http://${host_ip}:9090/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
+     "query": "Most recent album by Taylor Swift"
+    }'
+```
+
+## How to register your own tools with agent
+
+You can take a look at the tools yaml and python files in this example. For more details, please refer to the "Provide your own tools" section in the instructions [here](https://github.com/opea-project/GenAIComps/tree/main/comps/agent/src/README.md).
diff --git a/AgentQnA/kubernetes/helm/gaudi-values.yaml b/AgentQnA/kubernetes/helm/gaudi-values.yaml
index 91ef5d1026..2d171ea22a 100644
--- a/AgentQnA/kubernetes/helm/gaudi-values.yaml
+++ b/AgentQnA/kubernetes/helm/gaudi-values.yaml
@@ -4,35 +4,13 @@
 # Accelerate inferencing in heaviest components to improve performance
 # by overriding their subchart values
 
-tgi:
+vllm:
   enabled: true
-  accelDevice: "gaudi"
   image:
-    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
-  resources:
-    limits:
-      habana.ai/gaudi: 4
-  MAX_INPUT_LENGTH: "4096"
-  MAX_TOTAL_TOKENS: "8192"
-  CUDA_GRAPHS: ""
-  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
-  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
-  ENABLE_HPU_GRAPH: "true"
-  LIMIT_HPU_GRAPH: "true"
-  USE_FLASH_ATTENTION: "true"
-  FLASH_ATTENTION_RECOMPUTE: "true"
-  extraCmdArgs: ["--sharded","true","--num-shard","4"]
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  readinessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-    failureThreshold: 120
+    repository: opea/vllm-gaudi
+supervisor:
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
+ragagent:
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
+sqlagent:
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
diff --git a/AgentQnA/tests/step1_build_images.sh b/AgentQnA/tests/step1_build_images.sh
index e00cf75106..4cb8a2e4d1 100644
--- a/AgentQnA/tests/step1_build_images.sh
+++ b/AgentQnA/tests/step1_build_images.sh
@@ -38,19 +38,17 @@ function build_vllm_docker_image() {
     echo "Building the vllm docker image"
     cd $WORKPATH
     echo $WORKPATH
-    if [ ! -d "./vllm" ]; then
-        echo "clone vllm repo...."
-        git clone https://github.com/vllm-project/vllm.git
+    if [ ! -d "./vllm-fork" ]; then
+        git clone https://github.com/HabanaAI/vllm-fork.git
     fi
-    cd ./vllm
-    echo "Checking out latest stable release of vllm"
-    git checkout v0.6.6
-    docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:comps --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
+    cd ./vllm-fork
+    git checkout v0.6.4.post2+Gaudi-1.19.0
+    docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:ci --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
     if [ $? -ne 0 ]; then
-        echo "opea/vllm-gaudi:comps failed"
+        echo "opea/vllm-gaudi:ci failed"
         exit 1
     else
-        echo "opea/vllm-gaudi:comps successful"
+        echo "opea/vllm-gaudi:ci successful"
     fi
 }
 
diff --git a/AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh b/AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh
index c99e212ff6..824f7aa855 100644
--- a/AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh
+++ b/AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh
@@ -8,15 +8,17 @@ WORKPATH=$(dirname "$PWD")
 export WORKDIR=$WORKPATH/../../
 echo "WORKDIR=${WORKDIR}"
 export ip_address=$(hostname -I | awk '{print $1}')
-export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
+export TOOLSET_PATH=$WORKPATH/tools/
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 model="meta-llama/Meta-Llama-3.1-70B-Instruct"
 
-export HF_CACHE_DIR=$WORKDIR/hf_cache
+export HF_CACHE_DIR=/data2/huggingface
 if [ ! -d "$HF_CACHE_DIR" ]; then
+    HF_CACHE_DIR=$WORKDIR/hf_cache
     mkdir -p "$HF_CACHE_DIR"
 fi
+echo  "HF_CACHE_DIR=$HF_CACHE_DIR"
 ls $HF_CACHE_DIR
 
 vllm_port=8086
@@ -35,7 +37,7 @@ function start_vllm_service_70B() {
 
     echo "start vllm gaudi service"
     echo "**************model is $model**************"
-    vllm_image=opea/vllm-gaudi:comps
+    vllm_image=opea/vllm-gaudi:ci
     docker run -d --runtime=habana --rm --name "vllm-gaudi-server" -e HABANA_VISIBLE_DEVICES=0,1,2,3 -p $vllm_port:8000 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host $vllm_image --model ${model} --max-seq-len-to-capture 16384 --tensor-parallel-size 4
     sleep 5s
     echo "Waiting vllm gaudi ready"
diff --git a/AgentQnA/tests/test_compose_on_gaudi.sh b/AgentQnA/tests/test_compose_on_gaudi.sh
index cf224b6aa1..880102f0f8 100644
--- a/AgentQnA/tests/test_compose_on_gaudi.sh
+++ b/AgentQnA/tests/test_compose_on_gaudi.sh
@@ -4,9 +4,6 @@
 
 set -xe
 
-echo "All running containers"
-docker ps
-
 WORKPATH=$(dirname "$PWD")
 export WORKDIR=$WORKPATH/../../
 echo "WORKDIR=${WORKDIR}"
diff --git a/AudioQnA/kubernetes/helm/gaudi-values.yaml b/AudioQnA/kubernetes/helm/gaudi-values.yaml
index faaad653ef..9b06ff4296 100644
--- a/AudioQnA/kubernetes/helm/gaudi-values.yaml
+++ b/AudioQnA/kubernetes/helm/gaudi-values.yaml
@@ -5,7 +5,7 @@ tgi:
   accelDevice: "gaudi"
   image:
     repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
   resources:
     limits:
       habana.ai/gaudi: 1
@@ -33,11 +33,15 @@ tgi:
     failureThreshold: 120
 
 whisper:
+  image:
+    repository: opea/whisper-gaudi
   resources:
     limits:
       habana.ai/gaudi: 1
 
 speecht5:
+  image:
+    repository: opea/speecht5-gaudi
   resources:
     limits:
       habana.ai/gaudi: 1
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README.md b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
index 01a00a8193..764afba4d4 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
@@ -280,7 +280,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
 1. TEI Embedding Service
 
    ```bash
-   curl ${host_ip}:6006/embed \
+   curl http://${host_ip}:6006/embed \
        -X POST \
        -d '{"inputs":"What is Deep Learning?"}' \
        -H 'Content-Type: application/json'
diff --git a/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml b/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml
new file mode 100644
index 0000000000..f552e1d5bc
--- /dev/null
+++ b/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml
@@ -0,0 +1,112 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Override CPU resource request and probe timing values in specific subcharts
+#
+# RESOURCES
+#
+# Resource request matching actual resource usage (with enough slack)
+# is important when service is scaled up, so that right amount of pods
+# get scheduled to right nodes.
+#
+# Because resource usage depends on the used devices, model, data type
+# and SW versions, and this top-level chart has overrides for them,
+# resource requests need to be specified here too.
+#
+# To test service without resource request, use "resources: {}".
+#
+# PROBES
+#
+# Inferencing pods startup / warmup takes *much* longer on CPUs than
+# with acceleration devices, and their responses are also slower,
+# especially when node is running several instances of these services.
+#
+# Kubernetes restarting pod before its startup finishes, or not
+# sending it queries because it's not in ready state due to slow
+# readiness responses, does really NOT help in getting faster responses.
+#
+# => probe timings need to be increased when running on CPU.
+
+vllm:
+  enabled: false
+tgi:
+  enabled: true
+  # TODO: add Helm value also for TGI data type option:
+  # https://github.com/opea-project/GenAIExamples/issues/330
+  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
+
+  # Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit:
+  #resources:
+  #  limits:
+  #    cpu: 8
+  #    memory: 70Gi
+  #  requests:
+  #    cpu: 6
+  #    memory: 65Gi
+
+  livenessProbe:
+    initialDelaySeconds: 8
+    periodSeconds: 8
+    failureThreshold: 24
+    timeoutSeconds: 4
+  readinessProbe:
+    initialDelaySeconds: 16
+    periodSeconds: 8
+    timeoutSeconds: 4
+  startupProbe:
+    initialDelaySeconds: 10
+    periodSeconds: 5
+    failureThreshold: 180
+    timeoutSeconds: 2
+
+teirerank:
+  RERANK_MODEL_ID: "BAAI/bge-reranker-base"
+
+  # Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
+  resources:
+    limits:
+      cpu: 4
+      memory: 30Gi
+    requests:
+      cpu: 2
+      memory: 25Gi
+
+  livenessProbe:
+    initialDelaySeconds: 8
+    periodSeconds: 8
+    failureThreshold: 24
+    timeoutSeconds: 4
+  readinessProbe:
+    initialDelaySeconds: 8
+    periodSeconds: 8
+    timeoutSeconds: 4
+  startupProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    failureThreshold: 120
+
+tei:
+  EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5"
+
+  # Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
+  resources:
+    limits:
+      cpu: 4
+      memory: 4Gi
+    requests:
+      cpu: 2
+      memory: 3Gi
+
+  livenessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    failureThreshold: 24
+    timeoutSeconds: 2
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 2
+  startupProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    failureThreshold: 120
diff --git a/ChatQnA/kubernetes/helm/cpu-values.yaml b/ChatQnA/kubernetes/helm/cpu-values.yaml
index b4c5ee5ddb..86b68a921f 100644
--- a/ChatQnA/kubernetes/helm/cpu-values.yaml
+++ b/ChatQnA/kubernetes/helm/cpu-values.yaml
@@ -1,109 +1,5 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-# Override CPU resource request and probe timing values in specific subcharts
-#
-# RESOURCES
-#
-# Resource request matching actual resource usage (with enough slack)
-# is important when service is scaled up, so that right amount of pods
-# get scheduled to right nodes.
-#
-# Because resource usage depends on the used devices, model, data type
-# and SW versions, and this top-level chart has overrides for them,
-# resource requests need to be specified here too.
-#
-# To test service without resource request, use "resources: {}".
-#
-# PROBES
-#
-# Inferencing pods startup / warmup takes *much* longer on CPUs than
-# with acceleration devices, and their responses are also slower,
-# especially when node is running several instances of these services.
-#
-# Kubernetes restarting pod before its startup finishes, or not
-# sending it queries because it's not in ready state due to slow
-# readiness responses, does really NOT help in getting faster responses.
-#
-# => probe timings need to be increased when running on CPU.
-
-tgi:
-  # TODO: add Helm value also for TGI data type option:
-  # https://github.com/opea-project/GenAIExamples/issues/330
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-
-  # Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit:
-  resources:
-    limits:
-      cpu: 8
-      memory: 70Gi
-    requests:
-      cpu: 6
-      memory: 65Gi
-
-  livenessProbe:
-    initialDelaySeconds: 8
-    periodSeconds: 8
-    failureThreshold: 24
-    timeoutSeconds: 4
-  readinessProbe:
-    initialDelaySeconds: 16
-    periodSeconds: 8
-    timeoutSeconds: 4
-  startupProbe:
-    initialDelaySeconds: 10
-    periodSeconds: 5
-    failureThreshold: 180
-    timeoutSeconds: 2
-
-teirerank:
-  RERANK_MODEL_ID: "BAAI/bge-reranker-base"
-
-  # Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
-  resources:
-    limits:
-      cpu: 4
-      memory: 30Gi
-    requests:
-      cpu: 2
-      memory: 25Gi
-
-  livenessProbe:
-    initialDelaySeconds: 8
-    periodSeconds: 8
-    failureThreshold: 24
-    timeoutSeconds: 4
-  readinessProbe:
-    initialDelaySeconds: 8
-    periodSeconds: 8
-    timeoutSeconds: 4
-  startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    failureThreshold: 120
-
-tei:
-  EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5"
-
-  # Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
-  resources:
-    limits:
-      cpu: 4
-      memory: 4Gi
-    requests:
-      cpu: 2
-      memory: 3Gi
-
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    failureThreshold: 24
-    timeoutSeconds: 2
-  readinessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 2
-  startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    failureThreshold: 120
+image:
+  repository: opea/chatqna
diff --git a/ChatQnA/kubernetes/helm/gaudi-values.yaml b/ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml
similarity index 97%
rename from ChatQnA/kubernetes/helm/gaudi-values.yaml
rename to ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml
index 47df99fc44..d4da00c976 100644
--- a/ChatQnA/kubernetes/helm/gaudi-values.yaml
+++ b/ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml
@@ -4,12 +4,15 @@
 # Accelerate inferencing in heaviest components to improve performance
 # by overriding their subchart values
 
+vllm:
+  enabled: false
 # TGI: largest bottleneck for ChatQnA
 tgi:
+  enabled: true
   accelDevice: "gaudi"
   image:
     repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
   resources:
     limits:
       habana.ai/gaudi: 1
diff --git a/ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml b/ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml
index 6c1a44ebff..76eafae029 100644
--- a/ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml
+++ b/ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml
@@ -6,9 +6,9 @@
 
 tgi:
   enabled: false
-
 vllm:
   enabled: true
+  shmSize: 1Gi
   accelDevice: "gaudi"
   image:
     repository: opea/vllm-gaudi
@@ -19,7 +19,7 @@ vllm:
     initialDelaySeconds: 5
     periodSeconds: 5
     timeoutSeconds: 1
-    failureThreshold: 120
+    failureThreshold: 180
   readinessProbe:
     initialDelaySeconds: 5
     periodSeconds: 5
@@ -39,7 +39,6 @@ vllm:
     "--max-seq_len-to-capture", "2048"
   ]
 
-
 # Reranking: second largest bottleneck when reranking is in use
 # (i.e. query context docs have been uploaded with data-prep)
 #
diff --git a/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml b/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml
index aad83623d5..8e8a491a0a 100644
--- a/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml
+++ b/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml
@@ -44,17 +44,18 @@ teirerank:
   readinessProbe:
     timeoutSeconds: 1
 
-tgi:
+tgi-guardrails:
+  enabled: true
   accelDevice: "gaudi"
+  LLM_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"
   image:
     repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
   resources:
     limits:
       habana.ai/gaudi: 1
-  # higher limits are needed with extra input tokens added by rerank
-  MAX_INPUT_LENGTH: "2048"
-  MAX_TOTAL_TOKENS: "4096"
+  MAX_INPUT_LENGTH: "1024"
+  MAX_TOTAL_TOKENS: "2048"
   CUDA_GRAPHS: ""
   OMPI_MCA_btl_vader_single_copy_mechanism: "none"
   ENABLE_HPU_GRAPH: "true"
@@ -75,34 +76,37 @@ tgi:
     timeoutSeconds: 1
     failureThreshold: 120
 
-tgi-guardrails:
+tgi:
+  enabled: false
+vllm:
   enabled: true
+  shmSize: 1Gi
   accelDevice: "gaudi"
-  LLM_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"
   image:
-    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    repository: opea/vllm-gaudi
   resources:
     limits:
       habana.ai/gaudi: 1
-  MAX_INPUT_LENGTH: "1024"
-  MAX_TOTAL_TOKENS: "2048"
-  CUDA_GRAPHS: ""
-  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
-  ENABLE_HPU_GRAPH: "true"
-  LIMIT_HPU_GRAPH: "true"
-  USE_FLASH_ATTENTION: "true"
-  FLASH_ATTENTION_RECOMPUTE: "true"
-  livenessProbe:
+  startupProbe:
     initialDelaySeconds: 5
     periodSeconds: 5
     timeoutSeconds: 1
+    failureThreshold: 180
   readinessProbe:
     initialDelaySeconds: 5
     periodSeconds: 5
     timeoutSeconds: 1
-  startupProbe:
+  livenessProbe:
     initialDelaySeconds: 5
     periodSeconds: 5
     timeoutSeconds: 1
-    failureThreshold: 120
+
+  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+
+  extraCmdArgs: [
+    "--tensor-parallel-size", "1",
+    "--block-size", "128",
+    "--max-num-seqs", "256",
+    "--max-seq_len-to-capture", "2048"
+  ]
diff --git a/ChatQnA/kubernetes/helm/guardrails-values.yaml b/ChatQnA/kubernetes/helm/guardrails-values.yaml
deleted file mode 100644
index d37a41060c..0000000000
--- a/ChatQnA/kubernetes/helm/guardrails-values.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-image:
-  repository: opea/chatqna-guardrails
-
-# guardrails related config
-guardrails-usvc:
-  enabled: true
-  # SAFETY_GUARD_ENDPOINT: "http://{{ .Release.Name }}-tgi-guardrails"
-  SAFETY_GUARD_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"
-tgi-guardrails:
-  enabled: true
-  LLM_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"
diff --git a/ChatQnA/kubernetes/helm/nv-values.yaml b/ChatQnA/kubernetes/helm/nv-values.yaml
deleted file mode 100644
index 67c4e3ac18..0000000000
--- a/ChatQnA/kubernetes/helm/nv-values.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-# To override values in subchart tgi
-tgi:
-  accelDevice: "nvidia"
-  image:
-    repository: ghcr.io/huggingface/text-generation-inference
-    tag: "2.2.0"
-  resources:
-    limits:
-      nvidia.com/gpu: 1
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  readinessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-    failureThreshold: 120
diff --git a/ChatQnA/tests/test_manifest_guardrails_on_gaudi.sh b/ChatQnA/tests/test_manifest_guardrails_on_gaudi.sh
deleted file mode 100755
index 274cc5209c..0000000000
--- a/ChatQnA/tests/test_manifest_guardrails_on_gaudi.sh
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-ROLLOUT_TIMEOUT_SECONDS="1800s"
-KUBECTL_TIMEOUT_SECONDS="60s"
-
-function validate_chatqna() {
-    local ns=$1
-    local log=$2
-    max_retry=20
-    # make sure microservice retriever-usvc is ready
-    # try to curl retriever-svc for max_retry times
-    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
-    for ((i=1; i<=max_retry; i++))
-    do
-        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
-        curl http://$endpoint_url/v1/retrieval -X POST \
-            -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
-            -H 'Content-Type: application/json' && break
-        sleep 30
-    done
-    # if i is bigger than max_retry, then exit with error
-    if [ $i -gt $max_retry ]; then
-        echo "Microservice retriever failed, exit with error."
-        return 1
-    fi
-    # make sure microservice tgi-svc is ready
-    for ((i=1; i<=max_retry; i++))
-    do
-        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
-        curl http://$endpoint_url/generate -X POST \
-            -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-            -H 'Content-Type: application/json' && break
-        sleep 30
-    done
-    # if i is bigger than max_retry, then exit with error
-    if [ $i -gt $max_retry ]; then
-        echo "Microservice tgi failed, exit with error."
-        return 1
-    fi
-
-    # check megaservice works
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$log.log
-    endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
-    curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice failed, please check the logs in $LOGFILE!"
-        return ${exit_code}
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] &&
-        [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-    if [[ $status == false ]]; then
-        echo "Response check failed, please check the logs in artifacts!"
-        return 1
-    else
-        echo "Response check succeed!"
-    fi
-    return 0
-}
-
-function install_chatqna() {
-    echo "Testing manifests chatqna_guardrails"
-    local ns=$1
-    bash ChatQnA/tests/common/_test_manifest_utils.sh _cleanup_ns $ns
-    pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest
-    kubectl create namespace $ns
-    # install guardrails
-    kubectl apply -f chatqna-guardrails.yaml -n $ns
-    # Sleep enough time for chatqna_guardrails to be ready
-    sleep 60
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_ChatQnA)
-        pushd ChatQnA/tests/common
-        bash _test_manifest_utils.sh init_ChatQnA
-        popd
-        ;;
-    install_ChatQnA)
-        NAMESPACE=$2
-        install_chatqna $NAMESPACE
-        popd
-        ;;
-    validate_ChatQnA)
-        NAMESPACE=$2
-        SERVICE_NAME=chatqna-guardrails
-        validate_chatqna $NAMESPACE chatqna-guardrails
-        ret=$?
-        if [ $ret -ne 0 ]; then
-            exit $ret
-        fi
-        ;;
-
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
diff --git a/ChatQnA/tests/test_manifest_guardrails_on_xeon.sh b/ChatQnA/tests/test_manifest_guardrails_on_xeon.sh
deleted file mode 100755
index 63d494c9f8..0000000000
--- a/ChatQnA/tests/test_manifest_guardrails_on_xeon.sh
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-ROLLOUT_TIMEOUT_SECONDS="1800s"
-KUBECTL_TIMEOUT_SECONDS="60s"
-
-function validate_chatqna() {
-    local ns=$1
-    local log=$2
-    max_retry=10
-    # make sure microservice retriever-usvc is ready
-    # try to curl retriever-svc for max_retry times
-    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
-    for ((i=1; i<=max_retry; i++))
-    do
-        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
-        curl http://$endpoint_url/v1/retrieval -X POST \
-            -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
-            -H 'Content-Type: application/json' && break
-        sleep 30
-    done
-    # if i is bigger than max_retry, then exit with error
-    if [ $i -gt $max_retry ]; then
-        echo "Microservice retriever failed, exit with error."
-        return 1
-    fi
-    # make sure microservice tgi-svc is ready
-    for ((i=1; i<=max_retry; i++))
-    do
-        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
-        curl http://$endpoint_url/generate -X POST \
-            -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-            -H 'Content-Type: application/json' && break
-        sleep 30
-    done
-    # if i is bigger than max_retry, then exit with error
-    if [ $i -gt $max_retry ]; then
-        echo "Microservice tgi failed, exit with error."
-        return 1
-    fi
-
-    # check megaservice works
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$log.log
-    endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
-    curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice failed, please check the logs in $LOGFILE!"
-        return ${exit_code}
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] &&
-        [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-        return 1
-    else
-        echo "Response check succeed!"
-    fi
-    return 0
-}
-
-function install_chatqna() {
-    echo "Testing manifests chatqna_guardrails"
-    local ns=$1
-    bash ChatQnA/tests/common/_test_manifest_utils.sh _cleanup_ns $ns
-    pushd ChatQnA/kubernetes/intel/cpu/xeon/manifest
-    kubectl create namespace $ns
-    # install guardrail
-    kubectl apply -f chatqna-guardrails.yaml -n $ns
-    # Sleep enough time for chatqna_guardrails to be ready
-    sleep 60
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_ChatQnA)
-        pushd ChatQnA/tests/common
-        bash _test_manifest_utils.sh init_ChatQnA
-        popd
-        ;;
-    install_ChatQnA)
-        NAMESPACE=$2
-        install_chatqna $NAMESPACE
-        popd
-        ;;
-    validate_ChatQnA)
-        NAMESPACE=$2
-        SERVICE_NAME=chatqna-guardrails
-        validate_chatqna $NAMESPACE chatqna-guardrails
-        ret=$?
-        if [ $ret -ne 0 ]; then
-            exit $ret
-        fi
-        ;;
-
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
diff --git a/ChatQnA/tests/test_manifest_on_gaudi.sh b/ChatQnA/tests/test_manifest_on_gaudi.sh
deleted file mode 100755
index d1764401ff..0000000000
--- a/ChatQnA/tests/test_manifest_on_gaudi.sh
+++ /dev/null
@@ -1,113 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-ROLLOUT_TIMEOUT_SECONDS="1800s"
-KUBECTL_TIMEOUT_SECONDS="60s"
-
-function install_chatqna {
-    echo "namespace is $NAMESPACE"
-    kubectl apply -f chatqna.yaml -n $NAMESPACE
-    # Sleep enough time for retreiver-usvc to be ready
-    sleep 60
-}
-
-function validate_chatqna() {
-    local ns=$1
-    local log=$2
-    max_retry=20
-    # make sure microservice retriever-usvc is ready
-    # try to curl retriever-svc for max_retry times
-    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
-    for ((i=1; i<=max_retry; i++))
-    do
-        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
-        curl http://$endpoint_url/v1/retrieval -X POST \
-            -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
-            -H 'Content-Type: application/json' && break
-        sleep 30
-    done
-    # if i is bigger than max_retry, then exit with error
-    if [ $i -gt $max_retry ]; then
-        echo "Microservice retriever failed, exit with error."
-        return 1
-    fi
-    # make sure microservice tgi-svc is ready
-    for ((i=1; i<=max_retry; i++))
-    do
-        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
-        curl http://$endpoint_url/generate -X POST \
-            -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-            -H 'Content-Type: application/json' && break
-        sleep 30
-    done
-    # if i is bigger than max_retry, then exit with error
-    if [ $i -gt $max_retry ]; then
-        echo "Microservice tgi failed, exit with error."
-        return 1
-    fi
-
-    # check megaservice works
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$log.log
-    endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
-    curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice failed, please check the logs in $LOGFILE!"
-        return ${exit_code}
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] &&
-        [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-        return 1
-    else
-        echo "Response check succeed!"
-    fi
-    return 0
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_ChatQnA)
-        pushd ChatQnA/tests/common
-        bash _test_manifest_utils.sh init_ChatQnA
-        popd
-        ;;
-    install_ChatQnA)
-        pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest
-        NAMESPACE=$2
-        install_chatqna
-        popd
-        ;;
-    validate_ChatQnA)
-        NAMESPACE=$2
-        SERVICE_NAME=chatqna
-        validate_chatqna $NAMESPACE chatqna
-        ret=$?
-        if [ $ret -ne 0 ]; then
-            exit $ret
-        fi
-        ;;
-
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
diff --git a/ChatQnA/tests/test_manifest_on_xeon.sh b/ChatQnA/tests/test_manifest_on_xeon.sh
deleted file mode 100755
index 4c93a8958e..0000000000
--- a/ChatQnA/tests/test_manifest_on_xeon.sh
+++ /dev/null
@@ -1,112 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-ROLLOUT_TIMEOUT_SECONDS="1800s"
-KUBECTL_TIMEOUT_SECONDS="60s"
-
-function install_chatqna {
-    echo "namespace is $NAMESPACE"
-    kubectl apply -f chatqna.yaml -n $NAMESPACE
-    # Sleep enough time for retreiver-usvc to be ready
-    sleep 60
-}
-
-function validate_chatqna() {
-    local ns=$1
-    local log=$2
-    max_retry=10
-    # make sure microservice retriever-usvc is ready
-    # try to curl retriever-svc for max_retry times
-    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
-    for ((i=1; i<=max_retry; i++))
-    do
-        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
-        curl http://$endpoint_url/v1/retrieval -X POST \
-            -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
-            -H 'Content-Type: application/json' && break
-        sleep 30
-    done
-    # if i is bigger than max_retry, then exit with error
-    if [ $i -gt $max_retry ]; then
-        echo "Microservice retriever failed, exit with error."
-        return 1
-    fi
-    # make sure microservice tgi-svc is ready
-    for ((i=1; i<=max_retry; i++))
-    do
-        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
-        curl http://$endpoint_url/generate -X POST \
-            -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-            -H 'Content-Type: application/json' && break
-        sleep 30
-    done
-    # if i is bigger than max_retry, then exit with error
-    if [ $i -gt $max_retry ]; then
-        echo "Microservice tgi failed, exit with error."
-        return 1
-    fi
-
-    # check megaservice works
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$log.log
-    endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
-    curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice failed, please check the logs in $LOGFILE!"
-        return ${exit_code}
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] &&
-        [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-        return 1
-    else
-        echo "Response check succeed!"
-    fi
-    return 0
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_ChatQnA)
-        pushd ChatQnA/tests/common
-        bash _test_manifest_utils.sh init_ChatQnA
-        popd
-        ;;
-    install_ChatQnA)
-        pushd ChatQnA/kubernetes/intel/cpu/xeon/manifest
-        NAMESPACE=$2
-        install_chatqna
-        popd
-        ;;
-    validate_ChatQnA)
-        NAMESPACE=$2
-        SERVICE_NAME=chatqna
-        validate_chatqna $NAMESPACE chatqna
-        ret=$?
-        if [ $ret -ne 0 ]; then
-            exit $ret
-        fi
-        ;;
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
diff --git a/ChatQnA/tests/test_manifest_vllm_on_gaudi.sh b/ChatQnA/tests/test_manifest_vllm_on_gaudi.sh
deleted file mode 100755
index c1ab58460e..0000000000
--- a/ChatQnA/tests/test_manifest_vllm_on_gaudi.sh
+++ /dev/null
@@ -1,118 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-ROLLOUT_TIMEOUT_SECONDS="1800s"
-KUBECTL_TIMEOUT_SECONDS="60s"
-
-function validate_chatqna() {
-    local ns=$1
-    local log=$2
-    max_retry=20
-    # make sure microservice retriever-usvc is ready
-    # try to curl retriever-svc for max_retry times
-    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
-    for ((i=1; i<=max_retry; i++))
-    do
-        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point  "chatqna-retriever-usvc" $ns)
-        curl http://$endpoint_url/v1/retrieval -X POST \
-            -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
-            -H 'Content-Type: application/json' && break
-        sleep 30
-    done
-    # if i is bigger than max_retry, then exit with error
-    if [ $i -gt $max_retry ]; then
-        echo "Microservice retriever failed, exit with error."
-        return 1
-    fi
-
-    # make sure microservice vllm-svc is ready
-    for ((i=1; i<=max_retry; i++))
-    do
-        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point  "chatqna-vllm" $ns)
-        curl http://$endpoint_url/v1/chat/completions -X POST \
-            -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}' \
-            -H 'Content-Type: application/json' && break
-        sleep 30
-    done
-    # if i is bigger than max_retry, then exit with error
-    if [ $i -gt $max_retry ]; then
-        echo "Microservice vllm failed, exit with error."
-        return 1
-    fi
-
-    # check megaservice works
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$log.log
-    endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point  "chatqna" $ns)
-    curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice failed, please check the logs in $LOGFILE!"
-        return ${exit_code}
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] &&
-        [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-        return 1
-    else
-        echo "Response check succeed!"
-    fi
-    return 0
-}
-
-function install_chatqna() {
-    echo "Testing manifests chatqna_vllm"
-    local ns=$1
-    bash ChatQnA/tests/common/_test_manifest_utils.sh _cleanup_ns $ns
-    kubectl create namespace $ns
-    # install guardrail
-    pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest
-    kubectl apply -f chatqna-vllm.yaml -n $ns
-    # Sleep enough time for chatqna_vllm to be ready, vllm warmup takes about 5 minutes
-    sleep 280
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_ChatQnA)
-        pushd ChatQnA/tests/common
-        bash _test_manifest_utils.sh init_ChatQnA
-        popd
-        ;;
-    install_ChatQnA)
-        NAMESPACE=$2
-        install_chatqna $NAMESPACE
-        popd
-        ;;
-    validate_ChatQnA)
-        NAMESPACE=$2
-        SERVICE_NAME=chatqna-vllm
-        validate_chatqna $NAMESPACE chatqna-vllm
-        ret=$?
-        if [ $ret -ne 0 ]; then
-            exit $ret
-        fi
-        ;;
-
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
diff --git a/CodeGen/kubernetes/helm/gaudi-values.yaml b/CodeGen/kubernetes/helm/gaudi-values.yaml
index e26bb4a5ed..25ac2c3959 100644
--- a/CodeGen/kubernetes/helm/gaudi-values.yaml
+++ b/CodeGen/kubernetes/helm/gaudi-values.yaml
@@ -6,13 +6,18 @@ tgi:
   LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct
   image:
     repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
   resources:
     limits:
       habana.ai/gaudi: 1
   MAX_INPUT_LENGTH: "1024"
   MAX_TOTAL_TOKENS: "2048"
   CUDA_GRAPHS: ""
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  ENABLE_HPU_GRAPH: "true"
+  LIMIT_HPU_GRAPH: "true"
+  USE_FLASH_ATTENTION: "true"
+  FLASH_ATTENTION_RECOMPUTE: "true"
   livenessProbe:
     initialDelaySeconds: 5
     periodSeconds: 5
diff --git a/CodeGen/tests/test_manifest_on_gaudi.sh b/CodeGen/tests/test_manifest_on_gaudi.sh
deleted file mode 100755
index a54e2d76df..0000000000
--- a/CodeGen/tests/test_manifest_on_gaudi.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_codegen() {
-    # executed under path manifest/codegen/xeon
-    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
-    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
-    # replace microservice image tag
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
-    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
-    # set huggingface token
-    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_codegen {
-    echo "namespace is $NAMESPACE"
-    kubectl apply -f codegen.yaml -n $NAMESPACE
-}
-
-function validate_codegen() {
-    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
-    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
-    echo "try to curl http://${ip_address}:${port}/v1/codegen..."
-
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
-    # Curl the Mega Service
-    curl http://${ip_address}:${port}/v1/codegen -H "Content-Type: application/json" \
-    -d '{"messages": "def print_hello_world():"}' > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice codegen failed, please check the logs in $LOGFILE!"
-        exit 1
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] && \
-    [[ $(grep -c "print" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-        exit 1
-    else
-        echo "Response check succeed!"
-    fi
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_CodeGen)
-        pushd CodeGen/kubernetes/intel/hpu/gaudi/manifest
-        init_codegen
-        popd
-        ;;
-    install_CodeGen)
-        pushd CodeGen/kubernetes/intel/hpu/gaudi/manifest
-        NAMESPACE=$2
-        install_codegen
-        popd
-        ;;
-    validate_CodeGen)
-        NAMESPACE=$2
-        SERVICE_NAME=codegen
-        validate_codegen
-        ;;
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
diff --git a/CodeGen/tests/test_manifest_on_xeon.sh b/CodeGen/tests/test_manifest_on_xeon.sh
deleted file mode 100755
index b0975f14a6..0000000000
--- a/CodeGen/tests/test_manifest_on_xeon.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_codegen() {
-    # executed under path manifest/codegen/xeon
-    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
-    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
-    # replace microservice image tag
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
-    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
-    # set huggingface token
-    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_codegen {
-    echo "namespace is $NAMESPACE"
-    kubectl apply -f codegen.yaml -n $NAMESPACE
-}
-
-function validate_codegen() {
-    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
-    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
-    echo "try to curl http://${ip_address}:${port}/v1/codegen..."
-
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
-    # Curl the Mega Service
-    curl http://${ip_address}:${port}/v1/codegen -H "Content-Type: application/json" \
-    -d '{"messages": "def print_hello_world():"}' > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice codegen failed, please check the logs in $LOGFILE!"
-        exit 1
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] && \
-    [[ $(grep -c "print" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-        exit 1
-    else
-        echo "Response check succeed!"
-    fi
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_CodeGen)
-        pushd CodeGen/kubernetes/intel/cpu/xeon/manifest
-        init_codegen
-        popd
-        ;;
-    install_CodeGen)
-        pushd CodeGen/kubernetes/intel/cpu/xeon/manifest
-        NAMESPACE=$2
-        install_codegen
-        popd
-        ;;
-    validate_CodeGen)
-        NAMESPACE=$2
-        SERVICE_NAME=codegen
-        validate_codegen
-        ;;
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
diff --git a/CodeTrans/kubernetes/helm/gaudi-values.yaml b/CodeTrans/kubernetes/helm/gaudi-values.yaml
index e5367383ae..89ed259285 100644
--- a/CodeTrans/kubernetes/helm/gaudi-values.yaml
+++ b/CodeTrans/kubernetes/helm/gaudi-values.yaml
@@ -5,13 +5,18 @@ tgi:
   accelDevice: "gaudi"
   image:
     repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
   resources:
     limits:
       habana.ai/gaudi: 1
   MAX_INPUT_LENGTH: "1024"
   MAX_TOTAL_TOKENS: "2048"
   CUDA_GRAPHS: ""
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  ENABLE_HPU_GRAPH: "true"
+  LIMIT_HPU_GRAPH: "true"
+  USE_FLASH_ATTENTION: "true"
+  FLASH_ATTENTION_RECOMPUTE: "true"
   livenessProbe:
     initialDelaySeconds: 5
     periodSeconds: 5
diff --git a/CodeTrans/tests/test_manifest_on_gaudi.sh b/CodeTrans/tests/test_manifest_on_gaudi.sh
deleted file mode 100755
index 7be05ae33b..0000000000
--- a/CodeTrans/tests/test_manifest_on_gaudi.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_codetrans() {
-    # executed under path manifest/codetrans/xeon
-    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
-    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
-    # replace microservice image tag
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
-    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
-    # set huggingface token
-    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_codetrans {
-    echo "namespace is $NAMESPACE"
-    kubectl apply -f codetrans.yaml -n $NAMESPACE
-}
-
-function validate_codetrans() {
-    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
-    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
-    echo "try to curl http://${ip_address}:${port}/v1/codetrans..."
-
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
-    # Curl the Mega Service
-    curl http://${ip_address}:${port}/v1/codetrans \
-    -H 'Content-Type: application/json' \
-    -d '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n}"}' > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice codetrans failed, please check the logs in $LOGFILE!"
-        exit 1
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] && \
-    [[ $(grep -c "print" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-        exit 1
-    else
-        echo "Response check succeed!"
-    fi
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_CodeTrans)
-        pushd CodeTrans/kubernetes/intel/hpu/gaudi/manifest
-        init_codetrans
-        popd
-        ;;
-    install_CodeTrans)
-        pushd CodeTrans/kubernetes/intel/hpu/gaudi/manifest
-        NAMESPACE=$2
-        install_codetrans
-        popd
-        ;;
-    validate_CodeTrans)
-        NAMESPACE=$2
-        SERVICE_NAME=codetrans
-        validate_codetrans
-        ;;
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
diff --git a/CodeTrans/tests/test_manifest_on_xeon.sh b/CodeTrans/tests/test_manifest_on_xeon.sh
deleted file mode 100755
index 7f2b969240..0000000000
--- a/CodeTrans/tests/test_manifest_on_xeon.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_codetrans() {
-    # executed under path manifest/codetrans/xeon
-    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
-    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
-    # replace microservice image tag
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
-    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
-    # set huggingface token
-    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_codetrans {
-    echo "namespace is $NAMESPACE"
-    kubectl apply -f codetrans.yaml -n $NAMESPACE
-}
-
-function validate_codetrans() {
-    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
-    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
-    echo "try to curl http://${ip_address}:${port}/v1/codetrans..."
-
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
-    # Curl the Mega Service
-    curl http://${ip_address}:${port}/v1/codetrans \
-    -H 'Content-Type: application/json' \
-    -d '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n}"}' > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice codetrans failed, please check the logs in $LOGFILE!"
-        exit 1
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] && \
-    [[ $(grep -c "print" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-        exit 1
-    else
-        echo "Response check succeed!"
-    fi
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_CodeTrans)
-        pushd CodeTrans/kubernetes/intel/cpu/xeon/manifest
-        init_codetrans
-        popd
-        ;;
-    install_CodeTrans)
-        pushd CodeTrans/kubernetes/intel/cpu/xeon/manifest
-        NAMESPACE=$2
-        install_codetrans
-        popd
-        ;;
-    validate_CodeTrans)
-        NAMESPACE=$2
-        SERVICE_NAME=codetrans
-        validate_codetrans
-        ;;
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
diff --git a/DocSum/kubernetes/helm/cpu-values.yaml b/DocSum/kubernetes/helm/cpu-values.yaml
index 97818ae448..6f2ab7768f 100644
--- a/DocSum/kubernetes/helm/cpu-values.yaml
+++ b/DocSum/kubernetes/helm/cpu-values.yaml
@@ -2,4 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 tgi:
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
+  enabled: true
+vllm:
+  enabled: false
diff --git a/DocSum/kubernetes/helm/gaudi-values.yaml b/DocSum/kubernetes/helm/gaudi-values.yaml
index 5cfae25928..eda0abe8c4 100644
--- a/DocSum/kubernetes/helm/gaudi-values.yaml
+++ b/DocSum/kubernetes/helm/gaudi-values.yaml
@@ -1,16 +1,21 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+vllm:
+  enabled: false
+
+llm-uservice:
+  DOCSUM_BACKEND: "TGI"
+
 tgi:
+  enabled: true
   accelDevice: "gaudi"
   image:
     repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
   resources:
     limits:
       habana.ai/gaudi: 1
-  MAX_INPUT_LENGTH: "1024"
-  MAX_TOTAL_TOKENS: "2048"
   CUDA_GRAPHS: ""
   ENABLE_HPU_GRAPH: true
   LIMIT_HPU_GRAPH: true
diff --git a/DocSum/tests/test_manifest_on_gaudi.sh b/DocSum/tests/test_manifest_on_gaudi.sh
deleted file mode 100755
index db731ac4a9..0000000000
--- a/DocSum/tests/test_manifest_on_gaudi.sh
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_docsum() {
-    # executed under path manifest/docsum/xeon
-    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
-    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
-    # replace microservice image tag
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
-    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
-    # set huggingface token
-    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_docsum {
-    echo "namespace is $NAMESPACE"
-    kubectl apply -f docsum.yaml -n $NAMESPACE
-}
-
-function validate_docsum() {
-    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
-    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
-    echo "try to curl http://${ip_address}:${port}/v1/docsum..."
-
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
-    # Curl the Mega Service
-    curl http://${ip_address}:${port}/v1/docsum \
-    -H 'Content-Type: multipart/form-data' \
-    -F 'type=text' \
-    -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice docsum failed, please check the logs in $LOGFILE!"
-        exit 1
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] && \
-    [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-        exit 1
-    else
-        echo "Response check succeed!"
-    fi
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_DocSum)
-        pushd DocSum/kubernetes/intel/hpu/gaudi/manifest
-        init_docsum
-        popd
-        ;;
-    install_DocSum)
-        pushd DocSum/kubernetes/intel/hpu/gaudi/manifest
-        NAMESPACE=$2
-        install_docsum
-        popd
-        ;;
-    validate_DocSum)
-        NAMESPACE=$2
-        SERVICE_NAME=docsum
-        validate_docsum
-        ;;
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
diff --git a/DocSum/tests/test_manifest_on_xeon.sh b/DocSum/tests/test_manifest_on_xeon.sh
deleted file mode 100755
index 0bf613975c..0000000000
--- a/DocSum/tests/test_manifest_on_xeon.sh
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_docsum() {
-    # executed under path manifest/docsum/xeon
-    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
-    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
-    # replace microservice image tag
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
-    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/opea/"
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
-    # set huggingface token
-    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_docsum {
-    echo "namespace is $NAMESPACE"
-    kubectl apply -f docsum.yaml -n $NAMESPACE
-}
-
-function validate_docsum() {
-    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
-    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
-    echo "try to curl http://${ip_address}:${port}/v1/docsum..."
-
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
-    # Curl the Mega Service
-    curl http://${ip_address}:${port}/v1/docsum \
-    -H 'Content-Type: multipart/form-data' \
-    -F 'type=text' \
-    -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice docsum failed, please check the logs in $LOGFILE!"
-        exit 1
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] && \
-    [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-        exit 1
-    else
-        echo "Response check succeed!"
-    fi
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_DocSum)
-        pushd DocSum/kubernetes/intel/cpu/xeon/manifest
-        init_docsum
-        popd
-        ;;
-    install_DocSum)
-        pushd DocSum/kubernetes/intel/cpu/xeon/manifest
-        NAMESPACE=$2
-        install_docsum
-        popd
-        ;;
-    validate_DocSum)
-        NAMESPACE=$2
-        SERVICE_NAME=docsum
-        validate_docsum
-        ;;
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
diff --git a/FaqGen/kubernetes/helm/gaudi-values.yaml b/FaqGen/kubernetes/helm/gaudi-values.yaml
index d14729c4a3..e45cde146f 100644
--- a/FaqGen/kubernetes/helm/gaudi-values.yaml
+++ b/FaqGen/kubernetes/helm/gaudi-values.yaml
@@ -5,13 +5,25 @@ tgi:
   accelDevice: "gaudi"
   image:
     repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
   resources:
     limits:
       habana.ai/gaudi: 1
-  MAX_INPUT_LENGTH: "4096"
-  MAX_TOTAL_TOKENS: "8192"
+  MAX_INPUT_LENGTH: "1024"
+  MAX_TOTAL_TOKENS: "2048"
   CUDA_GRAPHS: "0"
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  ENABLE_HPU_GRAPH: "true"
+  LIMIT_HPU_GRAPH: "true"
+  USE_FLASH_ATTENTION: "true"
+  FLASH_ATTENTION_RECOMPUTE: "true"
+  PREFILL_BATCH_BUCKET_SIZE: 1
+  BATCH_BUCKET_SIZE: 8
+  extraCmdArgs:
+    - "--max-batch-total-tokens"
+    - "65536"
+    - "--max-batch-prefill-tokens"
+    - "4096"
   livenessProbe:
     initialDelaySeconds: 5
     periodSeconds: 5
diff --git a/MultimodalQnA/README.md b/MultimodalQnA/README.md
index 8e46be1c8a..bda42ee285 100644
--- a/MultimodalQnA/README.md
+++ b/MultimodalQnA/README.md
@@ -87,12 +87,12 @@ In the below, we provide a table that describes for each microservice component
 <details>
 <summary><b>Gaudi default compose.yaml</b></summary>
 
-| MicroService | Open Source Project   | HW    | Port | Endpoint                                                              |
-| ------------ | --------------------- | ----- | ---- | --------------------------------------------------------------------- |
-| Embedding    | Langchain             | Xeon  | 6000 | /v1/embeddings                                                        |
-| Retriever    | Langchain, Redis      | Xeon  | 7000 | /v1/multimodal_retrieval                                              |
-| LVM          | Langchain, TGI        | Gaudi | 9399 | /v1/lvm                                                               |
-| Dataprep     | Redis, Langchain, TGI | Gaudi | 6007 | /v1/generate_transcripts, /v1/generate_captions, /v1/ingest_with_text |
+| MicroService | Open Source Project   | HW    | Port | Endpoint                                                    |
+| ------------ | --------------------- | ----- | ---- | ----------------------------------------------------------- |
+| Embedding    | Langchain             | Xeon  | 6000 | /v1/embeddings                                              |
+| Retriever    | Langchain, Redis      | Xeon  | 7000 | /v1/multimodal_retrieval                                    |
+| LVM          | Langchain, TGI        | Gaudi | 9399 | /v1/lvm                                                     |
+| Dataprep     | Redis, Langchain, TGI | Gaudi | 6007 | /v1/generate_transcripts, /v1/generate_captions, /v1/ingest |
 
 </details>
 
diff --git a/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md b/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md
index e49b264823..f49b9815f1 100644
--- a/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md
+++ b/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md
@@ -289,6 +289,7 @@ To delete all uploaded files along with data indexed with `$INDEX_NAME` in REDIS
 ```bash
 curl -X POST \
     -H "Content-Type: application/json" \
+    -d '{"file_path": "all"}' \
     ${DATAPREP_DELETE_FILE_ENDPOINT}
 ```
 
diff --git a/MultimodalQnA/docker_compose/amd/gpu/rocm/set_env.sh b/MultimodalQnA/docker_compose/amd/gpu/rocm/set_env.sh
index c271a4b553..5cb482bc55 100644
--- a/MultimodalQnA/docker_compose/amd/gpu/rocm/set_env.sh
+++ b/MultimodalQnA/docker_compose/amd/gpu/rocm/set_env.sh
@@ -26,8 +26,8 @@ export MM_RETRIEVER_SERVICE_HOST_IP=${HOST_IP}
 export LVM_SERVICE_HOST_IP=${HOST_IP}
 export MEGA_SERVICE_HOST_IP=${HOST_IP}
 export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:8888/v1/multimodalqna"
-export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/ingest"
-export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/generate_transcripts"
-export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/generate_captions"
-export DATAPREP_GET_FILE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/get"
-export DATAPREP_DELETE_FILE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/delete"
+export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/ingest"
+export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/generate_transcripts"
+export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/generate_captions"
+export DATAPREP_GET_FILE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/get"
+export DATAPREP_DELETE_FILE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/delete"
diff --git a/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md b/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md
index 714fc72661..7e4fa6894a 100644
--- a/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md
@@ -92,7 +92,7 @@ export REDIS_INSIGHTS_PORT=8001
 export REDIS_URL="redis://${host_ip}:${REDIS_DB_PORT}"
 export REDIS_HOST=${host_ip}
 export INDEX_NAME="mm-rag-redis"
-export DATAPREP_MMR_PORT=5000
+export DATAPREP_MMR_PORT=6007
 export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest"
 export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts"
 export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions"
@@ -334,15 +334,6 @@ export audio_fn="AudioSample.wav"
 wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav -O ${audio_fn}
 ```
 
-```bash
-export DATAPREP_MMR_PORT=6007
-export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest"
-export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts"
-export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions"
-export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/get"
-export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/delete"
-```
-
 Test dataprep microservice with generating transcript. This command updates a knowledge base by uploading a local video .mp4 and an audio .wav file.
 
 ```bash
@@ -398,6 +389,7 @@ To delete all uploaded files along with data indexed with `$INDEX_NAME` in REDIS
 ```bash
 curl -X POST \
     -H "Content-Type: application/json" \
+    -d '{"file_path": "all"}' \
     ${DATAPREP_DELETE_FILE_ENDPOINT}
 ```
 
diff --git a/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml b/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 681ba25ee6..31f543c755 100644
--- a/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -26,7 +26,7 @@ services:
       - redis-vector-db
       - lvm-llava
     ports:
-      - "6007:${DATAPREP_MMR_PORT}"
+      - "${DATAPREP_MMR_PORT}:5000"
     environment:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
diff --git a/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh b/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh
index 593e89452c..057f90990c 100755
--- a/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh
+++ b/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh
@@ -28,7 +28,7 @@ export REDIS_URL="redis://${host_ip}:${REDIS_DB_PORT}"
 export REDIS_HOST=${host_ip}
 export INDEX_NAME="mm-rag-redis"
 
-export DATAPREP_MMR_PORT=5000
+export DATAPREP_MMR_PORT=6007
 export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest"
 export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts"
 export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions"
diff --git a/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md b/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md
index 35ed4abbc8..2379fc3d4d 100644
--- a/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -37,7 +37,7 @@ export WHISPER_PORT=7066
 export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr"
 export MAX_IMAGES=1
 export WHISPER_MODEL="base"
-export DATAPREP_MMR_PORT=5000
+export DATAPREP_MMR_PORT=6007
 export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest"
 export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts"
 export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions"
@@ -282,15 +282,6 @@ wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_ex
 
 Test dataprep microservice with generating transcript. This command updates a knowledge base by uploading a local video .mp4 and an audio .wav file.
 
-```bash
-export DATAPREP_MMR_PORT=6007
-export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest"
-export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts"
-export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions"
-export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/get"
-export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/delete"
-```
-
 ```bash
 curl --silent --write-out "HTTPSTATUS:%{http_code}" \
     ${DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT} \
@@ -324,6 +315,7 @@ Also, you are able to get the list of all files that you uploaded:
 ```bash
 curl -X POST \
     -H "Content-Type: application/json" \
+    -d '{"file_path": "all"}' \
     ${DATAPREP_GET_FILE_ENDPOINT}
 ```
 
diff --git a/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml
index 6c646674b7..26b5610f5e 100644
--- a/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -28,7 +28,7 @@ services:
       - redis-vector-db
       - lvm
     ports:
-      - "6007:${DATAPREP_MMR_PORT}"
+      - "${DATAPREP_MMR_PORT}:5000"
     environment:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
@@ -42,16 +42,21 @@ services:
       MULTIMODAL_DATAPREP: true
       DATAPREP_COMPONENT_NAME: "OPEA_DATAPREP_MULTIMODALREDIS"
     restart: unless-stopped
-  embedding-multimodal-bridgetower:
-    image: ${REGISTRY:-opea}/embedding-multimodal-bridgetower:${TAG:-latest}
-    container_name: embedding-multimodal-bridgetower
+  embedding-multimodal-bridgetower-gaudi:
+    image: ${REGISTRY:-opea}/embedding-multimodal-bridgetower-gaudi:${TAG:-latest}
+    container_name: embedding-multimodal-bridgetower-gaudi
     ports:
       - ${EMM_BRIDGETOWER_PORT}:${EMM_BRIDGETOWER_PORT}
+    ipc: host
     environment:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
       PORT: ${EMM_BRIDGETOWER_PORT}
+      HABANA_VISIBLE_DEVICES: all
+    runtime: habana
+    cap_add:
+      - SYS_NICE
     healthcheck:
       test: ["CMD-SHELL", "http_proxy='' curl -f http://localhost:${EMM_BRIDGETOWER_PORT}/v1/health_check"]
       interval: 10s
@@ -64,7 +69,7 @@ services:
     image: ${REGISTRY:-opea}/embedding:${TAG:-latest}
     container_name: embedding
     depends_on:
-      embedding-multimodal-bridgetower:
+      embedding-multimodal-bridgetower-gaudi:
         condition: service_healthy
     ports:
       - ${MM_EMBEDDING_PORT_MICROSERVICE}:${MM_EMBEDDING_PORT_MICROSERVICE}
diff --git a/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh b/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh
index a92483f9a0..cc35d58d08 100755
--- a/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh
+++ b/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -28,7 +28,7 @@ export WHISPER_PORT=7066
 export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr"
 export MAX_IMAGES=1
 
-export DATAPREP_MMR_PORT=5000
+export DATAPREP_MMR_PORT=6007
 export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest"
 export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts"
 export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions"
diff --git a/MultimodalQnA/docker_image_build/build.yaml b/MultimodalQnA/docker_image_build/build.yaml
index e90d0cc686..1fc599c3e5 100644
--- a/MultimodalQnA/docker_image_build/build.yaml
+++ b/MultimodalQnA/docker_image_build/build.yaml
@@ -23,6 +23,12 @@ services:
       dockerfile: comps/third_parties/bridgetower/src/Dockerfile
     extends: multimodalqna
     image: ${REGISTRY:-opea}/embedding-multimodal-bridgetower:${TAG:-latest}
+  embedding-multimodal-bridgetower-gaudi:
+    build:
+      context: GenAIComps
+      dockerfile: comps/third_parties/bridgetower/src/Dockerfile.intel_hpu
+    extends: multimodalqna
+    image: ${REGISTRY:-opea}/embedding-multimodal-bridgetower-gaudi:${TAG:-latest}
   embedding:
     build:
       context: GenAIComps
diff --git a/MultimodalQnA/tests/test_compose_on_gaudi.sh b/MultimodalQnA/tests/test_compose_on_gaudi.sh
index d50e024dc2..ccb4f1894d 100644
--- a/MultimodalQnA/tests/test_compose_on_gaudi.sh
+++ b/MultimodalQnA/tests/test_compose_on_gaudi.sh
@@ -59,7 +59,7 @@ function build_docker_images() {
     git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="multimodalqna multimodalqna-ui embedding-multimodal-bridgetower embedding retriever lvm dataprep whisper"
+    service_list="multimodalqna multimodalqna-ui embedding-multimodal-bridgetower-gaudi embedding retriever lvm dataprep whisper"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
@@ -82,7 +82,7 @@ function setup_env() {
     export MAX_IMAGES=1
     export WHISPER_MODEL="base"
     export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr"
-    export DATAPREP_MMR_PORT=5000
+    export DATAPREP_MMR_PORT=6007
     export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest"
     export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts"
     export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions"
@@ -177,19 +177,19 @@ function validate_microservices() {
     # Check if the microservices are running correctly.
 
     # Bridgetower Embedding Server
-    echo "Validating embedding-multimodal-bridgetower"
+    echo "Validating embedding-multimodal-bridgetower-gaudi"
     validate_service \
         "http://${host_ip}:${EMM_BRIDGETOWER_PORT}/v1/encode" \
         '"embedding":[' \
-        "embedding-multimodal-bridgetower" \
-        "embedding-multimodal-bridgetower" \
+        "embedding-multimodal-bridgetower-gaudi" \
+        "embedding-multimodal-bridgetower-gaudi" \
         '{"text":"This is example"}'
 
     validate_service \
         "http://${host_ip}:${EMM_BRIDGETOWER_PORT}/v1/encode" \
         '"embedding":[' \
-        "embedding-multimodal-bridgetower" \
-        "embedding-multimodal-bridgetower" \
+        "embedding-multimodal-bridgetower-gaudi" \
+        "embedding-multimodal-bridgetower-gaudi" \
         '{"text":"This is example", "img_b64_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC"}'
 
     # embedding microservice
@@ -210,11 +210,6 @@ function validate_microservices() {
 
     sleep 1m # retrieval can't curl as expected, try to wait for more time
 
-    export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/ingest"
-    export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/generate_transcripts"
-    export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/generate_captions"
-    export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/get"
-
     # test data prep
     echo "Validating Data Prep with Generating Transcript for Video"
     validate_service \
diff --git a/MultimodalQnA/tests/test_compose_on_rocm.sh b/MultimodalQnA/tests/test_compose_on_rocm.sh
index 7787d919ee..9ba5c68c90 100644
--- a/MultimodalQnA/tests/test_compose_on_rocm.sh
+++ b/MultimodalQnA/tests/test_compose_on_rocm.sh
@@ -67,11 +67,11 @@ function setup_env() {
     export LVM_SERVICE_HOST_IP=${HOST_IP}
     export MEGA_SERVICE_HOST_IP=${HOST_IP}
     export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:8888/v1/multimodalqna"
-    export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/ingest"
-    export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/generate_transcripts"
-    export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/generate_captions"
-    export DATAPREP_GET_FILE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/get"
-    export DATAPREP_DELETE_FILE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/delete"
+    export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/ingest"
+    export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/generate_transcripts"
+    export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/generate_captions"
+    export DATAPREP_GET_FILE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/get"
+    export DATAPREP_DELETE_FILE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/delete"
 }
 
 function start_services() {
@@ -174,11 +174,6 @@ function validate_microservices() {
 
     sleep 1m # retrieval can't curl as expected, try to wait for more time
 
-    export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/ingest"
-    export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/generate_transcripts"
-    export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/generate_captions"
-    export DATAPREP_GET_FILE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/get"
-
     # test data prep
     echo "Data Prep with Generating Transcript for Video"
     validate_service \
diff --git a/MultimodalQnA/tests/test_compose_on_xeon.sh b/MultimodalQnA/tests/test_compose_on_xeon.sh
index 10b1579478..b5d254b58c 100644
--- a/MultimodalQnA/tests/test_compose_on_xeon.sh
+++ b/MultimodalQnA/tests/test_compose_on_xeon.sh
@@ -79,7 +79,7 @@ function setup_env() {
     export REDIS_URL="redis://${host_ip}:${REDIS_DB_PORT}"
     export REDIS_HOST=${host_ip}
     export INDEX_NAME="mm-rag-redis"
-    export DATAPREP_MMR_PORT=5000
+    export DATAPREP_MMR_PORT=6007
     export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest"
     export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts"
     export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions"
@@ -207,11 +207,6 @@ function validate_microservices() {
 
     sleep 1m # retrieval can't curl as expected, try to wait for more time
 
-    export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/ingest"
-    export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/generate_transcripts"
-    export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/generate_captions"
-    export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/get"
-
     # test data prep
     echo "Validating Data Prep with Generating Transcript for Video"
     validate_service \
diff --git a/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py b/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py
index 6b94e54be9..7919ce5910 100644
--- a/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py
+++ b/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py
@@ -610,7 +610,7 @@ def select_upload_type(choice, request: gr.Request):
         "BACKEND_SERVICE_ENDPOINT", f"http://localhost:{MEGA_SERVICE_PORT}/v1/multimodalqna"
     )
     dataprep_ingest_endpoint = os.getenv(
-        "DATAPREP_INGEST_SERVICE_ENDPOINT", f"http://localhost:{DATAPREP_MMR_PORT}/v1/ingest_with_text"
+        "DATAPREP_INGEST_SERVICE_ENDPOINT", f"http://localhost:{DATAPREP_MMR_PORT}/v1/ingest"
     )
     dataprep_gen_transcript_endpoint = os.getenv(
         "DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT", f"http://localhost:{DATAPREP_MMR_PORT}/v1/generate_transcripts"
diff --git a/SearchQnA/kubernetes/helm/README.md b/SearchQnA/kubernetes/helm/README.md
new file mode 100644
index 0000000000..ccdf71a32f
--- /dev/null
+++ b/SearchQnA/kubernetes/helm/README.md
@@ -0,0 +1,18 @@
+# Deploy SearchQnA on Kubernetes cluster
+
+- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information.
+- For more deploy options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme).
+
+## Deploy on Xeon
+
+```
+export HFTOKEN="insert-your-huggingface-token-here"
+helm install searchqna oci://ghcr.io/opea-project/charts/searchqna  --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml
+```
+
+## Deploy on Gaudi
+
+```
+export HFTOKEN="insert-your-huggingface-token-here"
+helm install searchqna oci://ghcr.io/opea-project/charts/searchqna  --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml
+```
diff --git a/SearchQnA/kubernetes/helm/cpu-values.yaml b/SearchQnA/kubernetes/helm/cpu-values.yaml
new file mode 100644
index 0000000000..4de7affb83
--- /dev/null
+++ b/SearchQnA/kubernetes/helm/cpu-values.yaml
@@ -0,0 +1,7 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+tgi:
+  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
+llm_uservice:
+  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
diff --git a/SearchQnA/kubernetes/helm/gaudi-values.yaml b/SearchQnA/kubernetes/helm/gaudi-values.yaml
new file mode 100644
index 0000000000..ef327645de
--- /dev/null
+++ b/SearchQnA/kubernetes/helm/gaudi-values.yaml
@@ -0,0 +1,50 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+tgi:
+  accelDevice: "gaudi"
+  image:
+    repository: ghcr.io/huggingface/tgi-gaudi
+    tag: "2.3.1"
+  resources:
+    limits:
+      habana.ai/gaudi: 1
+  MAX_INPUT_LENGTH: "2048"
+  MAX_TOTAL_TOKENS: "4096"
+  CUDA_GRAPHS: ""
+  HF_HUB_DISABLE_PROGRESS_BARS: 1
+  HF_HUB_ENABLE_HF_TRANSFER: 0
+  ENABLE_HPU_GRAPH: true
+  LIMIT_HPU_GRAPH: true
+  USE_FLASH_ATTENTION: true
+  FLASH_ATTENTION_RECOMPUTE: true
+  livenessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+  startupProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
+
+tei:
+  accelDevice: "gaudi"
+  image:
+    repository: ghcr.io/huggingface/tei-gaudi
+    tag: "1.5.0"
+  OMPI_MCA_btl_vader_single_copy_mechanism: none
+  MAX_WARMUP_SEQUENCE_LENGTH: 512
+  securityContext:
+    readOnlyRootFilesystem: false
+  resources:
+    limits:
+      habana.ai/gaudi: 1
+  livenessProbe:
+    timeoutSeconds: 1
+  readinessProbe:
+    timeoutSeconds: 1
diff --git a/Text2Image/kubernetes/helm/README.md b/Text2Image/kubernetes/helm/README.md
new file mode 100644
index 0000000000..6d26e77bd9
--- /dev/null
+++ b/Text2Image/kubernetes/helm/README.md
@@ -0,0 +1,18 @@
+# Deploy txt2img on Kubernetes cluster
+
+- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information.
+- For more deploy options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme).
+
+## Deploy on Xeon
+
+```
+export HFTOKEN="insert-your-huggingface-token-here"
+helm install txt2img oci://ghcr.io/opea-project/charts/txt2img  --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml
+```
+
+## Deploy on Gaudi
+
+```
+export HFTOKEN="insert-your-huggingface-token-here"
+helm install txt2img oci://ghcr.io/opea-project/charts/txt2img  --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml
+```
diff --git a/Text2Image/kubernetes/helm/cpu-values.yaml b/Text2Image/kubernetes/helm/cpu-values.yaml
new file mode 100644
index 0000000000..87a6085784
--- /dev/null
+++ b/Text2Image/kubernetes/helm/cpu-values.yaml
@@ -0,0 +1,6 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+text2image:
+  image:
+    repository: opea/text2image
diff --git a/Text2Image/kubernetes/helm/gaudi-values.yaml b/Text2Image/kubernetes/helm/gaudi-values.yaml
new file mode 100644
index 0000000000..f43d405d5a
--- /dev/null
+++ b/Text2Image/kubernetes/helm/gaudi-values.yaml
@@ -0,0 +1,30 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+text2image:
+  accelDevice: "gaudi"
+  image:
+    repository: opea/text2image-gaudi
+  resources:
+    limits:
+      habana.ai/gaudi: 1
+      # The following hugepage related settings is for default MODEL stable-diffusion-v1-5/stable-diffusion-v1-5
+      # User should change the resource limits for other models
+      hugepages-2Mi: 256Mi
+  volumes:
+    - name: hugepage-2mi
+      emptyDir:
+        medium: HugePages-2Mi
+  volumeMounts:
+    - name: hugepage-2mi
+      mountPath: /hugepages-2Mi
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+  startupProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
diff --git a/Translation/tests/test_manifest_on_gaudi.sh b/Translation/tests/test_manifest_on_gaudi.sh
deleted file mode 100755
index ea1f113cd7..0000000000
--- a/Translation/tests/test_manifest_on_gaudi.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_translation() {
-    # executed under path manifest/translation/xeon
-    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
-    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
-    # replace microservice image tag
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
-    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
-    # set huggingface token
-    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_translation {
-    echo "namespace is $NAMESPACE"
-    kubectl apply -f translation.yaml -n $NAMESPACE
-    sleep 50s
-}
-
-function validate_translation() {
-    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
-    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
-    echo "try to curl http://${ip_address}:${port}/v1/translation..."
-
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
-    # Curl the Mega Service
-    curl http://${ip_address}:${port}/v1/translation \
-    -H 'Content-Type: application/json' \
-    -d '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice translation failed, please check the logs in $LOGFILE!"
-        exit 1
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] && \
-    [[ $(grep -c "translation" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-    else
-        echo "Response check succeed!"
-    fi
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_Translation)
-        pushd Translation/kubernetes/intel/hpu/gaudi/manifest
-        init_translation
-        popd
-        ;;
-    install_Translation)
-        pushd Translation/kubernetes/intel/hpu/gaudi/manifest
-        NAMESPACE=$2
-        install_translation
-        popd
-        ;;
-    validate_Translation)
-        NAMESPACE=$2
-        SERVICE_NAME=translation
-        validate_translation
-        ;;
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
diff --git a/Translation/tests/test_manifest_on_xeon.sh b/Translation/tests/test_manifest_on_xeon.sh
deleted file mode 100755
index d32eb4a229..0000000000
--- a/Translation/tests/test_manifest_on_xeon.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_translation() {
-    # executed under path manifest/translation/xeon
-    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
-    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
-    # replace microservice image tag
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
-    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
-    # set huggingface token
-    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_translation {
-    echo "namespace is $NAMESPACE"
-    kubectl apply -f translation.yaml -n $NAMESPACE
-}
-
-function validate_translation() {
-    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
-    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
-    echo "try to curl http://${ip_address}:${port}/v1/translation..."
-
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
-    # Curl the Mega Service
-    curl http://${ip_address}:${port}/v1/translation \
-    -H 'Content-Type: application/json' \
-    -d '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice translation failed, please check the logs in $LOGFILE!"
-        exit 1
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] && \
-    [[ $(grep -c "translation" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-    else
-        echo "Response check succeed!"
-    fi
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_Translation)
-        pushd Translation/kubernetes/intel/cpu/xeon/manifest
-        init_translation
-        popd
-        ;;
-    install_Translation)
-        pushd Translation/kubernetes/intel/cpu/xeon/manifest
-        NAMESPACE=$2
-        install_translation
-        popd
-        ;;
-    validate_Translation)
-        NAMESPACE=$2
-        SERVICE_NAME=translation
-        validate_translation
-        ;;
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
diff --git a/VisualQnA/kubernetes/helm/gaudi-values.yaml b/VisualQnA/kubernetes/helm/gaudi-values.yaml
index 5a0e95c3a9..eb6494a142 100644
--- a/VisualQnA/kubernetes/helm/gaudi-values.yaml
+++ b/VisualQnA/kubernetes/helm/gaudi-values.yaml
@@ -9,13 +9,18 @@ tgi:
   accelDevice: "gaudi"
   image:
     repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
   resources:
     limits:
       habana.ai/gaudi: 1
   MAX_INPUT_LENGTH: "4096"
   MAX_TOTAL_TOKENS: "8192"
   CUDA_GRAPHS: ""
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  ENABLE_HPU_GRAPH: "true"
+  LIMIT_HPU_GRAPH: "true"
+  USE_FLASH_ATTENTION: "true"
+  FLASH_ATTENTION_RECOMPUTE: "true"
   livenessProbe:
     initialDelaySeconds: 5
     periodSeconds: 5