Merge branch 'main' into melanie/mmqna_ui_port_fix

mhbuehler · Jan 23, 2025 · cebd9be · cebd9be
2 parents fa70be2 + 6600c32
commit cebd9be
Show file tree

Hide file tree

Showing 45 changed files with 488 additions and 1,506 deletions.
diff --git a/.github/workflows/_example-workflow.yml b/.github/workflows/_example-workflow.yml
@@ -50,7 +50,6 @@ jobs:
 # Image Build
 ####################################################################################################
   build-images:
-    if: ${{ !(fromJSON(inputs.test_helmchart)) }}
     runs-on: "docker-build-${{ inputs.node }}"
     steps:
       - name: Clean Up Working Directory

diff --git a/.github/workflows/_helm-e2e.yml b/.github/workflows/_helm-e2e.yml
@@ -29,6 +29,10 @@ on:
         default: "latest"
         required: false
         type: string
+      version:
+        default: "0-latest"
+        required: false
+        type: string
 
 jobs:
   get-test-case:
@@ -154,6 +158,13 @@ jobs:
             exit 0
           fi
 
+          for img in `helm template -n $NAMESPACE $RELEASE_NAME oci://ghcr.io/opea-project/charts/${CHART_NAME} -f ${{ inputs.example }}/kubernetes/helm/${value_file} --version ${{ inputs.version }} | grep 'image:' | grep 'opea/' | awk '{print $2}' | xargs`;
+          do
+            # increase helm install wait for for vllm-gaudi case
+            if [[ $img == *"vllm-gaudi"* ]]; then
+              ROLLOUT_TIMEOUT_SECONDS=900s
+            fi
+          done
           if ! helm install \
             --create-namespace \
             --namespace $NAMESPACE \
@@ -163,9 +174,11 @@ jobs:
             --set global.modelUseHostPath=/home/sdp/.cache/huggingface/hub \
             --set GOOGLE_API_KEY=${{ env.GOOGLE_API_KEY}} \
             --set GOOGLE_CSE_ID=${{ env.GOOGLE_CSE_ID}} \
+            --set web-retriever.GOOGLE_API_KEY=${{ env.GOOGLE_API_KEY}} \
+            --set web-retriever.GOOGLE_CSE_ID=${{ env.GOOGLE_CSE_ID}} \
             -f ${{ inputs.example }}/kubernetes/helm/${value_file} \
-            --version 0-latest \
-            --wait; then
+            --version ${{ inputs.version }} \
+            --wait --timeout "$ROLLOUT_TIMEOUT_SECONDS"; then
             echo "Failed to install chart ${{ inputs.example }}"
             echo "skip_validate=true" >> $GITHUB_ENV
             .github/workflows/scripts/k8s-utils.sh dump_pods_status $NAMESPACE

diff --git a/.github/workflows/scripts/k8s-utils.sh b/.github/workflows/scripts/k8s-utils.sh
@@ -12,7 +12,7 @@ function dump_pod_log() {
     kubectl describe pod $pod_name -n $namespace
     echo "-----------------------------------"
     echo "#kubectl logs $pod_name -n $namespace"
-    kubectl logs $pod_name -n $namespace
+    kubectl logs $pod_name -n $namespace --all-containers --prefix=true
     echo "-----------------------------------"
 }
 
@@ -44,8 +44,13 @@ function dump_pods_status() {
 
 function dump_all_pod_logs() {
     namespace=$1
+    echo "------SUMMARY of POD STATUS in NS $namespace------"
+    kubectl get pods -n $namespace -o wide
+    echo "------SUMMARY of SVC STATUS in NS $namespace------"
+    kubectl get services -n $namespace -o wide
+    echo "------SUMMARY of endpoint STATUS in NS $namespace------"
+    kubectl get endpoints -n $namespace -o wide
     echo "-----DUMP POD STATUS AND LOG in NS $namespace------"
-
     pods=$(kubectl get pods -n $namespace -o jsonpath='{.items[*].metadata.name}')
     for pod_name in $pods
     do

diff --git a/AgentQnA/docker_compose/amd/gpu/rocm/README.md b/AgentQnA/docker_compose/amd/gpu/rocm/README.md
@@ -0,0 +1,101 @@
+# Single node on-prem deployment with Docker Compose on AMD GPU
+
+This example showcases a hierarchical multi-agent system for question-answering applications. We deploy the example on Xeon. For LLMs, we use OpenAI models via API calls. For instructions on using open-source LLMs, please refer to the deployment guide [here](../../../../README.md).
+
+## Deployment with docker
+
+1. First, clone this repo.
+   ```
+   export WORKDIR=<your-work-directory>
+   cd $WORKDIR
+   git clone https://github.com/opea-project/GenAIExamples.git
+   ```
+2. Set up environment for this example </br>
+
+   ```
+   # Example: host_ip="192.168.1.1" or export host_ip="External_Public_IP"
+   export host_ip=$(hostname -I | awk '{print $1}')
+   # if you are in a proxy environment, also set the proxy-related environment variables
+   export http_proxy="Your_HTTP_Proxy"
+   export https_proxy="Your_HTTPs_Proxy"
+   # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
+   export no_proxy="Your_No_Proxy"
+
+   export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
+   #OPANAI_API_KEY if you want to use OpenAI models
+   export OPENAI_API_KEY=<your-openai-key>
+   # Set AMD GPU settings
+   export AGENTQNA_CARD_ID="card1"
+   export AGENTQNA_RENDER_ID="renderD136"
+   ```
+
+3. Deploy the retrieval tool (i.e., DocIndexRetriever mega-service)
+
+   First, launch the mega-service.
+
+   ```
+   cd $WORKDIR/GenAIExamples/AgentQnA/retrieval_tool
+   bash launch_retrieval_tool.sh
+   ```
+
+   Then, ingest data into the vector database. Here we provide an example. You can ingest your own data.
+
+   ```
+   bash run_ingest_data.sh
+   ```
+
+4. Launch Tool service
+   In this example, we will use some of the mock APIs provided in the Meta CRAG KDD Challenge to demonstrate the benefits of gaining additional context from mock knowledge graphs.
+   ```
+   docker run -d -p=8080:8000 docker.io/aicrowd/kdd-cup-24-crag-mock-api:v0
+   ```
+5. Launch `Agent` service
+
+   ```
+   cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/amd/gpu/rocm
+   bash launch_agent_service_tgi_rocm.sh
+   ```
+
+6. [Optional] Build `Agent` docker image if pulling images failed.
+
+   ```
+   git clone https://github.com/opea-project/GenAIComps.git
+   cd GenAIComps
+   docker build -t opea/agent:latest -f comps/agent/src/Dockerfile .
+   ```
+
+## Validate services
+
+First look at logs of the agent docker containers:
+
+```
+# worker agent
+docker logs rag-agent-endpoint
+```
+
+```
+# supervisor agent
+docker logs react-agent-endpoint
+```
+
+You should see something like "HTTP server setup successful" if the docker containers are started successfully.</p>
+
+Second, validate worker agent:
+
+```
+curl http://${host_ip}:9095/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
+     "query": "Most recent album by Taylor Swift"
+    }'
+```
+
+Third, validate supervisor agent:
+
+```
+curl http://${host_ip}:9090/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
+     "query": "Most recent album by Taylor Swift"
+    }'
+```
+
+## How to register your own tools with agent
+
+You can take a look at the tools yaml and python files in this example. For more details, please refer to the "Provide your own tools" section in the instructions [here](https://github.com/opea-project/GenAIComps/tree/main/comps/agent/src/README.md).
diff --git a/AgentQnA/kubernetes/helm/gaudi-values.yaml b/AgentQnA/kubernetes/helm/gaudi-values.yaml
@@ -4,35 +4,13 @@
 # Accelerate inferencing in heaviest components to improve performance
 # by overriding their subchart values
 
-tgi:
+vllm:
   enabled: true
-  accelDevice: "gaudi"
   image:
-    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
-  resources:
-    limits:
-      habana.ai/gaudi: 4
-  MAX_INPUT_LENGTH: "4096"
-  MAX_TOTAL_TOKENS: "8192"
-  CUDA_GRAPHS: ""
-  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
-  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
-  ENABLE_HPU_GRAPH: "true"
-  LIMIT_HPU_GRAPH: "true"
-  USE_FLASH_ATTENTION: "true"
-  FLASH_ATTENTION_RECOMPUTE: "true"
-  extraCmdArgs: ["--sharded","true","--num-shard","4"]
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  readinessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-    failureThreshold: 120
+    repository: opea/vllm-gaudi
+supervisor:
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
+ragagent:
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
+sqlagent:
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
diff --git a/AgentQnA/tests/step1_build_images.sh b/AgentQnA/tests/step1_build_images.sh
@@ -38,19 +38,17 @@ function build_vllm_docker_image() {
     echo "Building the vllm docker image"
     cd $WORKPATH
     echo $WORKPATH
-    if [ ! -d "./vllm" ]; then
-        echo "clone vllm repo...."
-        git clone https://github.com/vllm-project/vllm.git
+    if [ ! -d "./vllm-fork" ]; then
+        git clone https://github.com/HabanaAI/vllm-fork.git
     fi
-    cd ./vllm
-    echo "Checking out latest stable release of vllm"
-    git checkout v0.6.6
-    docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:comps --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
+    cd ./vllm-fork
+    git checkout v0.6.4.post2+Gaudi-1.19.0
+    docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:ci --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
     if [ $? -ne 0 ]; then
-        echo "opea/vllm-gaudi:comps failed"
+        echo "opea/vllm-gaudi:ci failed"
         exit 1
     else
-        echo "opea/vllm-gaudi:comps successful"
+        echo "opea/vllm-gaudi:ci successful"
     fi
 }
 

diff --git a/AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh b/AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh
@@ -8,15 +8,17 @@ WORKPATH=$(dirname "$PWD")
 export WORKDIR=$WORKPATH/../../
 echo "WORKDIR=${WORKDIR}"
 export ip_address=$(hostname -I | awk '{print $1}')
-export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
+export TOOLSET_PATH=$WORKPATH/tools/
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 model="meta-llama/Meta-Llama-3.1-70B-Instruct"
 
-export HF_CACHE_DIR=$WORKDIR/hf_cache
+export HF_CACHE_DIR=/data2/huggingface
 if [ ! -d "$HF_CACHE_DIR" ]; then
+    HF_CACHE_DIR=$WORKDIR/hf_cache
     mkdir -p "$HF_CACHE_DIR"
 fi
+echo  "HF_CACHE_DIR=$HF_CACHE_DIR"
 ls $HF_CACHE_DIR
 
 vllm_port=8086
@@ -35,7 +37,7 @@ function start_vllm_service_70B() {
 
     echo "start vllm gaudi service"
     echo "**************model is $model**************"
-    vllm_image=opea/vllm-gaudi:comps
+    vllm_image=opea/vllm-gaudi:ci
     docker run -d --runtime=habana --rm --name "vllm-gaudi-server" -e HABANA_VISIBLE_DEVICES=0,1,2,3 -p $vllm_port:8000 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host $vllm_image --model ${model} --max-seq-len-to-capture 16384 --tensor-parallel-size 4
     sleep 5s
     echo "Waiting vllm gaudi ready"

diff --git a/AgentQnA/tests/test_compose_on_gaudi.sh b/AgentQnA/tests/test_compose_on_gaudi.sh
@@ -4,9 +4,6 @@
 
 set -xe
 
-echo "All running containers"
-docker ps
-
 WORKPATH=$(dirname "$PWD")
 export WORKDIR=$WORKPATH/../../
 echo "WORKDIR=${WORKDIR}"

diff --git a/AudioQnA/kubernetes/helm/gaudi-values.yaml b/AudioQnA/kubernetes/helm/gaudi-values.yaml
@@ -5,7 +5,7 @@ tgi:
   accelDevice: "gaudi"
   image:
     repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
   resources:
     limits:
       habana.ai/gaudi: 1
@@ -33,11 +33,15 @@ tgi:
     failureThreshold: 120
 
 whisper:
+  image:
+    repository: opea/whisper-gaudi
   resources:
     limits:
       habana.ai/gaudi: 1
 
 speecht5:
+  image:
+    repository: opea/speecht5-gaudi
   resources:
     limits:
       habana.ai/gaudi: 1
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README.md b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
@@ -280,7 +280,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
 1. TEI Embedding Service
 
    ```bash
-   curl ${host_ip}:6006/embed \
+   curl http://${host_ip}:6006/embed \
        -X POST \
        -d '{"inputs":"What is Deep Learning?"}' \
        -H 'Content-Type: application/json'