diff --git a/.github/workflows/_example-workflow.yml b/.github/workflows/_example-workflow.yml index 39ac1388db..8de7eba9cb 100644 --- a/.github/workflows/_example-workflow.yml +++ b/.github/workflows/_example-workflow.yml @@ -50,7 +50,6 @@ jobs: # Image Build #################################################################################################### build-images: - if: ${{ !(fromJSON(inputs.test_helmchart)) }} runs-on: "docker-build-${{ inputs.node }}" steps: - name: Clean Up Working Directory diff --git a/.github/workflows/_helm-e2e.yml b/.github/workflows/_helm-e2e.yml index b162baf9a7..48be97012a 100644 --- a/.github/workflows/_helm-e2e.yml +++ b/.github/workflows/_helm-e2e.yml @@ -29,6 +29,10 @@ on: default: "latest" required: false type: string + version: + default: "0-latest" + required: false + type: string jobs: get-test-case: @@ -154,6 +158,13 @@ jobs: exit 0 fi + for img in `helm template -n $NAMESPACE $RELEASE_NAME oci://ghcr.io/opea-project/charts/${CHART_NAME} -f ${{ inputs.example }}/kubernetes/helm/${value_file} --version ${{ inputs.version }} | grep 'image:' | grep 'opea/' | awk '{print $2}' | xargs`; + do + # increase helm install wait for for vllm-gaudi case + if [[ $img == *"vllm-gaudi"* ]]; then + ROLLOUT_TIMEOUT_SECONDS=900s + fi + done if ! helm install \ --create-namespace \ --namespace $NAMESPACE \ @@ -163,9 +174,11 @@ jobs: --set global.modelUseHostPath=/home/sdp/.cache/huggingface/hub \ --set GOOGLE_API_KEY=${{ env.GOOGLE_API_KEY}} \ --set GOOGLE_CSE_ID=${{ env.GOOGLE_CSE_ID}} \ + --set web-retriever.GOOGLE_API_KEY=${{ env.GOOGLE_API_KEY}} \ + --set web-retriever.GOOGLE_CSE_ID=${{ env.GOOGLE_CSE_ID}} \ -f ${{ inputs.example }}/kubernetes/helm/${value_file} \ - --version 0-latest \ - --wait; then + --version ${{ inputs.version }} \ + --wait --timeout "$ROLLOUT_TIMEOUT_SECONDS"; then echo "Failed to install chart ${{ inputs.example }}" echo "skip_validate=true" >> $GITHUB_ENV .github/workflows/scripts/k8s-utils.sh dump_pods_status $NAMESPACE diff --git a/.github/workflows/scripts/k8s-utils.sh b/.github/workflows/scripts/k8s-utils.sh index ba58e1a152..0676a80d38 100755 --- a/.github/workflows/scripts/k8s-utils.sh +++ b/.github/workflows/scripts/k8s-utils.sh @@ -12,7 +12,7 @@ function dump_pod_log() { kubectl describe pod $pod_name -n $namespace echo "-----------------------------------" echo "#kubectl logs $pod_name -n $namespace" - kubectl logs $pod_name -n $namespace + kubectl logs $pod_name -n $namespace --all-containers --prefix=true echo "-----------------------------------" } @@ -44,8 +44,13 @@ function dump_pods_status() { function dump_all_pod_logs() { namespace=$1 + echo "------SUMMARY of POD STATUS in NS $namespace------" + kubectl get pods -n $namespace -o wide + echo "------SUMMARY of SVC STATUS in NS $namespace------" + kubectl get services -n $namespace -o wide + echo "------SUMMARY of endpoint STATUS in NS $namespace------" + kubectl get endpoints -n $namespace -o wide echo "-----DUMP POD STATUS AND LOG in NS $namespace------" - pods=$(kubectl get pods -n $namespace -o jsonpath='{.items[*].metadata.name}') for pod_name in $pods do diff --git a/AgentQnA/docker_compose/amd/gpu/rocm/README.md b/AgentQnA/docker_compose/amd/gpu/rocm/README.md new file mode 100644 index 0000000000..b0d76d0d31 --- /dev/null +++ b/AgentQnA/docker_compose/amd/gpu/rocm/README.md @@ -0,0 +1,101 @@ +# Single node on-prem deployment with Docker Compose on AMD GPU + +This example showcases a hierarchical multi-agent system for question-answering applications. We deploy the example on Xeon. For LLMs, we use OpenAI models via API calls. For instructions on using open-source LLMs, please refer to the deployment guide [here](../../../../README.md). + +## Deployment with docker + +1. First, clone this repo. + ``` + export WORKDIR= + cd $WORKDIR + git clone https://github.com/opea-project/GenAIExamples.git + ``` +2. Set up environment for this example
+ + ``` + # Example: host_ip="192.168.1.1" or export host_ip="External_Public_IP" + export host_ip=$(hostname -I | awk '{print $1}') + # if you are in a proxy environment, also set the proxy-related environment variables + export http_proxy="Your_HTTP_Proxy" + export https_proxy="Your_HTTPs_Proxy" + # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1" + export no_proxy="Your_No_Proxy" + + export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/ + #OPANAI_API_KEY if you want to use OpenAI models + export OPENAI_API_KEY= + # Set AMD GPU settings + export AGENTQNA_CARD_ID="card1" + export AGENTQNA_RENDER_ID="renderD136" + ``` + +3. Deploy the retrieval tool (i.e., DocIndexRetriever mega-service) + + First, launch the mega-service. + + ``` + cd $WORKDIR/GenAIExamples/AgentQnA/retrieval_tool + bash launch_retrieval_tool.sh + ``` + + Then, ingest data into the vector database. Here we provide an example. You can ingest your own data. + + ``` + bash run_ingest_data.sh + ``` + +4. Launch Tool service + In this example, we will use some of the mock APIs provided in the Meta CRAG KDD Challenge to demonstrate the benefits of gaining additional context from mock knowledge graphs. + ``` + docker run -d -p=8080:8000 docker.io/aicrowd/kdd-cup-24-crag-mock-api:v0 + ``` +5. Launch `Agent` service + + ``` + cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/amd/gpu/rocm + bash launch_agent_service_tgi_rocm.sh + ``` + +6. [Optional] Build `Agent` docker image if pulling images failed. + + ``` + git clone https://github.com/opea-project/GenAIComps.git + cd GenAIComps + docker build -t opea/agent:latest -f comps/agent/src/Dockerfile . + ``` + +## Validate services + +First look at logs of the agent docker containers: + +``` +# worker agent +docker logs rag-agent-endpoint +``` + +``` +# supervisor agent +docker logs react-agent-endpoint +``` + +You should see something like "HTTP server setup successful" if the docker containers are started successfully.

+ +Second, validate worker agent: + +``` +curl http://${host_ip}:9095/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{ + "query": "Most recent album by Taylor Swift" + }' +``` + +Third, validate supervisor agent: + +``` +curl http://${host_ip}:9090/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{ + "query": "Most recent album by Taylor Swift" + }' +``` + +## How to register your own tools with agent + +You can take a look at the tools yaml and python files in this example. For more details, please refer to the "Provide your own tools" section in the instructions [here](https://github.com/opea-project/GenAIComps/tree/main/comps/agent/src/README.md). diff --git a/AgentQnA/kubernetes/helm/gaudi-values.yaml b/AgentQnA/kubernetes/helm/gaudi-values.yaml index 91ef5d1026..2d171ea22a 100644 --- a/AgentQnA/kubernetes/helm/gaudi-values.yaml +++ b/AgentQnA/kubernetes/helm/gaudi-values.yaml @@ -4,35 +4,13 @@ # Accelerate inferencing in heaviest components to improve performance # by overriding their subchart values -tgi: +vllm: enabled: true - accelDevice: "gaudi" image: - repository: ghcr.io/huggingface/tgi-gaudi - tag: "2.0.6" - resources: - limits: - habana.ai/gaudi: 4 - MAX_INPUT_LENGTH: "4096" - MAX_TOTAL_TOKENS: "8192" - CUDA_GRAPHS: "" - OMPI_MCA_btl_vader_single_copy_mechanism: "none" - PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" - ENABLE_HPU_GRAPH: "true" - LIMIT_HPU_GRAPH: "true" - USE_FLASH_ATTENTION: "true" - FLASH_ATTENTION_RECOMPUTE: "true" - extraCmdArgs: ["--sharded","true","--num-shard","4"] - livenessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - readinessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - startupProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - failureThreshold: 120 + repository: opea/vllm-gaudi +supervisor: + llm_endpoint_url: http://{{ .Release.Name }}-vllm +ragagent: + llm_endpoint_url: http://{{ .Release.Name }}-vllm +sqlagent: + llm_endpoint_url: http://{{ .Release.Name }}-vllm diff --git a/AgentQnA/tests/step1_build_images.sh b/AgentQnA/tests/step1_build_images.sh index e00cf75106..4cb8a2e4d1 100644 --- a/AgentQnA/tests/step1_build_images.sh +++ b/AgentQnA/tests/step1_build_images.sh @@ -38,19 +38,17 @@ function build_vllm_docker_image() { echo "Building the vllm docker image" cd $WORKPATH echo $WORKPATH - if [ ! -d "./vllm" ]; then - echo "clone vllm repo...." - git clone https://github.com/vllm-project/vllm.git + if [ ! -d "./vllm-fork" ]; then + git clone https://github.com/HabanaAI/vllm-fork.git fi - cd ./vllm - echo "Checking out latest stable release of vllm" - git checkout v0.6.6 - docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:comps --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy + cd ./vllm-fork + git checkout v0.6.4.post2+Gaudi-1.19.0 + docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:ci --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy if [ $? -ne 0 ]; then - echo "opea/vllm-gaudi:comps failed" + echo "opea/vllm-gaudi:ci failed" exit 1 else - echo "opea/vllm-gaudi:comps successful" + echo "opea/vllm-gaudi:ci successful" fi } diff --git a/AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh b/AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh index c99e212ff6..824f7aa855 100644 --- a/AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh +++ b/AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh @@ -8,15 +8,17 @@ WORKPATH=$(dirname "$PWD") export WORKDIR=$WORKPATH/../../ echo "WORKDIR=${WORKDIR}" export ip_address=$(hostname -I | awk '{print $1}') -export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/ +export TOOLSET_PATH=$WORKPATH/tools/ export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} model="meta-llama/Meta-Llama-3.1-70B-Instruct" -export HF_CACHE_DIR=$WORKDIR/hf_cache +export HF_CACHE_DIR=/data2/huggingface if [ ! -d "$HF_CACHE_DIR" ]; then + HF_CACHE_DIR=$WORKDIR/hf_cache mkdir -p "$HF_CACHE_DIR" fi +echo "HF_CACHE_DIR=$HF_CACHE_DIR" ls $HF_CACHE_DIR vllm_port=8086 @@ -35,7 +37,7 @@ function start_vllm_service_70B() { echo "start vllm gaudi service" echo "**************model is $model**************" - vllm_image=opea/vllm-gaudi:comps + vllm_image=opea/vllm-gaudi:ci docker run -d --runtime=habana --rm --name "vllm-gaudi-server" -e HABANA_VISIBLE_DEVICES=0,1,2,3 -p $vllm_port:8000 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host $vllm_image --model ${model} --max-seq-len-to-capture 16384 --tensor-parallel-size 4 sleep 5s echo "Waiting vllm gaudi ready" diff --git a/AgentQnA/tests/test_compose_on_gaudi.sh b/AgentQnA/tests/test_compose_on_gaudi.sh index cf224b6aa1..880102f0f8 100644 --- a/AgentQnA/tests/test_compose_on_gaudi.sh +++ b/AgentQnA/tests/test_compose_on_gaudi.sh @@ -4,9 +4,6 @@ set -xe -echo "All running containers" -docker ps - WORKPATH=$(dirname "$PWD") export WORKDIR=$WORKPATH/../../ echo "WORKDIR=${WORKDIR}" diff --git a/AudioQnA/kubernetes/helm/gaudi-values.yaml b/AudioQnA/kubernetes/helm/gaudi-values.yaml index faaad653ef..9b06ff4296 100644 --- a/AudioQnA/kubernetes/helm/gaudi-values.yaml +++ b/AudioQnA/kubernetes/helm/gaudi-values.yaml @@ -5,7 +5,7 @@ tgi: accelDevice: "gaudi" image: repository: ghcr.io/huggingface/tgi-gaudi - tag: "2.0.6" + tag: "2.3.1" resources: limits: habana.ai/gaudi: 1 @@ -33,11 +33,15 @@ tgi: failureThreshold: 120 whisper: + image: + repository: opea/whisper-gaudi resources: limits: habana.ai/gaudi: 1 speecht5: + image: + repository: opea/speecht5-gaudi resources: limits: habana.ai/gaudi: 1 diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README.md b/ChatQnA/docker_compose/intel/cpu/xeon/README.md index 01a00a8193..764afba4d4 100644 --- a/ChatQnA/docker_compose/intel/cpu/xeon/README.md +++ b/ChatQnA/docker_compose/intel/cpu/xeon/README.md @@ -280,7 +280,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v 1. TEI Embedding Service ```bash - curl ${host_ip}:6006/embed \ + curl http://${host_ip}:6006/embed \ -X POST \ -d '{"inputs":"What is Deep Learning?"}' \ -H 'Content-Type: application/json' diff --git a/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml b/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml new file mode 100644 index 0000000000..f552e1d5bc --- /dev/null +++ b/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml @@ -0,0 +1,112 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Override CPU resource request and probe timing values in specific subcharts +# +# RESOURCES +# +# Resource request matching actual resource usage (with enough slack) +# is important when service is scaled up, so that right amount of pods +# get scheduled to right nodes. +# +# Because resource usage depends on the used devices, model, data type +# and SW versions, and this top-level chart has overrides for them, +# resource requests need to be specified here too. +# +# To test service without resource request, use "resources: {}". +# +# PROBES +# +# Inferencing pods startup / warmup takes *much* longer on CPUs than +# with acceleration devices, and their responses are also slower, +# especially when node is running several instances of these services. +# +# Kubernetes restarting pod before its startup finishes, or not +# sending it queries because it's not in ready state due to slow +# readiness responses, does really NOT help in getting faster responses. +# +# => probe timings need to be increased when running on CPU. + +vllm: + enabled: false +tgi: + enabled: true + # TODO: add Helm value also for TGI data type option: + # https://github.com/opea-project/GenAIExamples/issues/330 + LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct + + # Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit: + #resources: + # limits: + # cpu: 8 + # memory: 70Gi + # requests: + # cpu: 6 + # memory: 65Gi + + livenessProbe: + initialDelaySeconds: 8 + periodSeconds: 8 + failureThreshold: 24 + timeoutSeconds: 4 + readinessProbe: + initialDelaySeconds: 16 + periodSeconds: 8 + timeoutSeconds: 4 + startupProbe: + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 180 + timeoutSeconds: 2 + +teirerank: + RERANK_MODEL_ID: "BAAI/bge-reranker-base" + + # Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model: + resources: + limits: + cpu: 4 + memory: 30Gi + requests: + cpu: 2 + memory: 25Gi + + livenessProbe: + initialDelaySeconds: 8 + periodSeconds: 8 + failureThreshold: 24 + timeoutSeconds: 4 + readinessProbe: + initialDelaySeconds: 8 + periodSeconds: 8 + timeoutSeconds: 4 + startupProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 120 + +tei: + EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5" + + # Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model: + resources: + limits: + cpu: 4 + memory: 4Gi + requests: + cpu: 2 + memory: 3Gi + + livenessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 24 + timeoutSeconds: 2 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 2 + startupProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 120 diff --git a/ChatQnA/kubernetes/helm/cpu-values.yaml b/ChatQnA/kubernetes/helm/cpu-values.yaml index b4c5ee5ddb..86b68a921f 100644 --- a/ChatQnA/kubernetes/helm/cpu-values.yaml +++ b/ChatQnA/kubernetes/helm/cpu-values.yaml @@ -1,109 +1,5 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (C) 2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -# Override CPU resource request and probe timing values in specific subcharts -# -# RESOURCES -# -# Resource request matching actual resource usage (with enough slack) -# is important when service is scaled up, so that right amount of pods -# get scheduled to right nodes. -# -# Because resource usage depends on the used devices, model, data type -# and SW versions, and this top-level chart has overrides for them, -# resource requests need to be specified here too. -# -# To test service without resource request, use "resources: {}". -# -# PROBES -# -# Inferencing pods startup / warmup takes *much* longer on CPUs than -# with acceleration devices, and their responses are also slower, -# especially when node is running several instances of these services. -# -# Kubernetes restarting pod before its startup finishes, or not -# sending it queries because it's not in ready state due to slow -# readiness responses, does really NOT help in getting faster responses. -# -# => probe timings need to be increased when running on CPU. - -tgi: - # TODO: add Helm value also for TGI data type option: - # https://github.com/opea-project/GenAIExamples/issues/330 - LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 - - # Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit: - resources: - limits: - cpu: 8 - memory: 70Gi - requests: - cpu: 6 - memory: 65Gi - - livenessProbe: - initialDelaySeconds: 8 - periodSeconds: 8 - failureThreshold: 24 - timeoutSeconds: 4 - readinessProbe: - initialDelaySeconds: 16 - periodSeconds: 8 - timeoutSeconds: 4 - startupProbe: - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 180 - timeoutSeconds: 2 - -teirerank: - RERANK_MODEL_ID: "BAAI/bge-reranker-base" - - # Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model: - resources: - limits: - cpu: 4 - memory: 30Gi - requests: - cpu: 2 - memory: 25Gi - - livenessProbe: - initialDelaySeconds: 8 - periodSeconds: 8 - failureThreshold: 24 - timeoutSeconds: 4 - readinessProbe: - initialDelaySeconds: 8 - periodSeconds: 8 - timeoutSeconds: 4 - startupProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - failureThreshold: 120 - -tei: - EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5" - - # Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model: - resources: - limits: - cpu: 4 - memory: 4Gi - requests: - cpu: 2 - memory: 3Gi - - livenessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - failureThreshold: 24 - timeoutSeconds: 2 - readinessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 2 - startupProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - failureThreshold: 120 +image: + repository: opea/chatqna diff --git a/ChatQnA/kubernetes/helm/gaudi-values.yaml b/ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml similarity index 97% rename from ChatQnA/kubernetes/helm/gaudi-values.yaml rename to ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml index 47df99fc44..d4da00c976 100644 --- a/ChatQnA/kubernetes/helm/gaudi-values.yaml +++ b/ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml @@ -4,12 +4,15 @@ # Accelerate inferencing in heaviest components to improve performance # by overriding their subchart values +vllm: + enabled: false # TGI: largest bottleneck for ChatQnA tgi: + enabled: true accelDevice: "gaudi" image: repository: ghcr.io/huggingface/tgi-gaudi - tag: "2.0.6" + tag: "2.3.1" resources: limits: habana.ai/gaudi: 1 diff --git a/ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml b/ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml index 6c1a44ebff..76eafae029 100644 --- a/ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml +++ b/ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml @@ -6,9 +6,9 @@ tgi: enabled: false - vllm: enabled: true + shmSize: 1Gi accelDevice: "gaudi" image: repository: opea/vllm-gaudi @@ -19,7 +19,7 @@ vllm: initialDelaySeconds: 5 periodSeconds: 5 timeoutSeconds: 1 - failureThreshold: 120 + failureThreshold: 180 readinessProbe: initialDelaySeconds: 5 periodSeconds: 5 @@ -39,7 +39,6 @@ vllm: "--max-seq_len-to-capture", "2048" ] - # Reranking: second largest bottleneck when reranking is in use # (i.e. query context docs have been uploaded with data-prep) # diff --git a/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml b/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml index aad83623d5..8e8a491a0a 100644 --- a/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml +++ b/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml @@ -44,17 +44,18 @@ teirerank: readinessProbe: timeoutSeconds: 1 -tgi: +tgi-guardrails: + enabled: true accelDevice: "gaudi" + LLM_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B" image: repository: ghcr.io/huggingface/tgi-gaudi - tag: "2.0.6" + tag: "2.3.1" resources: limits: habana.ai/gaudi: 1 - # higher limits are needed with extra input tokens added by rerank - MAX_INPUT_LENGTH: "2048" - MAX_TOTAL_TOKENS: "4096" + MAX_INPUT_LENGTH: "1024" + MAX_TOTAL_TOKENS: "2048" CUDA_GRAPHS: "" OMPI_MCA_btl_vader_single_copy_mechanism: "none" ENABLE_HPU_GRAPH: "true" @@ -75,34 +76,37 @@ tgi: timeoutSeconds: 1 failureThreshold: 120 -tgi-guardrails: +tgi: + enabled: false +vllm: enabled: true + shmSize: 1Gi accelDevice: "gaudi" - LLM_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B" image: - repository: ghcr.io/huggingface/tgi-gaudi - tag: "2.0.6" + repository: opea/vllm-gaudi resources: limits: habana.ai/gaudi: 1 - MAX_INPUT_LENGTH: "1024" - MAX_TOTAL_TOKENS: "2048" - CUDA_GRAPHS: "" - OMPI_MCA_btl_vader_single_copy_mechanism: "none" - ENABLE_HPU_GRAPH: "true" - LIMIT_HPU_GRAPH: "true" - USE_FLASH_ATTENTION: "true" - FLASH_ATTENTION_RECOMPUTE: "true" - livenessProbe: + startupProbe: initialDelaySeconds: 5 periodSeconds: 5 timeoutSeconds: 1 + failureThreshold: 180 readinessProbe: initialDelaySeconds: 5 periodSeconds: 5 timeoutSeconds: 1 - startupProbe: + livenessProbe: initialDelaySeconds: 5 periodSeconds: 5 timeoutSeconds: 1 - failureThreshold: 120 + + PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + + extraCmdArgs: [ + "--tensor-parallel-size", "1", + "--block-size", "128", + "--max-num-seqs", "256", + "--max-seq_len-to-capture", "2048" + ] diff --git a/ChatQnA/kubernetes/helm/guardrails-values.yaml b/ChatQnA/kubernetes/helm/guardrails-values.yaml deleted file mode 100644 index d37a41060c..0000000000 --- a/ChatQnA/kubernetes/helm/guardrails-values.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -image: - repository: opea/chatqna-guardrails - -# guardrails related config -guardrails-usvc: - enabled: true - # SAFETY_GUARD_ENDPOINT: "http://{{ .Release.Name }}-tgi-guardrails" - SAFETY_GUARD_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B" -tgi-guardrails: - enabled: true - LLM_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B" diff --git a/ChatQnA/kubernetes/helm/nv-values.yaml b/ChatQnA/kubernetes/helm/nv-values.yaml deleted file mode 100644 index 67c4e3ac18..0000000000 --- a/ChatQnA/kubernetes/helm/nv-values.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -# To override values in subchart tgi -tgi: - accelDevice: "nvidia" - image: - repository: ghcr.io/huggingface/text-generation-inference - tag: "2.2.0" - resources: - limits: - nvidia.com/gpu: 1 - livenessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - readinessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - startupProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - failureThreshold: 120 diff --git a/ChatQnA/tests/test_manifest_guardrails_on_gaudi.sh b/ChatQnA/tests/test_manifest_guardrails_on_gaudi.sh deleted file mode 100755 index 274cc5209c..0000000000 --- a/ChatQnA/tests/test_manifest_guardrails_on_gaudi.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -set -xe -USER_ID=$(whoami) -LOG_PATH=/home/$(whoami)/logs -MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub -IMAGE_REPO=${IMAGE_REPO:-opea} -IMAGE_TAG=${IMAGE_TAG:-latest} - -ROLLOUT_TIMEOUT_SECONDS="1800s" -KUBECTL_TIMEOUT_SECONDS="60s" - -function validate_chatqna() { - local ns=$1 - local log=$2 - max_retry=20 - # make sure microservice retriever-usvc is ready - # try to curl retriever-svc for max_retry times - test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") - for ((i=1; i<=max_retry; i++)) - do - endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns) - curl http://$endpoint_url/v1/retrieval -X POST \ - -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \ - -H 'Content-Type: application/json' && break - sleep 30 - done - # if i is bigger than max_retry, then exit with error - if [ $i -gt $max_retry ]; then - echo "Microservice retriever failed, exit with error." - return 1 - fi - # make sure microservice tgi-svc is ready - for ((i=1; i<=max_retry; i++)) - do - endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns) - curl http://$endpoint_url/generate -X POST \ - -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \ - -H 'Content-Type: application/json' && break - sleep 30 - done - # if i is bigger than max_retry, then exit with error - if [ $i -gt $max_retry ]; then - echo "Microservice tgi failed, exit with error." - return 1 - fi - - # check megaservice works - # generate a random logfile name to avoid conflict among multiple runners - LOGFILE=$LOG_PATH/curlmega_$log.log - endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns) - curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE - exit_code=$? - if [ $exit_code -ne 0 ]; then - echo "Megaservice failed, please check the logs in $LOGFILE!" - return ${exit_code} - fi - - echo "Checking response results, make sure the output is reasonable. " - local status=false - if [[ -f $LOGFILE ]] && - [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then - status=true - fi - if [[ $status == false ]]; then - echo "Response check failed, please check the logs in artifacts!" - return 1 - else - echo "Response check succeed!" - fi - return 0 -} - -function install_chatqna() { - echo "Testing manifests chatqna_guardrails" - local ns=$1 - bash ChatQnA/tests/common/_test_manifest_utils.sh _cleanup_ns $ns - pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest - kubectl create namespace $ns - # install guardrails - kubectl apply -f chatqna-guardrails.yaml -n $ns - # Sleep enough time for chatqna_guardrails to be ready - sleep 60 -} - -if [ $# -eq 0 ]; then - echo "Usage: $0 " - exit 1 -fi - -case "$1" in - init_ChatQnA) - pushd ChatQnA/tests/common - bash _test_manifest_utils.sh init_ChatQnA - popd - ;; - install_ChatQnA) - NAMESPACE=$2 - install_chatqna $NAMESPACE - popd - ;; - validate_ChatQnA) - NAMESPACE=$2 - SERVICE_NAME=chatqna-guardrails - validate_chatqna $NAMESPACE chatqna-guardrails - ret=$? - if [ $ret -ne 0 ]; then - exit $ret - fi - ;; - - *) - echo "Unknown function: $1" - ;; -esac diff --git a/ChatQnA/tests/test_manifest_guardrails_on_xeon.sh b/ChatQnA/tests/test_manifest_guardrails_on_xeon.sh deleted file mode 100755 index 63d494c9f8..0000000000 --- a/ChatQnA/tests/test_manifest_guardrails_on_xeon.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -set -xe -USER_ID=$(whoami) -LOG_PATH=/home/$(whoami)/logs -MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub -IMAGE_REPO=${IMAGE_REPO:-opea} -IMAGE_TAG=${IMAGE_TAG:-latest} - -ROLLOUT_TIMEOUT_SECONDS="1800s" -KUBECTL_TIMEOUT_SECONDS="60s" - -function validate_chatqna() { - local ns=$1 - local log=$2 - max_retry=10 - # make sure microservice retriever-usvc is ready - # try to curl retriever-svc for max_retry times - test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") - for ((i=1; i<=max_retry; i++)) - do - endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns) - curl http://$endpoint_url/v1/retrieval -X POST \ - -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \ - -H 'Content-Type: application/json' && break - sleep 30 - done - # if i is bigger than max_retry, then exit with error - if [ $i -gt $max_retry ]; then - echo "Microservice retriever failed, exit with error." - return 1 - fi - # make sure microservice tgi-svc is ready - for ((i=1; i<=max_retry; i++)) - do - endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns) - curl http://$endpoint_url/generate -X POST \ - -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \ - -H 'Content-Type: application/json' && break - sleep 30 - done - # if i is bigger than max_retry, then exit with error - if [ $i -gt $max_retry ]; then - echo "Microservice tgi failed, exit with error." - return 1 - fi - - # check megaservice works - # generate a random logfile name to avoid conflict among multiple runners - LOGFILE=$LOG_PATH/curlmega_$log.log - endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns) - curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE - exit_code=$? - if [ $exit_code -ne 0 ]; then - echo "Megaservice failed, please check the logs in $LOGFILE!" - return ${exit_code} - fi - - echo "Checking response results, make sure the output is reasonable. " - local status=false - if [[ -f $LOGFILE ]] && - [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then - status=true - fi - if [ $status == false ]; then - echo "Response check failed, please check the logs in artifacts!" - return 1 - else - echo "Response check succeed!" - fi - return 0 -} - -function install_chatqna() { - echo "Testing manifests chatqna_guardrails" - local ns=$1 - bash ChatQnA/tests/common/_test_manifest_utils.sh _cleanup_ns $ns - pushd ChatQnA/kubernetes/intel/cpu/xeon/manifest - kubectl create namespace $ns - # install guardrail - kubectl apply -f chatqna-guardrails.yaml -n $ns - # Sleep enough time for chatqna_guardrails to be ready - sleep 60 -} - -if [ $# -eq 0 ]; then - echo "Usage: $0 " - exit 1 -fi - -case "$1" in - init_ChatQnA) - pushd ChatQnA/tests/common - bash _test_manifest_utils.sh init_ChatQnA - popd - ;; - install_ChatQnA) - NAMESPACE=$2 - install_chatqna $NAMESPACE - popd - ;; - validate_ChatQnA) - NAMESPACE=$2 - SERVICE_NAME=chatqna-guardrails - validate_chatqna $NAMESPACE chatqna-guardrails - ret=$? - if [ $ret -ne 0 ]; then - exit $ret - fi - ;; - - *) - echo "Unknown function: $1" - ;; -esac diff --git a/ChatQnA/tests/test_manifest_on_gaudi.sh b/ChatQnA/tests/test_manifest_on_gaudi.sh deleted file mode 100755 index d1764401ff..0000000000 --- a/ChatQnA/tests/test_manifest_on_gaudi.sh +++ /dev/null @@ -1,113 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -set -xe -USER_ID=$(whoami) -LOG_PATH=/home/$(whoami)/logs -MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub -IMAGE_REPO=${IMAGE_REPO:-opea} -IMAGE_TAG=${IMAGE_TAG:-latest} - -ROLLOUT_TIMEOUT_SECONDS="1800s" -KUBECTL_TIMEOUT_SECONDS="60s" - -function install_chatqna { - echo "namespace is $NAMESPACE" - kubectl apply -f chatqna.yaml -n $NAMESPACE - # Sleep enough time for retreiver-usvc to be ready - sleep 60 -} - -function validate_chatqna() { - local ns=$1 - local log=$2 - max_retry=20 - # make sure microservice retriever-usvc is ready - # try to curl retriever-svc for max_retry times - test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") - for ((i=1; i<=max_retry; i++)) - do - endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns) - curl http://$endpoint_url/v1/retrieval -X POST \ - -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \ - -H 'Content-Type: application/json' && break - sleep 30 - done - # if i is bigger than max_retry, then exit with error - if [ $i -gt $max_retry ]; then - echo "Microservice retriever failed, exit with error." - return 1 - fi - # make sure microservice tgi-svc is ready - for ((i=1; i<=max_retry; i++)) - do - endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns) - curl http://$endpoint_url/generate -X POST \ - -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \ - -H 'Content-Type: application/json' && break - sleep 30 - done - # if i is bigger than max_retry, then exit with error - if [ $i -gt $max_retry ]; then - echo "Microservice tgi failed, exit with error." - return 1 - fi - - # check megaservice works - # generate a random logfile name to avoid conflict among multiple runners - LOGFILE=$LOG_PATH/curlmega_$log.log - endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns) - curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE - exit_code=$? - if [ $exit_code -ne 0 ]; then - echo "Megaservice failed, please check the logs in $LOGFILE!" - return ${exit_code} - fi - - echo "Checking response results, make sure the output is reasonable. " - local status=false - if [[ -f $LOGFILE ]] && - [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then - status=true - fi - if [ $status == false ]; then - echo "Response check failed, please check the logs in artifacts!" - return 1 - else - echo "Response check succeed!" - fi - return 0 -} - -if [ $# -eq 0 ]; then - echo "Usage: $0 " - exit 1 -fi - -case "$1" in - init_ChatQnA) - pushd ChatQnA/tests/common - bash _test_manifest_utils.sh init_ChatQnA - popd - ;; - install_ChatQnA) - pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest - NAMESPACE=$2 - install_chatqna - popd - ;; - validate_ChatQnA) - NAMESPACE=$2 - SERVICE_NAME=chatqna - validate_chatqna $NAMESPACE chatqna - ret=$? - if [ $ret -ne 0 ]; then - exit $ret - fi - ;; - - *) - echo "Unknown function: $1" - ;; -esac diff --git a/ChatQnA/tests/test_manifest_on_xeon.sh b/ChatQnA/tests/test_manifest_on_xeon.sh deleted file mode 100755 index 4c93a8958e..0000000000 --- a/ChatQnA/tests/test_manifest_on_xeon.sh +++ /dev/null @@ -1,112 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -set -xe -USER_ID=$(whoami) -LOG_PATH=/home/$(whoami)/logs -MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub -IMAGE_REPO=${IMAGE_REPO:-opea} -IMAGE_TAG=${IMAGE_TAG:-latest} - -ROLLOUT_TIMEOUT_SECONDS="1800s" -KUBECTL_TIMEOUT_SECONDS="60s" - -function install_chatqna { - echo "namespace is $NAMESPACE" - kubectl apply -f chatqna.yaml -n $NAMESPACE - # Sleep enough time for retreiver-usvc to be ready - sleep 60 -} - -function validate_chatqna() { - local ns=$1 - local log=$2 - max_retry=10 - # make sure microservice retriever-usvc is ready - # try to curl retriever-svc for max_retry times - test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") - for ((i=1; i<=max_retry; i++)) - do - endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns) - curl http://$endpoint_url/v1/retrieval -X POST \ - -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \ - -H 'Content-Type: application/json' && break - sleep 30 - done - # if i is bigger than max_retry, then exit with error - if [ $i -gt $max_retry ]; then - echo "Microservice retriever failed, exit with error." - return 1 - fi - # make sure microservice tgi-svc is ready - for ((i=1; i<=max_retry; i++)) - do - endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns) - curl http://$endpoint_url/generate -X POST \ - -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \ - -H 'Content-Type: application/json' && break - sleep 30 - done - # if i is bigger than max_retry, then exit with error - if [ $i -gt $max_retry ]; then - echo "Microservice tgi failed, exit with error." - return 1 - fi - - # check megaservice works - # generate a random logfile name to avoid conflict among multiple runners - LOGFILE=$LOG_PATH/curlmega_$log.log - endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns) - curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE - exit_code=$? - if [ $exit_code -ne 0 ]; then - echo "Megaservice failed, please check the logs in $LOGFILE!" - return ${exit_code} - fi - - echo "Checking response results, make sure the output is reasonable. " - local status=false - if [[ -f $LOGFILE ]] && - [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then - status=true - fi - if [ $status == false ]; then - echo "Response check failed, please check the logs in artifacts!" - return 1 - else - echo "Response check succeed!" - fi - return 0 -} - -if [ $# -eq 0 ]; then - echo "Usage: $0 " - exit 1 -fi - -case "$1" in - init_ChatQnA) - pushd ChatQnA/tests/common - bash _test_manifest_utils.sh init_ChatQnA - popd - ;; - install_ChatQnA) - pushd ChatQnA/kubernetes/intel/cpu/xeon/manifest - NAMESPACE=$2 - install_chatqna - popd - ;; - validate_ChatQnA) - NAMESPACE=$2 - SERVICE_NAME=chatqna - validate_chatqna $NAMESPACE chatqna - ret=$? - if [ $ret -ne 0 ]; then - exit $ret - fi - ;; - *) - echo "Unknown function: $1" - ;; -esac diff --git a/ChatQnA/tests/test_manifest_vllm_on_gaudi.sh b/ChatQnA/tests/test_manifest_vllm_on_gaudi.sh deleted file mode 100755 index c1ab58460e..0000000000 --- a/ChatQnA/tests/test_manifest_vllm_on_gaudi.sh +++ /dev/null @@ -1,118 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -set -xe -USER_ID=$(whoami) -LOG_PATH=/home/$(whoami)/logs -MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub -IMAGE_REPO=${IMAGE_REPO:-opea} -IMAGE_TAG=${IMAGE_TAG:-latest} - -ROLLOUT_TIMEOUT_SECONDS="1800s" -KUBECTL_TIMEOUT_SECONDS="60s" - -function validate_chatqna() { - local ns=$1 - local log=$2 - max_retry=20 - # make sure microservice retriever-usvc is ready - # try to curl retriever-svc for max_retry times - test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") - for ((i=1; i<=max_retry; i++)) - do - endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns) - curl http://$endpoint_url/v1/retrieval -X POST \ - -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \ - -H 'Content-Type: application/json' && break - sleep 30 - done - # if i is bigger than max_retry, then exit with error - if [ $i -gt $max_retry ]; then - echo "Microservice retriever failed, exit with error." - return 1 - fi - - # make sure microservice vllm-svc is ready - for ((i=1; i<=max_retry; i++)) - do - endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-vllm" $ns) - curl http://$endpoint_url/v1/chat/completions -X POST \ - -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}' \ - -H 'Content-Type: application/json' && break - sleep 30 - done - # if i is bigger than max_retry, then exit with error - if [ $i -gt $max_retry ]; then - echo "Microservice vllm failed, exit with error." - return 1 - fi - - # check megaservice works - # generate a random logfile name to avoid conflict among multiple runners - LOGFILE=$LOG_PATH/curlmega_$log.log - endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns) - curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE - exit_code=$? - if [ $exit_code -ne 0 ]; then - echo "Megaservice failed, please check the logs in $LOGFILE!" - return ${exit_code} - fi - - echo "Checking response results, make sure the output is reasonable. " - local status=false - if [[ -f $LOGFILE ]] && - [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then - status=true - fi - if [ $status == false ]; then - echo "Response check failed, please check the logs in artifacts!" - return 1 - else - echo "Response check succeed!" - fi - return 0 -} - -function install_chatqna() { - echo "Testing manifests chatqna_vllm" - local ns=$1 - bash ChatQnA/tests/common/_test_manifest_utils.sh _cleanup_ns $ns - kubectl create namespace $ns - # install guardrail - pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest - kubectl apply -f chatqna-vllm.yaml -n $ns - # Sleep enough time for chatqna_vllm to be ready, vllm warmup takes about 5 minutes - sleep 280 -} - -if [ $# -eq 0 ]; then - echo "Usage: $0 " - exit 1 -fi - -case "$1" in - init_ChatQnA) - pushd ChatQnA/tests/common - bash _test_manifest_utils.sh init_ChatQnA - popd - ;; - install_ChatQnA) - NAMESPACE=$2 - install_chatqna $NAMESPACE - popd - ;; - validate_ChatQnA) - NAMESPACE=$2 - SERVICE_NAME=chatqna-vllm - validate_chatqna $NAMESPACE chatqna-vllm - ret=$? - if [ $ret -ne 0 ]; then - exit $ret - fi - ;; - - *) - echo "Unknown function: $1" - ;; -esac diff --git a/CodeGen/kubernetes/helm/gaudi-values.yaml b/CodeGen/kubernetes/helm/gaudi-values.yaml index e26bb4a5ed..25ac2c3959 100644 --- a/CodeGen/kubernetes/helm/gaudi-values.yaml +++ b/CodeGen/kubernetes/helm/gaudi-values.yaml @@ -6,13 +6,18 @@ tgi: LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct image: repository: ghcr.io/huggingface/tgi-gaudi - tag: "2.0.6" + tag: "2.3.1" resources: limits: habana.ai/gaudi: 1 MAX_INPUT_LENGTH: "1024" MAX_TOTAL_TOKENS: "2048" CUDA_GRAPHS: "" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + ENABLE_HPU_GRAPH: "true" + LIMIT_HPU_GRAPH: "true" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "true" livenessProbe: initialDelaySeconds: 5 periodSeconds: 5 diff --git a/CodeGen/tests/test_manifest_on_gaudi.sh b/CodeGen/tests/test_manifest_on_gaudi.sh deleted file mode 100755 index a54e2d76df..0000000000 --- a/CodeGen/tests/test_manifest_on_gaudi.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -set -xe -USER_ID=$(whoami) -LOG_PATH=/home/$(whoami)/logs -MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub -IMAGE_REPO=${IMAGE_REPO:-opea} -IMAGE_TAG=${IMAGE_TAG:-latest} - -function init_codegen() { - # executed under path manifest/codegen/xeon - # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT" - find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \; - # replace microservice image tag - find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \; - # replace the repository "image: opea/*" with "image: $IMAGE_REPO/" - find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \; - # set huggingface token - find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \; -} - -function install_codegen { - echo "namespace is $NAMESPACE" - kubectl apply -f codegen.yaml -n $NAMESPACE -} - -function validate_codegen() { - ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}') - port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}') - echo "try to curl http://${ip_address}:${port}/v1/codegen..." - - # generate a random logfile name to avoid conflict among multiple runners - LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log - # Curl the Mega Service - curl http://${ip_address}:${port}/v1/codegen -H "Content-Type: application/json" \ - -d '{"messages": "def print_hello_world():"}' > $LOGFILE - exit_code=$? - if [ $exit_code -ne 0 ]; then - echo "Megaservice codegen failed, please check the logs in $LOGFILE!" - exit 1 - fi - - echo "Checking response results, make sure the output is reasonable. " - local status=false - if [[ -f $LOGFILE ]] && \ - [[ $(grep -c "print" $LOGFILE) != 0 ]]; then - status=true - fi - - if [ $status == false ]; then - echo "Response check failed, please check the logs in artifacts!" - exit 1 - else - echo "Response check succeed!" - fi -} - -if [ $# -eq 0 ]; then - echo "Usage: $0 " - exit 1 -fi - -case "$1" in - init_CodeGen) - pushd CodeGen/kubernetes/intel/hpu/gaudi/manifest - init_codegen - popd - ;; - install_CodeGen) - pushd CodeGen/kubernetes/intel/hpu/gaudi/manifest - NAMESPACE=$2 - install_codegen - popd - ;; - validate_CodeGen) - NAMESPACE=$2 - SERVICE_NAME=codegen - validate_codegen - ;; - *) - echo "Unknown function: $1" - ;; -esac diff --git a/CodeGen/tests/test_manifest_on_xeon.sh b/CodeGen/tests/test_manifest_on_xeon.sh deleted file mode 100755 index b0975f14a6..0000000000 --- a/CodeGen/tests/test_manifest_on_xeon.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -set -xe -USER_ID=$(whoami) -LOG_PATH=/home/$(whoami)/logs -MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub -IMAGE_REPO=${IMAGE_REPO:-opea} -IMAGE_TAG=${IMAGE_TAG:-latest} - -function init_codegen() { - # executed under path manifest/codegen/xeon - # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT" - find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \; - # replace microservice image tag - find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \; - # replace the repository "image: opea/*" with "image: $IMAGE_REPO/" - find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \; - # set huggingface token - find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \; -} - -function install_codegen { - echo "namespace is $NAMESPACE" - kubectl apply -f codegen.yaml -n $NAMESPACE -} - -function validate_codegen() { - ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}') - port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}') - echo "try to curl http://${ip_address}:${port}/v1/codegen..." - - # generate a random logfile name to avoid conflict among multiple runners - LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log - # Curl the Mega Service - curl http://${ip_address}:${port}/v1/codegen -H "Content-Type: application/json" \ - -d '{"messages": "def print_hello_world():"}' > $LOGFILE - exit_code=$? - if [ $exit_code -ne 0 ]; then - echo "Megaservice codegen failed, please check the logs in $LOGFILE!" - exit 1 - fi - - echo "Checking response results, make sure the output is reasonable. " - local status=false - if [[ -f $LOGFILE ]] && \ - [[ $(grep -c "print" $LOGFILE) != 0 ]]; then - status=true - fi - - if [ $status == false ]; then - echo "Response check failed, please check the logs in artifacts!" - exit 1 - else - echo "Response check succeed!" - fi -} - -if [ $# -eq 0 ]; then - echo "Usage: $0 " - exit 1 -fi - -case "$1" in - init_CodeGen) - pushd CodeGen/kubernetes/intel/cpu/xeon/manifest - init_codegen - popd - ;; - install_CodeGen) - pushd CodeGen/kubernetes/intel/cpu/xeon/manifest - NAMESPACE=$2 - install_codegen - popd - ;; - validate_CodeGen) - NAMESPACE=$2 - SERVICE_NAME=codegen - validate_codegen - ;; - *) - echo "Unknown function: $1" - ;; -esac diff --git a/CodeTrans/kubernetes/helm/gaudi-values.yaml b/CodeTrans/kubernetes/helm/gaudi-values.yaml index e5367383ae..89ed259285 100644 --- a/CodeTrans/kubernetes/helm/gaudi-values.yaml +++ b/CodeTrans/kubernetes/helm/gaudi-values.yaml @@ -5,13 +5,18 @@ tgi: accelDevice: "gaudi" image: repository: ghcr.io/huggingface/tgi-gaudi - tag: "2.0.6" + tag: "2.3.1" resources: limits: habana.ai/gaudi: 1 MAX_INPUT_LENGTH: "1024" MAX_TOTAL_TOKENS: "2048" CUDA_GRAPHS: "" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + ENABLE_HPU_GRAPH: "true" + LIMIT_HPU_GRAPH: "true" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "true" livenessProbe: initialDelaySeconds: 5 periodSeconds: 5 diff --git a/CodeTrans/tests/test_manifest_on_gaudi.sh b/CodeTrans/tests/test_manifest_on_gaudi.sh deleted file mode 100755 index 7be05ae33b..0000000000 --- a/CodeTrans/tests/test_manifest_on_gaudi.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -set -xe -USER_ID=$(whoami) -LOG_PATH=/home/$(whoami)/logs -MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub -IMAGE_REPO=${IMAGE_REPO:-opea} -IMAGE_TAG=${IMAGE_TAG:-latest} - -function init_codetrans() { - # executed under path manifest/codetrans/xeon - # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT" - find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \; - # replace microservice image tag - find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \; - # replace the repository "image: opea/*" with "image: $IMAGE_REPO/" - find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \; - # set huggingface token - find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \; -} - -function install_codetrans { - echo "namespace is $NAMESPACE" - kubectl apply -f codetrans.yaml -n $NAMESPACE -} - -function validate_codetrans() { - ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}') - port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}') - echo "try to curl http://${ip_address}:${port}/v1/codetrans..." - - # generate a random logfile name to avoid conflict among multiple runners - LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log - # Curl the Mega Service - curl http://${ip_address}:${port}/v1/codetrans \ - -H 'Content-Type: application/json' \ - -d '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}"}' > $LOGFILE - exit_code=$? - if [ $exit_code -ne 0 ]; then - echo "Megaservice codetrans failed, please check the logs in $LOGFILE!" - exit 1 - fi - - echo "Checking response results, make sure the output is reasonable. " - local status=false - if [[ -f $LOGFILE ]] && \ - [[ $(grep -c "print" $LOGFILE) != 0 ]]; then - status=true - fi - - if [ $status == false ]; then - echo "Response check failed, please check the logs in artifacts!" - exit 1 - else - echo "Response check succeed!" - fi -} - -if [ $# -eq 0 ]; then - echo "Usage: $0 " - exit 1 -fi - -case "$1" in - init_CodeTrans) - pushd CodeTrans/kubernetes/intel/hpu/gaudi/manifest - init_codetrans - popd - ;; - install_CodeTrans) - pushd CodeTrans/kubernetes/intel/hpu/gaudi/manifest - NAMESPACE=$2 - install_codetrans - popd - ;; - validate_CodeTrans) - NAMESPACE=$2 - SERVICE_NAME=codetrans - validate_codetrans - ;; - *) - echo "Unknown function: $1" - ;; -esac diff --git a/CodeTrans/tests/test_manifest_on_xeon.sh b/CodeTrans/tests/test_manifest_on_xeon.sh deleted file mode 100755 index 7f2b969240..0000000000 --- a/CodeTrans/tests/test_manifest_on_xeon.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -set -xe -USER_ID=$(whoami) -LOG_PATH=/home/$(whoami)/logs -MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub -IMAGE_REPO=${IMAGE_REPO:-opea} -IMAGE_TAG=${IMAGE_TAG:-latest} - -function init_codetrans() { - # executed under path manifest/codetrans/xeon - # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT" - find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \; - # replace microservice image tag - find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \; - # replace the repository "image: opea/*" with "image: $IMAGE_REPO/" - find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \; - # set huggingface token - find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \; -} - -function install_codetrans { - echo "namespace is $NAMESPACE" - kubectl apply -f codetrans.yaml -n $NAMESPACE -} - -function validate_codetrans() { - ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}') - port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}') - echo "try to curl http://${ip_address}:${port}/v1/codetrans..." - - # generate a random logfile name to avoid conflict among multiple runners - LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log - # Curl the Mega Service - curl http://${ip_address}:${port}/v1/codetrans \ - -H 'Content-Type: application/json' \ - -d '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}"}' > $LOGFILE - exit_code=$? - if [ $exit_code -ne 0 ]; then - echo "Megaservice codetrans failed, please check the logs in $LOGFILE!" - exit 1 - fi - - echo "Checking response results, make sure the output is reasonable. " - local status=false - if [[ -f $LOGFILE ]] && \ - [[ $(grep -c "print" $LOGFILE) != 0 ]]; then - status=true - fi - - if [ $status == false ]; then - echo "Response check failed, please check the logs in artifacts!" - exit 1 - else - echo "Response check succeed!" - fi -} - -if [ $# -eq 0 ]; then - echo "Usage: $0 " - exit 1 -fi - -case "$1" in - init_CodeTrans) - pushd CodeTrans/kubernetes/intel/cpu/xeon/manifest - init_codetrans - popd - ;; - install_CodeTrans) - pushd CodeTrans/kubernetes/intel/cpu/xeon/manifest - NAMESPACE=$2 - install_codetrans - popd - ;; - validate_CodeTrans) - NAMESPACE=$2 - SERVICE_NAME=codetrans - validate_codetrans - ;; - *) - echo "Unknown function: $1" - ;; -esac diff --git a/DocSum/kubernetes/helm/cpu-values.yaml b/DocSum/kubernetes/helm/cpu-values.yaml index 97818ae448..6f2ab7768f 100644 --- a/DocSum/kubernetes/helm/cpu-values.yaml +++ b/DocSum/kubernetes/helm/cpu-values.yaml @@ -2,4 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 tgi: - LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + enabled: true +vllm: + enabled: false diff --git a/DocSum/kubernetes/helm/gaudi-values.yaml b/DocSum/kubernetes/helm/gaudi-values.yaml index 5cfae25928..eda0abe8c4 100644 --- a/DocSum/kubernetes/helm/gaudi-values.yaml +++ b/DocSum/kubernetes/helm/gaudi-values.yaml @@ -1,16 +1,21 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +vllm: + enabled: false + +llm-uservice: + DOCSUM_BACKEND: "TGI" + tgi: + enabled: true accelDevice: "gaudi" image: repository: ghcr.io/huggingface/tgi-gaudi - tag: "2.0.6" + tag: "2.3.1" resources: limits: habana.ai/gaudi: 1 - MAX_INPUT_LENGTH: "1024" - MAX_TOTAL_TOKENS: "2048" CUDA_GRAPHS: "" ENABLE_HPU_GRAPH: true LIMIT_HPU_GRAPH: true diff --git a/DocSum/tests/test_manifest_on_gaudi.sh b/DocSum/tests/test_manifest_on_gaudi.sh deleted file mode 100755 index db731ac4a9..0000000000 --- a/DocSum/tests/test_manifest_on_gaudi.sh +++ /dev/null @@ -1,87 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -set -xe -USER_ID=$(whoami) -LOG_PATH=/home/$(whoami)/logs -MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub -IMAGE_REPO=${IMAGE_REPO:-opea} -IMAGE_TAG=${IMAGE_TAG:-latest} - -function init_docsum() { - # executed under path manifest/docsum/xeon - # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT" - find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \; - # replace microservice image tag - find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \; - # replace the repository "image: opea/*" with "image: $IMAGE_REPO/" - find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \; - # set huggingface token - find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \; -} - -function install_docsum { - echo "namespace is $NAMESPACE" - kubectl apply -f docsum.yaml -n $NAMESPACE -} - -function validate_docsum() { - ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}') - port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}') - echo "try to curl http://${ip_address}:${port}/v1/docsum..." - - # generate a random logfile name to avoid conflict among multiple runners - LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log - # Curl the Mega Service - curl http://${ip_address}:${port}/v1/docsum \ - -H 'Content-Type: multipart/form-data' \ - -F 'type=text' \ - -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." > $LOGFILE - exit_code=$? - if [ $exit_code -ne 0 ]; then - echo "Megaservice docsum failed, please check the logs in $LOGFILE!" - exit 1 - fi - - echo "Checking response results, make sure the output is reasonable. " - local status=false - if [[ -f $LOGFILE ]] && \ - [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then - status=true - fi - - if [ $status == false ]; then - echo "Response check failed, please check the logs in artifacts!" - exit 1 - else - echo "Response check succeed!" - fi -} - -if [ $# -eq 0 ]; then - echo "Usage: $0 " - exit 1 -fi - -case "$1" in - init_DocSum) - pushd DocSum/kubernetes/intel/hpu/gaudi/manifest - init_docsum - popd - ;; - install_DocSum) - pushd DocSum/kubernetes/intel/hpu/gaudi/manifest - NAMESPACE=$2 - install_docsum - popd - ;; - validate_DocSum) - NAMESPACE=$2 - SERVICE_NAME=docsum - validate_docsum - ;; - *) - echo "Unknown function: $1" - ;; -esac diff --git a/DocSum/tests/test_manifest_on_xeon.sh b/DocSum/tests/test_manifest_on_xeon.sh deleted file mode 100755 index 0bf613975c..0000000000 --- a/DocSum/tests/test_manifest_on_xeon.sh +++ /dev/null @@ -1,87 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -set -xe -USER_ID=$(whoami) -LOG_PATH=/home/$(whoami)/logs -MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub -IMAGE_REPO=${IMAGE_REPO:-opea} -IMAGE_TAG=${IMAGE_TAG:-latest} - -function init_docsum() { - # executed under path manifest/docsum/xeon - # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT" - find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \; - # replace microservice image tag - find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \; - # replace the repository "image: opea/*" with "image: $IMAGE_REPO/opea/" - find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \; - # set huggingface token - find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \; -} - -function install_docsum { - echo "namespace is $NAMESPACE" - kubectl apply -f docsum.yaml -n $NAMESPACE -} - -function validate_docsum() { - ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}') - port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}') - echo "try to curl http://${ip_address}:${port}/v1/docsum..." - - # generate a random logfile name to avoid conflict among multiple runners - LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log - # Curl the Mega Service - curl http://${ip_address}:${port}/v1/docsum \ - -H 'Content-Type: multipart/form-data' \ - -F 'type=text' \ - -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." > $LOGFILE - exit_code=$? - if [ $exit_code -ne 0 ]; then - echo "Megaservice docsum failed, please check the logs in $LOGFILE!" - exit 1 - fi - - echo "Checking response results, make sure the output is reasonable. " - local status=false - if [[ -f $LOGFILE ]] && \ - [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then - status=true - fi - - if [ $status == false ]; then - echo "Response check failed, please check the logs in artifacts!" - exit 1 - else - echo "Response check succeed!" - fi -} - -if [ $# -eq 0 ]; then - echo "Usage: $0 " - exit 1 -fi - -case "$1" in - init_DocSum) - pushd DocSum/kubernetes/intel/cpu/xeon/manifest - init_docsum - popd - ;; - install_DocSum) - pushd DocSum/kubernetes/intel/cpu/xeon/manifest - NAMESPACE=$2 - install_docsum - popd - ;; - validate_DocSum) - NAMESPACE=$2 - SERVICE_NAME=docsum - validate_docsum - ;; - *) - echo "Unknown function: $1" - ;; -esac diff --git a/FaqGen/kubernetes/helm/gaudi-values.yaml b/FaqGen/kubernetes/helm/gaudi-values.yaml index d14729c4a3..e45cde146f 100644 --- a/FaqGen/kubernetes/helm/gaudi-values.yaml +++ b/FaqGen/kubernetes/helm/gaudi-values.yaml @@ -5,13 +5,25 @@ tgi: accelDevice: "gaudi" image: repository: ghcr.io/huggingface/tgi-gaudi - tag: "2.0.6" + tag: "2.3.1" resources: limits: habana.ai/gaudi: 1 - MAX_INPUT_LENGTH: "4096" - MAX_TOTAL_TOKENS: "8192" + MAX_INPUT_LENGTH: "1024" + MAX_TOTAL_TOKENS: "2048" CUDA_GRAPHS: "0" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + ENABLE_HPU_GRAPH: "true" + LIMIT_HPU_GRAPH: "true" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "true" + PREFILL_BATCH_BUCKET_SIZE: 1 + BATCH_BUCKET_SIZE: 8 + extraCmdArgs: + - "--max-batch-total-tokens" + - "65536" + - "--max-batch-prefill-tokens" + - "4096" livenessProbe: initialDelaySeconds: 5 periodSeconds: 5 diff --git a/MultimodalQnA/README.md b/MultimodalQnA/README.md index 8e46be1c8a..bda42ee285 100644 --- a/MultimodalQnA/README.md +++ b/MultimodalQnA/README.md @@ -87,12 +87,12 @@ In the below, we provide a table that describes for each microservice component
Gaudi default compose.yaml -| MicroService | Open Source Project | HW | Port | Endpoint | -| ------------ | --------------------- | ----- | ---- | --------------------------------------------------------------------- | -| Embedding | Langchain | Xeon | 6000 | /v1/embeddings | -| Retriever | Langchain, Redis | Xeon | 7000 | /v1/multimodal_retrieval | -| LVM | Langchain, TGI | Gaudi | 9399 | /v1/lvm | -| Dataprep | Redis, Langchain, TGI | Gaudi | 6007 | /v1/generate_transcripts, /v1/generate_captions, /v1/ingest_with_text | +| MicroService | Open Source Project | HW | Port | Endpoint | +| ------------ | --------------------- | ----- | ---- | ----------------------------------------------------------- | +| Embedding | Langchain | Xeon | 6000 | /v1/embeddings | +| Retriever | Langchain, Redis | Xeon | 7000 | /v1/multimodal_retrieval | +| LVM | Langchain, TGI | Gaudi | 9399 | /v1/lvm | +| Dataprep | Redis, Langchain, TGI | Gaudi | 6007 | /v1/generate_transcripts, /v1/generate_captions, /v1/ingest |
diff --git a/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md b/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md index e49b264823..f49b9815f1 100644 --- a/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md +++ b/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md @@ -289,6 +289,7 @@ To delete all uploaded files along with data indexed with `$INDEX_NAME` in REDIS ```bash curl -X POST \ -H "Content-Type: application/json" \ + -d '{"file_path": "all"}' \ ${DATAPREP_DELETE_FILE_ENDPOINT} ``` diff --git a/MultimodalQnA/docker_compose/amd/gpu/rocm/set_env.sh b/MultimodalQnA/docker_compose/amd/gpu/rocm/set_env.sh index c271a4b553..5cb482bc55 100644 --- a/MultimodalQnA/docker_compose/amd/gpu/rocm/set_env.sh +++ b/MultimodalQnA/docker_compose/amd/gpu/rocm/set_env.sh @@ -26,8 +26,8 @@ export MM_RETRIEVER_SERVICE_HOST_IP=${HOST_IP} export LVM_SERVICE_HOST_IP=${HOST_IP} export MEGA_SERVICE_HOST_IP=${HOST_IP} export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:8888/v1/multimodalqna" -export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/ingest" -export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/generate_transcripts" -export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/generate_captions" -export DATAPREP_GET_FILE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/get" -export DATAPREP_DELETE_FILE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/delete" +export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/ingest" +export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/generate_transcripts" +export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/generate_captions" +export DATAPREP_GET_FILE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/get" +export DATAPREP_DELETE_FILE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/delete" diff --git a/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md b/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md index 714fc72661..7e4fa6894a 100644 --- a/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md +++ b/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md @@ -92,7 +92,7 @@ export REDIS_INSIGHTS_PORT=8001 export REDIS_URL="redis://${host_ip}:${REDIS_DB_PORT}" export REDIS_HOST=${host_ip} export INDEX_NAME="mm-rag-redis" -export DATAPREP_MMR_PORT=5000 +export DATAPREP_MMR_PORT=6007 export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest" export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts" export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions" @@ -334,15 +334,6 @@ export audio_fn="AudioSample.wav" wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav -O ${audio_fn} ``` -```bash -export DATAPREP_MMR_PORT=6007 -export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest" -export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts" -export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions" -export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/get" -export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/delete" -``` - Test dataprep microservice with generating transcript. This command updates a knowledge base by uploading a local video .mp4 and an audio .wav file. ```bash @@ -398,6 +389,7 @@ To delete all uploaded files along with data indexed with `$INDEX_NAME` in REDIS ```bash curl -X POST \ -H "Content-Type: application/json" \ + -d '{"file_path": "all"}' \ ${DATAPREP_DELETE_FILE_ENDPOINT} ``` diff --git a/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml b/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml index 681ba25ee6..31f543c755 100644 --- a/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml +++ b/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml @@ -26,7 +26,7 @@ services: - redis-vector-db - lvm-llava ports: - - "6007:${DATAPREP_MMR_PORT}" + - "${DATAPREP_MMR_PORT}:5000" environment: no_proxy: ${no_proxy} http_proxy: ${http_proxy} diff --git a/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh b/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh index 593e89452c..057f90990c 100755 --- a/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh +++ b/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh @@ -28,7 +28,7 @@ export REDIS_URL="redis://${host_ip}:${REDIS_DB_PORT}" export REDIS_HOST=${host_ip} export INDEX_NAME="mm-rag-redis" -export DATAPREP_MMR_PORT=5000 +export DATAPREP_MMR_PORT=6007 export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest" export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts" export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions" diff --git a/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md b/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md index 35ed4abbc8..2379fc3d4d 100644 --- a/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md +++ b/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md @@ -37,7 +37,7 @@ export WHISPER_PORT=7066 export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr" export MAX_IMAGES=1 export WHISPER_MODEL="base" -export DATAPREP_MMR_PORT=5000 +export DATAPREP_MMR_PORT=6007 export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest" export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts" export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions" @@ -282,15 +282,6 @@ wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_ex Test dataprep microservice with generating transcript. This command updates a knowledge base by uploading a local video .mp4 and an audio .wav file. -```bash -export DATAPREP_MMR_PORT=6007 -export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest" -export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts" -export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions" -export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/get" -export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/delete" -``` - ```bash curl --silent --write-out "HTTPSTATUS:%{http_code}" \ ${DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT} \ @@ -324,6 +315,7 @@ Also, you are able to get the list of all files that you uploaded: ```bash curl -X POST \ -H "Content-Type: application/json" \ + -d '{"file_path": "all"}' \ ${DATAPREP_GET_FILE_ENDPOINT} ``` diff --git a/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml index 6c646674b7..26b5610f5e 100644 --- a/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml @@ -28,7 +28,7 @@ services: - redis-vector-db - lvm ports: - - "6007:${DATAPREP_MMR_PORT}" + - "${DATAPREP_MMR_PORT}:5000" environment: no_proxy: ${no_proxy} http_proxy: ${http_proxy} @@ -42,16 +42,21 @@ services: MULTIMODAL_DATAPREP: true DATAPREP_COMPONENT_NAME: "OPEA_DATAPREP_MULTIMODALREDIS" restart: unless-stopped - embedding-multimodal-bridgetower: - image: ${REGISTRY:-opea}/embedding-multimodal-bridgetower:${TAG:-latest} - container_name: embedding-multimodal-bridgetower + embedding-multimodal-bridgetower-gaudi: + image: ${REGISTRY:-opea}/embedding-multimodal-bridgetower-gaudi:${TAG:-latest} + container_name: embedding-multimodal-bridgetower-gaudi ports: - ${EMM_BRIDGETOWER_PORT}:${EMM_BRIDGETOWER_PORT} + ipc: host environment: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} PORT: ${EMM_BRIDGETOWER_PORT} + HABANA_VISIBLE_DEVICES: all + runtime: habana + cap_add: + - SYS_NICE healthcheck: test: ["CMD-SHELL", "http_proxy='' curl -f http://localhost:${EMM_BRIDGETOWER_PORT}/v1/health_check"] interval: 10s @@ -64,7 +69,7 @@ services: image: ${REGISTRY:-opea}/embedding:${TAG:-latest} container_name: embedding depends_on: - embedding-multimodal-bridgetower: + embedding-multimodal-bridgetower-gaudi: condition: service_healthy ports: - ${MM_EMBEDDING_PORT_MICROSERVICE}:${MM_EMBEDDING_PORT_MICROSERVICE} diff --git a/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh b/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh index a92483f9a0..cc35d58d08 100755 --- a/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh +++ b/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh @@ -28,7 +28,7 @@ export WHISPER_PORT=7066 export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr" export MAX_IMAGES=1 -export DATAPREP_MMR_PORT=5000 +export DATAPREP_MMR_PORT=6007 export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest" export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts" export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions" diff --git a/MultimodalQnA/docker_image_build/build.yaml b/MultimodalQnA/docker_image_build/build.yaml index e90d0cc686..1fc599c3e5 100644 --- a/MultimodalQnA/docker_image_build/build.yaml +++ b/MultimodalQnA/docker_image_build/build.yaml @@ -23,6 +23,12 @@ services: dockerfile: comps/third_parties/bridgetower/src/Dockerfile extends: multimodalqna image: ${REGISTRY:-opea}/embedding-multimodal-bridgetower:${TAG:-latest} + embedding-multimodal-bridgetower-gaudi: + build: + context: GenAIComps + dockerfile: comps/third_parties/bridgetower/src/Dockerfile.intel_hpu + extends: multimodalqna + image: ${REGISTRY:-opea}/embedding-multimodal-bridgetower-gaudi:${TAG:-latest} embedding: build: context: GenAIComps diff --git a/MultimodalQnA/tests/test_compose_on_gaudi.sh b/MultimodalQnA/tests/test_compose_on_gaudi.sh index d50e024dc2..ccb4f1894d 100644 --- a/MultimodalQnA/tests/test_compose_on_gaudi.sh +++ b/MultimodalQnA/tests/test_compose_on_gaudi.sh @@ -59,7 +59,7 @@ function build_docker_images() { git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git echo "Build all the images with --no-cache, check docker_image_build.log for details..." - service_list="multimodalqna multimodalqna-ui embedding-multimodal-bridgetower embedding retriever lvm dataprep whisper" + service_list="multimodalqna multimodalqna-ui embedding-multimodal-bridgetower-gaudi embedding retriever lvm dataprep whisper" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6 @@ -82,7 +82,7 @@ function setup_env() { export MAX_IMAGES=1 export WHISPER_MODEL="base" export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr" - export DATAPREP_MMR_PORT=5000 + export DATAPREP_MMR_PORT=6007 export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest" export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts" export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions" @@ -177,19 +177,19 @@ function validate_microservices() { # Check if the microservices are running correctly. # Bridgetower Embedding Server - echo "Validating embedding-multimodal-bridgetower" + echo "Validating embedding-multimodal-bridgetower-gaudi" validate_service \ "http://${host_ip}:${EMM_BRIDGETOWER_PORT}/v1/encode" \ '"embedding":[' \ - "embedding-multimodal-bridgetower" \ - "embedding-multimodal-bridgetower" \ + "embedding-multimodal-bridgetower-gaudi" \ + "embedding-multimodal-bridgetower-gaudi" \ '{"text":"This is example"}' validate_service \ "http://${host_ip}:${EMM_BRIDGETOWER_PORT}/v1/encode" \ '"embedding":[' \ - "embedding-multimodal-bridgetower" \ - "embedding-multimodal-bridgetower" \ + "embedding-multimodal-bridgetower-gaudi" \ + "embedding-multimodal-bridgetower-gaudi" \ '{"text":"This is example", "img_b64_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC"}' # embedding microservice @@ -210,11 +210,6 @@ function validate_microservices() { sleep 1m # retrieval can't curl as expected, try to wait for more time - export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/ingest" - export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/generate_transcripts" - export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/generate_captions" - export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/get" - # test data prep echo "Validating Data Prep with Generating Transcript for Video" validate_service \ diff --git a/MultimodalQnA/tests/test_compose_on_rocm.sh b/MultimodalQnA/tests/test_compose_on_rocm.sh index 7787d919ee..9ba5c68c90 100644 --- a/MultimodalQnA/tests/test_compose_on_rocm.sh +++ b/MultimodalQnA/tests/test_compose_on_rocm.sh @@ -67,11 +67,11 @@ function setup_env() { export LVM_SERVICE_HOST_IP=${HOST_IP} export MEGA_SERVICE_HOST_IP=${HOST_IP} export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:8888/v1/multimodalqna" - export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/ingest" - export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/generate_transcripts" - export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/generate_captions" - export DATAPREP_GET_FILE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/get" - export DATAPREP_DELETE_FILE_ENDPOINT="http://${HOST_IP}:5000/v1/dataprep/delete" + export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/ingest" + export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/generate_transcripts" + export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/generate_captions" + export DATAPREP_GET_FILE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/get" + export DATAPREP_DELETE_FILE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/delete" } function start_services() { @@ -174,11 +174,6 @@ function validate_microservices() { sleep 1m # retrieval can't curl as expected, try to wait for more time - export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/ingest" - export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/generate_transcripts" - export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/generate_captions" - export DATAPREP_GET_FILE_ENDPOINT="http://${HOST_IP}:6007/v1/dataprep/get" - # test data prep echo "Data Prep with Generating Transcript for Video" validate_service \ diff --git a/MultimodalQnA/tests/test_compose_on_xeon.sh b/MultimodalQnA/tests/test_compose_on_xeon.sh index 10b1579478..b5d254b58c 100644 --- a/MultimodalQnA/tests/test_compose_on_xeon.sh +++ b/MultimodalQnA/tests/test_compose_on_xeon.sh @@ -79,7 +79,7 @@ function setup_env() { export REDIS_URL="redis://${host_ip}:${REDIS_DB_PORT}" export REDIS_HOST=${host_ip} export INDEX_NAME="mm-rag-redis" - export DATAPREP_MMR_PORT=5000 + export DATAPREP_MMR_PORT=6007 export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest" export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts" export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_captions" @@ -207,11 +207,6 @@ function validate_microservices() { sleep 1m # retrieval can't curl as expected, try to wait for more time - export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/ingest" - export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/generate_transcripts" - export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/generate_captions" - export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/get" - # test data prep echo "Validating Data Prep with Generating Transcript for Video" validate_service \ diff --git a/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py b/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py index 6b94e54be9..7919ce5910 100644 --- a/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py +++ b/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py @@ -610,7 +610,7 @@ def select_upload_type(choice, request: gr.Request): "BACKEND_SERVICE_ENDPOINT", f"http://localhost:{MEGA_SERVICE_PORT}/v1/multimodalqna" ) dataprep_ingest_endpoint = os.getenv( - "DATAPREP_INGEST_SERVICE_ENDPOINT", f"http://localhost:{DATAPREP_MMR_PORT}/v1/ingest_with_text" + "DATAPREP_INGEST_SERVICE_ENDPOINT", f"http://localhost:{DATAPREP_MMR_PORT}/v1/ingest" ) dataprep_gen_transcript_endpoint = os.getenv( "DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT", f"http://localhost:{DATAPREP_MMR_PORT}/v1/generate_transcripts" diff --git a/SearchQnA/kubernetes/helm/README.md b/SearchQnA/kubernetes/helm/README.md new file mode 100644 index 0000000000..ccdf71a32f --- /dev/null +++ b/SearchQnA/kubernetes/helm/README.md @@ -0,0 +1,18 @@ +# Deploy SearchQnA on Kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deploy options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Xeon + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install searchqna oci://ghcr.io/opea-project/charts/searchqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` + +## Deploy on Gaudi + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install searchqna oci://ghcr.io/opea-project/charts/searchqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml +``` diff --git a/SearchQnA/kubernetes/helm/cpu-values.yaml b/SearchQnA/kubernetes/helm/cpu-values.yaml new file mode 100644 index 0000000000..4de7affb83 --- /dev/null +++ b/SearchQnA/kubernetes/helm/cpu-values.yaml @@ -0,0 +1,7 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tgi: + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 +llm_uservice: + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 diff --git a/SearchQnA/kubernetes/helm/gaudi-values.yaml b/SearchQnA/kubernetes/helm/gaudi-values.yaml new file mode 100644 index 0000000000..ef327645de --- /dev/null +++ b/SearchQnA/kubernetes/helm/gaudi-values.yaml @@ -0,0 +1,50 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tgi: + accelDevice: "gaudi" + image: + repository: ghcr.io/huggingface/tgi-gaudi + tag: "2.3.1" + resources: + limits: + habana.ai/gaudi: 1 + MAX_INPUT_LENGTH: "2048" + MAX_TOTAL_TOKENS: "4096" + CUDA_GRAPHS: "" + HF_HUB_DISABLE_PROGRESS_BARS: 1 + HF_HUB_ENABLE_HF_TRANSFER: 0 + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true + livenessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + startupProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 + +tei: + accelDevice: "gaudi" + image: + repository: ghcr.io/huggingface/tei-gaudi + tag: "1.5.0" + OMPI_MCA_btl_vader_single_copy_mechanism: none + MAX_WARMUP_SEQUENCE_LENGTH: 512 + securityContext: + readOnlyRootFilesystem: false + resources: + limits: + habana.ai/gaudi: 1 + livenessProbe: + timeoutSeconds: 1 + readinessProbe: + timeoutSeconds: 1 diff --git a/Text2Image/kubernetes/helm/README.md b/Text2Image/kubernetes/helm/README.md new file mode 100644 index 0000000000..6d26e77bd9 --- /dev/null +++ b/Text2Image/kubernetes/helm/README.md @@ -0,0 +1,18 @@ +# Deploy txt2img on Kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deploy options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Xeon + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install txt2img oci://ghcr.io/opea-project/charts/txt2img --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` + +## Deploy on Gaudi + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install txt2img oci://ghcr.io/opea-project/charts/txt2img --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml +``` diff --git a/Text2Image/kubernetes/helm/cpu-values.yaml b/Text2Image/kubernetes/helm/cpu-values.yaml new file mode 100644 index 0000000000..87a6085784 --- /dev/null +++ b/Text2Image/kubernetes/helm/cpu-values.yaml @@ -0,0 +1,6 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +text2image: + image: + repository: opea/text2image diff --git a/Text2Image/kubernetes/helm/gaudi-values.yaml b/Text2Image/kubernetes/helm/gaudi-values.yaml new file mode 100644 index 0000000000..f43d405d5a --- /dev/null +++ b/Text2Image/kubernetes/helm/gaudi-values.yaml @@ -0,0 +1,30 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +text2image: + accelDevice: "gaudi" + image: + repository: opea/text2image-gaudi + resources: + limits: + habana.ai/gaudi: 1 + # The following hugepage related settings is for default MODEL stable-diffusion-v1-5/stable-diffusion-v1-5 + # User should change the resource limits for other models + hugepages-2Mi: 256Mi + volumes: + - name: hugepage-2mi + emptyDir: + medium: HugePages-2Mi + volumeMounts: + - name: hugepage-2mi + mountPath: /hugepages-2Mi + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + startupProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 diff --git a/Translation/tests/test_manifest_on_gaudi.sh b/Translation/tests/test_manifest_on_gaudi.sh deleted file mode 100755 index ea1f113cd7..0000000000 --- a/Translation/tests/test_manifest_on_gaudi.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -set -xe -USER_ID=$(whoami) -LOG_PATH=/home/$(whoami)/logs -MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub -IMAGE_REPO=${IMAGE_REPO:-opea} -IMAGE_TAG=${IMAGE_TAG:-latest} - -function init_translation() { - # executed under path manifest/translation/xeon - # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT" - find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \; - # replace microservice image tag - find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \; - # replace the repository "image: opea/*" with "image: $IMAGE_REPO/" - find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \; - # set huggingface token - find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \; -} - -function install_translation { - echo "namespace is $NAMESPACE" - kubectl apply -f translation.yaml -n $NAMESPACE - sleep 50s -} - -function validate_translation() { - ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}') - port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}') - echo "try to curl http://${ip_address}:${port}/v1/translation..." - - # generate a random logfile name to avoid conflict among multiple runners - LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log - # Curl the Mega Service - curl http://${ip_address}:${port}/v1/translation \ - -H 'Content-Type: application/json' \ - -d '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' > $LOGFILE - exit_code=$? - if [ $exit_code -ne 0 ]; then - echo "Megaservice translation failed, please check the logs in $LOGFILE!" - exit 1 - fi - - echo "Checking response results, make sure the output is reasonable. " - local status=false - if [[ -f $LOGFILE ]] && \ - [[ $(grep -c "translation" $LOGFILE) != 0 ]]; then - status=true - fi - - if [ $status == false ]; then - echo "Response check failed, please check the logs in artifacts!" - else - echo "Response check succeed!" - fi -} - -if [ $# -eq 0 ]; then - echo "Usage: $0 " - exit 1 -fi - -case "$1" in - init_Translation) - pushd Translation/kubernetes/intel/hpu/gaudi/manifest - init_translation - popd - ;; - install_Translation) - pushd Translation/kubernetes/intel/hpu/gaudi/manifest - NAMESPACE=$2 - install_translation - popd - ;; - validate_Translation) - NAMESPACE=$2 - SERVICE_NAME=translation - validate_translation - ;; - *) - echo "Unknown function: $1" - ;; -esac diff --git a/Translation/tests/test_manifest_on_xeon.sh b/Translation/tests/test_manifest_on_xeon.sh deleted file mode 100755 index d32eb4a229..0000000000 --- a/Translation/tests/test_manifest_on_xeon.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -set -xe -USER_ID=$(whoami) -LOG_PATH=/home/$(whoami)/logs -MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub -IMAGE_REPO=${IMAGE_REPO:-opea} -IMAGE_TAG=${IMAGE_TAG:-latest} - -function init_translation() { - # executed under path manifest/translation/xeon - # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT" - find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \; - # replace microservice image tag - find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \; - # replace the repository "image: opea/*" with "image: $IMAGE_REPO/" - find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \; - # set huggingface token - find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \; -} - -function install_translation { - echo "namespace is $NAMESPACE" - kubectl apply -f translation.yaml -n $NAMESPACE -} - -function validate_translation() { - ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}') - port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}') - echo "try to curl http://${ip_address}:${port}/v1/translation..." - - # generate a random logfile name to avoid conflict among multiple runners - LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log - # Curl the Mega Service - curl http://${ip_address}:${port}/v1/translation \ - -H 'Content-Type: application/json' \ - -d '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' > $LOGFILE - exit_code=$? - if [ $exit_code -ne 0 ]; then - echo "Megaservice translation failed, please check the logs in $LOGFILE!" - exit 1 - fi - - echo "Checking response results, make sure the output is reasonable. " - local status=false - if [[ -f $LOGFILE ]] && \ - [[ $(grep -c "translation" $LOGFILE) != 0 ]]; then - status=true - fi - - if [ $status == false ]; then - echo "Response check failed, please check the logs in artifacts!" - else - echo "Response check succeed!" - fi -} - -if [ $# -eq 0 ]; then - echo "Usage: $0 " - exit 1 -fi - -case "$1" in - init_Translation) - pushd Translation/kubernetes/intel/cpu/xeon/manifest - init_translation - popd - ;; - install_Translation) - pushd Translation/kubernetes/intel/cpu/xeon/manifest - NAMESPACE=$2 - install_translation - popd - ;; - validate_Translation) - NAMESPACE=$2 - SERVICE_NAME=translation - validate_translation - ;; - *) - echo "Unknown function: $1" - ;; -esac diff --git a/VisualQnA/kubernetes/helm/gaudi-values.yaml b/VisualQnA/kubernetes/helm/gaudi-values.yaml index 5a0e95c3a9..eb6494a142 100644 --- a/VisualQnA/kubernetes/helm/gaudi-values.yaml +++ b/VisualQnA/kubernetes/helm/gaudi-values.yaml @@ -9,13 +9,18 @@ tgi: accelDevice: "gaudi" image: repository: ghcr.io/huggingface/tgi-gaudi - tag: "2.0.6" + tag: "2.3.1" resources: limits: habana.ai/gaudi: 1 MAX_INPUT_LENGTH: "4096" MAX_TOTAL_TOKENS: "8192" CUDA_GRAPHS: "" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + ENABLE_HPU_GRAPH: "true" + LIMIT_HPU_GRAPH: "true" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "true" livenessProbe: initialDelaySeconds: 5 periodSeconds: 5