diff --git a/MultimodalQnA/Dockerfile b/MultimodalQnA/Dockerfile
index f38b305e42..ca73e5486a 100644
--- a/MultimodalQnA/Dockerfile
+++ b/MultimodalQnA/Dockerfile
@@ -16,13 +16,12 @@ RUN useradd -m -s /bin/bash user && \
WORKDIR $HOME
-
# Stage 2: latest GenAIComps sources
FROM base AS git
RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
+#RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
+RUN git clone --depth 1 https://github.com/mhbuehler/GenAIComps.git --single-branch --branch mmqna-image-query
# Stage 3: common layer shared by services using GenAIComps
FROM base AS comps-base
diff --git a/MultimodalQnA/README.md b/MultimodalQnA/README.md
index c96e4229bf..8e46be1c8a 100644
--- a/MultimodalQnA/README.md
+++ b/MultimodalQnA/README.md
@@ -1,8 +1,8 @@
# MultimodalQnA Application
-Suppose you possess a set of videos and wish to perform question-answering to extract insights from these videos. To respond to your questions, it typically necessitates comprehension of visual cues within the videos, knowledge derived from the audio content, or often a mix of both these visual elements and auditory facts. The MultimodalQnA framework offers an optimal solution for this purpose.
+Suppose you possess a set of videos, images, audio files, PDFs, or some combination thereof and wish to perform question-answering to extract insights from these documents. To respond to your questions, the system needs to comprehend a mix of textual, visual, and audio facts drawn from the document contents. The MultimodalQnA framework offers an optimal solution for this purpose.
-`MultimodalQnA` addresses your questions by dynamically fetching the most pertinent multimodal information (frames, transcripts, and/or captions) from your collection of videos, images, and audio files. For this purpose, MultimodalQnA utilizes [BridgeTower model](https://huggingface.co/BridgeTower/bridgetower-large-itm-mlm-gaudi), a multimodal encoding transformer model which merges visual and textual data into a unified semantic space. During the ingestion phase, the BridgeTower model embeds both visual cues and auditory facts as texts, and those embeddings are then stored in a vector database. When it comes to answering a question, the MultimodalQnA will fetch its most relevant multimodal content from the vector store and feed it into a downstream Large Vision-Language Model (LVM) as input context to generate a response for the user.
+`MultimodalQnA` addresses your questions by dynamically fetching the most pertinent multimodal information (e.g. images, transcripts, and captions) from your collection of video, image, audio, and PDF files. For this purpose, MultimodalQnA utilizes [BridgeTower model](https://huggingface.co/BridgeTower/bridgetower-large-itm-mlm-gaudi), a multimodal encoding transformer model which merges visual and textual data into a unified semantic space. During the ingestion phase, the BridgeTower model embeds both visual cues and auditory facts as texts, and those embeddings are then stored in a vector database. When it comes to answering a question, the MultimodalQnA will fetch its most relevant multimodal content from the vector store and feed it into a downstream Large Vision-Language Model (LVM) as input context to generate a response for the user.
The MultimodalQnA architecture shows below:
@@ -87,12 +87,12 @@ In the below, we provide a table that describes for each microservice component
Gaudi default compose.yaml
-| MicroService | Open Source Project | HW | Port | Endpoint |
-| ------------ | --------------------- | ----- | ---- | ----------------------------------------------- |
-| Embedding | Langchain | Xeon | 6000 | /v1/embeddings |
-| Retriever | Langchain, Redis | Xeon | 7000 | /v1/multimodal_retrieval |
-| LVM | Langchain, TGI | Gaudi | 9399 | /v1/lvm |
-| Dataprep | Redis, Langchain, TGI | Gaudi | 6007 | /v1/generate_transcripts, /v1/generate_captions |
+| MicroService | Open Source Project | HW | Port | Endpoint |
+| ------------ | --------------------- | ----- | ---- | --------------------------------------------------------------------- |
+| Embedding | Langchain | Xeon | 6000 | /v1/embeddings |
+| Retriever | Langchain, Redis | Xeon | 7000 | /v1/multimodal_retrieval |
+| LVM | Langchain, TGI | Gaudi | 9399 | /v1/lvm |
+| Dataprep | Redis, Langchain, TGI | Gaudi | 6007 | /v1/generate_transcripts, /v1/generate_captions, /v1/ingest_with_text |
@@ -172,8 +172,38 @@ docker compose -f compose.yaml up -d
## MultimodalQnA Demo on Gaudi2
-![MultimodalQnA-upload-waiting-screenshot](./assets/img/upload-gen-trans.png)
+### Multimodal QnA UI
-![MultimodalQnA-upload-done-screenshot](./assets/img/upload-gen-captions.png)
+![MultimodalQnA-ui-screenshot](./assets/img/mmqna-ui.png)
-![MultimodalQnA-query-example-screenshot](./assets/img/example_query.png)
+### Video Ingestion
+
+![MultimodalQnA-ingest-video-screenshot](./assets/img/video-ingestion.png)
+
+### Text Query following the ingestion of a Video
+
+![MultimodalQnA-video-query-screenshot](./assets/img/video-query.png)
+
+### Image Ingestion
+
+![MultimodalQnA-ingest-image-screenshot](./assets/img/image-ingestion.png)
+
+### Text Query following the ingestion of an image
+
+![MultimodalQnA-video-query-screenshot](./assets/img/image-query.png)
+
+### Audio Ingestion
+
+![MultimodalQnA-audio-ingestion-screenshot](./assets/img/audio-ingestion.png)
+
+### Text Query following the ingestion of an Audio Podcast
+
+![MultimodalQnA-audio-query-screenshot](./assets/img/audio-query.png)
+
+### PDF Ingestion
+
+![MultimodalQnA-upload-pdf-screenshot](./assets/img/ingest_pdf.png)
+
+### Text query following the ingestion of a PDF
+
+![MultimodalQnA-pdf-query-example-screenshot](./assets/img/pdf-query.png)
diff --git a/MultimodalQnA/assets/img/audio-ingestion.png b/MultimodalQnA/assets/img/audio-ingestion.png
new file mode 100644
index 0000000000..56ad7ca276
Binary files /dev/null and b/MultimodalQnA/assets/img/audio-ingestion.png differ
diff --git a/MultimodalQnA/assets/img/audio-query.png b/MultimodalQnA/assets/img/audio-query.png
new file mode 100644
index 0000000000..028a8324e9
Binary files /dev/null and b/MultimodalQnA/assets/img/audio-query.png differ
diff --git a/MultimodalQnA/assets/img/image-ingestion.png b/MultimodalQnA/assets/img/image-ingestion.png
new file mode 100644
index 0000000000..544a69ddd2
Binary files /dev/null and b/MultimodalQnA/assets/img/image-ingestion.png differ
diff --git a/MultimodalQnA/assets/img/image-query.png b/MultimodalQnA/assets/img/image-query.png
new file mode 100644
index 0000000000..2c075f6cf4
Binary files /dev/null and b/MultimodalQnA/assets/img/image-query.png differ
diff --git a/MultimodalQnA/assets/img/ingest_pdf.png b/MultimodalQnA/assets/img/ingest_pdf.png
new file mode 100644
index 0000000000..ae68a263c8
Binary files /dev/null and b/MultimodalQnA/assets/img/ingest_pdf.png differ
diff --git a/MultimodalQnA/assets/img/mmqna-ui.png b/MultimodalQnA/assets/img/mmqna-ui.png
new file mode 100644
index 0000000000..105720ee49
Binary files /dev/null and b/MultimodalQnA/assets/img/mmqna-ui.png differ
diff --git a/MultimodalQnA/assets/img/pdf-query.png b/MultimodalQnA/assets/img/pdf-query.png
new file mode 100644
index 0000000000..c4968f112a
Binary files /dev/null and b/MultimodalQnA/assets/img/pdf-query.png differ
diff --git a/MultimodalQnA/assets/img/video-ingestion.png b/MultimodalQnA/assets/img/video-ingestion.png
new file mode 100644
index 0000000000..fa39e48b1b
Binary files /dev/null and b/MultimodalQnA/assets/img/video-ingestion.png differ
diff --git a/MultimodalQnA/assets/img/video-query.png b/MultimodalQnA/assets/img/video-query.png
new file mode 100644
index 0000000000..53be0051f6
Binary files /dev/null and b/MultimodalQnA/assets/img/video-query.png differ
diff --git a/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md b/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md
index 5a72491c32..86710fc7a2 100644
--- a/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md
@@ -40,6 +40,10 @@ lvm
===
Port 9399 - Open to 0.0.0.0/0
+whisper
+===
+port 7066 - Open to 0.0.0.0/0
+
dataprep-multimodal-redis
===
Port 6007 - Open to 0.0.0.0/0
@@ -75,34 +79,47 @@ export your_no_proxy=${your_no_proxy},"External_Public_IP"
export no_proxy=${your_no_proxy}
export http_proxy=${your_http_proxy}
export https_proxy=${your_http_proxy}
-export EMBEDDER_PORT=6006
-export MMEI_EMBEDDING_ENDPOINT="http://${host_ip}:$EMBEDDER_PORT"
-export MM_EMBEDDING_PORT_MICROSERVICE=6000
-export WHISPER_SERVER_PORT=7066
-export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_SERVER_PORT}/v1/asr"
-export REDIS_URL="redis://${host_ip}:6379"
+export MM_EMBEDDING_SERVICE_HOST_IP=${host_ip}
+export MM_RETRIEVER_SERVICE_HOST_IP=${host_ip}
+export LVM_SERVICE_HOST_IP=${host_ip}
+export MEGA_SERVICE_HOST_IP=${host_ip}
+export WHISPER_PORT=7066
+export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr"
+export WHISPER_MODEL="base"
+export MAX_IMAGES=1
+export REDIS_DB_PORT=6379
+export REDIS_INSIGHTS_PORT=8001
+export REDIS_URL="redis://${host_ip}:${REDIS_DB_PORT}"
export REDIS_HOST=${host_ip}
export INDEX_NAME="mm-rag-redis"
+export DATAPREP_MMR_PORT=6007
+export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/ingest_with_text"
+export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/generate_transcripts"
+export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/generate_captions"
+export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/get_files"
+export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/delete_files"
+export EMM_BRIDGETOWER_PORT=6006
+export EMBEDDING_MODEL_ID="BridgeTower/bridgetower-large-itm-mlm-itc"
export BRIDGE_TOWER_EMBEDDING=true
+export MMEI_EMBEDDING_ENDPOINT="http://${host_ip}:$EMM_BRIDGETOWER_PORT"
+export MM_EMBEDDING_PORT_MICROSERVICE=6000
+export REDIS_RETRIEVER_PORT=7000
+export LVM_PORT=9399
export LLAVA_SERVER_PORT=8399
-export LVM_ENDPOINT="http://${host_ip}:8399"
-export EMBEDDING_MODEL_ID="BridgeTower/bridgetower-large-itm-mlm-itc"
export LVM_MODEL_ID="llava-hf/llava-1.5-7b-hf"
-export WHISPER_MODEL="base"
-export MM_EMBEDDING_SERVICE_HOST_IP=${host_ip}
-export MM_RETRIEVER_SERVICE_HOST_IP=${host_ip}
-export LVM_SERVICE_HOST_IP=${host_ip}
-export MEGA_SERVICE_HOST_IP=${host_ip}
-export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/multimodalqna"
-export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/ingest_with_text"
-export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/generate_transcripts"
-export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/generate_captions"
-export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/get_files"
-export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/delete_files"
+export LVM_ENDPOINT="http://${host_ip}:$LLAVA_SERVER_PORT"
+export MEGA_SERVICE_PORT=8888
+export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:$MEGA_SERVICE_PORT/v1/multimodalqna"
+export UI_PORT=5173
```
Note: Please replace with `host_ip` with you external IP address, do not use localhost.
+> Note: The `MAX_IMAGES` environment variable is used to specify the maximum number of images that will be sent from the LVM service to the LLaVA server.
+> If an image list longer than `MAX_IMAGES` is sent to the LVM server, a shortened image list will be sent to the LLaVA service. If the image list
+> needs to be shortened, the most recent images (the ones at the end of the list) are prioritized to send to the LLaVA service. Some LLaVA models have not
+> been trained with multiple images and may lead to inaccurate results. If `MAX_IMAGES` is not set, it will default to `1`.
+
## 🚀 Build Docker Images
### 1. Build embedding-multimodal-bridgetower Image
@@ -112,7 +129,7 @@ Build embedding-multimodal-bridgetower docker image
```bash
git clone https://github.com/opea-project/GenAIComps.git
cd GenAIComps
-docker build --no-cache -t opea/embedding-multimodal-bridgetower:latest --build-arg EMBEDDER_PORT=$EMBEDDER_PORT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/third_parties/bridgetower/src/Dockerfile .
+docker build --no-cache -t opea/embedding-multimodal-bridgetower:latest --build-arg EMBEDDER_PORT=$EMM_BRIDGETOWER_PORT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/third_parties/bridgetower/src/Dockerfile .
```
Build embedding microservice image
@@ -147,7 +164,7 @@ docker build --no-cache -t opea/lvm:latest --build-arg https_proxy=$https_proxy
docker build --no-cache -t opea/dataprep-multimodal-redis:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/multimodal/redis/langchain/Dockerfile .
```
-### 5. Build asr images
+### 5. Build Whisper Server Image
Build whisper server image
@@ -214,14 +231,14 @@ docker compose -f compose.yaml up -d
1. embedding-multimodal-bridgetower
```bash
-curl http://${host_ip}:${EMBEDDER_PORT}/v1/encode \
+curl http://${host_ip}:${EMM_BRIDGETOWER_PORT}/v1/encode \
-X POST \
-H "Content-Type:application/json" \
-d '{"text":"This is example"}'
```
```bash
-curl http://${host_ip}:${EMBEDDER_PORT}/v1/encode \
+curl http://${host_ip}:${EMM_BRIDGETOWER_PORT}/v1/encode \
-X POST \
-H "Content-Type:application/json" \
-d '{"text":"This is example", "img_b64_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC"}'
@@ -247,13 +264,13 @@ curl http://${host_ip}:$MM_EMBEDDING_PORT_MICROSERVICE/v1/embeddings \
```bash
export your_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(512)]; print(embedding)")
-curl http://${host_ip}:7000/v1/multimodal_retrieval \
+curl http://${host_ip}:${REDIS_RETRIEVER_PORT}/v1/multimodal_retrieval \
-X POST \
-H "Content-Type: application/json" \
-d "{\"text\":\"test\",\"embedding\":${your_embedding}}"
```
-4. asr
+4. whisper
```bash
curl ${WHISPER_SERVER_ENDPOINT} \
@@ -274,14 +291,14 @@ curl http://${host_ip}:${LLAVA_SERVER_PORT}/generate \
6. lvm
```bash
-curl http://${host_ip}:9399/v1/lvm \
+curl http://${host_ip}:${LVM_PORT}/v1/lvm \
-X POST \
-H 'Content-Type: application/json' \
-d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [{"b64_img_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "transcript_for_inference": "yellow image", "video_id": "8c7461df-b373-4a00-8696-9a2234359fe0", "time_of_frame_ms":"37000000", "source_video":"WeAreGoingOnBullrun_8c7461df-b373-4a00-8696-9a2234359fe0.mp4"}], "chat_template":"The caption of the image is: '\''{context}'\''. {question}"}'
```
```bash
-curl http://${host_ip}:9399/v1/lvm \
+curl http://${host_ip}:${LVM_PORT}/v1/lvm \
-X POST \
-H 'Content-Type: application/json' \
-d '{"image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "prompt":"What is this?"}'
@@ -290,7 +307,7 @@ curl http://${host_ip}:9399/v1/lvm \
Also, validate LVM Microservice with empty retrieval results
```bash
-curl http://${host_ip}:9399/v1/lvm \
+curl http://${host_ip}:${LVM_PORT}/v1/lvm \
-X POST \
-H 'Content-Type: application/json' \
-d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [], "chat_template":"The caption of the image is: '\''{context}'\''. {question}"}'
@@ -298,7 +315,7 @@ curl http://${host_ip}:9399/v1/lvm \
7. dataprep-multimodal-redis
-Download a sample video, image, and audio file and create a caption
+Download a sample video, image, pdf, and audio file and create a caption
```bash
export video_fn="WeAreGoingOnBullrun.mp4"
@@ -307,6 +324,9 @@ wget http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/WeAreGoing
export image_fn="apple.png"
wget https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true -O ${image_fn}
+export pdf_fn="nke-10k-2023.pdf"
+wget https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf -O ${pdf_fn}
+
export caption_fn="apple.txt"
echo "This is an apple." > ${caption_fn}
@@ -325,7 +345,7 @@ curl --silent --write-out "HTTPSTATUS:%{http_code}" \
-F "files=@./${audio_fn}"
```
-Also, test dataprep microservice with generating an image caption using lvm microservice
+Also, test dataprep microservice with generating an image caption using lvm microservice.
```bash
curl --silent --write-out "HTTPSTATUS:%{http_code}" \
@@ -334,13 +354,14 @@ curl --silent --write-out "HTTPSTATUS:%{http_code}" \
-X POST -F "files=@./${image_fn}"
```
-Now, test the microservice with posting a custom caption along with an image
+Now, test the microservice with posting a custom caption along with an image and a PDF containing images and text.
```bash
curl --silent --write-out "HTTPSTATUS:%{http_code}" \
${DATAPREP_INGEST_SERVICE_ENDPOINT} \
-H 'Content-Type: multipart/form-data' \
- -X POST -F "files=@./${image_fn}" -F "files=@./${caption_fn}"
+ -X POST -F "files=@./${image_fn}" -F "files=@./${caption_fn}" \
+ -F "files=@./${pdf_fn}"
```
Also, you are able to get the list of all files that you uploaded:
@@ -358,7 +379,8 @@ Then you will get the response python-style LIST like this. Notice the name of e
"WeAreGoingOnBullrun_7ac553a1-116c-40a2-9fc5-deccbb89b507.mp4",
"WeAreGoingOnBullrun_6d13cf26-8ba2-4026-a3a9-ab2e5eb73a29.mp4",
"apple_fcade6e6-11a5-44a2-833a-3e534cbe4419.png",
- "AudioSample_976a85a6-dc3e-43ab-966c-9d81beef780c.wav
+ "nke-10k-2023_28000757-5533-4b1b-89fe-7c0a1b7e2cd0.pdf",
+ "AudioSample_976a85a6-dc3e-43ab-966c-9d81beef780c.wav"
]
```
@@ -372,21 +394,41 @@ curl -X POST \
8. MegaService
+Test the MegaService with a text query:
+
```bash
-curl http://${host_ip}:8888/v1/multimodalqna \
+curl http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna \
-H "Content-Type: application/json" \
-X POST \
-d '{"messages": "What is the revenue of Nike in 2023?"}'
```
+Test the MegaService with an audio query:
+
+```bash
+curl http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna \
+ -H "Content-Type: application/json" \
+ -d '{"messages": [{"role": "user", "content": [{"type": "audio", "audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}]}]}'
+```
+
+Test the MegaService with a text and image query:
+
+```bash
+curl http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna \
+ -H "Content-Type: application/json" \
+ -d '{"messages": [{"role": "user", "content": [{"type": "text", "text": "Green bananas in a tree"}, {"type": "image_url", "image_url": {"url": "http://images.cocodataset.org/test-stuff2017/000000004248.jpg"}}]}]}'
+```
+
+Test the MegaService with a back and forth conversation between the user and assistant:
+
```bash
-curl http://${host_ip}:8888/v1/multimodalqna \
+curl http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna \
-H "Content-Type: application/json" \
-d '{"messages": [{"role": "user", "content": [{"type": "audio", "audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}]}]}'
```
```bash
-curl http://${host_ip}:8888/v1/multimodalqna \
+curl http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna \
-H "Content-Type: application/json" \
-d '{"messages": [{"role": "user", "content": [{"type": "text", "text": "hello, "}, {"type": "image_url", "image_url": {"url": "https://www.ilankelman.org/stopsigns/australia.jpg"}}]}, {"role": "assistant", "content": "opea project! "}, {"role": "user", "content": "chao, "}], "max_tokens": 10}'
```
diff --git a/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml b/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 48c40f3bb3..283251946c 100644
--- a/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -6,7 +6,7 @@ services:
image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
container_name: whisper-service
ports:
- - "7066:7066"
+ - "${WHISPER_PORT}:7066"
ipc: host
environment:
no_proxy: ${no_proxy}
@@ -17,8 +17,8 @@ services:
image: redis/redis-stack:7.2.0-v9
container_name: redis-vector-db
ports:
- - "6379:6379"
- - "8001:8001"
+ - "${REDIS_DB_PORT}:${REDIS_DB_PORT}"
+ - "${REDIS_INSIGHTS_PORT}:${REDIS_INSIGHTS_PORT}"
dataprep-multimodal-redis:
image: ${REGISTRY:-opea}/dataprep-multimodal-redis:${TAG:-latest}
container_name: dataprep-multimodal-redis
@@ -26,29 +26,31 @@ services:
- redis-vector-db
- lvm-llava
ports:
- - "6007:6007"
+ - "${DATAPREP_MMR_PORT}:${DATAPREP_MMR_PORT}"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
REDIS_URL: ${REDIS_URL}
REDIS_HOST: ${REDIS_HOST}
+ DATAPREP_MMR_PORT: ${DATAPREP_MMR_PORT}
INDEX_NAME: ${INDEX_NAME}
- LVM_ENDPOINT: "http://${LVM_SERVICE_HOST_IP}:9399/v1/lvm"
+ LVM_ENDPOINT: "http://${LVM_SERVICE_HOST_IP}:${LVM_PORT}/v1/lvm"
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
restart: unless-stopped
embedding-multimodal-bridgetower:
image: ${REGISTRY:-opea}/embedding-multimodal-bridgetower:${TAG:-latest}
container_name: embedding-multimodal-bridgetower
ports:
- - ${EMBEDDER_PORT}:${EMBEDDER_PORT}
+ - ${EMM_BRIDGETOWER_PORT}:${EMM_BRIDGETOWER_PORT}
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
- PORT: ${EMBEDDER_PORT}
+ EMM_BRIDGETOWER_PORT: ${EMM_BRIDGETOWER_PORT}
+ PORT: ${EMM_BRIDGETOWER_PORT}
healthcheck:
- test: ["CMD-SHELL", "http_proxy='' curl -f http://localhost:${EMBEDDER_PORT}/v1/health_check"]
+ test: ["CMD-SHELL", "http_proxy='' curl -f http://localhost:${EMM_BRIDGETOWER_PORT}/v1/health_check"]
interval: 10s
timeout: 6s
retries: 18
@@ -78,13 +80,16 @@ services:
depends_on:
- redis-vector-db
ports:
- - "7000:7000"
+ - "${REDIS_RETRIEVER_PORT}:${REDIS_RETRIEVER_PORT}"
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
REDIS_URL: ${REDIS_URL}
+ REDIS_DB_PORT: ${REDIS_DB_PORT}
+ REDIS_INSIGHTS_PORT: ${REDIS_INSIGHTS_PORT}
+ REDIS_RETRIEVER_PORT: ${REDIS_RETRIEVER_PORT}
INDEX_NAME: ${INDEX_NAME}
BRIDGE_TOWER_EMBEDDING: ${BRIDGE_TOWER_EMBEDDING}
LOGFLAG: ${LOGFLAG}
@@ -94,11 +99,13 @@ services:
image: ${REGISTRY:-opea}/lvm-llava:${TAG:-latest}
container_name: lvm-llava
ports:
- - "8399:8399"
+ - "${LLAVA_SERVER_PORT}:${LLAVA_SERVER_PORT}"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
+ LLAVA_SERVER_PORT: ${LLAVA_SERVER_PORT}
+ LVM_PORT: ${LVM_PORT}
entrypoint: ["python", "llava_server.py", "--device", "cpu", "--model_name_or_path", $LVM_MODEL_ID]
restart: unless-stopped
lvm:
@@ -107,7 +114,7 @@ services:
depends_on:
- lvm-llava
ports:
- - "9399:9399"
+ - "${LVM_PORT}:${LVM_PORT}"
ipc: host
environment:
no_proxy: ${no_proxy}
@@ -115,6 +122,9 @@ services:
https_proxy: ${https_proxy}
LVM_COMPONENT_NAME: "OPEA_LLAVA_LVM"
LVM_ENDPOINT: ${LVM_ENDPOINT}
+ LLAVA_SERVER_PORT: ${LLAVA_SERVER_PORT}
+ LVM_PORT: ${LVM_PORT}
+ MAX_IMAGES: ${MAX_IMAGES:-1}
restart: unless-stopped
multimodalqna:
image: ${REGISTRY:-opea}/multimodalqna:${TAG:-latest}
@@ -126,17 +136,19 @@ services:
- retriever-redis
- lvm
ports:
- - "8888:8888"
+ - "${MEGA_SERVICE_PORT}:${MEGA_SERVICE_PORT}"
environment:
no_proxy: ${no_proxy}
https_proxy: ${https_proxy}
http_proxy: ${http_proxy}
MEGA_SERVICE_HOST_IP: ${MEGA_SERVICE_HOST_IP}
+ MEGA_SERVICE_PORT: ${MEGA_SERVICE_PORT}
MM_EMBEDDING_SERVICE_HOST_IP: ${MM_EMBEDDING_SERVICE_HOST_IP}
MM_EMBEDDING_PORT_MICROSERVICE: ${MM_EMBEDDING_PORT_MICROSERVICE}
MM_RETRIEVER_SERVICE_HOST_IP: ${MM_RETRIEVER_SERVICE_HOST_IP}
LVM_SERVICE_HOST_IP: ${LVM_SERVICE_HOST_IP}
- WHISPER_SERVER_PORT: ${WHISPER_SERVER_PORT}
+ LVM_MODEL_ID: ${LVM_MODEL_ID}
+ WHISPER_PORT: ${WHISPER_PORT}
WHISPER_SERVER_ENDPOINT: ${WHISPER_SERVER_ENDPOINT}
ipc: host
restart: always
@@ -146,7 +158,7 @@ services:
depends_on:
- multimodalqna
ports:
- - "5173:5173"
+ - "${UI_PORT}:${UI_PORT}"
environment:
- no_proxy=${no_proxy}
- https_proxy=${https_proxy}
@@ -155,6 +167,9 @@ services:
- DATAPREP_INGEST_SERVICE_ENDPOINT=${DATAPREP_INGEST_SERVICE_ENDPOINT}
- DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT=${DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT}
- DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT=${DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT}
+ - MEGA_SERVICE_PORT:=${MEGA_SERVICE_PORT}
+ - UI_PORT=${UI_PORT}
+ - DATAPREP_MMR_PORT=${DATAPREP_MMR_PORT}
ipc: host
restart: always
diff --git a/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh b/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh
index 5b41e456ca..438e41fc31 100755
--- a/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh
+++ b/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh
@@ -6,30 +6,49 @@ pushd "../../../../../" > /dev/null
source .set_env.sh
popd > /dev/null
+export host_ip=$(hostname -I | awk '{print $1}')
+
export no_proxy=${your_no_proxy}
export http_proxy=${your_http_proxy}
export https_proxy=${your_http_proxy}
-export EMBEDDER_PORT=6006
-export MMEI_EMBEDDING_ENDPOINT="http://${host_ip}:$EMBEDDER_PORT"
-export MM_EMBEDDING_PORT_MICROSERVICE=6000
-export WHISPER_SERVER_PORT=7066
-export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_SERVER_PORT}/v1/asr"
-export REDIS_URL="redis://${host_ip}:6379"
+
+export MM_EMBEDDING_SERVICE_HOST_IP=${host_ip}
+export MM_RETRIEVER_SERVICE_HOST_IP=${host_ip}
+export LVM_SERVICE_HOST_IP=${host_ip}
+export MEGA_SERVICE_HOST_IP=${host_ip}
+
+export WHISPER_PORT=7066
+export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr"
+export WHISPER_MODEL="base"
+export MAX_IMAGES=1
+
+export REDIS_DB_PORT=6379
+export REDIS_INSIGHTS_PORT=8001
+export REDIS_URL="redis://${host_ip}:${REDIS_DB_PORT}"
export REDIS_HOST=${host_ip}
export INDEX_NAME="mm-rag-redis"
+
+export DATAPREP_MMR_PORT=6007
+export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/ingest_with_text"
+export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/generate_transcripts"
+export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/generate_captions"
+export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/get_files"
+export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/delete_files"
+
+export EMM_BRIDGETOWER_PORT=6006
+export EMBEDDING_MODEL_ID="BridgeTower/bridgetower-large-itm-mlm-itc"
+export MMEI_EMBEDDING_ENDPOINT="http://${host_ip}:$EMM_BRIDGETOWER_PORT"
+export MM_EMBEDDING_PORT_MICROSERVICE=6000
export BRIDGE_TOWER_EMBEDDING=true
+
+export REDIS_RETRIEVER_PORT=7000
+
+export LVM_PORT=9399
export LLAVA_SERVER_PORT=8399
-export LVM_ENDPOINT="http://${host_ip}:8399"
-export EMBEDDING_MODEL_ID="BridgeTower/bridgetower-large-itm-mlm-itc"
export LVM_MODEL_ID="llava-hf/llava-1.5-7b-hf"
-export WHISPER_MODEL="base"
-export MM_EMBEDDING_SERVICE_HOST_IP=${host_ip}
-export MM_RETRIEVER_SERVICE_HOST_IP=${host_ip}
-export LVM_SERVICE_HOST_IP=${host_ip}
-export MEGA_SERVICE_HOST_IP=${host_ip}
-export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/multimodalqna"
-export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/ingest_with_text"
-export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/generate_transcripts"
-export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/generate_captions"
-export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/get_files"
-export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/delete_files"
+export LVM_ENDPOINT="http://${host_ip}:${LLAVA_SERVER_PORT}"
+
+export MEGA_SERVICE_PORT=8888
+export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna"
+
+export UI_PORT=5173
diff --git a/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md b/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md
index 598797b74f..068a45900a 100644
--- a/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -24,34 +24,48 @@ export your_no_proxy=${your_no_proxy},"External_Public_IP"
export no_proxy=${your_no_proxy}
export http_proxy=${your_http_proxy}
export https_proxy=${your_http_proxy}
-export EMBEDDER_PORT=6006
-export MMEI_EMBEDDING_ENDPOINT="http://${host_ip}:$EMBEDDER_PORT"
-export MM_EMBEDDING_PORT_MICROSERVICE=6000
-export REDIS_URL="redis://${host_ip}:6379"
+export MM_EMBEDDING_SERVICE_HOST_IP=${host_ip}
+export MM_RETRIEVER_SERVICE_HOST_IP=${host_ip}
+export LVM_SERVICE_HOST_IP=${host_ip}
+export MEGA_SERVICE_HOST_IP=${host_ip}
+export REDIS_DB_PORT=6379
+export REDIS_INSIGHTS_PORT=8001
+export REDIS_URL="redis://${host_ip}:${REDIS_DB_PORT}"
export REDIS_HOST=${host_ip}
export INDEX_NAME="mm-rag-redis"
+export WHISPER_PORT=7066
+export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr"
+export MAX_IMAGES=1
+export WHISPER_MODEL="base"
+export DATAPREP_MMR_PORT=6007
+export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/ingest_with_text"
+export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/generate_transcripts"
+export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/generate_captions"
+export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/get_files"
+export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/delete_files"
+export EMM_BRIDGETOWER_PORT=6006
+export EMBEDDING_MODEL_ID="BridgeTower/bridgetower-large-itm-mlm-itc"
export BRIDGE_TOWER_EMBEDDING=true
+export MMEI_EMBEDDING_ENDPOINT="http://${host_ip}:$EMM_BRIDGETOWER_PORT"
+export MM_EMBEDDING_PORT_MICROSERVICE=6000
+export REDIS_RETRIEVER_PORT=7000
+export LVM_PORT=9399
export LLAVA_SERVER_PORT=8399
-export LVM_ENDPOINT="http://${host_ip}:8399"
-export EMBEDDING_MODEL_ID="BridgeTower/bridgetower-large-itm-mlm-itc"
+export TGI_GAUDI_PORT="${LLAVA_SERVER_PORT}:80"
export LVM_MODEL_ID="llava-hf/llava-v1.6-vicuna-13b-hf"
-export WHISPER_MODEL="base"
-export MM_EMBEDDING_SERVICE_HOST_IP=${host_ip}
-export MM_RETRIEVER_SERVICE_HOST_IP=${host_ip}
-export WHISPER_SERVER_PORT=7066
-export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_SERVER_PORT}/v1/asr"v1/audio/transcriptions"
-export LVM_SERVICE_HOST_IP=${host_ip}
-export MEGA_SERVICE_HOST_IP=${host_ip}
-export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/multimodalqna"
-export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/ingest_with_text"
-export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/generate_transcripts"
-export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/generate_captions"
-export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/get_files"
-export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/delete_files"
+export LVM_ENDPOINT="http://${host_ip}:${LLAVA_SERVER_PORT}"
+export MEGA_SERVICE_PORT=8888
+export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna"
+export UI_PORT=5173
```
Note: Please replace with `host_ip` with you external IP address, do not use localhost.
+> Note: The `MAX_IMAGES` environment variable is used to specify the maximum number of images that will be sent from the LVM service to the LLaVA server.
+> If an image list longer than `MAX_IMAGES` is sent to the LVM server, a shortened image list will be sent to the LLaVA service. If the image list
+> needs to be shortened, the most recent images (the ones at the end of the list) are prioritized to send to the LLaVA service. Some LLaVA models have not
+> been trained with multiple images and may lead to inaccurate results. If `MAX_IMAGES` is not set, it will default to `1`.
+
## 🚀 Build Docker Images
First of all, you need to build Docker Images locally and install the python package of it.
@@ -63,7 +77,7 @@ Build embedding-multimodal-bridgetower docker image
```bash
git clone https://github.com/opea-project/GenAIComps.git
cd GenAIComps
-docker build --no-cache -t opea/embedding-multimodal-bridgetower:latest --build-arg EMBEDDER_PORT=$EMBEDDER_PORT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/third_parties/bridgetower/src/Dockerfile .
+docker build --no-cache -t opea/embedding-multimodal-bridgetower:latest --build-arg EMBEDDER_PORT=$EMM_BRIDGETOWER_PORT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/third_parties/bridgetower/src/Dockerfile .
```
Build embedding microservice image
@@ -98,7 +112,7 @@ docker build --no-cache -t opea/lvm:latest --build-arg https_proxy=$https_proxy
docker build --no-cache -t opea/dataprep-multimodal-redis:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/multimodal/redis/langchain/Dockerfile .
```
-### 5. Build asr images
+### 5. Build Whisper Server Image
Build whisper server image
@@ -163,14 +177,14 @@ docker compose -f compose.yaml up -d
1. embedding-multimodal-bridgetower
```bash
-curl http://${host_ip}:${EMBEDDER_PORT}/v1/encode \
+curl http://${host_ip}:${EMM_BRIDGETOWER_PORT}/v1/encode \
-X POST \
-H "Content-Type:application/json" \
-d '{"text":"This is example"}'
```
```bash
-curl http://${host_ip}:${EMBEDDER_PORT}/v1/encode \
+curl http://${host_ip}:${EMM_BRIDGETOWER_PORT}/v1/encode \
-X POST \
-H "Content-Type:application/json" \
-d '{"text":"This is example", "img_b64_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC"}'
@@ -202,7 +216,7 @@ curl http://${host_ip}:7000/v1/multimodal_retrieval \
-d "{\"text\":\"test\",\"embedding\":${your_embedding}}"
```
-4. asr
+4. whisper
```bash
curl ${WHISPER_SERVER_ENDPOINT} \
@@ -223,14 +237,14 @@ curl http://${host_ip}:${LLAVA_SERVER_PORT}/generate \
6. lvm
```bash
-curl http://${host_ip}:9399/v1/lvm \
+curl http://${host_ip}:${LVM_PORT}/v1/lvm \
-X POST \
-H 'Content-Type: application/json' \
-d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [{"b64_img_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "transcript_for_inference": "yellow image", "video_id": "8c7461df-b373-4a00-8696-9a2234359fe0", "time_of_frame_ms":"37000000", "source_video":"WeAreGoingOnBullrun_8c7461df-b373-4a00-8696-9a2234359fe0.mp4"}], "chat_template":"The caption of the image is: '\''{context}'\''. {question}"}'
```
```bash
-curl http://${host_ip}:9399/v1/lvm \
+curl http://${host_ip}:${LVM_PORT}/v1/lvm \
-X POST \
-H 'Content-Type: application/json' \
-d '{"image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "prompt":"What is this?"}'
@@ -239,7 +253,7 @@ curl http://${host_ip}:9399/v1/lvm \
Also, validate LVM TGI Gaudi Server with empty retrieval results
```bash
-curl http://${host_ip}:9399/v1/lvm \
+curl http://${host_ip}:${LVM_PORT}/v1/lvm \
-X POST \
-H 'Content-Type: application/json' \
-d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [], "chat_template":"The caption of the image is: '\''{context}'\''. {question}"}'
@@ -247,7 +261,7 @@ curl http://${host_ip}:9399/v1/lvm \
7. Multimodal Dataprep Microservice
-Download a sample video, image, and audio file and create a caption
+Download a sample video, image, PDF, and audio file and create a caption
```bash
export video_fn="WeAreGoingOnBullrun.mp4"
@@ -256,6 +270,9 @@ wget http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/WeAreGoing
export image_fn="apple.png"
wget https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true -O ${image_fn}
+export pdf_fn="nke-10k-2023.pdf"
+wget https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf -O ${pdf_fn}
+
export caption_fn="apple.txt"
echo "This is an apple." > ${caption_fn}
@@ -283,13 +300,14 @@ curl --silent --write-out "HTTPSTATUS:%{http_code}" \
-X POST -F "files=@./${image_fn}"
```
-Now, test the microservice with posting a custom caption along with an image
+Now, test the microservice with posting a custom caption along with an image and a PDF containing images and text.
```bash
curl --silent --write-out "HTTPSTATUS:%{http_code}" \
${DATAPREP_INGEST_SERVICE_ENDPOINT} \
-H 'Content-Type: multipart/form-data' \
- -X POST -F "files=@./${image_fn}" -F "files=@./${caption_fn}"
+ -X POST -F "files=@./${image_fn}" -F "files=@./${caption_fn}" \
+ -F "files=@./${pdf_fn}"
```
Also, you are able to get the list of all files that you uploaded:
@@ -307,7 +325,8 @@ Then you will get the response python-style LIST like this. Notice the name of e
"WeAreGoingOnBullrun_7ac553a1-116c-40a2-9fc5-deccbb89b507.mp4",
"WeAreGoingOnBullrun_6d13cf26-8ba2-4026-a3a9-ab2e5eb73a29.mp4",
"apple_fcade6e6-11a5-44a2-833a-3e534cbe4419.png",
- "AudioSample_976a85a6-dc3e-43ab-966c-9d81beef780c.wav
+ "nke-10k-2023_28000757-5533-4b1b-89fe-7c0a1b7e2cd0.pdf",
+ "AudioSample_976a85a6-dc3e-43ab-966c-9d81beef780c.wav"
]
```
@@ -321,15 +340,35 @@ curl -X POST \
8. MegaService
+Test the MegaService with a text query:
+
```bash
-curl http://${host_ip}:8888/v1/multimodalqna \
+curl http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna \
-H "Content-Type: application/json" \
-X POST \
-d '{"messages": "What is the revenue of Nike in 2023?"}'
```
+Test the MegaService with an audio query:
+
+```bash
+curl http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna \
+ -H "Content-Type: application/json" \
+ -d '{"messages": [{"role": "user", "content": [{"type": "audio", "audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}]}]}'
+```
+
+Test the MegaService with a text and image query:
+
+```bash
+curl http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna \
+ -H "Content-Type: application/json" \
+ -d '{"messages": [{"role": "user", "content": [{"type": "text", "text": "Green bananas in a tree"}, {"type": "image_url", "image_url": {"url": "http://images.cocodataset.org/test-stuff2017/000000004248.jpg"}}]}]}'
+```
+
+Test the MegaService with a back and forth conversation between the user and assistant:
+
```bash
-curl http://${host_ip}:8888/v1/multimodalqna \
+curl http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna \
-H "Content-Type: application/json" \
-d '{"messages": [{"role": "user", "content": [{"type": "text", "text": "hello, "}, {"type": "image_url", "image_url": {"url": "https://www.ilankelman.org/stopsigns/australia.jpg"}}]}, {"role": "assistant", "content": "opea project! "}, {"role": "user", "content": "chao, "}], "max_tokens": 10}'
```
diff --git a/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml
index 7a2641c9a5..49574c0535 100644
--- a/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -6,18 +6,20 @@ services:
image: redis/redis-stack:7.2.0-v9
container_name: redis-vector-db
ports:
- - "6379:6379"
- - "8001:8001"
+ - "${REDIS_DB_PORT}:${REDIS_DB_PORT}"
+ - "${REDIS_INSIGHTS_PORT}:${REDIS_INSIGHTS_PORT}"
whisper-service:
image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
container_name: whisper-service
ports:
- - "7066:7066"
+ - "${WHISPER_PORT}:${WHISPER_PORT}"
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
+ WHISPER_PORT: ${WHISPER_PORT}
+ WHISPER_SERVER_ENDPOINT: ${WHISPER_SERVER_ENDPOINT}
restart: unless-stopped
dataprep-multimodal-redis:
image: ${REGISTRY:-opea}/dataprep-multimodal-redis:${TAG:-latest}
@@ -26,29 +28,30 @@ services:
- redis-vector-db
- lvm
ports:
- - "6007:6007"
+ - "${DATAPREP_MMR_PORT}:${DATAPREP_MMR_PORT}"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
REDIS_URL: ${REDIS_URL}
REDIS_HOST: ${REDIS_HOST}
+ DATAPREP_MMR_PORT: ${DATAPREP_MMR_PORT}
INDEX_NAME: ${INDEX_NAME}
- LVM_ENDPOINT: "http://${LVM_SERVICE_HOST_IP}:9399/v1/lvm"
+ LVM_ENDPOINT: "http://${LVM_SERVICE_HOST_IP}:${LVM_PORT}/v1/lvm"
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
restart: unless-stopped
embedding-multimodal-bridgetower:
image: ${REGISTRY:-opea}/embedding-multimodal-bridgetower:${TAG:-latest}
container_name: embedding-multimodal-bridgetower
ports:
- - ${EMBEDDER_PORT}:${EMBEDDER_PORT}
+ - ${EMM_BRIDGETOWER_PORT}:${EMM_BRIDGETOWER_PORT}
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
- PORT: ${EMBEDDER_PORT}
+ PORT: ${EMM_BRIDGETOWER_PORT}
healthcheck:
- test: ["CMD-SHELL", "http_proxy='' curl -f http://localhost:${EMBEDDER_PORT}/v1/health_check"]
+ test: ["CMD-SHELL", "http_proxy='' curl -f http://localhost:${EMM_BRIDGETOWER_PORT}/v1/health_check"]
interval: 10s
timeout: 6s
retries: 18
@@ -70,6 +73,7 @@ services:
https_proxy: ${https_proxy}
MMEI_EMBEDDING_ENDPOINT: ${MMEI_EMBEDDING_ENDPOINT}
MM_EMBEDDING_PORT_MICROSERVICE: ${MM_EMBEDDING_PORT_MICROSERVICE}
+ EMM_BRIDGETOWER_PORT: ${EMM_BRIDGETOWER_PORT}
MULTIMODAL_EMBEDDING: true
restart: unless-stopped
retriever-redis:
@@ -78,13 +82,16 @@ services:
depends_on:
- redis-vector-db
ports:
- - "7000:7000"
+ - "${REDIS_RETRIEVER_PORT}:${REDIS_RETRIEVER_PORT}"
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
REDIS_URL: ${REDIS_URL}
+ REDIS_DB_PORT: ${REDIS_DB_PORT}
+ REDIS_INSIGHTS_PORT: ${REDIS_INSIGHTS_PORT}
+ REDIS_RETRIEVER_PORT: ${REDIS_RETRIEVER_PORT}
INDEX_NAME: ${INDEX_NAME}
BRIDGE_TOWER_EMBEDDING: ${BRIDGE_TOWER_EMBEDDING}
LOGFLAG: ${LOGFLAG}
@@ -94,11 +101,14 @@ services:
image: ghcr.io/huggingface/tgi-gaudi:2.0.6
container_name: tgi-llava-gaudi-server
ports:
- - "8399:80"
+ - ${TGI_GAUDI_PORT}
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
+ TGI_GAUDI_PORT: ${TGI_GAUDI_PORT}
+ LLAVA_SERVER_PORT: ${LLAVA_SERVER_PORT}
+ LVM_PORT: ${LVM_PORT}
HF_HUB_DISABLE_PROGRESS_BARS: 1
HF_HUB_ENABLE_HF_TRANSFER: 0
HABANA_VISIBLE_DEVICES: all
@@ -122,7 +132,7 @@ services:
depends_on:
- tgi-gaudi
ports:
- - "9399:9399"
+ - "${LVM_PORT}:${LVM_PORT}"
ipc: host
environment:
no_proxy: ${no_proxy}
@@ -130,8 +140,11 @@ services:
https_proxy: ${https_proxy}
LVM_COMPONENT_NAME: "OPEA_TGI_LLAVA_LVM"
LVM_ENDPOINT: ${LVM_ENDPOINT}
+ LLAVA_SERVER_PORT: ${LLAVA_SERVER_PORT}
+ LVM_PORT: ${LVM_PORT}
HF_HUB_DISABLE_PROGRESS_BARS: 1
HF_HUB_ENABLE_HF_TRANSFER: 0
+ MAX_IMAGES: ${MAX_IMAGES:-1}
restart: unless-stopped
multimodalqna:
image: ${REGISTRY:-opea}/multimodalqna:${TAG:-latest}
@@ -143,17 +156,19 @@ services:
- retriever-redis
- lvm
ports:
- - "8888:8888"
+ - "${MEGA_SERVICE_PORT}:${MEGA_SERVICE_PORT}"
environment:
no_proxy: ${no_proxy}
https_proxy: ${https_proxy}
http_proxy: ${http_proxy}
MEGA_SERVICE_HOST_IP: ${MEGA_SERVICE_HOST_IP}
+ MEGA_SERVICE_PORT: ${MEGA_SERVICE_PORT}
MM_EMBEDDING_SERVICE_HOST_IP: ${MM_EMBEDDING_SERVICE_HOST_IP}
MM_EMBEDDING_PORT_MICROSERVICE: ${MM_EMBEDDING_PORT_MICROSERVICE}
MM_RETRIEVER_SERVICE_HOST_IP: ${MM_RETRIEVER_SERVICE_HOST_IP}
LVM_SERVICE_HOST_IP: ${LVM_SERVICE_HOST_IP}
- WHISPER_SERVER_PORT: ${WHISPER_SERVER_PORT}
+ LVM_MODEL_ID: ${LVM_MODEL_ID}
+ WHISPER_PORT: ${WHISPER_PORT}
WHISPER_SERVER_ENDPOINT: ${WHISPER_SERVER_ENDPOINT}
ipc: host
restart: always
@@ -163,7 +178,7 @@ services:
depends_on:
- multimodalqna
ports:
- - "5173:5173"
+ - "${UI_PORT}:${UI_PORT}"
environment:
- no_proxy=${no_proxy}
- https_proxy=${https_proxy}
@@ -172,6 +187,9 @@ services:
- DATAPREP_INGEST_SERVICE_ENDPOINT=${DATAPREP_INGEST_SERVICE_ENDPOINT}
- DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT=${DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT}
- DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT=${DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT}
+ - MEGA_SERVICE_PORT:=${MEGA_SERVICE_PORT}
+ - UI_PORT=${UI_PORT}
+ - DATAPREP_MMR_PORT=${DATAPREP_MMR_PORT}
ipc: host
restart: always
diff --git a/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh b/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh
index 8fb00423f7..7464eb52a6 100755
--- a/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh
+++ b/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -6,30 +6,50 @@ pushd "../../../../../" > /dev/null
source .set_env.sh
popd > /dev/null
+export host_ip=$(hostname -I | awk '{print $1}')
+
+export MM_EMBEDDING_SERVICE_HOST_IP=${host_ip}
+export MM_RETRIEVER_SERVICE_HOST_IP=${host_ip}
+export LVM_SERVICE_HOST_IP=${host_ip}
+export MEGA_SERVICE_HOST_IP=${host_ip}
+
export no_proxy=${your_no_proxy}
export http_proxy=${your_http_proxy}
export https_proxy=${your_http_proxy}
-export EMBEDDER_PORT=6006
-export MMEI_EMBEDDING_ENDPOINT="http://${host_ip}:$EMBEDDER_PORT"
-export MM_EMBEDDING_PORT_MICROSERVICE=6000
-export WHISPER_SERVER_PORT=7066
-export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_SERVER_PORT}/v1/asr"
-export REDIS_URL="redis://${host_ip}:6379"
+
+export REDIS_DB_PORT=6379
+export REDIS_INSIGHTS_PORT=8001
+export REDIS_URL="redis://${host_ip}:${REDIS_DB_PORT}"
export REDIS_HOST=${host_ip}
export INDEX_NAME="mm-rag-redis"
+
+export WHISPER_MODEL="base"
+export WHISPER_PORT=7066
+export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr"
+export MAX_IMAGES=1
+
+export DATAPREP_MMR_PORT=6007
+export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/ingest_with_text"
+export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/generate_transcripts"
+export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/generate_captions"
+export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/get_files"
+export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/delete_files"
+
+export EMM_BRIDGETOWER_PORT=6006
+export EMBEDDING_MODEL_ID="BridgeTower/bridgetower-large-itm-mlm-itc"
+export MMEI_EMBEDDING_ENDPOINT="http://${host_ip}:$EMM_BRIDGETOWER_PORT"
+export MM_EMBEDDING_PORT_MICROSERVICE=6000
export BRIDGE_TOWER_EMBEDDING=true
+
+export REDIS_RETRIEVER_PORT=7000
+
+export LVM_PORT=9399
export LLAVA_SERVER_PORT=8399
-export LVM_ENDPOINT="http://${host_ip}:8399"
-export EMBEDDING_MODEL_ID="BridgeTower/bridgetower-large-itm-mlm-itc"
+export TGI_GAUDI_PORT="${LLAVA_SERVER_PORT}:80"
export LVM_MODEL_ID="llava-hf/llava-v1.6-vicuna-13b-hf"
-export WHISPER_MODEL="base"
-export MM_EMBEDDING_SERVICE_HOST_IP=${host_ip}
-export MM_RETRIEVER_SERVICE_HOST_IP=${host_ip}
-export LVM_SERVICE_HOST_IP=${host_ip}
-export MEGA_SERVICE_HOST_IP=${host_ip}
-export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/multimodalqna"
-export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/ingest_with_text"
-export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/generate_transcripts"
-export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/generate_captions"
-export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/get_files"
-export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/delete_files"
+export LVM_ENDPOINT="http://${host_ip}:${LLAVA_SERVER_PORT}"
+
+export MEGA_SERVICE_PORT=8888
+export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna"
+
+export UI_PORT=5173
diff --git a/MultimodalQnA/docker_image_build/build.yaml b/MultimodalQnA/docker_image_build/build.yaml
index 9c26d99d8e..2da504474c 100644
--- a/MultimodalQnA/docker_image_build/build.yaml
+++ b/MultimodalQnA/docker_image_build/build.yaml
@@ -59,9 +59,3 @@ services:
dockerfile: comps/asr/src/integrations/dependency/whisper/Dockerfile
extends: multimodalqna
image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
- asr:
- build:
- context: GenAIComps
- dockerfile: comps/asr/src/Dockerfile
- extends: multimodalqna
- image: ${REGISTRY:-opea}/asr:${TAG:-latest}
diff --git a/MultimodalQnA/multimodalqna.py b/MultimodalQnA/multimodalqna.py
index 02b8334aec..0e3f87d190 100644
--- a/MultimodalQnA/multimodalqna.py
+++ b/MultimodalQnA/multimodalqna.py
@@ -26,20 +26,25 @@
MM_RETRIEVER_SERVICE_HOST_IP = os.getenv("MM_RETRIEVER_SERVICE_HOST_IP", "0.0.0.0")
MM_RETRIEVER_SERVICE_PORT = int(os.getenv("MM_RETRIEVER_SERVICE_PORT", 7000))
LVM_SERVICE_HOST_IP = os.getenv("LVM_SERVICE_HOST_IP", "0.0.0.0")
-LVM_SERVICE_PORT = int(os.getenv("LVM_SERVICE_PORT", 9399))
-WHISPER_SERVER_ENDPOINT = os.getenv("WHISPER_SERVER_ENDPOINT", "http://0.0.0.0:7066/v1/asr")
+LVM_SERVICE_PORT = int(os.getenv("LVM_PORT", 9399))
+WHISPER_PORT = int(os.getenv("WHISPER_PORT", 7066))
+WHISPER_SERVER_ENDPOINT = os.getenv("WHISPER_SERVER_ENDPOINT", "http://0.0.0.0:$WHISPER_PORT/v1/asr")
def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
if self.services[cur_node].service_type == ServiceType.EMBEDDING:
+ if "text" in inputs:
+ input_text = inputs["text"]["text"] if isinstance(inputs["text"], dict) else inputs["text"]
+ if "image" in inputs:
+ input_image = inputs["image"]["base64_image"] if isinstance(inputs["image"], dict) else inputs["image"]
if "text" in inputs and "image" in inputs:
- text_doc = TextDoc(text=inputs["text"])
- image_doc = ImageDoc(base64_image=inputs["image"])
+ text_doc = TextDoc(text=input_text)
+ image_doc = ImageDoc(base64_image=input_image)
inputs = TextImageDoc(text=text_doc, image=image_doc).dict()
elif "image" in inputs:
- inputs = ImageDoc(base64_image=inputs["image"]).dict()
+ inputs = ImageDoc(base64_image=input_image).dict()
elif "text" in inputs:
- inputs = TextDoc(text=inputs["text"]).dict()
+ inputs = TextDoc(text=input_text).dict()
return inputs
@@ -48,6 +53,7 @@ class MultimodalQnAService:
def __init__(self, host="0.0.0.0", port=8000):
self.host = host
self.port = port
+ self._role_labels = self._get_role_labels()
ServiceOrchestrator.align_inputs = align_inputs
self.lvm_megaservice = ServiceOrchestrator()
self.megaservice = ServiceOrchestrator()
@@ -88,6 +94,31 @@ def add_remote_service(self):
# for lvm megaservice
self.lvm_megaservice.add(lvm)
+ def _get_role_labels(self):
+ """Returns a dictionary of role labels that are used in the chat prompt based on the LVM_MODEL_ID
+ environment variable.
+
+ The function defines the role labels used by the llava-1.5, llava-v1.6-vicuna,
+ llava-v1.6-mistral, and llava-interleave models, and then defaults to use "USER:" and "ASSISTANT:" if the
+ LVM_MODEL_ID is not one of those.
+ """
+ lvm_model = os.getenv("LVM_MODEL_ID", "")
+
+ # Default to labels used by llava-1.5 and llava-v1.6-vicuna models
+ role_labels = {"user": "USER:", "assistant": "ASSISTANT:"}
+
+ if "llava-interleave" in lvm_model:
+ role_labels["user"] = "<|im_start|>user"
+ role_labels["assistant"] = "<|im_end|><|im_start|>assistant"
+ elif "llava-v1.6-mistral" in lvm_model:
+ role_labels["user"] = "[INST]"
+ role_labels["assistant"] = " [/INST]"
+ elif "llava-1.5" not in lvm_model and "llava-v1.6-vicuna" not in lvm_model:
+ print(f"[ MultimodalQnAGateway ] Using default role labels for prompt formatting: {role_labels}")
+
+ return role_labels
+
+ # this overrides _handle_message method of Gateway
def _handle_message(self, messages):
images = []
audios = []
@@ -100,6 +131,7 @@ def _handle_message(self, messages):
messages_dict = {}
system_prompt = ""
prompt = ""
+ role_label_dict = self._role_labels
for message in messages:
msg_role = message["role"]
messages_dict = {}
@@ -142,20 +174,24 @@ def _handle_message(self, messages):
for role, message in messages_dict.items():
if isinstance(message, tuple):
text, decoded_audio_input, image_list = message
+ # Remove empty items from the image list
+ image_list = [x for x in image_list if x]
+ # Add image indicators within the conversation
+ image_tags = "\n" * len(image_list)
if i == 0:
# do not add role for the very first message.
# this will be added by llava_server
if text:
- prompt += text + "\n"
+ prompt += image_tags + text + "\n"
elif decoded_audio_input:
- prompt += decoded_audio_input + "\n"
+ prompt += image_tags + decoded_audio_input + "\n"
else:
if text:
- prompt += role.upper() + ": " + text + "\n"
+ prompt += role_label_dict[role] + " " + image_tags + text + "\n"
elif decoded_audio_input:
- prompt += role.upper() + ": " + decoded_audio_input + "\n"
+ prompt += role_label_dict[role] + " " + image_tags + decoded_audio_input + "\n"
else:
- prompt += role.upper() + ":"
+ prompt += role_label_dict[role] + " " + image_tags
if image_list:
for img in image_list:
@@ -186,9 +222,9 @@ def _handle_message(self, messages):
prompt += message + "\n"
else:
if message:
- prompt += role.upper() + ": " + message + "\n"
+ prompt += role_label_dict[role] + " " + message + "\n"
else:
- prompt += role.upper() + ":"
+ prompt += role_label_dict[role]
if images:
b64_types["image"] = images
@@ -217,13 +253,24 @@ def convert_audio_to_text(self, audio):
return response["asr_result"]
async def handle_request(self, request: Request):
+ """MultimodalQnA accepts input queries as text, images, and/or audio.
+
+ The messages in the request can be a single
+ message (which would be assumed to be a first query from the user) or back and forth conversation between the
+ user and the assistant.
+ Audio queries are converted to text before being sent to the megaservice and the translated text is returned
+ as part of the metadata in the response.
+ First queries are sent to the full Multimodal megaserivce, which includes using the embedding microservice and
+ retriever, in order to get relevant information from the vector store to send to the LVM along with the user's
+ query. Follow up queries are sent directly to the LVM without searching for more similar information from the
+ vector store.
+ """
data = await request.json()
stream_opt = bool(data.get("stream", False))
if stream_opt:
print("[ MultimodalQnAService ] stream=True not used, this has not support stream yet!")
stream_opt = False
chat_request = ChatCompletionRequest.model_validate(data)
- # Multimodal RAG QnA With Videos has not yet accepts image as input during QnA.
num_messages = len(data["messages"]) if isinstance(data["messages"], list) else 1
messages = self._handle_message(chat_request.messages)
decoded_audio_input = ""
@@ -237,7 +284,7 @@ async def handle_request(self, request: Request):
# for metadata storage purposes
decoded_audio_input = b64_types["audio"]
if "image" in b64_types:
- initial_inputs = {"prompt": prompt, "image": b64_types["image"][0]}
+ initial_inputs = {"prompt": prompt, "image": b64_types["image"]}
else:
initial_inputs = {"prompt": prompt, "image": ""}
else:
@@ -248,12 +295,16 @@ async def handle_request(self, request: Request):
cur_megaservice = self.megaservice
if isinstance(messages, tuple):
prompt, b64_types = messages
+ initial_inputs = {"text": prompt}
if "audio" in b64_types:
# for metadata storage purposes
decoded_audio_input = b64_types["audio"]
+ if "image" in b64_types and len(b64_types["image"]) > 0:
+ # Format initial inputs to match TextImageDoc
+ initial_inputs["text"] = {"text": prompt}
+ initial_inputs["image"] = {"base64_image": b64_types["image"][0]}
else:
- prompt = messages
- initial_inputs = {"text": prompt}
+ initial_inputs = {"text": messages}
parameters = LLMParams(
max_new_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024,
diff --git a/MultimodalQnA/tests/test_compose_on_gaudi.sh b/MultimodalQnA/tests/test_compose_on_gaudi.sh
index 85e2af3e24..747d6d7ed0 100644
--- a/MultimodalQnA/tests/test_compose_on_gaudi.sh
+++ b/MultimodalQnA/tests/test_compose_on_gaudi.sh
@@ -17,10 +17,35 @@ ip_address=$(hostname -I | awk '{print $1}')
export image_fn="apple.png"
export video_fn="WeAreGoingOnBullrun.mp4"
export caption_fn="apple.txt"
+export pdf_fn="nke-10k-2023.pdf"
+
+function check_service_ready() {
+ local container_name="$1"
+ local max_retries="$2"
+ local log_string="$3"
+
+ for i in $(seq 1 "$max_retries")
+ do
+ service_logs=$(docker logs "$container_name" 2>&1 | grep "$log_string" || true)
+ if [[ -z "$service_logs" ]]; then
+ echo "The $container_name service is not ready yet, sleeping 30s..."
+ sleep 30s
+ else
+ echo "$container_name service is ready"
+ break
+ fi
+ done
+
+ if [[ $i -ge $max_retries ]]; then
+ echo "WARNING: Max retries reached when waiting for the $container_name service to be ready"
+ docker logs "$container_name" >> "${LOG_PATH}/$container_name_file.log"
+ fi
+}
function build_docker_images() {
cd $WORKPATH/docker_image_build
- git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+ #git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+ git clone https://github.com/mhbuehler/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"mmqna-image-query"}" && cd ../
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
service_list="multimodalqna multimodalqna-ui embedding-multimodal-bridgetower embedding retriever lvm dataprep-multimodal-redis whisper"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
@@ -32,30 +57,39 @@ function build_docker_images() {
function setup_env() {
export host_ip=${ip_address}
- export EMBEDDER_PORT=6006
- export MMEI_EMBEDDING_ENDPOINT="http://${host_ip}:$EMBEDDER_PORT"
- export MM_EMBEDDING_PORT_MICROSERVICE=6000
- export WHISPER_SERVER_PORT=7066
- export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_SERVER_PORT}/v1/asr"
- export REDIS_URL="redis://${host_ip}:6379"
- export REDIS_HOST=${host_ip}
- export INDEX_NAME="mm-rag-redis"
- export BRIDGE_TOWER_EMBEDDING=true
- export LLAVA_SERVER_PORT=8399
- export LVM_ENDPOINT="http://${host_ip}:8399"
- export EMBEDDING_MODEL_ID="BridgeTower/bridgetower-large-itm-mlm-itc"
- export LVM_MODEL_ID="llava-hf/llava-v1.6-vicuna-7b-hf"
- export WHISPER_MODEL="base"
export MM_EMBEDDING_SERVICE_HOST_IP=${host_ip}
export MM_RETRIEVER_SERVICE_HOST_IP=${host_ip}
export LVM_SERVICE_HOST_IP=${host_ip}
export MEGA_SERVICE_HOST_IP=${host_ip}
- export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/multimodalqna"
- export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/ingest_with_text"
- export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/generate_transcripts"
- export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/generate_captions"
- export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/get_files"
- export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/delete_files"
+ export REDIS_DB_PORT=6379
+ export REDIS_INSIGHTS_PORT=8001
+ export REDIS_URL="redis://${host_ip}:${REDIS_DB_PORT}"
+ export REDIS_HOST=${host_ip}
+ export INDEX_NAME="mm-rag-redis"
+ export WHISPER_PORT=7066
+ export MAX_IMAGES=1
+ export WHISPER_MODEL="base"
+ export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr"
+ export DATAPREP_MMR_PORT=6007
+ export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/ingest_with_text"
+ export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/generate_transcripts"
+ export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/generate_captions"
+ export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/get_files"
+ export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/delete_files"
+ export EMM_BRIDGETOWER_PORT=6006
+ export BRIDGE_TOWER_EMBEDDING=true
+ export EMBEDDING_MODEL_ID="BridgeTower/bridgetower-large-itm-mlm-itc"
+ export MMEI_EMBEDDING_ENDPOINT="http://${host_ip}:$EMM_BRIDGETOWER_PORT"
+ export MM_EMBEDDING_PORT_MICROSERVICE=6000
+ export REDIS_RETRIEVER_PORT=7000
+ export LVM_PORT=9399
+ export LLAVA_SERVER_PORT=8399
+ export TGI_GAUDI_PORT="${LLAVA_SERVER_PORT}:80"
+ export LVM_MODEL_ID="llava-hf/llava-v1.6-vicuna-13b-hf"
+ export LVM_ENDPOINT="http://${host_ip}:${LLAVA_SERVER_PORT}"
+ export MEGA_SERVICE_PORT=8888
+ export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna"
+ export UI_PORT=5173
}
function start_services() {
@@ -72,6 +106,7 @@ function prepare_data() {
echo "Downloading image and video"
wget https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true -O ${image_fn}
wget http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/WeAreGoingOnBullrun.mp4 -O ${video_fn}
+ wget https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf -O ${pdf_fn}
echo "Writing caption file"
echo "This is an apple." > ${caption_fn}
@@ -94,6 +129,9 @@ function validate_service() {
elif [[ $SERVICE_NAME == *"dataprep-multimodal-redis-ingest"* ]]; then
cd $LOG_PATH
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${image_fn}" -F "files=@./apple.txt" -H 'Content-Type: multipart/form-data' "$URL")
+ elif [[ $SERVICE_NAME == *"dataprep-multimodal-redis-pdf"* ]]; then
+ cd $LOG_PATH
+ HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${pdf_fn}" -H 'Content-Type: multipart/form-data' "$URL")
elif [[ $SERVICE_NAME == *"dataprep_get"* ]]; then
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' "$URL")
elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then
@@ -130,14 +168,14 @@ function validate_microservices() {
# Bridgetower Embedding Server
echo "Validating embedding-multimodal-bridgetower"
validate_service \
- "http://${host_ip}:${EMBEDDER_PORT}/v1/encode" \
+ "http://${host_ip}:${EMM_BRIDGETOWER_PORT}/v1/encode" \
'"embedding":[' \
"embedding-multimodal-bridgetower" \
"embedding-multimodal-bridgetower" \
'{"text":"This is example"}'
validate_service \
- "http://${host_ip}:${EMBEDDER_PORT}/v1/encode" \
+ "http://${host_ip}:${EMM_BRIDGETOWER_PORT}/v1/encode" \
'"embedding":[' \
"embedding-multimodal-bridgetower" \
"embedding-multimodal-bridgetower" \
@@ -162,20 +200,27 @@ function validate_microservices() {
sleep 1m # retrieval can't curl as expected, try to wait for more time
# test data prep
- echo "Data Prep with Generating Transcript for Video"
+ echo "Validating Data Prep with Generating Transcript for Video"
validate_service \
"${DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT}" \
"Data preparation succeeded" \
"dataprep-multimodal-redis-transcript" \
"dataprep-multimodal-redis"
- echo "Data Prep with Image & Caption Ingestion"
+ echo "Validating Data Prep with Image & Caption Ingestion"
validate_service \
"${DATAPREP_INGEST_SERVICE_ENDPOINT}" \
"Data preparation succeeded" \
"dataprep-multimodal-redis-ingest" \
"dataprep-multimodal-redis"
+ echo "Validating Data Prep with PDF"
+ validate_service \
+ "${DATAPREP_INGEST_SERVICE_ENDPOINT}" \
+ "Data preparation succeeded" \
+ "dataprep-multimodal-redis-pdf" \
+ "dataprep-multimodal-redis"
+
echo "Validating get file returns mp4"
validate_service \
"${DATAPREP_GET_FILE_ENDPOINT}" \
@@ -196,13 +241,14 @@ function validate_microservices() {
echo "Validating retriever-redis"
your_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(512)]; print(embedding)")
validate_service \
- "http://${host_ip}:7000/v1/retrieval" \
+ "http://${host_ip}:${REDIS_RETRIEVER_PORT}/v1/retrieval" \
"retrieved_docs" \
"retriever-redis" \
"retriever-redis" \
"{\"text\":\"test\",\"embedding\":${your_embedding}}"
- sleep 3m
+ echo "Wait for tgi-llava-gaudi-server service to be ready"
+ check_service_ready "tgi-llava-gaudi-server" 20 "Connected"
# llava server
echo "Evaluating LLAVA tgi-gaudi"
@@ -213,17 +259,25 @@ function validate_microservices() {
"tgi-llava-gaudi-server" \
'{"inputs":"![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)What is this a picture of?\n\n","parameters":{"max_new_tokens":16, "seed": 42}}'
+ echo "Evaluating LLAVA tgi-gaudi with multiple images"
+ validate_service \
+ "http://${host_ip}:${LLAVA_SERVER_PORT}/generate" \
+ '"generated_text":' \
+ "tgi-gaudi" \
+ "tgi-llava-gaudi-server" \
+ '{"inputs":"![]()![]()What is the content of these two images?\n\n","parameters":{"max_new_tokens":32, "seed": 42}}'
+
# lvm
echo "Evaluating lvm"
validate_service \
- "http://${host_ip}:9399/v1/lvm" \
+ "http://${host_ip}:${LVM_PORT}/v1/lvm" \
'"text":"' \
"lvm" \
"lvm" \
'{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [{"b64_img_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "transcript_for_inference": "yellow image", "video_id": "8c7461df-b373-4a00-8696-9a2234359fe0", "time_of_frame_ms":"37000000", "source_video":"WeAreGoingOnBullrun_8c7461df-b373-4a00-8696-9a2234359fe0.mp4"}], "chat_template":"The caption of the image is: '\''{context}'\''. {question}"}'
# data prep requiring lvm
- echo "Data Prep with Generating Caption for Image"
+ echo "Validating Data Prep with Generating Caption for Image"
validate_service \
"${DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT}" \
"Data preparation succeeded" \
@@ -235,33 +289,41 @@ function validate_microservices() {
function validate_megaservice() {
# Curl the Mega Service with retrieval
- echo "Validate megaservice with first query"
+ echo "Validating megaservice with first query"
validate_service \
- "http://${host_ip}:8888/v1/multimodalqna" \
+ "http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" \
'"time_of_frame_ms":' \
"multimodalqna" \
"multimodalqna-backend-server" \
'{"messages": "What is the revenue of Nike in 2023?"}'
- echo "Validate megaservice with first audio query"
+ echo "Validating megaservice with first audio query"
validate_service \
- "http://${host_ip}:8888/v1/multimodalqna" \
+ "http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" \
'"time_of_frame_ms":' \
"multimodalqna" \
"multimodalqna-backend-server" \
'{"messages": [{"role": "user", "content": [{"type": "audio", "audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}]}]}'
- echo "Validate megaservice with follow-up query"
+ echo "Validating megaservice with first query with an image"
+ validate_service \
+ "http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" \
+ '"time_of_frame_ms":' \
+ "multimodalqna" \
+ "multimodalqna-backend-server" \
+ '{"messages": [{"role": "user", "content": [{"type": "text", "text": "Find a similar image"}, {"type": "image_url", "image_url": {"url": "https://www.ilankelman.org/stopsigns/australia.jpg"}}]}]}'
+
+ echo "Validating megaservice with follow-up query"
validate_service \
- "http://${host_ip}:8888/v1/multimodalqna" \
+ "http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" \
'"content":"' \
"multimodalqna" \
"multimodalqna-backend-server" \
'{"messages": [{"role": "user", "content": [{"type": "audio", "audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}, {"type": "image_url", "image_url": {"url": "https://www.ilankelman.org/stopsigns/australia.jpg"}}]}, {"role": "assistant", "content": "opea project! "}, {"role": "user", "content": [{"type": "text", "text": "goodbye"}]}]}'
- echo "Validate megaservice with multiple text queries"
+ echo "Validating megaservice with multiple text queries"
validate_service \
- "http://${host_ip}:8888/v1/multimodalqna" \
+ "http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" \
'"content":"' \
"multimodalqna" \
"multimodalqna-backend-server" \
@@ -270,7 +332,7 @@ function validate_megaservice() {
}
function validate_delete {
- echo "Validate data prep delete files"
+ echo "Validating data prep delete files"
validate_service \
"${DATAPREP_DELETE_FILE_ENDPOINT}" \
'{"status":true}' \
@@ -284,6 +346,7 @@ function delete_data() {
rm -rf ${image_fn}
rm -rf ${video_fn}
rm -rf ${caption_fn}
+ rm -rf ${pdf_fn}
}
function stop_docker() {
diff --git a/MultimodalQnA/tests/test_compose_on_rocm.sh b/MultimodalQnA/tests/test_compose_on_rocm.sh
index 65fe94390d..42e24b877c 100644
--- a/MultimodalQnA/tests/test_compose_on_rocm.sh
+++ b/MultimodalQnA/tests/test_compose_on_rocm.sh
@@ -20,8 +20,8 @@ export caption_fn="apple.txt"
function build_docker_images() {
cd $WORKPATH/docker_image_build
- git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
-
+ #git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+ git clone https://github.com/mhbuehler/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"mmqna-image-query"}" && cd ../
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
service_list="multimodalqna multimodalqna-ui embedding-multimodal-bridgetower embedding retriever lvm dataprep-multimodal-redis whisper"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
diff --git a/MultimodalQnA/tests/test_compose_on_xeon.sh b/MultimodalQnA/tests/test_compose_on_xeon.sh
index 9a8eeec8bf..295ab5e8c2 100644
--- a/MultimodalQnA/tests/test_compose_on_xeon.sh
+++ b/MultimodalQnA/tests/test_compose_on_xeon.sh
@@ -17,10 +17,35 @@ ip_address=$(hostname -I | awk '{print $1}')
export image_fn="apple.png"
export video_fn="WeAreGoingOnBullrun.mp4"
export caption_fn="apple.txt"
+export pdf_fn="nke-10k-2023.pdf"
+
+function check_service_ready() {
+ local container_name="$1"
+ local max_retries="$2"
+ local log_string="$3"
+
+ for i in $(seq 1 "$max_retries")
+ do
+ service_logs=$(docker logs "$container_name" 2>&1 | grep "$log_string" || true)
+ if [[ -z "$service_logs" ]]; then
+ echo "The $container_name service is not ready yet, sleeping 30s..."
+ sleep 30s
+ else
+ echo "$container_name service is ready"
+ break
+ fi
+ done
+
+ if [[ $i -ge $max_retries ]]; then
+ echo "WARNING: Max retries reached when waiting for the $container_name service to be ready"
+ docker logs "$container_name" >> "${LOG_PATH}/$container_name_file.log"
+ fi
+}
function build_docker_images() {
cd $WORKPATH/docker_image_build
- git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+ #git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+ git clone https://github.com/mhbuehler/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"mmqna-image-query"}" && cd ../
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
service_list="multimodalqna multimodalqna-ui embedding-multimodal-bridgetower embedding retriever lvm-llava lvm dataprep-multimodal-redis whisper"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
@@ -30,33 +55,43 @@ function build_docker_images() {
function setup_env() {
export host_ip=${ip_address}
- export EMBEDDER_PORT=6006
- export MMEI_EMBEDDING_ENDPOINT="http://${host_ip}:$EMBEDDER_PORT"
- export MM_EMBEDDING_PORT_MICROSERVICE=6000
- export WHISPER_SERVER_PORT=7066
- export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_SERVER_PORT}/v1/asr"
- export REDIS_URL="redis://${host_ip}:6379"
+ export MM_EMBEDDING_SERVICE_HOST_IP=${host_ip}
+ export MM_RETRIEVER_SERVICE_HOST_IP=${host_ip}
+ export LVM_SERVICE_HOST_IP=${host_ip}
+ export MEGA_SERVICE_HOST_IP=${host_ip}
+ export WHISPER_PORT=7066
+ export MAX_IMAGES=1
+ export WHISPER_MODEL="base"
+ export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr"
+ export REDIS_DB_PORT=6379
+ export REDIS_INSIGHTS_PORT=8001
+ export REDIS_URL="redis://${host_ip}:${REDIS_DB_PORT}"
export REDIS_HOST=${host_ip}
export INDEX_NAME="mm-rag-redis"
+ export DATAPREP_MMR_PORT=6007
+ export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/ingest_with_text"
+ export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/generate_transcripts"
+ export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/generate_captions"
+ export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/get_files"
+ export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/delete_files"
+ export EMM_BRIDGETOWER_PORT=6006
export BRIDGE_TOWER_EMBEDDING=true
+ export EMBEDDING_MODEL_ID="BridgeTower/bridgetower-large-itm-mlm-itc"
+ export MMEI_EMBEDDING_ENDPOINT="http://${host_ip}:$EMM_BRIDGETOWER_PORT"
+ export MM_EMBEDDING_PORT_MICROSERVICE=6000
+ export REDIS_RETRIEVER_PORT=7000
+ export LVM_PORT=9399
export LLAVA_SERVER_PORT=8399
- export LVM_ENDPOINT="http://${host_ip}:8399"
export LVM_MODEL_ID="llava-hf/llava-1.5-7b-hf"
- export EMBEDDING_MODEL_ID="BridgeTower/bridgetower-large-itm-mlm-itc"
- export WHISPER_MODEL="base"
- export MM_EMBEDDING_SERVICE_HOST_IP=${host_ip}
- export MM_RETRIEVER_SERVICE_HOST_IP=${host_ip}
- export LVM_SERVICE_HOST_IP=${host_ip}
- export MEGA_SERVICE_HOST_IP=${host_ip}
- export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/multimodalqna"
- export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/ingest_with_text"
- export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/generate_transcripts"
- export DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/generate_captions"
- export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/get_files"
- export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/delete_files"
+ export LVM_ENDPOINT="http://${host_ip}:$LLAVA_SERVER_PORT"
+ export MEGA_SERVICE_PORT=8888
+ export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:$MEGA_SERVICE_PORT/v1/multimodalqna"
+ export UI_PORT=5173
}
+
function start_services() {
+ echo "Starting services..."
cd $WORKPATH/docker_compose/intel/cpu/xeon
@@ -64,6 +99,7 @@ function start_services() {
sed -i "s|container_name: multimodalqna-backend-server|container_name: multimodalqna-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml
docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
sleep 2m
+ echo "Services started."
}
function prepare_data() {
@@ -71,11 +107,13 @@ function prepare_data() {
echo "Downloading image and video"
wget https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true -O ${image_fn}
wget http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/WeAreGoingOnBullrun.mp4 -O ${video_fn}
+ wget https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf -O ${pdf_fn}
echo "Writing caption file"
echo "This is an apple." > ${caption_fn}
sleep 1m
}
+
function validate_service() {
local URL="$1"
local EXPECTED_RESULT="$2"
@@ -92,6 +130,9 @@ function validate_service() {
elif [[ $SERVICE_NAME == *"dataprep-multimodal-redis-ingest"* ]]; then
cd $LOG_PATH
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${image_fn}" -F "files=@./apple.txt" -H 'Content-Type: multipart/form-data' "$URL")
+ elif [[ $SERVICE_NAME == *"dataprep-multimodal-redis-pdf"* ]]; then
+ cd $LOG_PATH
+ HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${pdf_fn}" -H 'Content-Type: multipart/form-data' "$URL")
elif [[ $SERVICE_NAME == *"dataprep_get"* ]]; then
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' "$URL")
elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then
@@ -128,14 +169,14 @@ function validate_microservices() {
# Bridgetower Embedding Server
echo "Validating embedding-multimodal-bridgetower"
validate_service \
- "http://${host_ip}:${EMBEDDER_PORT}/v1/encode" \
+ "http://${host_ip}:${EMM_BRIDGETOWER_PORT}/v1/encode" \
'"embedding":[' \
"embedding-multimodal-bridgetower" \
"embedding-multimodal-bridgetower" \
'{"text":"This is example"}'
validate_service \
- "http://${host_ip}:${EMBEDDER_PORT}/v1/encode" \
+ "http://${host_ip}:${EMM_BRIDGETOWER_PORT}/v1/encode" \
'"embedding":[' \
"embedding-multimodal-bridgetower" \
"embedding-multimodal-bridgetower" \
@@ -160,20 +201,27 @@ function validate_microservices() {
sleep 1m # retrieval can't curl as expected, try to wait for more time
# test data prep
- echo "Data Prep with Generating Transcript for Video"
+ echo "Validating Data Prep with Generating Transcript for Video"
validate_service \
"${DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT}" \
"Data preparation succeeded" \
"dataprep-multimodal-redis-transcript" \
"dataprep-multimodal-redis"
- echo "Data Prep with Image & Caption Ingestion"
+ echo "Validating Data Prep with Image & Caption Ingestion"
validate_service \
"${DATAPREP_INGEST_SERVICE_ENDPOINT}" \
"Data preparation succeeded" \
"dataprep-multimodal-redis-ingest" \
"dataprep-multimodal-redis"
+ echo "Validating Data Prep with PDF"
+ validate_service \
+ "${DATAPREP_INGEST_SERVICE_ENDPOINT}" \
+ "Data preparation succeeded" \
+ "dataprep-multimodal-redis-pdf" \
+ "dataprep-multimodal-redis"
+
echo "Validating get file returns mp4"
validate_service \
"${DATAPREP_GET_FILE_ENDPOINT}" \
@@ -194,13 +242,14 @@ function validate_microservices() {
echo "Validating retriever-redis"
your_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(512)]; print(embedding)")
validate_service \
- "http://${host_ip}:7000/v1/retrieval" \
+ "http://${host_ip}:${REDIS_RETRIEVER_PORT}/v1/retrieval" \
"retrieved_docs" \
"retriever-redis" \
"retriever-redis" \
"{\"text\":\"test\",\"embedding\":${your_embedding}}"
- sleep 3m
+ echo "Wait for lvm-llava service to be ready"
+ check_service_ready "lvm-llava" 10 "Uvicorn running on http://"
# llava server
echo "Evaluating lvm-llava"
@@ -211,17 +260,25 @@ function validate_microservices() {
"lvm-llava" \
'{"prompt":"Describe the image please.", "img_b64_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC"}'
+ echo "Evaluating lvm-llava with a list of images"
+ validate_service \
+ "http://${host_ip}:${LLAVA_SERVER_PORT}/generate" \
+ '"text":' \
+ "lvm-llava" \
+ "lvm-llava" \
+ '{"prompt":"Describe the image please.", "img_b64_str": ["iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC","iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNkYPhfz0AEYBxVSF+FAP5FDvcfRYWgAAAAAElFTkSuQmCC"]}'
+
# lvm
echo "Evaluating lvm"
validate_service \
- "http://${host_ip}:9399/v1/lvm" \
+ "http://${host_ip}:${LVM_PORT}/v1/lvm" \
'"text":"' \
"lvm" \
"lvm" \
'{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [{"b64_img_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "transcript_for_inference": "yellow image", "video_id": "8c7461df-b373-4a00-8696-9a2234359fe0", "time_of_frame_ms":"37000000", "source_video":"WeAreGoingOnBullrun_8c7461df-b373-4a00-8696-9a2234359fe0.mp4"}], "chat_template":"The caption of the image is: '\''{context}'\''. {question}"}'
# data prep requiring lvm
- echo "Data Prep with Generating Caption for Image"
+ echo "Validating Data Prep with Generating Caption for Image"
validate_service \
"${DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT}" \
"Data preparation succeeded" \
@@ -233,33 +290,41 @@ function validate_microservices() {
function validate_megaservice() {
# Curl the Mega Service with retrieval
- echo "Validate megaservice with first query"
+ echo "Validating megaservice with first query"
validate_service \
- "http://${host_ip}:8888/v1/multimodalqna" \
+ "http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" \
'"time_of_frame_ms":' \
"multimodalqna" \
"multimodalqna-backend-server" \
'{"messages": "What is the revenue of Nike in 2023?"}'
- echo "Validate megaservice with first audio query"
+ echo "Validating megaservice with first audio query"
validate_service \
- "http://${host_ip}:8888/v1/multimodalqna" \
+ "http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" \
'"time_of_frame_ms":' \
"multimodalqna" \
"multimodalqna-backend-server" \
'{"messages": [{"role": "user", "content": [{"type": "audio", "audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}]}]}'
- echo "Validate megaservice with follow-up query"
+ echo "Validating megaservice with first query with an image"
+ validate_service \
+ "http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" \
+ '"time_of_frame_ms":' \
+ "multimodalqna" \
+ "multimodalqna-backend-server" \
+ '{"messages": [{"role": "user", "content": [{"type": "text", "text": "Find a similar image"}, {"type": "image_url", "image_url": {"url": "https://www.ilankelman.org/stopsigns/australia.jpg"}}]}]}'
+
+ echo "Validating megaservice with follow-up query"
validate_service \
- "http://${host_ip}:8888/v1/multimodalqna" \
+ "http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" \
'"content":"' \
"multimodalqna" \
"multimodalqna-backend-server" \
'{"messages": [{"role": "user", "content": [{"type": "audio", "audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}, {"type": "image_url", "image_url": {"url": "https://www.ilankelman.org/stopsigns/australia.jpg"}}]}, {"role": "assistant", "content": "opea project! "}, {"role": "user", "content": [{"type": "text", "text": "goodbye"}]}]}'
- echo "Validate megaservice with multiple text queries"
+ echo "Validating megaservice with multiple text queries"
validate_service \
- "http://${host_ip}:8888/v1/multimodalqna" \
+ "http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" \
'"content":"' \
"multimodalqna" \
"multimodalqna-backend-server" \
@@ -267,7 +332,7 @@ function validate_megaservice() {
}
function validate_delete {
- echo "Validate data prep delete files"
+ echo "Validating data prep delete files"
validate_service \
"${DATAPREP_DELETE_FILE_ENDPOINT}" \
'{"status":true}' \
@@ -280,12 +345,15 @@ function delete_data() {
echo "Deleting image, video, and caption"
rm -rf ${image_fn}
rm -rf ${video_fn}
+ rm -rf ${pdf_fn}
rm -rf ${caption_fn}
}
function stop_docker() {
+ echo "Stopping docker..."
cd $WORKPATH/docker_compose/intel/cpu/xeon
docker compose -f compose.yaml stop && docker compose -f compose.yaml rm -f
+ echo "Docker stopped."
}
function main() {
diff --git a/MultimodalQnA/ui/gradio/conversation.py b/MultimodalQnA/ui/gradio/conversation.py
index f080d7a154..678f7872c2 100644
--- a/MultimodalQnA/ui/gradio/conversation.py
+++ b/MultimodalQnA/ui/gradio/conversation.py
@@ -3,8 +3,9 @@
import dataclasses
from enum import Enum, auto
-from typing import List
+from typing import Dict, List
+from PIL import Image
from utils import convert_audio_to_base64, get_b64_frame_from_timestamp
@@ -21,6 +22,7 @@ class Conversation:
system: str
roles: List[str]
messages: List[List[str]]
+ image_query_files: Dict[int, str]
offset: int
sep_style: SeparatorStyle = SeparatorStyle.SINGLE
sep: str = "\n"
@@ -32,6 +34,7 @@ class Conversation:
split_video: str = None
image: str = None
audio_query_file: str = None
+ pdf: str = None
def _template_caption(self):
out = ""
@@ -45,6 +48,17 @@ def get_prompt(self):
# Need to do RAG. If the query is text, prompt is the query only
if self.audio_query_file:
ret = [{"role": "user", "content": [{"type": "audio", "audio": self.get_b64_audio_query()}]}]
+ elif 0 in self.image_query_files:
+ b64_image = get_b64_frame_from_timestamp(self.image_query_files[0], 0)
+ ret = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": messages[0][1]},
+ {"type": "image_url", "image_url": {"url": b64_image}},
+ ],
+ }
+ ]
else:
ret = messages[0][1]
else:
@@ -54,10 +68,11 @@ def get_prompt(self):
for i, (role, message) in enumerate(messages):
if message:
dic = {"role": role}
+ content = [{"type": "text", "text": message}]
+ # There might be audio
if self.audio_query_file:
- content = [{"type": "audio", "audio": self.get_b64_audio_query()}]
- else:
- content = [{"type": "text", "text": message}]
+ content.append({"type": "audio", "audio": self.get_b64_audio_query()})
+ # There might be a returned item from the first query
if i == 0 and self.time_of_frame_ms and self.video_file:
base64_frame = (
self.base64_frame
@@ -66,7 +81,18 @@ def get_prompt(self):
)
if base64_frame is None:
base64_frame = ""
+ # Include the original caption for the returned image/video
+ if self.caption and content[0]["type"] == "text":
+ content[0]["text"] = content[0]["text"] + " " + self._template_caption()
content.append({"type": "image_url", "image_url": {"url": base64_frame}})
+ # There might be a query image
+ if i in self.image_query_files:
+ content.append(
+ {
+ "type": "image_url",
+ "image_url": {"url": get_b64_frame_from_timestamp(self.image_query_files[i], 0)},
+ }
+ )
dic["content"] = content
conv_dict.append(dic)
else:
@@ -117,6 +143,31 @@ def to_gradio_chatbot(self):
img_str = f''
msg = img_str + msg.replace("", "").strip()
ret.append([msg, None])
+ elif i in self.image_query_files:
+ import base64
+ from io import BytesIO
+
+ image = Image.open(self.image_query_files[i])
+ max_hw, min_hw = max(image.size), min(image.size)
+ aspect_ratio = max_hw / min_hw
+ max_len, min_len = 800, 400
+ shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+ longest_edge = int(shortest_edge * aspect_ratio)
+ W, H = image.size
+ if H > W:
+ H, W = longest_edge, shortest_edge
+ else:
+ H, W = shortest_edge, longest_edge
+ image = image.resize((W, H))
+ buffered = BytesIO()
+ if image.format not in ["JPEG", "JPG"]:
+ image = image.convert("RGB")
+ image.save(buffered, format="JPEG")
+ img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+ img_str = f''
+ msg = img_str + msg.replace("", "").strip()
+ ret.append([msg, None])
+
else:
ret.append([msg, None])
else:
@@ -128,6 +179,7 @@ def copy(self):
system=self.system,
roles=self.roles,
messages=[[x, y] for x, y in self.messages],
+ image_query_files=self.image_query_files,
offset=self.offset,
sep_style=self.sep_style,
sep=self.sep,
@@ -150,6 +202,7 @@ def dict(self):
"split_video": self.split_video,
"image": self.image,
"audio_query_file": self.audio_query_file,
+ "pdf": self.pdf,
}
@@ -157,6 +210,7 @@ def dict(self):
system="",
roles=("user", "assistant"),
messages=(),
+ image_query_files={},
offset=0,
sep_style=SeparatorStyle.SINGLE,
sep="\n",
@@ -167,4 +221,5 @@ def dict(self):
split_video=None,
image=None,
audio_query_file=None,
+ pdf=None,
)
diff --git a/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py b/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py
index 28d3534be5..6b94e54be9 100644
--- a/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py
+++ b/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py
@@ -13,7 +13,8 @@
from conversation import multimodalqna_conv
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
-from utils import build_logger, make_temp_image, moderation_msg, server_error_msg, split_video
+from gradio_pdf import PDF
+from utils import build_logger, make_temp_image, server_error_msg, split_video
logger = build_logger("gradio_web_server", "gradio_web_server.log")
logflag = os.getenv("LOGFLAG", False)
@@ -50,11 +51,17 @@ def clear_history(state, request: gr.Request):
os.remove(state.split_video)
if state.image and os.path.exists(state.image):
os.remove(state.image)
+ if state.pdf and os.path.exists(state.pdf):
+ os.remove(state.pdf)
state = multimodalqna_conv.copy()
- return (state, state.to_gradio_chatbot(), None, None, None, None) + (disable_btn,) * 1
+ video = gr.Video(height=512, width=512, elem_id="video", visible=True, label="Media")
+ image = gr.Image(height=512, width=512, elem_id="image", visible=False, label="Media")
+ pdf = PDF(height=512, elem_id="pdf", interactive=False, visible=False, label="Media")
+ return (state, state.to_gradio_chatbot(), {"text": "", "files": []}, None, video, image, pdf) + (disable_btn,) * 1
-def add_text(state, text, audio, request: gr.Request):
+def add_text(state, textbox, audio, request: gr.Request):
+ text = textbox["text"]
logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}")
if audio:
state.audio_query_file = audio
@@ -62,6 +69,14 @@ def add_text(state, text, audio, request: gr.Request):
state.append_message(state.roles[1], None)
state.skip_next = False
return (state, state.to_gradio_chatbot(), None, None) + (disable_btn,) * 1
+ # If it is a image query
+ elif textbox["files"]:
+ image_file = textbox["files"][0]
+ state.image_query_files[len(state.messages)] = image_file
+ state.append_message(state.roles[0], text)
+ state.append_message(state.roles[1], None)
+ state.skip_next = False
+ return (state, state.to_gradio_chatbot(), None, None) + (disable_btn,) * 1
elif len(text) <= 0:
state.skip_next = True
return (state, state.to_gradio_chatbot(), None, None) + (no_change_btn,) * 1
@@ -83,8 +98,7 @@ def http_bot(state, request: gr.Request):
is_audio_query = state.audio_query_file is not None
if state.skip_next:
# This generate call is skipped due to invalid inputs
- path_to_sub_videos = state.get_path_to_subvideos()
- yield (state, state.to_gradio_chatbot(), path_to_sub_videos, None) + (no_change_btn,) * 1
+ yield (state, state.to_gradio_chatbot(), None, None, None) + (no_change_btn,) * 1
return
if len(state.messages) == state.offset + 2:
@@ -94,6 +108,7 @@ def http_bot(state, request: gr.Request):
new_state.append_message(new_state.roles[0], state.messages[-2][1])
new_state.append_message(new_state.roles[1], None)
new_state.audio_query_file = state.audio_query_file
+ new_state.image_query_files = state.image_query_files
state = new_state
# Construct prompt
@@ -104,12 +119,13 @@ def http_bot(state, request: gr.Request):
"messages": prompt,
}
- logger.info(f"==== request ====\n{pload}")
+ if logflag:
+ logger.info(f"==== request ====\n{pload}")
logger.info(f"==== url request ====\n{gateway_addr}")
state.messages[-1][-1] = "▌"
- yield (state, state.to_gradio_chatbot(), state.split_video, state.image) + (disable_btn,) * 1
+ yield (state, state.to_gradio_chatbot(), state.split_video, state.image, state.pdf) + (disable_btn,) * 1
try:
response = requests.post(
@@ -137,6 +153,7 @@ def http_bot(state, request: gr.Request):
video_file = metadata["source_video"]
state.video_file = os.path.join(static_dir, metadata["source_video"])
state.time_of_frame_ms = metadata["time_of_frame_ms"]
+ state.caption = metadata["transcript_for_inference"]
file_ext = os.path.splitext(state.video_file)[-1]
if file_ext == ".mp4":
try:
@@ -154,12 +171,19 @@ def http_bot(state, request: gr.Request):
print(f"image {state.video_file} does not exist in UI host!")
output_image_path = None
state.image = output_image_path
+ elif file_ext == ".pdf":
+ try:
+ output_pdf_path = make_temp_image(state.video_file, file_ext)
+ except:
+ print(f"pdf {state.video_file} does not exist in UI host!")
+ output_pdf_path = None
+ state.pdf = output_pdf_path
else:
raise requests.exceptions.RequestException
except requests.exceptions.RequestException as e:
state.messages[-1][-1] = server_error_msg
- yield (state, state.to_gradio_chatbot(), None, None) + (enable_btn,)
+ yield (state, state.to_gradio_chatbot(), None, None, None) + (enable_btn,)
return
state.messages[-1][-1] = message
@@ -173,6 +197,7 @@ def http_bot(state, request: gr.Request):
state.to_gradio_chatbot(),
gr.Video(state.split_video, visible=state.split_video is not None),
gr.Image(state.image, visible=state.image is not None),
+ PDF(state.pdf, visible=state.pdf is not None, interactive=False, starting_page=int(state.time_of_frame_ms)),
) + (enable_btn,) * 1
logger.info(f"{state.messages[-1][-1]}")
@@ -347,6 +372,57 @@ def ingest_with_text(filepath, text, request: gr.Request):
return
+def ingest_pdf(filepath, request: gr.Request):
+ yield (gr.Textbox(visible=True, value="Please wait while your uploaded PDF is ingested into the database..."))
+ verified_filepath = os.path.normpath(filepath)
+ if not verified_filepath.startswith(tmp_upload_folder):
+ print("Found malicious PDF file name!")
+ yield (
+ gr.Textbox(
+ visible=True,
+ value="Your uploaded PDF's file name has special characters that are not allowed (depends on the OS, some examples are \, /, :, and *). Please consider changing the file name.",
+ )
+ )
+ return
+ basename = os.path.basename(verified_filepath)
+ dest = os.path.join(static_dir, basename)
+ shutil.copy(verified_filepath, dest)
+ print("Done copying uploaded file to static folder.")
+ headers = {
+ # 'Content-Type': 'multipart/form-data'
+ }
+ files = {
+ "files": open(dest, "rb"),
+ }
+ response = requests.post(dataprep_ingest_addr, headers=headers, files=files)
+ print(response.status_code)
+ if response.status_code == 200:
+ response = response.json()
+ yield (gr.Textbox(visible=True, value="The PDF ingestion is done. Saving your uploaded PDF..."))
+ time.sleep(2)
+ fn_no_ext = Path(dest).stem
+ if "file_id_maps" in response and fn_no_ext in response["file_id_maps"]:
+ new_dst = os.path.join(static_dir, response["file_id_maps"][fn_no_ext])
+ print(response["file_id_maps"][fn_no_ext])
+ os.rename(dest, new_dst)
+ yield (
+ gr.Textbox(
+ visible=True,
+ value="Congratulations, your upload is done!\nClick the X button on the top right of the PDF upload box to upload another file.",
+ )
+ )
+ return
+ else:
+ yield (
+ gr.Textbox(
+ visible=True,
+ value=f"Something went wrong (server error: {response.status_code})!\nPlease click the X button on the top right of the PDF upload box to reupload your file.",
+ )
+ )
+ time.sleep(2)
+ return
+
+
def hide_text(request: gr.Request):
return gr.Textbox(visible=False)
@@ -356,19 +432,23 @@ def clear_text(request: gr.Request):
with gr.Blocks() as upload_video:
- gr.Markdown("# Ingest Your Own Video Using Generated Transcripts or Captions")
- gr.Markdown("Use this interface to ingest your own video and generate transcripts or captions for it")
+ gr.Markdown("# Ingest Videos Using Generated Transcripts or Captions")
+ gr.Markdown("Use this interface to ingest a video and generate transcripts or captions for it")
def select_upload_type(choice, request: gr.Request):
if choice == "transcript":
- return gr.Video(sources="upload", visible=True), gr.Video(sources="upload", visible=False)
+ return gr.Video(sources="upload", visible=True, format="mp4"), gr.Video(
+ sources="upload", visible=False, format="mp4"
+ )
else:
- return gr.Video(sources="upload", visible=False), gr.Video(sources="upload", visible=True)
+ return gr.Video(sources="upload", visible=False, format="mp4"), gr.Video(
+ sources="upload", visible=True, format="mp4"
+ )
with gr.Row():
with gr.Column(scale=6):
- video_upload_trans = gr.Video(sources="upload", elem_id="video_upload_trans", visible=True)
- video_upload_cap = gr.Video(sources="upload", elem_id="video_upload_cap", visible=False)
+ video_upload_trans = gr.Video(sources="upload", elem_id="video_upload_trans", visible=True, format="mp4")
+ video_upload_cap = gr.Video(sources="upload", elem_id="video_upload_cap", visible=False, format="mp4")
with gr.Column(scale=3):
text_options_radio = gr.Radio(
[
@@ -391,8 +471,8 @@ def select_upload_type(choice, request: gr.Request):
text_options_radio.change(select_upload_type, [text_options_radio], [video_upload_trans, video_upload_cap])
with gr.Blocks() as upload_image:
- gr.Markdown("# Ingest Your Own Image Using Generated or Custom Captions/Labels")
- gr.Markdown("Use this interface to ingest your own image and generate a caption for it")
+ gr.Markdown("# Ingest Images Using Generated or Custom Captions")
+ gr.Markdown("Use this interface to ingest an image and generate a caption for it")
def select_upload_type(choice, request: gr.Request):
if choice == "gen_caption":
@@ -424,8 +504,8 @@ def select_upload_type(choice, request: gr.Request):
text_options_radio.change(select_upload_type, [text_options_radio], [image_upload_cap, image_upload_text])
with gr.Blocks() as upload_audio:
- gr.Markdown("# Ingest Your Own Audio Using Generated Transcripts")
- gr.Markdown("Use this interface to ingest your own audio file and generate a transcript for it")
+ gr.Markdown("# Ingest Audio Using Generated Transcripts")
+ gr.Markdown("Use this interface to ingest an audio file and generate a transcript for it")
with gr.Row():
with gr.Column(scale=6):
audio_upload = gr.Audio(type="filepath")
@@ -440,17 +520,14 @@ def select_upload_type(choice, request: gr.Request):
audio_upload.clear(hide_text, [], [text_upload_result])
with gr.Blocks() as upload_pdf:
- gr.Markdown("# Ingest Your Own PDF")
- gr.Markdown("Use this interface to ingest your own PDF file with text, tables, images, and graphs")
+ gr.Markdown("# Ingest PDF Files")
+ gr.Markdown("Use this interface to ingest a PDF file with text and images")
with gr.Row():
with gr.Column(scale=6):
- image_upload_cap = gr.File()
+ pdf_upload = PDF(label="PDF File")
with gr.Column(scale=3):
- text_upload_result_cap = gr.Textbox(visible=False, interactive=False, label="Upload Status")
- image_upload_cap.upload(
- ingest_gen_caption, [image_upload_cap, gr.Textbox(value="PDF", visible=False)], [text_upload_result_cap]
- )
- image_upload_cap.clear(hide_text, [], [text_upload_result_cap])
+ pdf_upload_result = gr.Textbox(visible=False, interactive=False, label="Upload Status")
+ pdf_upload.upload(ingest_pdf, [pdf_upload], [pdf_upload_result])
with gr.Blocks() as qna:
state = gr.State(multimodalqna_conv.copy())
@@ -458,15 +535,15 @@ def select_upload_type(choice, request: gr.Request):
with gr.Column(scale=2):
video = gr.Video(height=512, width=512, elem_id="video", visible=True, label="Media")
image = gr.Image(height=512, width=512, elem_id="image", visible=False, label="Media")
+ pdf = PDF(height=512, elem_id="pdf", interactive=False, visible=False, label="Media")
with gr.Column(scale=9):
chatbot = gr.Chatbot(elem_id="chatbot", label="MultimodalQnA Chatbot", height=390)
with gr.Row():
with gr.Column(scale=8):
with gr.Tabs():
- with gr.TabItem("Text Query"):
- textbox = gr.Textbox(
- show_label=False,
- container=True,
+ with gr.TabItem("Text & Image Query"):
+ textbox = gr.MultimodalTextbox(
+ show_label=False, container=True, submit_btn=False, file_types=["image"]
)
with gr.TabItem("Audio Query"):
audio = gr.Audio(
@@ -486,7 +563,7 @@ def select_upload_type(choice, request: gr.Request):
[
state,
],
- [state, chatbot, textbox, audio, video, image, clear_btn],
+ [state, chatbot, textbox, audio, video, image, pdf, clear_btn],
)
submit_btn.click(
@@ -498,7 +575,7 @@ def select_upload_type(choice, request: gr.Request):
[
state,
],
- [state, chatbot, video, image, clear_btn],
+ [state, chatbot, video, image, pdf, clear_btn],
)
with gr.Blocks(css=css) as demo:
gr.Markdown("# MultimodalQnA")
@@ -511,6 +588,8 @@ def select_upload_type(choice, request: gr.Request):
upload_image.render()
with gr.TabItem("Upload Audio"):
upload_audio.render()
+ with gr.TabItem("Upload PDF"):
+ upload_pdf.render()
demo.queue()
app = gr.mount_gradio_app(app, demo, path="/")
@@ -520,19 +599,24 @@ def select_upload_type(choice, request: gr.Request):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="0.0.0.0")
- parser.add_argument("--port", type=int, default=5173)
+ parser.add_argument("--port", type=int, default=os.getenv("UI_PORT", 5173))
parser.add_argument("--concurrency-count", type=int, default=20)
parser.add_argument("--share", action="store_true")
- backend_service_endpoint = os.getenv("BACKEND_SERVICE_ENDPOINT", "http://localhost:8888/v1/multimodalqna")
+ MEGA_SERVICE_PORT = os.getenv("MEGA_SERVICE_PORT", 8888)
+ DATAPREP_MMR_PORT = os.getenv("DATAPREP_MMR_PORT", 6007)
+
+ backend_service_endpoint = os.getenv(
+ "BACKEND_SERVICE_ENDPOINT", f"http://localhost:{MEGA_SERVICE_PORT}/v1/multimodalqna"
+ )
dataprep_ingest_endpoint = os.getenv(
- "DATAPREP_INGEST_SERVICE_ENDPOINT", "http://localhost:6007/v1/ingest_with_text"
+ "DATAPREP_INGEST_SERVICE_ENDPOINT", f"http://localhost:{DATAPREP_MMR_PORT}/v1/ingest_with_text"
)
dataprep_gen_transcript_endpoint = os.getenv(
- "DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT", "http://localhost:6007/v1/generate_transcripts"
+ "DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT", f"http://localhost:{DATAPREP_MMR_PORT}/v1/generate_transcripts"
)
dataprep_gen_caption_endpoint = os.getenv(
- "DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT", "http://localhost:6007/v1/generate_captions"
+ "DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT", f"http://localhost:{DATAPREP_MMR_PORT}/v1/generate_captions"
)
args = parser.parse_args()
logger.info(f"args: {args}")
diff --git a/MultimodalQnA/ui/gradio/requirements.txt b/MultimodalQnA/ui/gradio/requirements.txt
index bb784f9112..9596e21ff1 100644
--- a/MultimodalQnA/ui/gradio/requirements.txt
+++ b/MultimodalQnA/ui/gradio/requirements.txt
@@ -1,4 +1,5 @@
gradio==5.5.0
+gradio_pdf==0.0.19
moviepy==1.0.3
numpy==1.26.4
opencv-python==4.10.0.82
diff --git a/MultimodalQnA/ui/gradio/utils.py b/MultimodalQnA/ui/gradio/utils.py
index 3d7be10118..c22d102a5a 100644
--- a/MultimodalQnA/ui/gradio/utils.py
+++ b/MultimodalQnA/ui/gradio/utils.py
@@ -126,7 +126,7 @@ def make_temp_image(
output_image_name: str = "image_tmp",
):
Path(output_image_path).mkdir(parents=True, exist_ok=True)
- output_image = os.path.join(output_image_path, "{}.{}".format(output_image_name, file_ext))
+ output_image = os.path.join(output_image_path, "{}{}".format(output_image_name, file_ext))
shutil.copy(image_name, output_image)
return output_image