diff --git a/.github/workflows/docker/compose/animation-compose.yaml b/.github/workflows/docker/compose/animation-compose.yaml index 957e4273cc..32b2a247a0 100644 --- a/.github/workflows/docker/compose/animation-compose.yaml +++ b/.github/workflows/docker/compose/animation-compose.yaml @@ -5,13 +5,13 @@ services: animation: build: - dockerfile: comps/animation/wav2lip/Dockerfile + dockerfile: comps/animation/src/Dockerfile image: ${REGISTRY:-opea}/animation:${TAG:-latest} wav2lip: build: - dockerfile: comps/animation/wav2lip/dependency/Dockerfile + dockerfile: comps/animation/src/integration/dependency/Dockerfile image: ${REGISTRY:-opea}/wav2lip:${TAG:-latest} wav2lip-gaudi: build: - dockerfile: comps/animation/wav2lip/dependency/Dockerfile.intel_hpu + dockerfile: comps/animation/src/integration/dependency/Dockerfile.intel_hpu image: ${REGISTRY:-opea}/wav2lip-gaudi:${TAG:-latest} diff --git a/.github/workflows/docker/compose/dataprep-compose.yaml b/.github/workflows/docker/compose/dataprep-compose.yaml index ee3b0002ff..69cbadd534 100644 --- a/.github/workflows/docker/compose/dataprep-compose.yaml +++ b/.github/workflows/docker/compose/dataprep-compose.yaml @@ -55,19 +55,11 @@ services: build: dockerfile: comps/dataprep/neo4j/llama_index/Dockerfile image: ${REGISTRY:-opea}/dataprep-neo4j-llamaindex:${TAG:-latest} - dataprep-multimedia2text: - build: - dockerfile: comps/dataprep/multimedia2text/Dockerfile - image: ${REGISTRY:-opea}/dataprep-multimedia2text:${TAG:-latest} - dataprep-video2audio: - build: - dockerfile: comps/dataprep/multimedia2text/video2audio/Dockerfile - image: ${REGISTRY:-opea}/dataprep-video2audio:${TAG:-latest} - dataprep-audio2text: - build: - dockerfile: comps/dataprep/multimedia2text/audio2text/Dockerfile - image: ${REGISTRY:-opea}/dataprep-audio2text:${TAG:-latest} dataprep-elasticsearch: build: dockerfile: comps/dataprep/elasticsearch/langchain/Dockerfile image: ${REGISTRY:-opea}/dataprep-elasticsearch:${TAG:-latest} + dataprep-opensearch: + build: + dockerfile: comps/dataprep/opensearch/langchain/Dockerfile + image: ${REGISTRY:-opea}/dataprep-opensearch:${TAG:-latest} diff --git a/.github/workflows/docker/compose/retrievers-compose.yaml b/.github/workflows/docker/compose/retrievers-compose.yaml index 5396866118..a81b6a9952 100644 --- a/.github/workflows/docker/compose/retrievers-compose.yaml +++ b/.github/workflows/docker/compose/retrievers-compose.yaml @@ -51,3 +51,7 @@ services: build: dockerfile: comps/retrievers/elasticsearch/langchain/Dockerfile image: ${REGISTRY:-opea}/retriever-elasticsearch:${TAG:-latest} + retriever-opensearch: + build: + dockerfile: comps/retrievers/opensearch/langchain/Dockerfile + image: ${REGISTRY:-opea}/retriever-opensearch:${TAG:-latest} diff --git a/comps/animation/wav2lip/Dockerfile b/comps/animation/src/Dockerfile similarity index 67% rename from comps/animation/wav2lip/Dockerfile rename to comps/animation/src/Dockerfile index bc1915b6bf..2608178272 100644 --- a/comps/animation/wav2lip/Dockerfile +++ b/comps/animation/src/Dockerfile @@ -15,10 +15,10 @@ ARG ARCH=cpu COPY comps /home/user/comps RUN pip install --no-cache-dir --upgrade pip && \ - pip install --no-cache-dir -r /home/user/comps/animation/wav2lip/requirements.txt ; + pip install --no-cache-dir -r /home/user/comps/animation/src/requirements.txt ; ENV PYTHONPATH=$PYTHONPATH:/home/user -WORKDIR /home/user/comps/animation/wav2lip +WORKDIR /home/user/comps/animation/src -ENTRYPOINT ["python3", "animation.py"] +ENTRYPOINT ["python3", "opea_animation_microservice.py"] diff --git a/comps/animation/wav2lip/README.md b/comps/animation/src/README.md similarity index 67% rename from comps/animation/wav2lip/README.md rename to comps/animation/src/README.md index 3eb5bb4778..c3855955b4 100644 --- a/comps/animation/wav2lip/README.md +++ b/comps/animation/src/README.md @@ -16,19 +16,19 @@ cd GenAIComps - Xeon CPU ```bash -docker build -t opea/wav2lip:latest -f comps/animation/wav2lip/dependency/Dockerfile . +docker build -t opea/wav2lip:latest -f comps/animation/src/integration/dependency/Dockerfile . ``` - Gaudi2 HPU ```bash -docker build -t opea/wav2lip-gaudi:latest -f comps/animation/wav2lip/dependency/Dockerfile.intel_hpu . +docker build -t opea/wav2lip-gaudi:latest -f comps/animation/src/integration/dependency/Dockerfile.intel_hpu . ``` ### 1.1.2 Animation server image ```bash -docker build -t opea/animation:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/animation/wav2lip/Dockerfile . +docker build -t opea/animation:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/animation/src/Dockerfile . ``` ## 1.2. Set environment variables @@ -78,13 +78,13 @@ export FPS=10 - Xeon CPU ```bash -docker run --privileged -d --name "wav2lip-service" -p 7860:7860 --ipc=host -w /home/user/comps/animation/wav2lip -e PYTHON=/usr/bin/python3.11 -v $(pwd)/comps/animation/wav2lip/assets:/home/user/comps/animation/wav2lip/assets -e DEVICE=$DEVICE -e INFERENCE_MODE=$INFERENCE_MODE -e CHECKPOINT_PATH=$CHECKPOINT_PATH -e FACE=$FACE -e AUDIO=$AUDIO -e FACESIZE=$FACESIZE -e OUTFILE=$OUTFILE -e GFPGAN_MODEL_VERSION=$GFPGAN_MODEL_VERSION -e UPSCALE_FACTOR=$UPSCALE_FACTOR -e FPS=$FPS -e WAV2LIP_PORT=$WAV2LIP_PORT opea/wav2lip:latest +docker run --privileged -d --name "wav2lip-service" -p 7860:7860 --ipc=host -w /home/user/comps/animation/src -e PYTHON=/usr/bin/python3.11 -v $(pwd)/comps/animation/src/assets:/home/user/comps/animation/src/assets -e DEVICE=$DEVICE -e INFERENCE_MODE=$INFERENCE_MODE -e CHECKPOINT_PATH=$CHECKPOINT_PATH -e FACE=$FACE -e AUDIO=$AUDIO -e FACESIZE=$FACESIZE -e OUTFILE=$OUTFILE -e GFPGAN_MODEL_VERSION=$GFPGAN_MODEL_VERSION -e UPSCALE_FACTOR=$UPSCALE_FACTOR -e FPS=$FPS -e WAV2LIP_PORT=$WAV2LIP_PORT opea/wav2lip:latest ``` - Gaudi2 HPU ```bash -docker run --privileged -d --name "wav2lip-gaudi-service" -p 7860:7860 --runtime=habana --cap-add=sys_nice --ipc=host -w /home/user/comps/animation/wav2lip -v $(pwd)/comps/animation/wav2lip/assets:/home/user/comps/animation/wav2lip/assets -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PYTHON=/usr/bin/python3.10 -e DEVICE=$DEVICE -e INFERENCE_MODE=$INFERENCE_MODE -e CHECKPOINT_PATH=$CHECKPOINT_PATH -e FACE=$FACE -e AUDIO=$AUDIO -e FACESIZE=$FACESIZE -e OUTFILE=$OUTFILE -e GFPGAN_MODEL_VERSION=$GFPGAN_MODEL_VERSION -e UPSCALE_FACTOR=$UPSCALE_FACTOR -e FPS=$FPS -e WAV2LIP_PORT=$WAV2LIP_PORT opea/wav2lip-gaudi:latest +docker run --privileged -d --name "wav2lip-gaudi-service" -p 7860:7860 --runtime=habana --cap-add=sys_nice --ipc=host -w /home/user/comps/animation/src -v $(pwd)/comps/animation/src/assets:/home/user/comps/animation/src/assets -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PYTHON=/usr/bin/python3.10 -e DEVICE=$DEVICE -e INFERENCE_MODE=$INFERENCE_MODE -e CHECKPOINT_PATH=$CHECKPOINT_PATH -e FACE=$FACE -e AUDIO=$AUDIO -e FACESIZE=$FACESIZE -e OUTFILE=$OUTFILE -e GFPGAN_MODEL_VERSION=$GFPGAN_MODEL_VERSION -e UPSCALE_FACTOR=$UPSCALE_FACTOR -e FPS=$FPS -e WAV2LIP_PORT=$WAV2LIP_PORT opea/wav2lip-gaudi:latest ``` ## 2.2 Run Animation Microservice @@ -101,7 +101,7 @@ Once microservice starts, user can use below script to validate the running micr ```bash cd GenAIComps -python3 comps/animation/wav2lip/dependency/check_wav2lip_server.py +python3 comps/animation/src/integration/dependency/check_wav2lip_server.py ``` ## 3.2 Validate Animation service @@ -109,20 +109,20 @@ python3 comps/animation/wav2lip/dependency/check_wav2lip_server.py ```bash cd GenAIComps export ip_address=$(hostname -I | awk '{print $1}') -curl http://${ip_address}:9066/v1/animation -X POST -H "Content-Type: application/json" -d @comps/animation/wav2lip/assets/audio/sample_question.json +curl http://${ip_address}:9066/v1/animation -X POST -H "Content-Type: application/json" -d @comps/animation/src/assets/audio/sample_question.json ``` or ```bash cd GenAIComps -python3 comps/animation/wav2lip/dependency/check_animation_server.py +python3 comps/animation/src/integration/dependency/check_animation_server.py ``` The expected output will be a message similar to the following: ```bash -{'wav2lip_result': '....../GenAIComps/comps/animation/wav2lip/assets/outputs/result.mp4'} +{'wav2lip_result': '....../GenAIComps/comps/animation/src/assets/outputs/result.mp4'} ``` -Please find "comps/animation/wav2lip/assets/outputs/result.mp4" as a reference generated video. +Please find "comps/animation/src/assets/outputs/result.mp4" as a reference generated video. diff --git a/comps/animation/wav2lip/dependency/__init__.py b/comps/animation/src/__init__.py similarity index 100% rename from comps/animation/wav2lip/dependency/__init__.py rename to comps/animation/src/__init__.py diff --git a/comps/animation/wav2lip/assets/audio/eg3_ref.wav b/comps/animation/src/assets/audio/eg3_ref.wav similarity index 100% rename from comps/animation/wav2lip/assets/audio/eg3_ref.wav rename to comps/animation/src/assets/audio/eg3_ref.wav diff --git a/comps/animation/wav2lip/assets/audio/sample_question.json b/comps/animation/src/assets/audio/sample_question.json similarity index 100% rename from comps/animation/wav2lip/assets/audio/sample_question.json rename to comps/animation/src/assets/audio/sample_question.json diff --git a/comps/animation/wav2lip/assets/audio/sample_whoareyou.json b/comps/animation/src/assets/audio/sample_whoareyou.json similarity index 100% rename from comps/animation/wav2lip/assets/audio/sample_whoareyou.json rename to comps/animation/src/assets/audio/sample_whoareyou.json diff --git a/comps/animation/wav2lip/assets/img/avatar1.jpg b/comps/animation/src/assets/img/avatar1.jpg similarity index 100% rename from comps/animation/wav2lip/assets/img/avatar1.jpg rename to comps/animation/src/assets/img/avatar1.jpg diff --git a/comps/animation/wav2lip/assets/img/avatar2.jpg b/comps/animation/src/assets/img/avatar2.jpg similarity index 100% rename from comps/animation/wav2lip/assets/img/avatar2.jpg rename to comps/animation/src/assets/img/avatar2.jpg diff --git a/comps/animation/wav2lip/assets/img/avatar3.png b/comps/animation/src/assets/img/avatar3.png similarity index 100% rename from comps/animation/wav2lip/assets/img/avatar3.png rename to comps/animation/src/assets/img/avatar3.png diff --git a/comps/animation/wav2lip/assets/img/avatar4.png b/comps/animation/src/assets/img/avatar4.png similarity index 100% rename from comps/animation/wav2lip/assets/img/avatar4.png rename to comps/animation/src/assets/img/avatar4.png diff --git a/comps/animation/wav2lip/assets/img/avatar5.png b/comps/animation/src/assets/img/avatar5.png similarity index 100% rename from comps/animation/wav2lip/assets/img/avatar5.png rename to comps/animation/src/assets/img/avatar5.png diff --git a/comps/animation/wav2lip/assets/img/avatar6.png b/comps/animation/src/assets/img/avatar6.png similarity index 100% rename from comps/animation/wav2lip/assets/img/avatar6.png rename to comps/animation/src/assets/img/avatar6.png diff --git a/comps/animation/wav2lip/assets/img/flowchart.png b/comps/animation/src/assets/img/flowchart.png similarity index 100% rename from comps/animation/wav2lip/assets/img/flowchart.png rename to comps/animation/src/assets/img/flowchart.png diff --git a/comps/animation/wav2lip/assets/img/gaudi.png b/comps/animation/src/assets/img/gaudi.png similarity index 100% rename from comps/animation/wav2lip/assets/img/gaudi.png rename to comps/animation/src/assets/img/gaudi.png diff --git a/comps/animation/wav2lip/assets/img/opea_gh_qr.png b/comps/animation/src/assets/img/opea_gh_qr.png similarity index 100% rename from comps/animation/wav2lip/assets/img/opea_gh_qr.png rename to comps/animation/src/assets/img/opea_gh_qr.png diff --git a/comps/animation/wav2lip/assets/img/opea_qr.png b/comps/animation/src/assets/img/opea_qr.png similarity index 100% rename from comps/animation/wav2lip/assets/img/opea_qr.png rename to comps/animation/src/assets/img/opea_qr.png diff --git a/comps/animation/wav2lip/assets/img/xeon.jpg b/comps/animation/src/assets/img/xeon.jpg similarity index 100% rename from comps/animation/wav2lip/assets/img/xeon.jpg rename to comps/animation/src/assets/img/xeon.jpg diff --git a/comps/animation/wav2lip/assets/outputs/results.mp4 b/comps/animation/src/assets/outputs/results.mp4 similarity index 100% rename from comps/animation/wav2lip/assets/outputs/results.mp4 rename to comps/animation/src/assets/outputs/results.mp4 diff --git a/comps/animation/wav2lip/check_animation_server.py b/comps/animation/src/check_animation_server.py similarity index 86% rename from comps/animation/wav2lip/check_animation_server.py rename to comps/animation/src/check_animation_server.py index 8152714475..b4511006c6 100644 --- a/comps/animation/wav2lip/check_animation_server.py +++ b/comps/animation/src/check_animation_server.py @@ -11,7 +11,7 @@ outfile = os.environ.get("OUTFILE") # Read the JSON file -with open("comps/animation/wav2lip/assets/audio/sample_question.json", "r") as file: +with open("comps/animation/src/assets/audio/sample_question.json", "r") as file: data = json.load(file) response = requests.post(url=endpoint, json=data, headers={"Content-Type": "application/json"}, proxies={"http": None}) diff --git a/comps/animation/wav2lip/docker_run.sh b/comps/animation/src/docker_run.sh similarity index 100% rename from comps/animation/wav2lip/docker_run.sh rename to comps/animation/src/docker_run.sh diff --git a/comps/animation/src/integration/__init__.py b/comps/animation/src/integration/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/comps/animation/src/integration/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/animation/wav2lip/dependency/Dockerfile b/comps/animation/src/integration/dependency/Dockerfile similarity index 90% rename from comps/animation/wav2lip/dependency/Dockerfile rename to comps/animation/src/integration/dependency/Dockerfile index 2f1aa1b76c..e9c90bccf2 100644 --- a/comps/animation/wav2lip/dependency/Dockerfile +++ b/comps/animation/src/integration/dependency/Dockerfile @@ -25,11 +25,11 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missin # Install GenAIComps RUN mkdir -p /home/user/comps COPY comps /home/user/comps -COPY comps/animation/wav2lip/dependency/entrypoint.sh /usr/local/bin/entrypoint.sh +COPY comps/animation/src/integration/dependency/entrypoint.sh /usr/local/bin/entrypoint.sh # Install ffmpeg with x264 software codec -RUN git clone https://github.com/FFmpeg/FFmpeg.git /home/user/comps/animation/wav2lip/FFmpeg -WORKDIR /home/user/comps/animation/wav2lip/FFmpeg +RUN git clone https://github.com/FFmpeg/FFmpeg.git /home/user/comps/animation/src/FFmpeg +WORKDIR /home/user/comps/animation/src/FFmpeg RUN ./configure --enable-gpl --enable-libx264 --enable-cross-compile && \ make -j$(nproc-1) && \ make install && \ @@ -53,7 +53,7 @@ ENV PYTHONPATH="$PYTHONPATH:/usr/local/lib/python3.11/site-packages/gfpgan" WORKDIR /usr/local/lib/python3.11/site-packages # Install pip dependencies -RUN pip install -r /home/user/comps/animation/wav2lip/requirements.txt +RUN pip install -r /home/user/comps/animation/src/requirements.txt # Custom patches # Modify the degradations.py file to import rgb_to_grayscale from torchvision.transforms.functional @@ -66,7 +66,7 @@ RUN sed -i "s/if 'cpu' not in device and 'cuda' not in device:/if 'cpu' not in d RUN sed -i 's/hp.sample_rate, hp.n_fft/sr=hp.sample_rate, n_fft=hp.n_fft/' /usr/local/lib/python3.11/site-packages/Wav2Lip/audio.py # Set the working directory -WORKDIR /home/user/comps/animation/wav2lip/ +WORKDIR /home/user/comps/animation/src/ # Define the command to run when the container starts RUN chmod +x /usr/local/bin/entrypoint.sh diff --git a/comps/animation/wav2lip/dependency/Dockerfile.intel_hpu b/comps/animation/src/integration/dependency/Dockerfile.intel_hpu similarity index 93% rename from comps/animation/wav2lip/dependency/Dockerfile.intel_hpu rename to comps/animation/src/integration/dependency/Dockerfile.intel_hpu index 218bfc0045..fac3a75487 100644 --- a/comps/animation/wav2lip/dependency/Dockerfile.intel_hpu +++ b/comps/animation/src/integration/dependency/Dockerfile.intel_hpu @@ -19,7 +19,7 @@ RUN rm -rf /var/lib/apt/lists/* # Install GenAIComps RUN mkdir -p /home/user/comps COPY comps /home/user/comps -COPY comps/animation/wav2lip/dependency/entrypoint.sh /usr/local/bin/entrypoint.sh +COPY comps/animation/src/integration/dependency/entrypoint.sh /usr/local/bin/entrypoint.sh # Install ffmpeg with x264 software codec RUN git clone https://github.com/FFmpeg/FFmpeg.git /home/user/comps/animation/FFmpeg @@ -47,7 +47,7 @@ ENV PYTHONPATH="$PYTHONPATH:/usr/local/lib/python3.10/dist-packages/gfpgan" WORKDIR /usr/local/lib/python3.10/dist-packages # Install pip dependencies -RUN pip install -r /home/user/comps/animation/wav2lip/requirements.txt +RUN pip install -r /home/user/comps/animation/src/requirements.txt # Custom patches # Modify the degradations.py file to import rgb_to_grayscale from torchvision.transforms.functional @@ -60,7 +60,7 @@ RUN sed -i "s/if 'cpu' not in device and 'cuda' not in device:/if 'cpu' not in d RUN sed -i 's/hp.sample_rate, hp.n_fft/sr=hp.sample_rate, n_fft=hp.n_fft/' /usr/local/lib/python3.10/dist-packages/Wav2Lip/audio.py # Set the working directory -WORKDIR /home/user/comps/animation/wav2lip +WORKDIR /home/user/comps/animation/scr # Define the command to run when the container starts RUN chmod +x /usr/local/bin/entrypoint.sh diff --git a/comps/animation/src/integration/dependency/__init__.py b/comps/animation/src/integration/dependency/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/comps/animation/src/integration/dependency/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/animation/wav2lip/dependency/check_wav2lip_server.py b/comps/animation/src/integration/dependency/check_wav2lip_server.py similarity index 82% rename from comps/animation/wav2lip/dependency/check_wav2lip_server.py rename to comps/animation/src/integration/dependency/check_wav2lip_server.py index 399f027d90..c8c7838388 100644 --- a/comps/animation/wav2lip/dependency/check_wav2lip_server.py +++ b/comps/animation/src/integration/dependency/check_wav2lip_server.py @@ -10,7 +10,7 @@ outfile = os.environ.get("OUTFILE") # Read the JSON file -with open("comps/animation/wav2lip/assets/audio/sample_whoareyou.json", "r") as file: +with open("comps/animation/src/assets/audio/sample_whoareyou.json", "r") as file: data = json.load(file) inputs = {"audio": data["byte_str"], "max_tokens": 64} diff --git a/comps/animation/wav2lip/dependency/download_ckpts.sh b/comps/animation/src/integration/dependency/download_ckpts.sh similarity index 100% rename from comps/animation/wav2lip/dependency/download_ckpts.sh rename to comps/animation/src/integration/dependency/download_ckpts.sh diff --git a/comps/animation/wav2lip/dependency/entrypoint.sh b/comps/animation/src/integration/dependency/entrypoint.sh similarity index 96% rename from comps/animation/wav2lip/dependency/entrypoint.sh rename to comps/animation/src/integration/dependency/entrypoint.sh index 1004b3594b..37c8db22e7 100644 --- a/comps/animation/wav2lip/dependency/entrypoint.sh +++ b/comps/animation/src/integration/dependency/entrypoint.sh @@ -23,7 +23,7 @@ export PT_HPU_LAZY_MODE=0 export PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES=1 # Wav2Lip, GFPGAN -cd /home/user/comps/animation/wav2lip/ || exit +cd /home/user/comps/animation/src/integration/ || exit python3 dependency/wav2lip_server.py \ --device $DEVICE \ --port $((WAV2LIP_PORT)) \ diff --git a/comps/animation/wav2lip/dependency/utils.py b/comps/animation/src/integration/dependency/utils.py similarity index 100% rename from comps/animation/wav2lip/dependency/utils.py rename to comps/animation/src/integration/dependency/utils.py diff --git a/comps/animation/wav2lip/dependency/wav2lip_server.py b/comps/animation/src/integration/dependency/wav2lip_server.py similarity index 100% rename from comps/animation/wav2lip/dependency/wav2lip_server.py rename to comps/animation/src/integration/dependency/wav2lip_server.py diff --git a/comps/animation/src/integration/opea.py b/comps/animation/src/integration/opea.py new file mode 100644 index 0000000000..16cb2b5d12 --- /dev/null +++ b/comps/animation/src/integration/opea.py @@ -0,0 +1,50 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +import json +import os + +import requests + +from comps import CustomLogger, OpeaComponent, ServiceType + +logger = CustomLogger("opea_animation") +logflag = os.getenv("LOGFLAG", False) + + +class OpeaAnimation(OpeaComponent): + """A specialized animation component derived from OpeaComponent.""" + + def __init__(self, name: str, description: str, config: dict = None): + super().__init__(name, ServiceType.ANIMATION.name.lower(), description, config) + self.base_url = os.getenv("WAV2LIP_ENDPOINT", "http://localhost:7860") + + def invoke(self, input: str): + """Invokes the animation service to generate embeddings for the animation input. + + Args: + input (Audio Byte Str) + """ + inputs = {"audio": input} + + response = requests.post(url=f"{self.base_url}/v1/wav2lip", data=json.dumps(inputs), proxies={"http": None}) + + outfile = response.json()["wav2lip_result"] + return outfile + + def check_health(self) -> bool: + """Checks the health of the animation service. + + Returns: + bool: True if the service is reachable and healthy, False otherwise. + """ + try: + response = requests.get(f"{self.base_url}/v1/health") + # If status is 200, the service is considered alive + if response.status_code == 200: + return True + else: + return False + except Exception as e: + # Handle connection errors, timeouts, etc. + logger.error(f"Health check failed: {e}") + return False diff --git a/comps/animation/wav2lip/animation.py b/comps/animation/src/opea_animation_microservice.py similarity index 63% rename from comps/animation/wav2lip/animation.py rename to comps/animation/src/opea_animation_microservice.py index bacf6b45f2..13ea92cbbc 100644 --- a/comps/animation/wav2lip/animation.py +++ b/comps/animation/src/opea_animation_microservice.py @@ -8,12 +8,11 @@ import os import time -import requests - # GenAIComps -from comps import CustomLogger +from comps import CustomLogger, OpeaComponentController +from comps.animation.src.integration.opea import OpeaAnimation -logger = CustomLogger("animation") +logger = CustomLogger("opea_animation") logflag = os.getenv("LOGFLAG", False) from comps import ( Base64ByteStrDoc, @@ -25,6 +24,23 @@ statistics_dict, ) +# Initialize OpeaComponentController +controller = OpeaComponentController() + +# Register components +try: + # Instantiate Animation component and register it to controller + opea_animation = OpeaAnimation( + name="OpeaAnimation", + description="OPEA Animation Service", + ) + controller.register(opea_animation) + + # Discover and activate a healthy component + controller.discover_and_activate() +except Exception as e: + logger.error(f"Failed to initialize components: {e}") + # Register the microservice @register_microservice( @@ -37,19 +53,11 @@ output_datatype=VideoPath, ) @register_statistics(names=["opea_service@animation"]) -async def animate(audio: Base64ByteStrDoc): +def animate(audio: Base64ByteStrDoc): start = time.time() - byte_str = audio.byte_str - inputs = {"audio": byte_str} - if logflag: - logger.info(inputs) - - response = requests.post(url=f"{wav2lip_endpoint}/v1/wav2lip", data=json.dumps(inputs), proxies={"http": None}) - - outfile = response.json()["wav2lip_result"] + outfile = opea_animation.invoke(audio.byte_str) if logflag: - logger.info(response) logger.info(f"Video generated successfully, check {outfile} for the result.") statistics_dict["opea_service@animation"].append_latency(time.time() - start, None) @@ -57,6 +65,5 @@ async def animate(audio: Base64ByteStrDoc): if __name__ == "__main__": - wav2lip_endpoint = os.getenv("WAV2LIP_ENDPOINT", "http://localhost:7860") logger.info("[animation - router] Animation initialized.") opea_microservices["opea_service@animation"].start() diff --git a/comps/animation/wav2lip/requirements.txt b/comps/animation/src/requirements.txt similarity index 100% rename from comps/animation/wav2lip/requirements.txt rename to comps/animation/src/requirements.txt diff --git a/comps/dataprep/multimedia2text/Dockerfile b/comps/dataprep/multimedia2text/Dockerfile deleted file mode 100644 index 54b39b72fc..0000000000 --- a/comps/dataprep/multimedia2text/Dockerfile +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -# Use the official Python 3.11 slim image as the base image -FROM python:3.11-slim - -# Set environment variables -ENV LANG=C.UTF-8 - -# Install necessary packages and clean up to reduce image size -RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ - build-essential \ - libgl1-mesa-glx \ - libjemalloc-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# Create a directory for the user and set it as the working directory -WORKDIR /home/user - -# Copy the application code and requirements file to the container -COPY comps /home/user/comps -COPY requirements.txt /home/user/requirements.txt -COPY ./comps/dataprep/multimedia2text/multimedia2text.py /home/user/multimedia2text.py - -# Install Python dependencies -RUN python -m pip install --no-cache-dir -r requirements.txt - -# Define the entry point for the container -ENTRYPOINT ["python", "multimedia2text.py"] diff --git a/comps/dataprep/multimedia2text/README.md b/comps/dataprep/multimedia2text/README.md deleted file mode 100644 index 3adef100e9..0000000000 --- a/comps/dataprep/multimedia2text/README.md +++ /dev/null @@ -1,220 +0,0 @@ -# Multimedia to Text Services - -This guide provides instructions on how to build and run various Docker services for converting multimedia content to text. The services include: - -1. **Whisper Service**: Converts audio to text. -2. **A2T Service**: Another service for audio to text conversion. -3. **Video to Audio Service**: Extracts audio from video files. -4. **Multimedia2Text Service**: Transforms multimedia data to text data. - -## Prerequisites - -1. **Docker**: Ensure you have Docker installed and running on your system. You can download and install Docker from the [official Docker website](https://www.docker.com/get-started). - -2. **Proxy Settings**: If you are behind a corporate firewall, make sure you have the necessary proxy settings configured. This will ensure that Docker and other tools can access the internet. - -3. **Python**: If you want to validate services using the provided Python scripts, ensure you have Python 3.11 installed. The current validation tests have been tested with Python 3.11. You can check your Python version by running the following command in your terminal: - ```bash - python --version - ``` - -## Getting Started - -First, navigate to the `GenAIComps` directory: - -```bash -cd GenAIComps -``` - -### Whisper Service - -The Whisper Service converts audio files to text. Follow these steps to build and run the service: - -#### Build - -```bash -docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/dependency/Dockerfile . -``` - -#### Run - -```bash -docker run -d -p 7066:7066 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/whisper:latest -``` - -### A2T Service - -The A2T Service is another service for converting audio to text. Follow these steps to build and run the service: - -#### Build - -```bash -docker build -t opea/a2t:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/multimedia2text/audio2text/Dockerfile . -``` - -#### Run - -```bash -host_ip=$(hostname -I | awk '{print $1}') - -docker run -d -p 9099:9099 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e A2T_ENDPOINT=http://$host_ip:7066 opea/a2t:latest -``` - -### Video to Audio Service - -The Video to Audio Service extracts audio from video files. Follow these steps to build and run the service: - -#### Build - -```bash -docker build -t opea/v2a:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/multimedia2text/video2audio/Dockerfile . -``` - -#### Run - -```bash -docker run -d -p 7078:7078 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/v2a:latest -``` - -### Multimedia2Text Service - -The Multimedia2Text Service transforms multimedia data to text data. Follow these steps to build and run the service: - -#### Build - -```bash -docker build -t opea/multimedia2text:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/multimedia2text/Dockerfile . -``` - -#### Run - -```bash -host_ip=$(hostname -I | awk '{print $1}') - -docker run -d -p 7079:7079 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy \ - -e A2T_ENDPOINT=http://$host_ip:7066 \ - -e V2A_ENDPOINT=http://$host_ip:7078 \ - opea/multimedia2text:latest -``` - -## Validate Microservices - -After building and running the services, you can validate them using the provided Python scripts. Below are the steps to validate each service: - -### Whisper Service - -Run the following command to validate the Whisper Service: - -```bash -python comps/asr/whisper/dependency/check_whisper_server.py -``` - -Expected output: - -``` -{'asr_result': 'who is pat gelsinger'} -``` - -### Audio2Text Service - -Run the following command to validate the Audio2Text Service: - -```bash -python comps/dataprep/multimedia2text/audio2text/check_a2t_server.py -``` - -Expected output: - -``` -Test passed successfully! -``` - -_Note: The `id` value will be different._ - -### Video2Audio Service - -Run the following command to validate the Video2Audio Service: - -```bash -python comps/dataprep/multimedia2text/video2audio/check_v2a_microserver.py -``` - -Expected output: - -``` -========= Audio file saved as ====== -comps/dataprep/multimedia2text/video2audio/converted_audio.wav -==================================== -``` - -### Multimedia2Text Service - -Run the following command to validate the Multimedia2Text Service: - -```bash -python comps/dataprep/multimedia2text/check_multimedia2text.py -``` - -Expected output: - -``` -Running test: Whisper service ->>> Whisper service Test Passed ... - -Running test: Audio2Text service ->>> Audio2Text service Test Passed ... - -Running test: Video2Text service ->>> Video2Text service Test Passed ... - -Running test: Multimedia2text service ->>> Multimedia2text service test for text data type passed ... ->>> Multimedia2text service test for audio data type passed ... ->>> Multimedia2text service test for video data type passed ... -``` - -## How to Stop/Remove Services - -To stop and remove the Docker containers and images associated with the multimedia-to-text services, follow these steps: - -1. **List Running Containers**: First, list all running Docker containers to identify the ones you want to stop and remove. - - ```bash - docker ps - ``` - -2. **Stop Containers**: Use the `docker stop` command followed by the container IDs or names to stop the running containers. - - ```bash - docker stop - ``` - - If you want to stop all running containers at once, you can use: - - ```bash - docker stop $(docker ps -q) - ``` - -3. **Remove Containers**: After stopping the containers, use the `docker rm` command followed by the container IDs or names to remove them. - - ```bash - docker rm - ``` - - Optionally, you can remove the stopped containers to free up resources: - - ```bash - docker rm $(docker ps -a -q) - ``` - -4. **Remove Images**: If you also want to remove the Docker images, use the `docker rmi` command followed by the image IDs or names. - - ```bash - docker rmi - ``` - - To remove all unused images, you can use: - - ```bash - docker image prune -a - ``` diff --git a/comps/dataprep/multimedia2text/audio2text/Dockerfile b/comps/dataprep/multimedia2text/audio2text/Dockerfile deleted file mode 100644 index 57707260f4..0000000000 --- a/comps/dataprep/multimedia2text/audio2text/Dockerfile +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -# Use the official Python 3.11 slim image as the base image -FROM python:3.11-slim - -# Create a new user and set up the home directory -RUN useradd -m -s /bin/bash user && \ - mkdir -p /home/user && \ - chown -R user /home/user/ -USER user - -# Set environment variables -ENV LANG=C.UTF-8 -ARG ARCH=cpu - -# Copy the application code and requirements file to the container -COPY comps /home/user/comps -COPY requirements.txt /home/user/requirements.txt - -# Install Python dependencies -RUN pip install --no-cache-dir --upgrade pip && \ - if [ "${ARCH}" = "cpu" ]; then \ - pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu && \ - pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/requirements.txt ; \ - else \ - pip install --no-cache-dir -r /home/user/requirements.txt ; \ - fi - -# Set the PYTHONPATH environment variable -ENV PYTHONPATH=$PYTHONPATH:/home/user - -# Set the working directory -WORKDIR /home/user/comps/dataprep/multimedia2text/audio2text - -# Define the entry point for the container -ENTRYPOINT ["python", "audio2text.py"] diff --git a/comps/dataprep/multimedia2text/audio2text/audio2text.py b/comps/dataprep/multimedia2text/audio2text/audio2text.py deleted file mode 100644 index 650c5704c3..0000000000 --- a/comps/dataprep/multimedia2text/audio2text/audio2text.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import json -import os - -import requests - -from comps import CustomLogger - -# Initialize custom logger -logger = CustomLogger("a2t") -logflag = os.getenv("LOGFLAG", False) - -from comps import ( - Audio2text, - Base64ByteStrDoc, - ServiceType, - TextDoc, - opea_microservices, - register_microservice, - register_statistics, -) - - -# Register the microservice -@register_microservice( - name="opea_service@a2t", - service_type=ServiceType.ASR, - endpoint="/v1/audio/transcriptions", - host="0.0.0.0", - port=9099, - input_datatype=Base64ByteStrDoc, - output_datatype=Audio2text, -) -@register_statistics(names=["opea_service@a2t"]) -async def audio_to_text(audio: Base64ByteStrDoc): - """Convert audio to text and return the transcription. - - Args: - audio (Base64ByteStrDoc): The incoming request containing the audio in base64 format. - - Returns: - TextDoc: The response containing the transcription text. - """ - try: - # Validate the input - if not audio or not audio.byte_str: - raise ValueError("Invalid input: 'audio' or 'audio.byte_str' is missing.") - - byte_str = audio.byte_str - inputs = {"audio": byte_str} - - if logflag: - logger.info(f"Inputs: {inputs}") - - # Send the POST request to the ASR endpoint - response = requests.post(url=f"{a2t_endpoint}/v1/asr", data=json.dumps(inputs), proxies={"http": None}) - response.raise_for_status() # Raise an error for bad status codes - - if logflag: - logger.info(f"Response: {response.json()}") - - # Return the transcription result - return Audio2text(query=response.json()["asr_result"]) # .text - - except requests.RequestException as e: - logger.error(f"Request to ASR endpoint failed: {e}") - raise - except Exception as e: - logger.error(f"An error occurred during audio to text conversion: {e}") - raise - - -if __name__ == "__main__": - try: - # Get the ASR endpoint from environment variables or use the default - a2t_endpoint = os.getenv("A2T_ENDPOINT", "http://localhost:7066") - - # Log initialization message - logger.info("[a2t - router] A2T initialized.") - - # Start the microservice - opea_microservices["opea_service@a2t"].start() - - except Exception as e: - logger.error(f"Failed to start the microservice: {e}") - raise diff --git a/comps/dataprep/multimedia2text/audio2text/check_a2t_server.py b/comps/dataprep/multimedia2text/audio2text/check_a2t_server.py deleted file mode 100644 index 8009fc5435..0000000000 --- a/comps/dataprep/multimedia2text/audio2text/check_a2t_server.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import argparse -import base64 -import json -import os - -import requests - -# Get the root folder of the current script -root_folder = os.path.dirname(os.path.abspath(__file__)) - - -def audio_to_text(path_to_audio): - """Convert an audio file to text by sending a request to the server. - - Args: - path_to_audio (str): Path to the audio file. - - Returns: - str: The transcribed text. - """ - file_name = os.path.join(root_folder, path_to_audio) - - # Read the audio file and encode it in base64 - with open(file_name, "rb") as f: - audio_base64_str = base64.b64encode(f.read()).decode("utf-8") - - endpoint = "http://localhost:9099/v1/audio/transcriptions" - inputs = {"byte_str": audio_base64_str} - - # Send the POST request to the server - response = requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None}) - - # Check if the request was successful - response.raise_for_status() - - # Return the transcribed text - return response.json()["query"] - - -def check_response(response): - """Check the response from the server and print the result. - - Args: - response (str): The transcribed text from the server. - """ - expected_response = "well" - assert response == expected_response, f"Expected '{expected_response}', but got '{response}'" - print("Test passed successfully!") - - -def read_config(): - """Read the configuration parameters from the input file. - - Returns: - argparse.Namespace: Parsed arguments. - """ - # Create an argument parser - parser = argparse.ArgumentParser(description="Process configuration parameters.") - - # Add argument for the audio file path - parser.add_argument( - "--path_to_audio", - help="Location of the audio file that will be converted to text.", - required=False, - default=os.path.join(root_folder, "../data/intel_short.wav"), - ) - - # Parse the arguments - args = parser.parse_args() - - # Return the parsed arguments - return args - - -if __name__ == "__main__": - # Read the configuration parameters - args = read_config() - - # Convert audio to text - response = audio_to_text(args.path_to_audio) - - # Check the response - check_response(response) diff --git a/comps/dataprep/multimedia2text/check_multimedia2text.py b/comps/dataprep/multimedia2text/check_multimedia2text.py deleted file mode 100644 index 9aeb735a72..0000000000 --- a/comps/dataprep/multimedia2text/check_multimedia2text.py +++ /dev/null @@ -1,154 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import ast -import base64 -import json -import os - -import requests - -# Get the root folder of the current script -root_folder = os.path.dirname(os.path.abspath(__file__)) - - -def get_base64_str(file_name): - """Convert a file to a base64 encoded string. - - Args: - file_name (str): Path to the file. - - Returns: - str: Base64 encoded string of the file content. - """ - with open(file_name, "rb") as f: - return base64.b64encode(f.read()).decode("utf-8") - - -def post_request(endpoint, inputs): - """Send a POST request to the specified endpoint. - - Args: - endpoint (str): The URL of the endpoint. - inputs (dict): The data to be sent in the request. - - Returns: - requests.Response: The response from the server. - """ - return requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None}) - - -def input_data_for_test(document_type): - """Generate input data for testing based on the document type. - - Args: - document_type (str): The type of document ("text", "audio", or "video"). - - Returns: - str: The input data for testing. - - Raises: - ValueError: If the document type is invalid. - """ - if document_type == "text": - input_data = "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are." - elif document_type == "audio": - input_data = get_base64_str(os.path.join(root_folder, "data/intel_short.wav")) - elif document_type == "video": - input_data = get_base64_str(os.path.join(root_folder, "data/intel_short.mp4")) - else: - raise ValueError("Invalid document type") - - return input_data - - -def test_whisper_service(): - """Test the Whisper service. - - Raises: - AssertionError: If the service does not return a 200 status code. - """ - print("Running test: Whisper service") - document_type = "audio" - endpoint = "http://localhost:7066/v1/asr" - inputs = {"audio": input_data_for_test(document_type)} - response = post_request(endpoint, inputs) - assert ( - response.status_code == 200 - ), f"Whisper service failed to get response from the server. Status code: {response.status_code}" - - # If the response status code is 200, print "Test passed" - print(">>> Whisper service Test Passed ... ") - print() - - -def test_audio2text(): - """Test the Audio2Text service. - - Raises: - AssertionError: If the service does not return a 200 status code. - """ - print("Running test: Audio2Text service") - document_type = "audio" - endpoint = "http://localhost:9099/v1/audio/transcriptions" - inputs = {"byte_str": input_data_for_test(document_type)} - response = post_request(endpoint, inputs) - assert ( - response.status_code == 200 - ), f"Audio2Text service failed to get response from the server. Status code: {response.status_code}" - - # If the response status code is 200, print "Test passed" - print(">>> Audio2Text service Test Passed ... ") - print() - - -def test_video2text(): - """Test the Video2Text service. - - Raises: - AssertionError: If the service does not return a 200 status code. - """ - print("Running test: Video2Text service") - document_type = "video" - endpoint = "http://localhost:7078/v1/video2audio" - inputs = {"byte_str": input_data_for_test(document_type)} - response = post_request(endpoint, inputs) - assert ( - response.status_code == 200 - ), f"Video2Text service failed to get response from the server. Status code: {response.status_code}" - - # If the response status code is 200, print "Test passed" - print(">>> Video2Text service Test Passed ... ") - print() - - -def test_multimedia2text_data(): - """Test the multimedia2text service for different document types. - - Raises: - AssertionError: If the service does not return a 200 status code. - """ - print("Running test: Multimedia2text service") - for document_type in ["text", "audio", "video"]: - endpoint = "http://localhost:7079/v1/multimedia2text" - inputs = {document_type: input_data_for_test(document_type)} - response = post_request(endpoint, inputs) - assert ( - response.status_code == 200 - ), f"{document_type} service failed to get response from the server. Status code: {response.status_code}" - - # If the response status code is 200, print "Test passed" - print(f">>> Multimedia2text service test for {document_type} data type passed ... ") - print() - - -if __name__ == "__main__": - # Run the tests and print the results - try: - test_whisper_service() - test_audio2text() - test_video2text() - test_multimedia2text_data() - - except AssertionError as e: - print(f"Test failed: {e}") diff --git a/comps/dataprep/multimedia2text/data/README.md b/comps/dataprep/multimedia2text/data/README.md deleted file mode 100644 index 89330dbacd..0000000000 --- a/comps/dataprep/multimedia2text/data/README.md +++ /dev/null @@ -1,31 +0,0 @@ -# Test Data for Document Summarization - -## Overview - -This document provides information about the test data used for the Document Summarization application. - -## Source of Test Data - -The data used for testing originated from the following video: - -[YouTube Video](https://www.youtube.com/watch?v=HUpnCtJRTg4) - -## Description of Test Data - -1. **Video File**: We extracted a 1-second segment from the above video and saved it as `intel_short.mp4`. -2. **Audio File**: The audio was extracted from the `intel_short.mp4` video file and saved as `intel_short.wav`. - -These files are used to test the functionality of the Document Summarization application, including the conversion of multimedia content to text. - -## Files - -- `intel_short.mp4`: A 1-second video segment extracted from the YouTube video. -- `intel_short.wav`: An audio file converted from the `intel_short.mp4` video file. - -## Usage - -These files can be used to validate the multimedia-to-text services provided by the Document Summarization application. Ensure that the files are placed in the appropriate directory as specified in the application's configuration. - -## License - -The original video content is subject to the terms and conditions of YouTube and the content creator. The extracted segments are used solely for testing and validation purposes. diff --git a/comps/dataprep/multimedia2text/data/intel_short.mp4 b/comps/dataprep/multimedia2text/data/intel_short.mp4 deleted file mode 100644 index 6b72f4122a..0000000000 Binary files a/comps/dataprep/multimedia2text/data/intel_short.mp4 and /dev/null differ diff --git a/comps/dataprep/multimedia2text/data/intel_short.wav b/comps/dataprep/multimedia2text/data/intel_short.wav deleted file mode 100644 index 21657414d1..0000000000 Binary files a/comps/dataprep/multimedia2text/data/intel_short.wav and /dev/null differ diff --git a/comps/dataprep/multimedia2text/multimedia2text.py b/comps/dataprep/multimedia2text/multimedia2text.py deleted file mode 100644 index 68f0181c95..0000000000 --- a/comps/dataprep/multimedia2text/multimedia2text.py +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import json -import os - -import requests - -from comps import CustomLogger - -# Initialize custom logger -logger = CustomLogger("multimedia2text") - -from comps import Audio2text, DocSumDoc, ServiceType, opea_microservices, register_microservice, register_statistics - - -# Register the microservice -@register_microservice( - name="opea_service@multimedia2text", - service_type=ServiceType.ASR, - endpoint="/v1/multimedia2text", - host="0.0.0.0", - port=7079, - input_datatype=DocSumDoc, - output_datatype=Audio2text, -) -@register_statistics(names=["opea_service@multimedia2text"]) -async def audio_to_text(input: DocSumDoc): - """Convert video or audio input to text using external services. - - Args: - input (DocSumDoc): Input document containing video, audio, or text data. - - Returns: - Audio2text: Object containing the ASR result or input text. - """ - response_to_return = None - - # Process video input - if input.video is not None: - logger.info(f"Processing video input at {v2a_endpoint}/v1/video2audio") - inputs = {"byte_str": input.video} - response = requests.post(url=f"{v2a_endpoint}/v1/video2audio", data=json.dumps(inputs), proxies={"http": None}) - response.raise_for_status() # Ensure the request was successful - input.audio = response.json().get("byte_str") - if input.audio is None: - logger.error("Failed to extract audio from video") - raise ValueError("Failed to extract audio from video") - - # Process audio input - if input.audio is not None: - logger.info(f"Processing audio input at {a2t_endpoint}/v1/asr") - inputs = {"audio": input.audio} - response = requests.post(url=f"{a2t_endpoint}/v1/asr", data=json.dumps(inputs), proxies={"http": None}) - response.raise_for_status() # Ensure the request was successful - response_to_return = response.json().get("asr_result") - if response_to_return is None: - logger.error("Failed to get ASR result from audio") - raise ValueError("Failed to get ASR result from audio") - - # Process text input - if input.text is not None: - logger.info("Processing text input") - response_to_return = input.text - - if response_to_return is None: - logger.warning("No valid input provided") - response_to_return = "No input" - else: - logger.info("Data Processing completeed") - - return Audio2text(query=response_to_return) - - -if __name__ == "__main__": - try: - # Get the V2T endpoint from environment variables or use the default - v2a_endpoint = os.getenv("V2A_ENDPOINT", "http://localhost:7078") - # Get the A2T endpoint from environment variables or use the default - a2t_endpoint = os.getenv("A2T_ENDPOINT", "http://localhost:7066") - - # Log initialization message - logger.info("[multimedia2text - router] multimedia2text initialized.") - - # Start the microservice - opea_microservices["opea_service@multimedia2text"].start() - - except Exception as e: - logger.error(f"Failed to start the multimedia2text microservice: {e}") - raise diff --git a/comps/dataprep/multimedia2text/video2audio/Dockerfile b/comps/dataprep/multimedia2text/video2audio/Dockerfile deleted file mode 100644 index 32b2fe8ee4..0000000000 --- a/comps/dataprep/multimedia2text/video2audio/Dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -# Use the official Python 3.11 slim image as the base image -FROM python:3.11-slim - -# Set environment variables -ENV LANG=C.UTF-8 - -# Install necessary packages -RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ - build-essential \ - libgl1-mesa-glx \ - libjemalloc-dev - -# Create a directory for the user -RUN mkdir -p /home/user - -# Copy the application code to the container -COPY comps /home/user/comps -COPY requirements.txt /home/user/requirements.txt -COPY ./comps/dataprep/multimedia2text/video2audio/video2audio_microservice.py /home/user/video2audio_microservice.py - -# Install Python dependencies -RUN python -m pip install --no-cache-dir -r /home/user/requirements.txt moviepy - -# Set the working directory -WORKDIR /home/user/ - -# Define the entry point for the container -ENTRYPOINT ["python", "video2audio_microservice.py"] diff --git a/comps/dataprep/multimedia2text/video2audio/check_v2a_microserver.py b/comps/dataprep/multimedia2text/video2audio/check_v2a_microserver.py deleted file mode 100644 index d8499faa12..0000000000 --- a/comps/dataprep/multimedia2text/video2audio/check_v2a_microserver.py +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import argparse -import base64 -import json -import os - -import requests - -# Get the root folder of the current script -root_folder = os.path.dirname(os.path.abspath(__file__)) - - -def video_to_audio(path_to_video): - """Convert a video file to an audio file in base64 format by sending a request to the server. - - Args: - path_to_video (str): Path to the video file. - - Returns: - str: Base64 encoded audio file. - """ - file_name = os.path.join(root_folder, path_to_video) - - # Read the video file and encode it in base64 - with open(file_name, "rb") as f: - video_base64_str = base64.b64encode(f.read()).decode("utf-8") - - # Define the endpoint and payload - endpoint = "http://localhost:7078/v1/video2audio" - inputs = {"byte_str": video_base64_str} - - # Send the POST request to the server - response = requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None}) - - # Check if the request was successful - response.raise_for_status() - - # Extract the base64 encoded audio from the response - audio_base64 = response.json()["byte_str"] - - return audio_base64 - - -def read_config(): - """Function to read the configuration parameters from the input file. - Returns the parsed arguments. - - Returns: - argparse.Namespace: Parsed arguments. - """ - # Create an argument parser - parser = argparse.ArgumentParser(description="Process configuration parameters.") - - # Add argument for the video file path - parser.add_argument( - "--path_to_video", - help="Location of the video file that will be converted to audio.", - required=False, - default=os.path.join(root_folder, "../data/intel_short.mp4"), - ) - - # Add argument for the audio file path - parser.add_argument( - "--path_to_audio", - help="Location to save the extracted audio file.", - required=False, - default=os.path.join(root_folder, "converted_audio.wav"), - ) - - # Parse the arguments - args = parser.parse_args() - - # Return the parsed arguments - return args - - -if __name__ == "__main__": - # Read the configuration parameters - args = read_config() - - # Extract audio from video - audio_base64 = video_to_audio(args.path_to_video) - - # Save the extracted audio to a file - with open(args.path_to_audio, "wb") as f: - f.write(base64.b64decode(audio_base64)) - - print("========= Audio file saved as ======") - print(args.path_to_audio) - print("====================================") diff --git a/comps/dataprep/multimedia2text/video2audio/video2audio.py b/comps/dataprep/multimedia2text/video2audio/video2audio.py deleted file mode 100644 index 57cc173360..0000000000 --- a/comps/dataprep/multimedia2text/video2audio/video2audio.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import argparse -import base64 -import uuid -from os import path, remove - -from moviepy import VideoFileClip - -# Get the root folder of the current script -root_folder = path.dirname(path.abspath(__file__)) - - -class Video2Audio: - """Class to convert video files to audio files and handle base64 encoding.""" - - def __init__(self): - pass - - def validate_file_exists(self, file_path): - """Validate if the given file exists. - - Args: - file_path (str): Path to the file. - - Raises: - FileNotFoundError: If the file does not exist. - """ - if not path.isfile(file_path): - raise FileNotFoundError(f"The file {file_path} does not exist.") - - def convert_video_to_audio(self, path_to_video, audio_file_name): - """Extract mp3 audio file from mp4 video file. - - Args: - path_to_video (str): Path to the video file. - audio_file_name (str): Path to save the extracted audio file. - """ - # Validate the video file exists - self.validate_file_exists(path_to_video) - - # Extract audio from video - clip = VideoFileClip(path_to_video) - clip.audio.write_audiofile(audio_file_name) - print(f"Audio extracted and saved to {audio_file_name}") - - def convert_base64(self, file_name): - """Convert a file to a base64 encoded string and remove the file. - - Args: - file_name (str): Path to the file to be encoded. - - Returns: - str: Base64 encoded string of the file content. - """ - # Validate the file exists - self.validate_file_exists(file_name) - - # Read the file and encode it in base64 - with open(file_name, "rb") as f: - base64_str = base64.b64encode(f.read()).decode("utf-8") - - # Remove the file after encoding - remove(file_name) - - return base64_str - - def convert_video_to_audio_base64(self, video_file_name): - """Convert a video file to an audio file and return the audio file as a base64 encoded string. - - Args: - video_file_name (str): Path to the video file. - - Returns: - str: Base64 encoded string of the extracted audio file. - """ - # Generate a unique identifier for the audio file - uid = str(uuid.uuid4()) - audio_file_name = uid + ".mp3" - - # Convert the video to audio - self.convert_video_to_audio(video_file_name, audio_file_name) - - # Convert the audio file to a base64 encoded string - base64_str = self.convert_base64(audio_file_name) - - return base64_str diff --git a/comps/dataprep/multimedia2text/video2audio/video2audio_microservice.py b/comps/dataprep/multimedia2text/video2audio/video2audio_microservice.py deleted file mode 100644 index f1b4b906a1..0000000000 --- a/comps/dataprep/multimedia2text/video2audio/video2audio_microservice.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import base64 -import json -import os -import uuid - -import requests - -from comps import ( - Base64ByteStrDoc, - CustomLogger, - ServiceType, - opea_microservices, - register_microservice, - register_statistics, -) -from comps.dataprep.multimedia2text.video2audio.video2audio import Video2Audio - -# Initialize custom logger -logger = CustomLogger("video2audio") -logflag = os.getenv("LOGFLAG", False) - - -# Register the microservice -@register_microservice( - name="opea_service@video2audio", - service_type=ServiceType.DATAPREP, - endpoint="/v1/video2audio", - host="0.0.0.0", - port=7078, - input_datatype=Base64ByteStrDoc, - output_datatype=Base64ByteStrDoc, -) -@register_statistics(names=["opea_service@video2audio"]) -async def audio_to_text(request: Base64ByteStrDoc): - """Convert video to audio and return the result in base64 format. - - Args: - request (Base64ByteStrDoc): The incoming request containing the video in base64 format. - - Returns: - Base64ByteStrDoc: The response containing the audio in base64 format. - """ - try: - # Generate a unique identifier for the video file - uid = str(uuid.uuid4()) - file_name = uid + ".mp4" - - logger.info("Received request for video to audio conversion.") - byte_str = request.byte_str - - # Decode the base64 string and save it as a video file - with open(file_name, "wb") as f: - f.write(base64.b64decode(byte_str)) - - # Convert the video file to audio and get the result in base64 format - response = v2a.convert_video_to_audio_base64(file_name) - - # Remove the temporary video file - os.remove(file_name) - - logger.info("Successfully converted video to audio.") - return Base64ByteStrDoc(byte_str=response) - - except requests.RequestException as e: - logger.error(f"Request to video-to-audio endpoint failed: {e}") - raise - except Exception as e: - logger.error(f"An error occurred during video to audio conversion: {e}") - raise - - -if __name__ == "__main__": - try: - # Initialize the Video2Audio instance - v2a = Video2Audio() - - # Log initialization message - logger.info("[video2audio - router] VIDEO2AUDIO initialized.") - - # Start the microservice - opea_microservices["opea_service@video2audio"].start() - - except Exception as e: - logger.error(f"Failed to start the microservice: {e}") - raise diff --git a/comps/dataprep/opensearch/README.md b/comps/dataprep/opensearch/README.md new file mode 100644 index 0000000000..a4067b7eaa --- /dev/null +++ b/comps/dataprep/opensearch/README.md @@ -0,0 +1,253 @@ +# Dataprep Microservice with OpenSearch + +For dataprep microservice for text input, we provide here the `Langchain` framework. + +## ๐Ÿš€1. Start Microservice with Python๏ผˆOption 1๏ผ‰ + +### 1.1 Install Requirements + +- option 1: Install Single-process version (for processing up to 10 files) + +```bash +apt update +apt install default-jre tesseract-ocr libtesseract-dev poppler-utils -y +# for langchain +cd langchain +pip install -r requirements.txt +``` + +### 1.2 Start OpenSearch Stack Server + +Please refer to this [readme](../../vectorstores/opensearch/README.md). + +### 1.3 Setup Environment Variables + +```bash +export your_ip=$(hostname -I | awk '{print $1}') +export OPENSEARCH_URL="http://${your_ip}:9200" +export INDEX_NAME=${your_index_name} +export PYTHONPATH=${path_to_comps} +``` + +### 1.4 Start Embedding Service + +First, you need to start a TEI service. + +```bash +your_port=6006 +model="BAAI/bge-base-en-v1.5" +docker run -p $your_port:80 -v ./data:/data --name tei_server -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model +``` + +Then you need to test your TEI service using the following commands: + +```bash +curl localhost:$your_port/embed \ + -X POST \ + -d '{"inputs":"What is Deep Learning?"}' \ + -H 'Content-Type: application/json' +``` + +After checking that it works, set up environment variables. + +```bash +export TEI_ENDPOINT="http://localhost:$your_port" +``` + +### 1.4 Start Document Preparation Microservice for OpenSearch with Python Script + +Start document preparation microservice for OpenSearch with below command. + +- option 1: Start single-process version (for processing up to 10 files) + +```bash +cd langchain +python prepare_doc_opensearch.py +``` + +## ๐Ÿš€2. Start Microservice with Docker (Option 2) + +### 2.1 Start OpenSearch Stack Server + +Please refer to this [readme](../../vectorstores/opensearch/README.md). + +### 2.2 Setup Environment Variables + +```bash +export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" +export TEI_ENDPOINT="http://${your_ip}:6006" +export OPENSEARCH_URL="http://${your_ip}:9200" +export INDEX_NAME=${your_index_name} +export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} +``` + +### 2.3 Build Docker Image + +- Build docker image with langchain + +- option 1: Start single-process version (for processing up to 10 files) + +```bash +cd ../../ +docker build -t opea/dataprep-opensearch:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/opensearch/langchain/Dockerfile . +``` + +### 2.4 Run Docker with CLI (Option A) + +- option 1: Start single-process version (for processing up to 10 files) + +```bash +docker run -d --name="dataprep-opensearch-server" -p 6007:6007 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e OPENSEARCH_URL=$OPENSEARCH_URL -e INDEX_NAME=$INDEX_NAME -e TEI_ENDPOINT=$TEI_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN opea/dataprep-opensearch:latest +``` + +### 2.5 Run with Docker Compose (Option B - deprecated, will move to genAIExample in future) + +```bash +# for langchain +cd comps/dataprep/opensearch/langchain +# common command +docker compose -f docker-compose-dataprep-opensearch.yaml up -d +``` + +## ๐Ÿš€3. Status Microservice + +```bash +docker container logs -f dataprep-opensearch-server +``` + +## ๐Ÿš€4. Consume Microservice + +### 4.1 Consume Upload API + +Once document preparation microservice for OpenSearch is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. + +Make sure the file path after `files=@` is correct. + +- Single file upload + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.txt" \ + http://localhost:6007/v1/dataprep +``` + +You can specify chunk_size and chunk_size by the following commands. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.txt" \ + -F "chunk_size=1500" \ + -F "chunk_overlap=100" \ + http://localhost:6007/v1/dataprep +``` + +We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". + +Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./your_file.pdf" \ + -F "process_table=true" \ + -F "table_strategy=hq" \ + http://localhost:6007/v1/dataprep +``` + +- Multiple file upload + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.txt" \ + -F "files=@./file2.txt" \ + -F "files=@./file3.txt" \ + http://localhost:6007/v1/dataprep +``` + +- Links upload (not supported for llama_index now) + +```bash +curl -X POST \ + -F 'link_list=["https://www.ces.tech/"]' \ + http://localhost:6007/v1/dataprep +``` + +or + +```python +import requests +import json + +proxies = {"http": ""} +url = "http://localhost:6007/v1/dataprep" +urls = [ + "https://towardsdatascience.com/no-gpu-no-party-fine-tune-bert-for-sentiment-analysis-with-vertex-ai-custom-jobs-d8fc410e908b?source=rss----7f60cf5620c9---4" +] +payload = {"link_list": json.dumps(urls)} + +try: + resp = requests.post(url=url, data=payload, proxies=proxies) + print(resp.text) + resp.raise_for_status() # Raise an exception for unsuccessful HTTP status codes + print("Request successful!") +except requests.exceptions.RequestException as e: + print("An error occurred:", e) +``` + +### 4.2 Consume get_file API + +To get uploaded file structures, use the following command: + +```bash +curl -X POST \ + -H "Content-Type: application/json" \ + http://localhost:6007/v1/dataprep/get_file +``` + +Then you will get the response JSON like this: + +```json +[ + { + "name": "uploaded_file_1.txt", + "id": "uploaded_file_1.txt", + "type": "File", + "parent": "" + }, + { + "name": "uploaded_file_2.txt", + "id": "uploaded_file_2.txt", + "type": "File", + "parent": "" + } +] +``` + +### 4.3 Consume delete_file API + +To delete uploaded file/link, use the following command. + +The `file_path` here should be the `id` get from `/v1/dataprep/get_file` API. + +```bash +# delete link +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"file_path": "https://www.ces.tech/.txt"}' \ + http://localhost:6007/v1/dataprep/delete_file + +# delete file +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"file_path": "uploaded_file_1.txt"}' \ + http://localhost:6007/v1/dataprep/delete_file + +# delete all files and links +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"file_path": "all"}' \ + http://localhost:6007/v1/dataprep/delete_file +``` diff --git a/comps/dataprep/opensearch/langchain/Dockerfile b/comps/dataprep/opensearch/langchain/Dockerfile new file mode 100644 index 0000000000..f29a753bcd --- /dev/null +++ b/comps/dataprep/opensearch/langchain/Dockerfile @@ -0,0 +1,42 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + default-jre \ + libgl1-mesa-glx \ + libjemalloc-dev \ + libreoffice \ + poppler-utils \ + tesseract-ocr + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/dataprep/opensearch/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +USER root + +RUN mkdir -p /home/user/comps/dataprep/opensearch/langchain/uploaded_files && chown -R user /home/user/comps/dataprep/opensearch/langchain/uploaded_files + +USER user + +WORKDIR /home/user/comps/dataprep/opensearch/langchain + +ENTRYPOINT ["python", "prepare_doc_opensearch.py"] + diff --git a/comps/dataprep/opensearch/langchain/__init__.py b/comps/dataprep/opensearch/langchain/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/comps/dataprep/opensearch/langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/opensearch/langchain/config.py b/comps/dataprep/opensearch/langchain/config.py new file mode 100644 index 0000000000..767cd84da7 --- /dev/null +++ b/comps/dataprep/opensearch/langchain/config.py @@ -0,0 +1,60 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# Embedding model +EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") + +# OpenSearch Connection Information +OPENSEARCH_HOST = os.getenv("OPENSEARCH_HOST", "localhost") +OPENSEARCH_PORT = int(os.getenv("OPENSEARCH_PORT", 9200)) +OPENSEARCH_INITIAL_ADMIN_PASSWORD = os.getenv("OPENSEARCH_INITIAL_ADMIN_PASSWORD", "") + + +def get_boolean_env_var(var_name, default_value=False): + """Retrieve the boolean value of an environment variable. + + Args: + var_name (str): The name of the environment variable to retrieve. + default_value (bool): The default value to return if the variable + is not found. + + Returns: + bool: The value of the environment variable, interpreted as a boolean. + """ + true_values = {"true", "1", "t", "y", "yes"} + false_values = {"false", "0", "f", "n", "no"} + + # Retrieve the environment variable's value + value = os.getenv(var_name, "").lower() + + # Decide the boolean value based on the content of the string + if value in true_values: + return True + elif value in false_values: + return False + else: + return default_value + + +def format_opensearch_conn_from_env(): + opensearch_url = os.getenv("OPENSEARCH_URL", None) + if opensearch_url: + return opensearch_url + else: + using_ssl = get_boolean_env_var("OPENSEARCH_SSL", False) + start = "https://" if using_ssl else "http://" + + return start + f"{OPENSEARCH_HOST}:{OPENSEARCH_PORT}" + + +OPENSEARCH_URL = format_opensearch_conn_from_env() + +# Vector Index Configuration +INDEX_NAME = os.getenv("INDEX_NAME", "rag-opensearch") +KEY_INDEX_NAME = os.getenv("KEY_INDEX_NAME", "file-keys") + +TIMEOUT_SECONDS = int(os.getenv("TIMEOUT_SECONDS", 600)) + +SEARCH_BATCH_SIZE = int(os.getenv("SEARCH_BATCH_SIZE", 10)) diff --git a/comps/dataprep/opensearch/langchain/docker-compose-dataprep-opensearch.yaml b/comps/dataprep/opensearch/langchain/docker-compose-dataprep-opensearch.yaml new file mode 100644 index 0000000000..7699bee1ce --- /dev/null +++ b/comps/dataprep/opensearch/langchain/docker-compose-dataprep-opensearch.yaml @@ -0,0 +1,65 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + opensearch-vector-db: + image: opensearchproject/opensearch:latest + container_name: opensearch-vector-db + environment: + - cluster.name=opensearch-cluster + - node.name=opensearch-vector-db + - discovery.seed_hosts=opensearch-vector-db + - cluster.initial_master_nodes=opensearch-vector-db + - bootstrap.memory_lock=true # along with the memlock settings below, disables swapping + - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # minimum and maximum Java heap size, recommend setting both to 50% of system RAM + - OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD} # Sets the demo admin user password when using demo configuration, required for OpenSearch 2.12 and later + ulimits: + memlock: + soft: -1 + hard: -1 + nofile: + soft: 65536 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems + hard: 65536 + ports: + - 9200:9200 + - 9600:9600 # required for Performance Analyzer + networks: + - opensearch-net + security_opt: + - no-new-privileges:true + tei-embedding-service: + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + container_name: tei-embedding-server + ports: + - "6060:80" + volumes: + - "./data:/data" + shm_size: 1g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate + dataprep-opensearch: + image: opea/dataprep-opensearch:latest + container_name: dataprep-opensearch-server + ports: + - 6007:6007 + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + OPENSEARCH_URL: ${OPENSEARCH_URL} + INDEX_NAME: ${INDEX_NAME} + TEI_ENDPOINT: ${TEI_ENDPOINT} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + restart: unless-stopped + security_opt: + - no-new-privileges:true + +networks: + default: + driver: bridge + opensearch-net: diff --git a/comps/dataprep/opensearch/langchain/prepare_doc_opensearch.py b/comps/dataprep/opensearch/langchain/prepare_doc_opensearch.py new file mode 100644 index 0000000000..10c9f83538 --- /dev/null +++ b/comps/dataprep/opensearch/langchain/prepare_doc_opensearch.py @@ -0,0 +1,471 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import json +import os +from pathlib import Path +from typing import List, Optional, Union + +from config import ( + EMBED_MODEL, + INDEX_NAME, + KEY_INDEX_NAME, + OPENSEARCH_INITIAL_ADMIN_PASSWORD, + OPENSEARCH_URL, + SEARCH_BATCH_SIZE, +) +from fastapi import Body, File, Form, HTTPException, UploadFile +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.embeddings import HuggingFaceBgeEmbeddings +from langchain_community.vectorstores import OpenSearchVectorSearch +from langchain_huggingface import HuggingFaceEndpointEmbeddings +from langchain_text_splitters import HTMLHeaderTextSplitter + +# from pyspark import SparkConf, SparkContext +from opensearchpy import OpenSearch, helpers + +from comps import CustomLogger, DocPath, opea_microservices, register_microservice +from comps.dataprep.utils import ( + create_upload_folder, + document_loader, + encode_filename, + format_search_results, + get_separators, + get_tables_result, + parse_html, + remove_folder_with_ignore, + save_content_to_local_disk, +) + +logger = CustomLogger("prepare_doc_opensearch") +logflag = os.getenv("LOGFLAG", False) + +upload_folder = "./uploaded_files/" +tei_embedding_endpoint = os.getenv("TEI_ENDPOINT") +if tei_embedding_endpoint: + # create embeddings using TEI endpoint service + embeddings = HuggingFaceEndpointEmbeddings(model=tei_embedding_endpoint) +else: + # create embeddings using local embedding model + embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) +auth = ("admin", OPENSEARCH_INITIAL_ADMIN_PASSWORD) +opensearch_client = OpenSearchVectorSearch( + opensearch_url=OPENSEARCH_URL, + index_name=INDEX_NAME, + embedding_function=embeddings, + http_auth=auth, + use_ssl=True, + verify_certs=False, + ssl_assert_hostname=False, + ssl_show_warn=False, +) + + +def check_index_existence(client, index_name): + if logflag: + logger.info(f"[ check index existence ] checking {client}") + try: + exists = client.index_exists(index_name) + exists = False if exists is None else exists + if exists: + if logflag: + logger.info(f"[ check index existence ] index of client exists: {client}") + else: + if logflag: + logger.info("[ check index existence ] index does not exist") + return exists + except Exception as e: + if logflag: + logger.info(f"[ check index existence ] error checking index for client: {e}") + return False + + +def create_index(client, index_name: str = KEY_INDEX_NAME): + if logflag: + logger.info(f"[ create index ] creating index {index_name}") + try: + index_body = { + "mappings": { + "properties": { + "file_name": {"type": "text"}, + "key_ids": {"type": "text"}, + } + } + } + + # Create the index + client.client.indices.create(index_name, body=index_body) + + if logflag: + logger.info(f"[ create index ] index {index_name} successfully created") + return True + except Exception as e: + if logflag: + logger.info(f"[ create index ] fail to create index {index_name}: {e}") + return False + + +def store_by_id(client, key, value): + if logflag: + logger.info(f"[ store by id ] storing ids of {key}") + try: + client.client.index( + index=KEY_INDEX_NAME, body={"file_name": f"file:${key}", "key_ids:": value}, id="file:" + key, refresh=True + ) + if logflag: + logger.info(f"[ store by id ] store document success. id: file:{key}") + except Exception as e: + if logflag: + logger.info(f"[ store by id ] fail to store document file:{key}: {e}") + return False + return True + + +def search_by_id(client, doc_id): + if logflag: + logger.info(f"[ search by id ] searching docs of {doc_id}") + try: + result = client.client.get(index=KEY_INDEX_NAME, id=doc_id) + if result["found"]: + if logflag: + logger.info(f"[ search by id ] search success of {doc_id}: {result}") + return result + return None + except Exception as e: + if logflag: + logger.info(f"[ search by id ] fail to search docs of {doc_id}: {e}") + return None + + +def drop_index(client, index_name): + if logflag: + logger.info(f"[ drop index ] dropping index {index_name}") + try: + client.client.indices.delete(index=index_name) + if logflag: + logger.info(f"[ drop index ] index {index_name} deleted") + except Exception as e: + if logflag: + logger.info(f"[ drop index ] index {index_name} delete failed: {e}") + return False + return True + + +def delete_by_id(client, doc_id): + try: + response = client.client.delete(index=KEY_INDEX_NAME, id=doc_id) + if response["result"] == "deleted": + if logflag: + logger.info(f"[ delete by id ] delete id success: {doc_id}") + return True + else: + if logflag: + logger.info(f"[ delete by id ] delete id failed: {doc_id}") + return False + except Exception as e: + if logflag: + logger.info(f"[ delete by id ] fail to delete ids {doc_id}: {e}") + return False + + +def ingest_chunks_to_opensearch(file_name: str, chunks: List): + if logflag: + logger.info(f"[ ingest chunks ] file name: {file_name}") + + # Batch size + batch_size = 32 + num_chunks = len(chunks) + + file_ids = [] + for i in range(0, num_chunks, batch_size): + if logflag: + logger.info(f"[ ingest chunks ] Current batch: {i}") + batch_chunks = chunks[i : i + batch_size] + + keys = opensearch_client.add_texts(texts=batch_chunks, metadatas=[{"source": file_name} for _ in batch_chunks]) + if logflag: + logger.info(f"[ ingest chunks ] keys: {keys}") + file_ids.extend(keys) + if logflag: + logger.info(f"[ ingest chunks ] Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}") + + # store file_ids into index file-keys + if not check_index_existence(opensearch_client, KEY_INDEX_NAME): + assert create_index(opensearch_client) + + try: + assert store_by_id(opensearch_client, key=file_name, value="#".join(file_ids)) + except Exception as e: + if logflag: + logger.info(f"[ ingest chunks ] {e}. Fail to store chunks of file {file_name}.") + raise HTTPException(status_code=500, detail=f"Fail to store chunks of file {file_name}.") + return True + + +def ingest_data_to_opensearch(doc_path: DocPath): + """Ingest document to OpenSearch.""" + path = doc_path.path + if logflag: + logger.info(f"[ ingest data ] Parsing document {path}.") + + if path.endswith(".html"): + headers_to_split_on = [ + ("h1", "Header 1"), + ("h2", "Header 2"), + ("h3", "Header 3"), + ] + text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) + else: + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=doc_path.chunk_size, + chunk_overlap=doc_path.chunk_overlap, + add_start_index=True, + separators=get_separators(), + ) + + content = document_loader(path) + if logflag: + logger.info("[ ingest data ] file content loaded") + + structured_types = [".xlsx", ".csv", ".json", "jsonl"] + _, ext = os.path.splitext(path) + + if ext in structured_types: + chunks = content + else: + chunks = text_splitter.split_text(content) + + ### Specially processing for the table content in PDFs + if doc_path.process_table and path.endswith(".pdf"): + table_chunks = get_tables_result(path, doc_path.table_strategy) + chunks = chunks + table_chunks + if logflag: + logger.info(f"[ ingest data ] Done preprocessing. Created {len(chunks)} chunks of the given file.") + + file_name = doc_path.path.split("/")[-1] + return ingest_chunks_to_opensearch(file_name, chunks) + + +def search_all_documents(index_name, offset, search_batch_size): + try: + response = opensearch_client.client.search( + index=index_name, + body={ + "query": {"match_all": {}}, + "from": offset, # Starting position + "size": search_batch_size, # Number of results to return + }, + ) + # Get total number of matching documents + total_hits = response["hits"]["total"]["value"] + # Get the documents from the current batch + documents = response["hits"]["hits"] + + return {"total_hits": total_hits, "documents": documents} + + except Exception as e: + print(f"Error performing search: {e}") + return None + + +@register_microservice(name="opea_service@prepare_doc_opensearch", endpoint="/v1/dataprep", host="0.0.0.0", port=6007) +async def ingest_documents( + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1500), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), +): + if logflag: + logger.info(f"[ upload ] files:{files}") + logger.info(f"[ upload ] link_list:{link_list}") + + if files: + if not isinstance(files, list): + files = [files] + uploaded_files = [] + + for file in files: + encode_file = encode_filename(file.filename) + doc_id = "file:" + encode_file + if logflag: + logger.info(f"[ upload ] processing file {doc_id}") + + # check whether the file already exists + key_ids = None + try: + document = search_by_id(opensearch_client, doc_id) + if document: + if logflag: + logger.info(f"[ upload ] File {file.filename} already exists.") + key_ids = document["_id"] + except Exception as e: + logger.info(f"[ upload ] File {file.filename} does not exist.") + if key_ids: + raise HTTPException( + status_code=400, detail=f"Uploaded file {file.filename} already exists. Please change file name." + ) + + save_path = upload_folder + encode_file + await save_content_to_local_disk(save_path, file) + ingest_data_to_opensearch( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ) + ) + uploaded_files.append(save_path) + if logflag: + logger.info(f"[ upload ] Successfully saved file {save_path}") + + result = {"status": 200, "message": "Data preparation succeeded"} + if logflag: + logger.info(result) + return result + + if link_list: + link_list = json.loads(link_list) # Parse JSON string to list + if not isinstance(link_list, list): + raise HTTPException(status_code=400, detail=f"Link_list {link_list} should be a list.") + for link in link_list: + encoded_link = encode_filename(link) + doc_id = "file:" + encoded_link + ".txt" + if logflag: + logger.info(f"[ upload ] processing link {doc_id}") + + # check whether the link file already exists + key_ids = None + try: + document = search_by_id(opensearch_client, doc_id) + if document: + if logflag: + logger.info(f"[ upload ] Link {link} already exists.") + key_ids = document["_id"] + except Exception as e: + logger.info(f"[ upload ] Link {link} does not exist. Keep storing.") + if key_ids: + raise HTTPException( + status_code=400, detail=f"Uploaded link {link} already exists. Please change another link." + ) + + save_path = upload_folder + encoded_link + ".txt" + content = parse_html([link])[0][0] + await save_content_to_local_disk(save_path, content) + ingest_data_to_opensearch( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ) + ) + if logflag: + logger.info(f"[ upload ] Successfully saved link list {link_list}") + return {"status": 200, "message": "Data preparation succeeded"} + + raise HTTPException(status_code=400, detail="Must provide either a file or a string list.") + + +@register_microservice( + name="opea_service@prepare_doc_opensearch", endpoint="/v1/dataprep/get_file", host="0.0.0.0", port=6007 +) +async def rag_get_file_structure(): + if logflag: + logger.info("[ get ] start to get file structure") + + offset = 0 + file_list = [] + + # check index existence + res = check_index_existence(opensearch_client, KEY_INDEX_NAME) + if not res: + if logflag: + logger.info(f"[ get ] index {KEY_INDEX_NAME} does not exist") + return file_list + + while True: + response = search_all_documents(KEY_INDEX_NAME, offset, SEARCH_BATCH_SIZE) + # no doc retrieved + if len(response) < 2: + break + + def format_opensearch_results(response, file_list): + for document in response["documents"]: + file_id = document["_id"] + file_list.append({"name": file_id, "id": file_id, "type": "File", "parent": ""}) + + file_list = format_opensearch_results(response, file_list) + offset += SEARCH_BATCH_SIZE + # last batch + if (len(response) - 1) // 2 < SEARCH_BATCH_SIZE: + break + if logflag: + logger.info(f"[get] final file_list: {file_list}") + return file_list + + +@register_microservice( + name="opea_service@prepare_doc_opensearch", endpoint="/v1/dataprep/delete_file", host="0.0.0.0", port=6007 +) +async def delete_single_file(file_path: str = Body(..., embed=True)): + """Delete file according to `file_path`. + + `file_path`: + - specific file path (e.g. /path/to/file.txt) + - "all": delete all files uploaded + """ + + # delete all uploaded files + if file_path == "all": + if logflag: + logger.info("[ delete ] delete all files") + + # drop index KEY_INDEX_NAME + if check_index_existence(opensearch_client, KEY_INDEX_NAME): + try: + assert drop_index(index_name=KEY_INDEX_NAME) + except Exception as e: + if logflag: + logger.info(f"[ delete ] {e}. Fail to drop index {KEY_INDEX_NAME}.") + raise HTTPException(status_code=500, detail=f"Fail to drop index {KEY_INDEX_NAME}.") + else: + logger.info(f"[ delete ] Index {KEY_INDEX_NAME} does not exits.") + + # drop index INDEX_NAME + if check_index_existence(opensearch_client, INDEX_NAME): + try: + assert drop_index(index_name=INDEX_NAME) + except Exception as e: + if logflag: + logger.info(f"[ delete ] {e}. Fail to drop index {INDEX_NAME}.") + raise HTTPException(status_code=500, detail=f"Fail to drop index {INDEX_NAME}.") + else: + if logflag: + logger.info(f"[ delete ] Index {INDEX_NAME} does not exits.") + + # delete files on local disk + try: + remove_folder_with_ignore(upload_folder) + except Exception as e: + if logflag: + logger.info(f"[ delete ] {e}. Fail to delete {upload_folder}.") + raise HTTPException(status_code=500, detail=f"Fail to delete {upload_folder}.") + + if logflag: + logger.info("[ delete ] successfully delete all files.") + create_upload_folder(upload_folder) + if logflag: + logger.info({"status": True}) + return {"status": True} + else: + raise HTTPException(status_code=404, detail="Single file deletion is not implemented yet") + + +if __name__ == "__main__": + create_upload_folder(upload_folder) + opea_microservices["opea_service@prepare_doc_opensearch"].start() diff --git a/comps/dataprep/opensearch/langchain/requirements.txt b/comps/dataprep/opensearch/langchain/requirements.txt new file mode 100644 index 0000000000..fa242973e8 --- /dev/null +++ b/comps/dataprep/opensearch/langchain/requirements.txt @@ -0,0 +1,30 @@ +beautifulsoup4 +cairosvg +docarray[full] +docx2txt +easyocr +fastapi +huggingface_hub +langchain +langchain-community +langchain-text-splitters +langchain_huggingface +markdown +numpy +opensearch-py +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pandas +Pillow +prometheus-fastapi-instrumentator +pymupdf +pyspark +pytesseract +python-bidi +python-docx +python-pptx +sentence_transformers +shortuuid +unstructured[all-docs] +uvicorn diff --git a/comps/image2image/src/Dockerfile b/comps/image2image/src/Dockerfile index 074cbf9dc9..5e84557fcb 100644 --- a/comps/image2image/src/Dockerfile +++ b/comps/image2image/src/Dockerfile @@ -16,8 +16,8 @@ RUN pip install --no-cache-dir --upgrade pip && \ ENV PYTHONPATH=$PYTHONPATH:/home -WORKDIR /home/comps/image2image/src/ +WORKDIR /home/comps/image2image/src -RUN echo python image2image.py --bf16 >> run.sh +RUN echo python opea_image2image_microservice.py --bf16 >> run.sh CMD bash run.sh diff --git a/comps/image2image/src/Dockerfile.intel_hpu b/comps/image2image/src/Dockerfile.intel_hpu index 1d25c439be..dd0d29f523 100644 --- a/comps/image2image/src/Dockerfile.intel_hpu +++ b/comps/image2image/src/Dockerfile.intel_hpu @@ -19,11 +19,11 @@ ENV PYTHONPATH=/home/user:/usr/lib/habanalabs/:/home/user/optimum-habana # Install requirements and optimum habana RUN pip install --no-cache-dir --upgrade pip && \ - pip install --no-cache-dir -r /home/user/comps/image2image/requirements.txt && \ + pip install --no-cache-dir -r /home/user/comps/image2image/src/requirements.txt && \ pip install --no-cache-dir optimum[habana] -WORKDIR /home/user/comps/image2image +WORKDIR /home/user/comps/image2image/src -RUN echo python image2image.py --device hpu --use_hpu_graphs --bf16 >> run.sh +RUN echo python opea_image2image_microservice.py --device hpu --use_hpu_graphs --bf16 >> run.sh CMD bash run.sh diff --git a/comps/image2image/src/README.md b/comps/image2image/src/README.md index 44a30d7c22..4d71161758 100644 --- a/comps/image2image/src/README.md +++ b/comps/image2image/src/README.md @@ -28,7 +28,7 @@ export HF_TOKEN= Start the OPEA Microservice: ```bash -python image2image.py --bf16 --model_name_or_path $MODEL --token $HF_TOKEN +python opea_image2image_microservice.py --bf16 --model_name_or_path $MODEL --token $HF_TOKEN ``` # ๐Ÿš€2. Start Microservice with Docker (Option 2) diff --git a/comps/image2image/src/integration/__init__.py b/comps/image2image/src/integration/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/comps/image2image/src/integration/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/image2image/src/integration/opea_image2image_native.py b/comps/image2image/src/integration/opea_image2image_native.py new file mode 100644 index 0000000000..4399c12931 --- /dev/null +++ b/comps/image2image/src/integration/opea_image2image_native.py @@ -0,0 +1,139 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +import base64 +import os +import threading + +from comps import CustomLogger, OpeaComponent, SDImg2ImgInputs, ServiceType + +logger = CustomLogger("opea_imagetoimage") +logflag = os.getenv("LOGFLAG", False) + +import torch +from diffusers import AutoPipelineForImage2Image +from diffusers.utils import load_image + +pipe = None +args = None +initialization_lock = threading.Lock() +initialized = False + + +def initialize( + model_name_or_path="stabilityai/stable-diffusion-xl-refiner-1.0", + device="cpu", + token=None, + bf16=True, + use_hpu_graphs=False, +): + global pipe, args, initialized + with initialization_lock: + if not initialized: + # initialize model and tokenizer + if os.getenv("MODEL", None): + model_name_or_path = os.getenv("MODEL") + kwargs = {} + if bf16: + kwargs["torch_dtype"] = torch.bfloat16 + if not token: + token = os.getenv("HF_TOKEN") + if device == "hpu": + kwargs( + { + "use_habana": True, + "use_hpu_graphs": use_hpu_graphs, + "gaudi_config": "Habana/stable-diffusion", + "token": token, + } + ) + if "stable-diffusion-xl" in model_name_or_path: + from optimum.habana.diffusers import GaudiStableDiffusionXLImg2ImgPipeline + + pipe = GaudiStableDiffusionXLImg2ImgPipeline.from_pretrained( + model_name_or_path, + **kwargs, + ) + else: + raise NotImplementedError( + "Only support stable-diffusion-xl now, " + f"model {model_name_or_path} not supported." + ) + elif device == "cpu": + pipe = AutoPipelineForImage2Image.from_pretrained(model_name_or_path, token=token, **kwargs) + else: + raise NotImplementedError(f"Only support cpu and hpu device now, device {device} not supported.") + logger.info("Stable Diffusion model initialized.") + initialized = True + + +class OpeaImageToImage(OpeaComponent): + """A specialized ImageToImage component derived from OpeaComponent for Stable Diffusion model . + + Attributes: + model_name_or_path (str): The name of the Stable Diffusion model used. + device (str): which device to use. + token(str): Huggingface Token. + bf16(bool): Is use bf16. + use_hpu_graphs(bool): Is use hpu_graphs. + """ + + def __init__( + self, + name: str, + description: str, + config: dict = None, + seed=42, + model_name_or_path="stabilityai/stable-diffusion-xl-refiner-1.0", + device="cpu", + token=None, + bf16=True, + use_hpu_graphs=False, + ): + super().__init__(name, ServiceType.IMAGE2IMAGE.name.lower(), description, config) + initialize( + model_name_or_path=model_name_or_path, device=device, token=token, bf16=bf16, use_hpu_graphs=use_hpu_graphs + ) + self.pipe = pipe + self.seed = seed + + def invoke(self, input: SDImg2ImgInputs): + """Invokes the ImageToImage service to generate Images for the provided input. + + Args: + input (SDImg2ImgInputs): The input in SD images format. + """ + image = load_image(input.image).convert("RGB") + prompt = input.prompt + num_images_per_prompt = input.num_images_per_prompt + + generator = torch.manual_seed(self.seed) + images = pipe( + image=image, prompt=prompt, generator=generator, num_images_per_prompt=num_images_per_prompt + ).images + image_path = os.path.join(os.getcwd(), prompt.strip().replace(" ", "_").replace("/", "")) + os.makedirs(image_path, exist_ok=True) + results = [] + for i, image in enumerate(images): + save_path = os.path.join(image_path, f"image_{i + 1}.png") + image.save(save_path) + with open(save_path, "rb") as f: + bytes = f.read() + b64_str = base64.b64encode(bytes).decode() + results.append(b64_str) + + return results + + def check_health(self) -> bool: + """Checks the health of the ImageToImage service. + + Returns: + bool: True if the service is reachable and healthy, False otherwise. + """ + try: + if self.pipe: + return True + else: + return False + except Exception as e: + # Handle connection errors, timeouts, etc. + logger.error(f"Health check failed: {e}") + return False diff --git a/comps/image2image/src/opea_image2image_microservice.py b/comps/image2image/src/opea_image2image_microservice.py new file mode 100644 index 0000000000..7cf8da5168 --- /dev/null +++ b/comps/image2image/src/opea_image2image_microservice.py @@ -0,0 +1,81 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import base64 +import os +import time + +from comps import ( + CustomLogger, + OpeaComponentController, + SDImg2ImgInputs, + SDOutputs, + ServiceType, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) +from comps.image2image.src.integration.opea_image2image_native import OpeaImageToImage + +args = None + +logger = CustomLogger("image2image") + + +# Initialize OpeaComponentController +controller = OpeaComponentController() + +# Register components +# try: + +# except Exception as e: +# logger.error(f"Failed to initialize components: {e}") + + +@register_microservice( + name="opea_service@image2image", + service_type=ServiceType.IMAGE2IMAGE, + endpoint="/v1/image2image", + host="0.0.0.0", + port=9389, + input_datatype=SDImg2ImgInputs, + output_datatype=SDOutputs, +) +@register_statistics(names=["opea_service@image2image"]) +def image2image(input: SDImg2ImgInputs): + start = time.time() + results = controller.invoke(input) + statistics_dict["opea_service@image2image"].append_latency(time.time() - start, None) + return SDOutputs(images=results) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model_name_or_path", type=str, default="stabilityai/stable-diffusion-xl-refiner-1.0") + parser.add_argument("--use_hpu_graphs", default=False, action="store_true") + parser.add_argument("--device", type=str, default="cpu") + parser.add_argument("--token", type=str, default=None) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--bf16", action="store_true") + + args = parser.parse_args() + # Instantiate Animation component and register it to controller + opea_imagetoimage = OpeaImageToImage( + name="OpeaImageToImage", + description="OPEA Image To Image Service", + seed=args.seed, + model_name_or_path=args.model_name_or_path, + device=args.device, + token=args.token, + bf16=args.bf16, + use_hpu_graphs=args.use_hpu_graphs, + ) + + controller.register(opea_imagetoimage) + + # Discover and activate a healthy component + controller.discover_and_activate() + logger.info("Image2image server started.") + opea_microservices["opea_service@image2image"].start() diff --git a/comps/retrievers/opensearch/langchain/Dockerfile b/comps/retrievers/opensearch/langchain/Dockerfile new file mode 100644 index 0000000000..038b5d6bc1 --- /dev/null +++ b/comps/retrievers/opensearch/langchain/Dockerfile @@ -0,0 +1,28 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +COPY comps /home/user/comps + +USER user + +RUN pip install --no-cache-dir --upgrade pip && \ + if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/retrievers/opensearch/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/retrievers/opensearch/langchain + +ENTRYPOINT ["python", "retriever_opensearch.py"] diff --git a/comps/retrievers/opensearch/langchain/README.md b/comps/retrievers/opensearch/langchain/README.md new file mode 100644 index 0000000000..487f8e7d53 --- /dev/null +++ b/comps/retrievers/opensearch/langchain/README.md @@ -0,0 +1,144 @@ +# Retriever Microservice + +This retriever microservice is a highly efficient search service designed for handling and retrieving embedding vectors. It operates by receiving an embedding vector as input and conducting a similarity search against vectors stored in a VectorDB database. Users must specify the VectorDB's URL and the index name, and the service searches within that index to find documents with the highest similarity to the input vector. + +The service primarily utilizes similarity measures in vector space to rapidly retrieve contentually similar documents. The vector-based retrieval approach is particularly suited for handling large datasets, offering fast and accurate search results that significantly enhance the efficiency and quality of information retrieval. + +Overall, this microservice provides robust backend support for applications requiring efficient similarity searches, playing a vital role in scenarios such as recommendation systems, information retrieval, or any other context where precise measurement of document similarity is crucial. + +## ๐Ÿš€1. Start Microservice with Python (Option 1) + +To start the retriever microservice, you must first install the required python packages. + +### 1.1 Install Requirements + +```bash +pip install -r requirements.txt +``` + +### 1.2 Start TEI Service + +```bash +model=BAAI/bge-base-en-v1.5 +volume=$PWD/data +docker run -d -p 6060:80 -v $volume:/data -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model +``` + +### 1.3 Verify the TEI Service + +Health check the embedding service with: + +```bash +curl 127.0.0.1:6060/embed \ + -X POST \ + -d '{"inputs":"What is Deep Learning?"}' \ + -H 'Content-Type: application/json' +``` + +### 1.4 Setup VectorDB Service + +You need to setup your own VectorDB service (OpenSearch in this example), and ingest your knowledge documents into the vector database. + +As for OpenSearch, you could start a docker container referencing the instructions found in the OpenSearch vectorstores [README.md](../../../vectorstores/opensearch/README.md) + +### 1.5 Start Retriever Service + +```bash +export TEI_EMBEDDING_ENDPOINT="http://${your_ip}:6060" +python retriever_opensearch.py +``` + +## ๐Ÿš€2. Start Microservice with Docker (Option 2) + +### 2.1 Setup Environment Variables + +```bash +export RETRIEVE_MODEL_ID="BAAI/bge-base-en-v1.5" +export OPENSEARCH_URL="http://${your_ip}:9200" +export INDEX_NAME=${your_index_name} +export TEI_EMBEDDING_ENDPOINT="http://${your_ip}:6060" +export HUGGINGFACEHUB_API_TOKEN=${your_hf_token} +export OPENSEARCH_INITIAL_ADMIN_PASSWORD=${your_opensearch_initial_admin_password} +``` + +### 2.2 Build Docker Image + +```bash +cd ../../../../ +docker build -t opea/retriever-opensearch-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/opensearch/langchain/Dockerfile . +``` + +To start a docker container, you have two options: + +- A. Run Docker with CLI +- B. Run Docker with Docker Compose + +You can choose one as needed. + +### 2.3 Run Docker with CLI (Option A) + +```bash +docker run -d --name="retriever-opensearch-server" -p 7000:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e OPENSEARCH_URL=$OPENSEARCH_URL -e INDEX_NAME=$INDEX_NAME -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN opea/retriever-opensearch:latest +``` + +### 2.4 Run Docker with Docker Compose (Option B) + +```bash +docker compose -f docker_compose_retriever.yaml up -d +``` + +## ๐Ÿš€3. Consume Retriever Service + +### 3.1 Check Service Status + +```bash +curl http://localhost:7000/v1/health_check \ + -X GET \ + -H 'Content-Type: application/json' +``` + +### 3.2 Consume Embedding Service + +To consume the Retriever Microservice, you can generate a mock embedding vector of length 768 with Python. + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://${your_ip}:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding}}" \ + -H 'Content-Type: application/json' +``` + +You can set the parameters for the retriever. + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity\", \"k\":4}" \ + -H 'Content-Type: application/json' +``` + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity_distance_threshold\", \"k\":4, \"distance_threshold\":1.0}" \ + -H 'Content-Type: application/json' +``` + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity_score_threshold\", \"k\":4, \"score_threshold\":0.2}" \ + -H 'Content-Type: application/json' +``` + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"mmr\", \"k\":4, \"fetch_k\":20, \"lambda_mult\":0.5}" \ + -H 'Content-Type: application/json' +``` diff --git a/comps/retrievers/opensearch/langchain/__init__.py b/comps/retrievers/opensearch/langchain/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/comps/retrievers/opensearch/langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/retrievers/opensearch/langchain/docker_compose_retriever.yaml b/comps/retrievers/opensearch/langchain/docker_compose_retriever.yaml new file mode 100644 index 0000000000..653e413a32 --- /dev/null +++ b/comps/retrievers/opensearch/langchain/docker_compose_retriever.yaml @@ -0,0 +1,36 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3.8" + +services: + tei_xeon_service: + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 + container_name: tei-xeon_server + ports: + - "6060:80" + volumes: + - "./data:/data" + shm_size: 1g + command: --model-id ${RETRIEVE_MODEL_ID} + retriever: + image: opea/retriever-opensearch-server + container_name: retriever-opensearch-server + ports: + - "7000:7000" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + OPENSEARCH_URL: ${OPENSEARCH_URL} + INDEX_NAME: ${INDEX_NAME} + TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + restart: unless-stopped + security_opt: + - no-new-privileges:true + +networks: + default: + driver: bridge diff --git a/comps/retrievers/opensearch/langchain/opensearch_config.py b/comps/retrievers/opensearch/langchain/opensearch_config.py new file mode 100644 index 0000000000..fd6b68d357 --- /dev/null +++ b/comps/retrievers/opensearch/langchain/opensearch_config.py @@ -0,0 +1,70 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + + +def get_boolean_env_var(var_name, default_value=False): + """Retrieve the boolean value of an environment variable. + + Args: + var_name (str): The name of the environment variable to retrieve. + default_value (bool): The default value to return if the variable + is not found. + + Returns: + bool: The value of the environment variable, interpreted as a boolean. + """ + true_values = {"true", "1", "t", "y", "yes"} + false_values = {"false", "0", "f", "n", "no"} + + # Retrieve the environment variable's value + value = os.getenv(var_name, "").lower() + + # Decide the boolean value based on the content of the string + if value in true_values: + return True + elif value in false_values: + return False + else: + return default_value + + +# Whether or not to enable langchain debugging +DEBUG = get_boolean_env_var("DEBUG", False) +# Set DEBUG env var to "true" if you wish to enable LC debugging module +if DEBUG: + import langchain + + langchain.debug = True + + +# Embedding model +EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") + + +# OpenSearch Connection Information +OPENSEARCH_HOST = os.getenv("OPENSEARCH_HOST", "localhost") +OPENSEARCH_PORT = int(os.getenv("OPENSEARCH_PORT", 9200)) +OPENSEARCH_INITIAL_ADMIN_PASSWORD = os.getenv("OPENSEARCH_INITIAL_ADMIN_PASSWORD", "") + + +def format_opensearch_conn_from_env(): + opensearch_url = os.getenv("OPENSEARCH_URL", None) + if opensearch_url: + return opensearch_url + else: + using_ssl = get_boolean_env_var("OPENSEARCH_SSL", False) + start = "https://" if using_ssl else "http://" + + return start + f"{OPENSEARCH_HOST}:{OPENSEARCH_PORT}" + + +OPENSEARCH_URL = format_opensearch_conn_from_env() + +# Vector Index Configuration +INDEX_NAME = os.getenv("INDEX_NAME", "rag-opensearch") + + +current_file_path = os.path.abspath(__file__) +parent_dir = os.path.dirname(current_file_path) diff --git a/comps/retrievers/opensearch/langchain/requirements.txt b/comps/retrievers/opensearch/langchain/requirements.txt new file mode 100644 index 0000000000..5690118bbb --- /dev/null +++ b/comps/retrievers/opensearch/langchain/requirements.txt @@ -0,0 +1,16 @@ +docarray[full] +easyocr +fastapi +langchain_community +langchain_huggingface +numpy +opensearch-py +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +prometheus-fastapi-instrumentator +pydantic +pymupdf +sentence_transformers +shortuuid +uvicorn diff --git a/comps/retrievers/opensearch/langchain/retriever_opensearch.py b/comps/retrievers/opensearch/langchain/retriever_opensearch.py new file mode 100644 index 0000000000..c570cb6db5 --- /dev/null +++ b/comps/retrievers/opensearch/langchain/retriever_opensearch.py @@ -0,0 +1,162 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import time +from typing import Callable, List, Union + +import numpy as np +from langchain_community.embeddings import HuggingFaceBgeEmbeddings +from langchain_community.vectorstores import OpenSearchVectorSearch +from langchain_huggingface import HuggingFaceEndpointEmbeddings +from opensearch_config import EMBED_MODEL, INDEX_NAME, OPENSEARCH_INITIAL_ADMIN_PASSWORD, OPENSEARCH_URL +from pydantic import conlist + +from comps import ( + CustomLogger, + EmbedDoc, + SearchedDoc, + ServiceType, + TextDoc, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) +from comps.cores.proto.api_protocol import ( + ChatCompletionRequest, + RetrievalRequest, + RetrievalResponse, + RetrievalResponseData, +) + +logger = CustomLogger("retriever_opensearch") +logflag = os.getenv("LOGFLAG", False) + +tei_embedding_endpoint = os.getenv("TEI_EMBEDDING_ENDPOINT", None) + + +async def search_all_embeddings_vectors( + embeddings: Union[conlist(float, min_length=0), List[conlist(float, min_length=0)]], func: Callable, *args, **kwargs +): + try: + if not isinstance(embeddings, np.ndarray): + embeddings = np.array(embeddings) + + if not np.issubdtype(embeddings.dtype, np.floating): + raise ValueError("All embeddings values must be floating point numbers") + + if embeddings.ndim == 1: + return await func(embedding=embeddings, *args, **kwargs) + elif embeddings.ndim == 2: + responses = [] + for emb in embeddings: + response = await func(embedding=emb, *args, **kwargs) + responses.extend(response) + return responses + else: + raise ValueError("Embeddings must be one or two dimensional") + except Exception as e: + raise ValueError(f"Embedding data is not valid: {e}") + + +@register_microservice( + name="opea_service@retriever_opensearch", + service_type=ServiceType.RETRIEVER, + endpoint="/v1/retrieval", + host="0.0.0.0", + port=7000, +) +@register_statistics(names=["opea_service@retriever_opensearch"]) +async def retrieve( + input: Union[EmbedDoc, RetrievalRequest, ChatCompletionRequest] +) -> Union[SearchedDoc, RetrievalResponse, ChatCompletionRequest]: + if logflag: + logger.info(input) + start = time.time() + + # Check if the index exists and has documents + doc_count = 0 + + index_exists = vector_db.client.indices.exists(index=INDEX_NAME) + if index_exists: + doc_count = vector_db.client.count(index=INDEX_NAME)["count"] + if (not index_exists) or doc_count == 0: + search_res = [] + else: + if isinstance(input, EmbedDoc): + query = input.text + else: + # for RetrievalRequest, ChatCompletionRequest + query = input.input + # if the OpenSearch index has data, perform the search + if input.search_type == "similarity": + search_res = await search_all_embeddings_vectors( + embeddings=input.embedding, + func=vector_db.asimilarity_search_by_vector, + k=input.k, + ) + elif input.search_type == "similarity_distance_threshold": + if input.distance_threshold is None: + raise ValueError("distance_threshold must be provided for " + "similarity_distance_threshold retriever") + search_res = await search_all_embeddings_vectors( + embeddings=input.embedding, + func=vector_db.asimilarity_search_by_vector, + k=input.k, + distance_threshold=input.distance_threshold, + ) + elif input.search_type == "similarity_score_threshold": + doc_and_similarities = await vector_db.asimilarity_search_with_relevance_scores( + query=input.text, k=input.k, score_threshold=input.score_threshold + ) + search_res = [doc for doc, _ in doc_and_similarities] + elif input.search_type == "mmr": + search_res = await vector_db.amax_marginal_relevance_search( + query=input.text, k=input.k, fetch_k=input.fetch_k, lambda_mult=input.lambda_mult + ) + else: + raise ValueError(f"{input.search_type} not valid") + + # return different response format + retrieved_docs = [] + if isinstance(input, EmbedDoc): + for r in search_res: + retrieved_docs.append(TextDoc(text=r.page_content)) + result = SearchedDoc(retrieved_docs=retrieved_docs, initial_query=input.text) + else: + for r in search_res: + retrieved_docs.append(RetrievalResponseData(text=r.page_content, metadata=r.metadata)) + if isinstance(input, RetrievalRequest): + result = RetrievalResponse(retrieved_docs=retrieved_docs) + elif isinstance(input, ChatCompletionRequest): + input.retrieved_docs = retrieved_docs + input.documents = [doc.text for doc in retrieved_docs] + result = input + + statistics_dict["opea_service@retriever_opensearch"].append_latency(time.time() - start, None) + if logflag: + logger.info(result) + return result + + +if __name__ == "__main__": + # Create vectorstore + if tei_embedding_endpoint: + # create embeddings using TEI endpoint service + embeddings = HuggingFaceEndpointEmbeddings(model=tei_embedding_endpoint) + else: + # create embeddings using local embedding model + embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) + + auth = ("admin", OPENSEARCH_INITIAL_ADMIN_PASSWORD) + vector_db = OpenSearchVectorSearch( + opensearch_url=OPENSEARCH_URL, + index_name=INDEX_NAME, + embedding_function=embeddings, + http_auth=auth, + use_ssl=True, + verify_certs=False, + ssl_assert_hostname=False, + ssl_show_warn=False, + ) + opea_microservices["opea_service@retriever_opensearch"].start() diff --git a/comps/vectorstores/opensearch/README.md b/comps/vectorstores/opensearch/README.md new file mode 100644 index 0000000000..f784d72967 --- /dev/null +++ b/comps/vectorstores/opensearch/README.md @@ -0,0 +1,35 @@ +# Start Opensearch server + +## Prerequisites + +1. Install docker +1. Install docker compose (if not already installed) + 1. `sudo curl -L https://github.com/docker/compose/releases/latest/download/docker-compose-$(uname -s)-$(uname -m) -o /usr/local/bin/docker-compose` + 2. `sudo chmod +x /usr/local/bin/docker-compose` + +## Instructions + +### 1. Set admin password as environment variable + +OpenSearch version 2.12 and later require a custom admin password to be set. Following [these guidelines](https://opensearch.org/docs/latest/security/configuration/demo-configuration/#setting-up-a-custom-admin-password), set the admin password as an environment variable to be used by the `docker-compose-opensearch.yml` file like `export OPENSEARCH_INITIAL_ADMIN_PASSWORD=_some_admin_password` in the terminal before starting the docker containers. + +### 2. Start the cluster + +`docker-compose -f docker-compose-opensearch.yml up` + +## Troubleshooting + +### "java.nio.file.FileSystemNotFoundException: null" error + +1. Make sure to grant read permissions to your local data volume folders + 1. `sudo chown -R instance_user:instance_user ./opensearch-data1` + 2. `sudo chown -R instance_user:instance_user ./opensearch-data2` + 1. Replace `instance_user` with the login user (i.e. ec2-user, ssm-user, or your local user name) +2. Try increasing the virtual max memory map count + 1. `sudo sysctl -w vm.max_map_count=262144` + +### OpenSearch Dashboards container errors + +1. Make sure to grant read permission to the `opensearch_dashboards.yml` file +1. `sudo chown -R instance_user:instance_user ./opensearch_dashboards.yml` + 1. Replace `instance_user` with the login user (i.e. ec2-user, ssm-user, or your local user name) diff --git a/comps/vectorstores/opensearch/__init__.py b/comps/vectorstores/opensearch/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/comps/vectorstores/opensearch/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/vectorstores/opensearch/docker-compose-opensearch.yaml b/comps/vectorstores/opensearch/docker-compose-opensearch.yaml new file mode 100644 index 0000000000..1769850e65 --- /dev/null +++ b/comps/vectorstores/opensearch/docker-compose-opensearch.yaml @@ -0,0 +1,81 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: '3' +services: + opensearch-node1: + image: opensearchproject/opensearch:latest + container_name: opensearch-node1 + environment: + - cluster.name=opensearch-cluster + - node.name=opensearch-node1 + - discovery.seed_hosts=opensearch-node1,opensearch-node2 + - cluster.initial_master_nodes=opensearch-node1,opensearch-node2 + - bootstrap.memory_lock=true # along with the memlock settings below, disables swapping + - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # minimum and maximum Java heap size, recommend setting both to 50% of system RAM + - OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD} # Sets the demo admin user password when using demo configuration, required for OpenSearch 2.12 and later + ulimits: + memlock: + soft: -1 + hard: -1 + nofile: + soft: 65536 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems + hard: 65536 + volumes: + - ./opensearch-data1:/var/lib/opensearch/data + ports: + - 9200:9200 + - 9600:9600 # required for Performance Analyzer + networks: + - opensearch-net + security_opt: + - no-new-privileges:true + opensearch-node2: + image: opensearchproject/opensearch:latest + container_name: opensearch-node2 + environment: + - cluster.name=opensearch-cluster + - node.name=opensearch-node2 + - discovery.seed_hosts=opensearch-node1,opensearch-node2 + - cluster.initial_master_nodes=opensearch-node1,opensearch-node2 + - bootstrap.memory_lock=true + - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" + - OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD} # Sets the demo admin user password when using demo configuration, required for OpenSearch 2.12 and later + ulimits: + memlock: + soft: -1 + hard: -1 + nofile: + soft: 65536 + hard: 65536 + volumes: + - ./opensearch-data2:/var/lib/opensearch/data + networks: + - opensearch-net + security_opt: + - no-new-privileges:true + opensearch-dashboards: + image: opensearchproject/opensearch-dashboards:latest + volumes: + - ./opensearch_dashboards.yml:/usr/share/opensearch-dashboards/config/opensearch_dashboards.yml + container_name: opensearch-dashboards + ports: + - 5601:5601 + expose: + - "5601" + environment: + OPENSEARCH_HOSTS: '["https://opensearch-node1:9200","https://opensearch-node2:9200"]' # must be a string with no spaces when specified as an environment variable + networks: + - opensearch-net + security_opt: + - no-new-privileges:true + depends_on: + - opensearch-node1 + - opensearch-node2 + +volumes: + opensearch-data1: + opensearch-data2: + +networks: + opensearch-net: diff --git a/comps/vectorstores/opensearch/opensearch_dashboards.yml b/comps/vectorstores/opensearch/opensearch_dashboards.yml new file mode 100644 index 0000000000..f6d43e6ed0 --- /dev/null +++ b/comps/vectorstores/opensearch/opensearch_dashboards.yml @@ -0,0 +1,210 @@ +--- +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 + +# Description: +# Default configuration for OpenSearch Dashboards + +# OpenSearch Dashboards is served by a back end server. This setting specifies the port to use. +# server.port: 5601 + +# Specifies the address to which the OpenSearch Dashboards server will bind. IP addresses and host names are both valid values. +# The default is 'localhost', which usually means remote machines will not be able to connect. +# To allow connections from remote users, set this parameter to a non-loopback address. +# server.host: "localhost" + +# Enables you to specify a path to mount OpenSearch Dashboards at if you are running behind a proxy. +# Use the `server.rewriteBasePath` setting to tell OpenSearch Dashboards if it should remove the basePath +# from requests it receives, and to prevent a deprecation warning at startup. +# This setting cannot end in a slash. +# server.basePath: "" + +# Specifies whether OpenSearch Dashboards should rewrite requests that are prefixed with +# `server.basePath` or require that they are rewritten by your reverse proxy. +# server.rewriteBasePath: false + +# The maximum payload size in bytes for incoming server requests. +# server.maxPayloadBytes: 1048576 + +# The OpenSearch Dashboards server's name. This is used for display purposes. +# server.name: "your-hostname" + +# The URLs of the OpenSearch instances to use for all your queries. +# opensearch.hosts: ["http://localhost:9200"] + +# OpenSearch Dashboards uses an index in OpenSearch to store saved searches, visualizations and +# dashboards. OpenSearch Dashboards creates a new index if the index doesn't already exist. +# opensearchDashboards.index: ".opensearch_dashboards" + +# The default application to load. +# opensearchDashboards.defaultAppId: "home" + +# Setting for an optimized healthcheck that only uses the local OpenSearch node to do Dashboards healthcheck. +# This settings should be used for large clusters or for clusters with ingest heavy nodes. +# It allows Dashboards to only healthcheck using the local OpenSearch node rather than fan out requests across all nodes. +# +# It requires the user to create an OpenSearch node attribute with the same name as the value used in the setting +# This node attribute should assign all nodes of the same cluster an integer value that increments with each new cluster that is spun up +# e.g. in opensearch.yml file you would set the value to a setting using node.attr.cluster_id: +# Should only be enabled if there is a corresponding node attribute created in your OpenSearch config that matches the value here +# opensearch.optimizedHealthcheckId: "cluster_id" + +# If your OpenSearch is protected with basic authentication, these settings provide +# the username and password that the OpenSearch Dashboards server uses to perform maintenance on the OpenSearch Dashboards +# index at startup. Your OpenSearch Dashboards users still need to authenticate with OpenSearch, which +# is proxied through the OpenSearch Dashboards server. +# opensearch.username: "opensearch_dashboards_system" +# opensearch.password: "pass" + +# Enables SSL and paths to the PEM-format SSL certificate and SSL key files, respectively. +# These settings enable SSL for outgoing requests from the OpenSearch Dashboards server to the browser. +# server.ssl.enabled: false +# server.ssl.certificate: /path/to/your/server.crt +# server.ssl.key: /path/to/your/server.key + +# Optional settings that provide the paths to the PEM-format SSL certificate and key files. +# These files are used to verify the identity of OpenSearch Dashboards to OpenSearch and are required when +# xpack.security.http.ssl.client_authentication in OpenSearch is set to required. +# opensearch.ssl.certificate: /path/to/your/client.crt +# opensearch.ssl.key: /path/to/your/client.key + +# Optional setting that enables you to specify a path to the PEM file for the certificate +# authority for your OpenSearch instance. +# opensearch.ssl.certificateAuthorities: [ "/path/to/your/CA.pem" ] + +# To disregard the validity of SSL certificates, change this setting's value to 'none'. +# opensearch.ssl.verificationMode: full + +# Time in milliseconds to wait for OpenSearch to respond to pings. Defaults to the value of +# the opensearch.requestTimeout setting. +# opensearch.pingTimeout: 1500 + +# Time in milliseconds to wait for responses from the back end or OpenSearch. This value +# must be a positive integer. +# opensearch.requestTimeout: 30000 + +# List of OpenSearch Dashboards client-side headers to send to OpenSearch. To send *no* client-side +# headers, set this value to [] (an empty list). +# opensearch.requestHeadersWhitelist: [ authorization ] + +# Header names and values that are sent to OpenSearch. Any custom headers cannot be overwritten +# by client-side headers, regardless of the opensearch.requestHeadersWhitelist configuration. +# opensearch.customHeaders: {} + +# Time in milliseconds for OpenSearch to wait for responses from shards. Set to 0 to disable. +# opensearch.shardTimeout: 30000 + +# Logs queries sent to OpenSearch. Requires logging.verbose set to true. +# opensearch.logQueries: false + +# Specifies the path where OpenSearch Dashboards creates the process ID file. +# pid.file: /var/run/opensearchDashboards.pid + +# Enables you to specify a file where OpenSearch Dashboards stores log output. +# logging.dest: stdout + +# Set the value of this setting to true to suppress all logging output. +# logging.silent: false + +# Set the value of this setting to true to suppress all logging output other than error messages. +# logging.quiet: false + +# Set the value of this setting to true to log all events, including system usage information +# and all requests. +# logging.verbose: false + +# Set the interval in milliseconds to sample system and process performance +# metrics. Minimum is 100ms. Defaults to 5000. +# ops.interval: 5000 + +# Specifies locale to be used for all localizable strings, dates and number formats. +# Supported languages are the following: English - en , by default , Chinese - zh-CN . +# i18n.locale: "en" + +# Set the allowlist to check input graphite Url. Allowlist is the default check list. +# vis_type_timeline.graphiteAllowedUrls: ['https://www.hostedgraphite.com/UID/ACCESS_KEY/graphite'] + +# Set the blocklist to check input graphite Url. Blocklist is an IP list. +# Below is an example for reference +# vis_type_timeline.graphiteBlockedIPs: [ +# //Loopback +# '127.0.0.0/8', +# '::1/128', +# //Link-local Address for IPv6 +# 'fe80::/10', +# //Private IP address for IPv4 +# '10.0.0.0/8', +# '172.16.0.0/12', +# '192.168.0.0/16', +# //Unique local address (ULA) +# 'fc00::/7', +# //Reserved IP address +# '0.0.0.0/8', +# '100.64.0.0/10', +# '192.0.0.0/24', +# '192.0.2.0/24', +# '198.18.0.0/15', +# '192.88.99.0/24', +# '198.51.100.0/24', +# '203.0.113.0/24', +# '224.0.0.0/4', +# '240.0.0.0/4', +# '255.255.255.255/32', +# '::/128', +# '2001:db8::/32', +# 'ff00::/8', +# ] +# vis_type_timeline.graphiteBlockedIPs: [] + +# opensearchDashboards.branding: +# logo: +# defaultUrl: "" +# darkModeUrl: "" +# mark: +# defaultUrl: "" +# darkModeUrl: "" +# loadingLogo: +# defaultUrl: "" +# darkModeUrl: "" +# faviconUrl: "" +# applicationTitle: "" + +# Set the value of this setting to true to capture region blocked warnings and errors +# for your map rendering services. +# map.showRegionBlockedWarning: false% + +# Set the value of this setting to false to suppress search usage telemetry +# for reducing the load of OpenSearch cluster. +# data.search.usageTelemetry.enabled: false + +# 2.4 renames 'wizard.enabled: false' to 'vis_builder.enabled: false' +# Set the value of this setting to false to disable VisBuilder +# functionality in Visualization. +# vis_builder.enabled: false + +# 2.4 New Experimental Feature +# Set the value of this setting to true to enable the experimental multiple data source +# support feature. Use with caution. +# data_source.enabled: false +# Set the value of these settings to customize crypto materials to encryption saved credentials +# in data sources. +# data_source.encryption.wrappingKeyName: 'changeme' +# data_source.encryption.wrappingKeyNamespace: 'changeme' +# data_source.encryption.wrappingKey: [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] + +# 2.6 New ML Commons Dashboards Experimental Feature +# Set the value of this setting to true to enable the experimental ml commons dashboards +ml_commons_dashboards.enabled: true + +opensearch.hosts: ["https://localhost:9200"] +opensearch.ssl.verificationMode: none +opensearch.username: kibanaserver +opensearch.password: kibanaserver +opensearch.requestHeadersWhitelist: [authorization, securitytenant] + +opensearch_security.multitenancy.enabled: true +opensearch_security.multitenancy.tenants.preferred: [Private, Global] +opensearch_security.readonly_mode.roles: [kibana_read_only] +# Use this setting if you are running opensearch-dashboards without https +opensearch_security.cookie.secure: false +server.host: '0.0.0.0' diff --git a/tests/animation/test_animation_wav2lip.sh b/tests/animation/test_animation_opea.sh old mode 100755 new mode 100644 similarity index 67% rename from tests/animation/test_animation_wav2lip.sh rename to tests/animation/test_animation_opea.sh index ddc0c0cb04..6aad155a7c --- a/tests/animation/test_animation_wav2lip.sh +++ b/tests/animation/test_animation_opea.sh @@ -10,14 +10,14 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH echo $(pwd) - docker build -t opea/wav2lip:comps -f comps/animation/wav2lip/dependency/Dockerfile . + docker build -t opea/wav2lip:comps -f comps/animation/src/integration/dependency/Dockerfile . if [ $? -ne 0 ]; then echo "opea/wav2lip built fail" exit 1 else echo "opea/wav2lip built successful" fi - docker build --no-cache -t opea/animation:comps -f comps/animation/wav2lip/Dockerfile . + docker build --no-cache -t opea/animation:comps -f comps/animation/src/Dockerfile . if [ $? -ne 0 ]; then echo "opea/animation built fail" exit 1 @@ -35,22 +35,22 @@ function start_service() { export ANIMATION_PORT=9066 export INFERENCE_MODE='wav2lip+gfpgan' export CHECKPOINT_PATH='/usr/local/lib/python3.11/site-packages/Wav2Lip/checkpoints/wav2lip_gan.pth' - export FACE="assets/img/avatar1.jpg" + export FACE="/home/user/comps/animation/src/assets/img/avatar1.jpg" export AUDIO='None' export FACESIZE=96 - export OUTFILE="assets/outputs/result.mp4" + export OUTFILE="/home/user/comps/animation/src/assets/outputs/result.mp4" export GFPGAN_MODEL_VERSION=1.4 # latest version, can roll back to v1.3 if needed export UPSCALE_FACTOR=1 export FPS=10 - docker run -d --name="test-comps-animation-wav2lip" -v $WORKPATH/comps/animation/wav2lip/assets:/home/user/comps/animation/wav2lip/assets -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e DEVICE=$DEVICE -e INFERENCE_MODE=$INFERENCE_MODE -e CHECKPOINT_PATH=$CHECKPOINT_PATH -e FACE=$FACE -e AUDIO=$AUDIO -e FACESIZE=$FACESIZE -e OUTFILE=$OUTFILE -e GFPGAN_MODEL_VERSION=$GFPGAN_MODEL_VERSION -e UPSCALE_FACTOR=$UPSCALE_FACTOR -e FPS=$FPS -e WAV2LIP_PORT=$WAV2LIP_PORT -p 7860:7860 --ipc=host opea/wav2lip:comps - docker run -d --name="test-comps-animation" -v $WORKPATH/comps/animation/wav2lip/assets:/home/user/comps/animation/wav2lip/assets -e WAV2LIP_ENDPOINT=http://$ip_address:7860 -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 9066:9066 --ipc=host opea/animation:comps + docker run -d --name="test-comps-animation-wav2lip" -v $WORKPATH/comps/animation/src/assets:/home/user/comps/animation/src/assets -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e DEVICE=$DEVICE -e INFERENCE_MODE=$INFERENCE_MODE -e CHECKPOINT_PATH=$CHECKPOINT_PATH -e FACE=$FACE -e AUDIO=$AUDIO -e FACESIZE=$FACESIZE -e OUTFILE=$OUTFILE -e GFPGAN_MODEL_VERSION=$GFPGAN_MODEL_VERSION -e UPSCALE_FACTOR=$UPSCALE_FACTOR -e FPS=$FPS -e WAV2LIP_PORT=$WAV2LIP_PORT -p 7860:7860 --ipc=host opea/wav2lip:comps + docker run -d --name="test-comps-animation" -v $WORKPATH/comps/animation/src/assets:/home/user/comps/animation/src/assets -e WAV2LIP_ENDPOINT=http://$ip_address:7860 -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 9066:9066 --ipc=host opea/animation:comps sleep 3m } function validate_microservice() { cd $WORKPATH - result=$(http_proxy="" curl http://localhost:9066/v1/animation -X POST -H "Content-Type: application/json" -d @comps/animation/wav2lip/assets/audio/sample_question.json) + result=$(http_proxy="" curl http://localhost:9066/v1/animation -X POST -H "Content-Type: application/json" -d @comps/animation/src/assets/audio/sample_question.json) if [[ $result == *"result.mp4"* ]]; then echo "Result correct." else diff --git a/tests/dataprep/test_dataprep_multimedia.sh b/tests/dataprep/test_dataprep_multimedia.sh deleted file mode 100644 index c151f6b06f..0000000000 --- a/tests/dataprep/test_dataprep_multimedia.sh +++ /dev/null @@ -1,242 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -# set -xe - -IMAGE_REPO=${IMAGE_REPO:-"opea"} -IMAGE_TAG=${IMAGE_TAG:-"latest"} -echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}" -echo "TAG=IMAGE_TAG=${IMAGE_TAG}" - -WORKPATH=$(dirname "$PWD") -LOG_PATH="$WORKPATH/tests" - -host_ip=$(hostname -I | awk '{print $1}') - -export REGISTRY=${IMAGE_REPO} -export TAG=${IMAGE_TAG} -export no_proxy="${no_proxy},${host_ip}" - -export V2A_SERVICE_HOST_IP=${host_ip} -export V2A_ENDPOINT=http://$host_ip:7078 - -export A2T_ENDPOINT=http://$host_ip:7066 -export A2T_SERVICE_HOST_IP=${host_ip} -export A2T_SERVICE_PORT=9099 - -export DATA_ENDPOINT=http://$host_ip:7079 -export DATA_SERVICE_HOST_IP=${host_ip} -export DATA_SERVICE_PORT=7079 - -# Get the root folder of the current script -ROOT_FOLDER=$(dirname "$(readlink -f "$0")") - -function build_docker_images() { - cd $WORKPATH - echo "Current working directory: $(pwd)" - - # Array of Docker build configurations - declare -A docker_builds=( - ["opea/whisper:comps"]="comps/asr/whisper/dependency/Dockerfile" - ["opea/a2t:comps"]="comps/dataprep/multimedia2text/audio2text/Dockerfile" - ["opea/v2a:comps"]="comps/dataprep/multimedia2text/video2audio/Dockerfile" - ["opea/multimedia2text:comps"]="comps/dataprep/multimedia2text/Dockerfile" - ) - - # Loop through the array and build each Docker image - for image in "${!docker_builds[@]}"; do - dockerfile=${docker_builds[$image]} - echo "Building Docker image: $image from Dockerfile: $dockerfile" - - docker build --no-cache -t $image --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f $dockerfile . - - if [ $? -ne 0 ]; then - echo "$image build failed" - exit 1 - else - echo "$image build successful" - fi - done - - # List Docker images and wait for 1 second - docker images && sleep 1s -} - -function start_services() { - - docker run -d -p 7066:7066 --name="test-comps-mm-whisper-service" --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/whisper:comps - if [ $? -ne 0 ]; then - echo "opea/whisper service fail to start" - exit 1 - else - echo "opea/whisper start successful" - fi - - - docker run -d -p 9199:9099 --name="test-comps-mm-a2t-service" --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e A2T_ENDPOINT=http://$host_ip:7066 opea/a2t:comps - if [ $? -ne 0 ]; then - echo "opea/a2t service fail to start" - exit 1 - else - echo "opea/a2t start successful" - fi - - docker run -d -p 7078:7078 --name="test-comps-mm-v2a-service" --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/v2a:comps - if [ $? -ne 0 ]; then - echo "opea/v2a service fail to start" - exit 1 - else - echo "opea/v2a start successful" - fi - - - docker run -d -p 7079:7079 --name="test-comps-mm-multimedia2text-service" --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy \ - -e A2T_ENDPOINT=http://$host_ip:7066 \ - -e V2A_ENDPOINT=http://$host_ip:7078 \ - opea/multimedia2text:comps - - if [ $? -ne 0 ]; then - echo "opea/multimedia2text service fail to start" - exit 1 - else - echo "opea/multimedia2text start successful" - fi - - sleep 120s - -} - -function validate_services() { - local URL="$1" - local EXPECTED_RESULT="$2" - local SERVICE_NAME="$3" - local DOCKER_NAME="$4" - local INPUT_DATA="$5" - - local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") - - echo "===========================================" - - if [ "$HTTP_STATUS" -eq 200 ]; then - echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." - - local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) - - if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then - echo "[ $SERVICE_NAME ] Content is as expected." - else - echo "EXPECTED_RESULT==> $EXPECTED_RESULT" - echo "CONTENT==> $CONTENT" - echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - - fi - else - echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - fi - sleep 1s - -} - -get_base64_str() { - local file_name=$1 - base64 -w 0 "$file_name" -} - -# Function to generate input data for testing based on the document type -input_data_for_test() { - local document_type=$1 - case $document_type in - ("text") - echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are." - ;; - ("audio") - # get_base64_str "$ROOT_FOLDER/data/test.wav" - get_base64_str "$WORKPATH/comps/dataprep/multimedia2text/data/intel_short.wav" - ;; - ("video") - # get_base64_str "$ROOT_FOLDER/data/test.mp4" - get_base64_str "$WORKPATH/comps/dataprep/multimedia2text/data/intel_short.mp4" - ;; - (*) - echo "Invalid document type" >&2 - exit 1 - ;; - esac -} - -function validate_microservices() { - # Check if the microservices are running correctly. - - # whisper microservice - ulimit -s 65536 - validate_services \ - "${host_ip}:7066/v1/asr" \ - '{"asr_result":"well"}' \ - "whisper-service" \ - "whisper-service" \ - "{\"audio\": \"$(input_data_for_test "audio")\"}" - - # Audio2Text service - validate_services \ - "${host_ip}:9199/v1/audio/transcriptions" \ - '"query":"well"' \ - "a2t" \ - "a2t-service" \ - "{\"byte_str\": \"$(input_data_for_test "audio")\"}" - - # Video2Audio service - validate_services \ - "${host_ip}:7078/v1/video2audio" \ - "SUQzBAAAAAAAI1RTU0UAAAAPAAADTGF2ZjU4LjI5LjEwMAAAAAAAAAAAAAAA//tQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAASW5mbwAAAA8AAAAIAAAN3wAtLS0tLS0tLS0tLS1LS0tLS0tLS0tLS0tpaWlpaWlpaWlpaWlph4eHh4eHh4eHh4eHpaWlpaWlpaWlpaWlpcPDw8PDw8PDw8PDw+Hh4eHh4eHh4eHh4eH///////////////8AAAAATGF2YzU4LjU0AAAAAAAAAAAAAAA" \ - "v2a" \ - "v2a-service" \ - "{\"byte_str\": \"$(input_data_for_test "video")\"}" - - # Docsum Data service - video - validate_services \ - "${host_ip}:7079/v1/multimedia2text" \ - '"query":"well' \ - "multimedia2text-service" \ - "multimedia2text" \ - "{\"video\": \"$(input_data_for_test "video")\"}" - - # Docsum Data service - audio - validate_services \ - "${host_ip}:7079/v1/multimedia2text" \ - '"query":"well' \ - "multimedia2text-service" \ - "multimedia2text" \ - "{\"audio\": \"$(input_data_for_test "audio")\"}" - - # Docsum Data service - text - validate_services \ - "${host_ip}:7079/v1/multimedia2text" \ - "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco" \ - "multimedia2text-service" \ - "multimedia2text" \ - "{\"text\": \"$(input_data_for_test "text")\"}" - -} - -function stop_docker() { - cid=$(docker ps -aq --filter "name=test-comps-mm-*") - if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi - echo "All specified services have been stopped and removed." -} - -function main() { - - stop_docker - if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi - start_services - validate_microservices - stop_docker - echo y | docker system prune -} - -main diff --git a/tests/dataprep/test_dataprep_opensearch_langchain.sh b/tests/dataprep/test_dataprep_opensearch_langchain.sh new file mode 100644 index 0000000000..11e8006b6c --- /dev/null +++ b/tests/dataprep/test_dataprep_opensearch_langchain.sh @@ -0,0 +1,174 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') +dataprep_service_port="6007" +OPENSEARCH_INITIAL_ADMIN_PASSWORD="StRoNgOpEa0)" + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker build -t opea/dataprep-opensearch:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/opensearch/langchain/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/dataprep-opensearch built fail" + exit 1 + else + echo "opea/dataprep-opensearch built successful" + fi +} + +function start_service() { + # Start OpenSearch vector db container + docker run -d \ + --name test-comps-dataprep-opensearch-langchain \ + -e cluster.name=opensearch-cluster \ + -e node.name=opensearch-vector-db \ + -e discovery.seed_hosts=opensearch-vector-db \ + -e cluster.initial_master_nodes=opensearch-vector-db \ + -e bootstrap.memory_lock=true \ + -e "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" \ + -e OPENSEARCH_INITIAL_ADMIN_PASSWORD=$OPENSEARCH_INITIAL_ADMIN_PASSWORD \ + --ulimit memlock=-1:-1 \ + --ulimit nofile=65536:65536 \ + -p 9200:9200 \ + -p 9600:9600 \ + opensearchproject/opensearch:latest + + # Start OpenSearch dataprep container + OPENSEARCH_URL="http://${ip_address}:9200" + echo $(OPENSEARCH_URL) + INDEX_NAME="file-index" + docker run -d \ + --name test-comps-dataprep-opensearch-langchain-server \ + -p 6007:6007 \ + -e https_proxy=$https_proxy \ + -e http_proxy=$http_proxy \ + -e OPENSEARCH_INITIAL_ADMIN_PASSWORD=$OPENSEARCH_INITIAL_ADMIN_PASSWORD \ + -e OPENSEARCH_URL=$OPENSEARCH_URL \ + -e INDEX_NAME=$INDEX_NAME \ + opea/dataprep-opensearch:latest + + sleep 2m +} + +function validate_microservice() { + cd $LOG_PATH + + # test /v1/dataprep upload file + URL="http://${ip_address}:$dataprep_service_port/v1/dataprep" + echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' -k -u admin:$OPENSEARCH_INITIAL_ADMIN_PASSWORD "$URL") + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + SERVICE_NAME="dataprep - upload - file" + + if [ "$HTTP_STATUS" -ne "200" ]; then + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-opensearch-langchain-server >> ${LOG_PATH}/dataprep_upload_file.log + exit 1 + else + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + fi + if [[ "$RESPONSE_BODY" != *"Data preparation succeeded"* ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + docker logs test-comps-dataprep-opensearch-langchain-server >> ${LOG_PATH}/dataprep_upload_file.log + exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." + fi + + + # test /v1/dataprep upload link + URL="http://${ip_address}:$dataprep_service_port/v1/dataprep" + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'link_list=["https://www.ces.tech/"]' -k -u admin:$OPENSEARCH_INITIAL_ADMIN_PASSWORD "$URL") + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + SERVICE_NAME="dataprep - upload - link" + + + if [ "$HTTP_STATUS" -ne "200" ]; then + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-opensearch-langchain-server >> ${LOG_PATH}/dataprep_upload_link.log + exit 1 + else + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + fi + if [[ "$RESPONSE_BODY" != *"Data preparation succeeded"* ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + docker logs test-comps-dataprep-opensearch-langchain-server >> ${LOG_PATH}/dataprep_upload_link.log + exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." + fi + + # test /v1/dataprep/get_file + URL="http://${ip_address}:$dataprep_service_port/v1/dataprep/get_file" + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -k -u admin:$OPENSEARCH_INITIAL_ADMIN_PASSWORD "$URL") + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + SERVICE_NAME="dataprep - get" + + if [ "$HTTP_STATUS" -ne "200" ]; then + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-opensearch-langchain-server >> ${LOG_PATH}/dataprep_file.log + exit 1 + else + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + fi + if [[ "$RESPONSE_BODY" -ne "null" ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + docker logs test-comps-dataprep-opensearch-langchain-server >> ${LOG_PATH}/dataprep_file.log + exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." + fi + + # test /v1/dataprep/delete_file + URL="http://${ip_address}:$dataprep_service_port/v1/dataprep/delete_file" + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "dataprep_file.txt"}' -H 'Content-Type: application/json' -k -u admin:$OPENSEARCH_INITIAL_ADMIN_PASSWORD "$URL") + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + SERVICE_NAME="dataprep - del" + + # check response status + if [ "$HTTP_STATUS" -ne "404" ]; then + echo "[ $SERVICE_NAME ] HTTP status is not 404. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-opensearch-langchain-server >> ${LOG_PATH}/dataprep_del.log + exit 1 + else + echo "[ $SERVICE_NAME ] HTTP status is 404. Checking content..." + fi + # check response body + if [[ "$RESPONSE_BODY" != *'{"detail":"Single file deletion is not implemented yet"}'* ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + docker logs test-comps-dataprep-opensearch-langchain-server >> ${LOG_PATH}/dataprep_del.log + exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." + fi +} + +function stop_service() { + cid=$(docker ps -aq --filter "name=test-comps-dataprep-opensearch-langchain*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi + +} + +function main() { + stop_service + + build_docker_images + start_service + + validate_microservice + + stop_service + # echo y | docker system prune +} + +main diff --git a/tests/retrievers/test_retrievers_opensearch_langchain.sh b/tests/retrievers/test_retrievers_opensearch_langchain.sh new file mode 100644 index 0000000000..a03a28cc08 --- /dev/null +++ b/tests/retrievers/test_retrievers_opensearch_langchain.sh @@ -0,0 +1,111 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') +retriever_port="7000" +OPENSEARCH_INITIAL_ADMIN_PASSWORD="StRoNgOpEa0)" + +function build_docker_images() { + cd $WORKPATH + docker build -t opea/retriever-opensearch:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/opensearch/langchain/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/retriever-opensearch built fail" + exit 1 + else + echo "opea/retriever-opensearch built successful" + fi +} + +function start_service() { + # Start OpenSearch vector db container + docker run -d \ + --name test-comps-retriever-opensearch \ + -e cluster.name=opensearch-cluster \ + -e node.name=opensearch-vector-db \ + -e discovery.seed_hosts=opensearch-vector-db \ + -e cluster.initial_master_nodes=opensearch-vector-db \ + -e bootstrap.memory_lock=true \ + -e "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" \ + -e OPENSEARCH_INITIAL_ADMIN_PASSWORD=$OPENSEARCH_INITIAL_ADMIN_PASSWORD \ + --ulimit memlock=-1:-1 \ + --ulimit nofile=65536:65536 \ + -p 9200:9200 \ + -p 9600:9600 \ + opensearchproject/opensearch:latest + + # tei endpoint + tei_endpoint=6060 + model="BAAI/bge-base-en-v1.5" + docker run -d --name="test-comps-retriever-opensearch-tei-endpoint" -p $tei_endpoint:80 -v ./data:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model + sleep 30s + export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_endpoint}" + + # Start OpenSearch retriever container + OPENSEARCH_URL="http://${ip_address}:9200" + INDEX_NAME="file-index" + docker run -d \ + --name test-comps-retriever-opensearch-server \ + -p 7000:7000 \ + -e https_proxy=$https_proxy \ + -e http_proxy=$http_proxy \ + -e OPENSEARCH_INITIAL_ADMIN_PASSWORD=$OPENSEARCH_INITIAL_ADMIN_PASSWORD \ + -e OPENSEARCH_URL=$OPENSEARCH_URL \ + -e INDEX_NAME=$INDEX_NAME \ + -e TEI_EMBEDDING_ENDPOINT=${TEI_EMBEDDING_ENDPOINT} \ + opea/retriever-opensearch:latest + + sleep 2m +} + +function validate_microservice() { + export PATH="${HOME}/miniforge3/bin:$PATH" + source activate + URL="http://${ip_address}:$retriever_port/v1/retrieval" + + test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") + + HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' -k -u admin:$OPENSEARCH_INITIAL_ADMIN_PASSWORD "$URL") + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ retriever ] HTTP status is 200. Checking content..." + local CONTENT=$(curl -s -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/retriever.log) + + if echo "$CONTENT" | grep -q "retrieved_docs"; then + echo "[ retriever ] Content is as expected." + else + echo "[ retriever ] Content does not match the expected result: $CONTENT" + docker logs test-comps-retriever-opensearch-server >> ${LOG_PATH}/retriever.log + docker logs test-comps-retriever-opensearch-tei-endpoint >> ${LOG_PATH}/tei.log + exit 1 + fi + else + echo "[ retriever ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-retriever-opensearch-server >> ${LOG_PATH}/retriever.log + docker logs test-comps-retriever-opensearch-tei-endpoint >> ${LOG_PATH}/tei.log + exit 1 + fi +} + +function stop_service() { + cid=$(docker ps -aq --filter "name=test-comps-retriever-opensearch*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi + +} + +function main() { + stop_service + + build_docker_images + start_service + + validate_microservice + + stop_service + # echo y | docker system prune +} + +main