diff --git a/comps/__init__.py b/comps/__init__.py index d3f2878165..5737087340 100644 --- a/comps/__init__.py +++ b/comps/__init__.py @@ -22,6 +22,7 @@ LLMParamsDoc, SearchedDoc, TextDoc, + DocPath, ) # Microservice diff --git a/comps/cores/proto/docarray.py b/comps/cores/proto/docarray.py index 05c2556c6d..e4b3030241 100644 --- a/comps/cores/proto/docarray.py +++ b/comps/cores/proto/docarray.py @@ -29,6 +29,10 @@ class Base64ByteStrDoc(BaseDoc): byte_str: str +class DocPath(BaseDoc): + path: str + + class EmbedDoc768(BaseDoc): text: str embedding: conlist(float, min_length=768, max_length=768) diff --git a/comps/dataprep/langchain/qdrant/README.md b/comps/dataprep/langchain/qdrant/README.md new file mode 100644 index 0000000000..428a18e35b --- /dev/null +++ b/comps/dataprep/langchain/qdrant/README.md @@ -0,0 +1,49 @@ +# 🚀Start Microservice with Python + +## Install Requirements + +```bash +pip install -r requirements.txt +``` + +## Start Qdrant server + +Please refer to this [readme](../../../vectorstores/langchain/qdrant/README.md). + +## Start document preparation microservice for Qdrant with Python Script + +Start document preparation microservice for Qdrant with below command. + +```bash +python prepare_doc_qdrant.py +``` + +# 🚀Start Microservice with Docker + +## Build Docker Image + +```bash +cd ../../ +docker build -t opea/gen-ai-comps:dataprep-qdrant-xeon-server --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/langchain/qdrant/docker/Dockerfile . +``` + +## Run Docker with CLI + +```bash +docker run -d --name="dataprep-qdrant-server" -p 8000:8000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/gen-ai-comps:dataprep-qdrant-xeon-server +``` + +## Run Docker with Docker Compose + +```bash +cd docker +docker compose -f docker-compose-dataprep-qdrant.yaml up -d +``` + +# Invoke Microservices + +Once document preparation microservice for Qdrant is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. + +```bash +curl -X POST -H "Content-Type: application/json" -d '{"path":"/path/to/document"}' http://localhost:6000/v1/dataprep +``` diff --git a/comps/dataprep/langchain/qdrant/__init__.py b/comps/dataprep/langchain/qdrant/__init__.py new file mode 100644 index 0000000000..28f108cb63 --- /dev/null +++ b/comps/dataprep/langchain/qdrant/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/comps/dataprep/langchain/qdrant/config.py b/comps/dataprep/langchain/qdrant/config.py new file mode 100644 index 0000000000..c6301dfb9e --- /dev/null +++ b/comps/dataprep/langchain/qdrant/config.py @@ -0,0 +1,28 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +# Embedding model +EMBED_MODEL = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2") + +# Qdrant configuration +QDRANT_HOST = os.getenv("QDRANT", "localhost") +QDRANT_PORT = int(os.getenv("QDRANT_PORT", 6333)) +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "rag-qdrant") + +# LLM/Embedding endpoints +TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") +TGI_LLM_ENDPOINT_NO_RAG = os.getenv("TGI_LLM_ENDPOINT_NO_RAG", "http://localhost:8081") +TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_ENDPOINT") diff --git a/comps/dataprep/langchain/qdrant/docker/Dockerfile b/comps/dataprep/langchain/qdrant/docker/Dockerfile new file mode 100644 index 0000000000..ca16698e03 --- /dev/null +++ b/comps/dataprep/langchain/qdrant/docker/Dockerfile @@ -0,0 +1,40 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM python:3.11-slim + +ENV LANG C.UTF-8 + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev \ + vim + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/user/comps/dataprep/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/dataprep/langchain/redis + +ENTRYPOINT ["python", "prepare_doc_redis.py"] + diff --git a/comps/dataprep/langchain/qdrant/docker/docker-compose-dataprep-qdrant.yml b/comps/dataprep/langchain/qdrant/docker/docker-compose-dataprep-qdrant.yml new file mode 100644 index 0000000000..9a4f0a06fc --- /dev/null +++ b/comps/dataprep/langchain/qdrant/docker/docker-compose-dataprep-qdrant.yml @@ -0,0 +1,22 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +version: "3" +services: + qdrant-vector-db: + image: qdrant/qdrant + container_name: qdrant-vector-db + ports: + - "6333:6333" + - "6334:6334" diff --git a/comps/dataprep/langchain/qdrant/prepare_doc_qdrant.py b/comps/dataprep/langchain/qdrant/prepare_doc_qdrant.py new file mode 100644 index 0000000000..917d7b8a29 --- /dev/null +++ b/comps/dataprep/langchain/qdrant/prepare_doc_qdrant.py @@ -0,0 +1,72 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from config import COLLECTION_NAME, EMBED_MODEL, QDRANT_HOST, QDRANT_PORT +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceHubEmbeddings +from langchain_community.vectorstores import Qdrant + +from comps import DocPath, opea_microservices, register_microservice +from comps.dataprep.langchain.utils import docment_loader + +tei_embedding_endpoint = os.getenv("TEI_ENDPOINT") + + +@register_microservice( + name="opea_service@prepare_doc_qdrant", + expose_endpoint="/v1/dataprep", + host="0.0.0.0", + port=6000, + input_datatype=DocPath, + output_datatype=None, +) +def ingest_documents(doc_path: DocPath): + """Ingest document to Qdrant.""" + doc_path = doc_path.path + print(f"Parsing document {doc_path}.") + + text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True) + content = docment_loader(doc_path) + chunks = text_splitter.split_text(content) + + print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf") + # Create vectorstore + if tei_embedding_endpoint: + # create embeddings using TEI endpoint service + embedder = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint) + else: + # create embeddings using local embedding model + embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) + + # Batch size + batch_size = 32 + num_chunks = len(chunks) + for i in range(0, num_chunks, batch_size): + batch_chunks = chunks[i : i + batch_size] + batch_texts = batch_chunks + + _ = Qdrant.from_texts( + texts=batch_texts, + embedding=embedder, + collection_name=COLLECTION_NAME, + host=QDRANT_HOST, + port=QDRANT_PORT, + ) + print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}") + + +if __name__ == "__main__": + opea_microservices["opea_service@prepare_doc_qdrant"].start() diff --git a/comps/dataprep/langchain/qdrant/requirements.txt b/comps/dataprep/langchain/qdrant/requirements.txt new file mode 100644 index 0000000000..5513f7b540 --- /dev/null +++ b/comps/dataprep/langchain/qdrant/requirements.txt @@ -0,0 +1,10 @@ +docarray[full] +easyocr +fastapi +fitz +huggingface_hub +langchain +numpy +Pillow +sentence_transformers +shortuuid diff --git a/comps/dataprep/langchain/redis/README.md b/comps/dataprep/langchain/redis/README.md new file mode 100644 index 0000000000..76e6a4314f --- /dev/null +++ b/comps/dataprep/langchain/redis/README.md @@ -0,0 +1,49 @@ +# 🚀Start Microservice with Python + +## Install Requirements + +```bash +pip install -r requirements.txt +``` + +## Start Redis Stack Server + +Please refer to this [readme](../../../vectorstores/langchain/redis/README.md). + +## Start Document Preparation Microservice for Redis with Python Script + +Start document preparation microservice for Redis with below command. + +```bash +python prepare_doc_redis.py +``` + +# 🚀Start Microservice with Docker + +## Build Docker Image + +```bash +cd ../../ +docker build -t opea/gen-ai-comps:dataprep-redis-xeon-server --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/langchain/redis/docker/Dockerfile . +``` + +## Run Docker with CLI + +```bash +docker run -d --name="dataprep-redis-server" -p 8000:8000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/gen-ai-comps:dataprep-redis-xeon-server +``` + +## Run Docker with Docker Compose + +```bash +cd docker +docker compose -f docker-compose-dataprep-redis.yaml up -d +``` + +# Invoke Microservices + +Once document preparation microservice for Redis is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. + +```bash +curl -X POST -H "Content-Type: application/json" -d '{"path":"/path/to/document"}' http://localhost:6000/v1/dataprep +``` diff --git a/comps/dataprep/langchain/redis/__init__.py b/comps/dataprep/langchain/redis/__init__.py new file mode 100644 index 0000000000..28f108cb63 --- /dev/null +++ b/comps/dataprep/langchain/redis/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/comps/vectorstores/langchain/redis/config.py b/comps/dataprep/langchain/redis/config.py similarity index 100% rename from comps/vectorstores/langchain/redis/config.py rename to comps/dataprep/langchain/redis/config.py diff --git a/comps/dataprep/langchain/redis/docker/Dockerfile b/comps/dataprep/langchain/redis/docker/Dockerfile new file mode 100644 index 0000000000..ca16698e03 --- /dev/null +++ b/comps/dataprep/langchain/redis/docker/Dockerfile @@ -0,0 +1,40 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM python:3.11-slim + +ENV LANG C.UTF-8 + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev \ + vim + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/user/comps/dataprep/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/dataprep/langchain/redis + +ENTRYPOINT ["python", "prepare_doc_redis.py"] + diff --git a/comps/dataprep/langchain/redis/docker/docker-compose-dataprep-redis.yml b/comps/dataprep/langchain/redis/docker/docker-compose-dataprep-redis.yml new file mode 100644 index 0000000000..efabd8cbf1 --- /dev/null +++ b/comps/dataprep/langchain/redis/docker/docker-compose-dataprep-redis.yml @@ -0,0 +1,32 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +version: "3" +services: + redis-vector-db: + image: redis/redis-stack:7.2.0-v9 + container_name: redis-vector-db + ports: + - "6379:6379" + - "8001:8001" + dataprep-redis: + image: opea/gen-ai-comps:dataprep-redis-xeon-server + container_name: dataprep-redis-server + ports: + - "8000:8000" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + restart: unless-stopped diff --git a/comps/dataprep/langchain/redis/prepare_doc_redis.py b/comps/dataprep/langchain/redis/prepare_doc_redis.py new file mode 100644 index 0000000000..1ad3f3c065 --- /dev/null +++ b/comps/dataprep/langchain/redis/prepare_doc_redis.py @@ -0,0 +1,72 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from config import EMBED_MODEL, INDEX_NAME, INDEX_SCHEMA, REDIS_URL +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceHubEmbeddings +from langchain_community.vectorstores import Redis + +from comps import DocPath, opea_microservices, register_microservice +from comps.dataprep.langchain.utils import docment_loader + +tei_embedding_endpoint = os.getenv("TEI_ENDPOINT") + + +@register_microservice( + name="opea_service@prepare_doc_redis", + expose_endpoint="/v1/dataprep", + host="0.0.0.0", + port=6000, + input_datatype=DocPath, + output_datatype=None, +) +def ingest_documents(doc_path: DocPath): + """Ingest document to Redis.""" + doc_path = doc_path.path + print(f"Parsing document {doc_path}.") + + text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True) + content = docment_loader(doc_path) + chunks = text_splitter.split_text(content) + + print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf") + # Create vectorstore + if tei_embedding_endpoint: + # create embeddings using TEI endpoint service + embedder = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint) + else: + # create embeddings using local embedding model + embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) + + # Batch size + batch_size = 32 + num_chunks = len(chunks) + for i in range(0, num_chunks, batch_size): + batch_chunks = chunks[i : i + batch_size] + batch_texts = batch_chunks + + _ = Redis.from_texts( + texts=batch_texts, + embedding=embedder, + index_name=INDEX_NAME, + index_schema=INDEX_SCHEMA, + redis_url=REDIS_URL, + ) + print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}") + + +if __name__ == "__main__": + opea_microservices["opea_service@prepare_doc_redis"].start() diff --git a/comps/dataprep/langchain/redis/requirements.txt b/comps/dataprep/langchain/redis/requirements.txt new file mode 100644 index 0000000000..5513f7b540 --- /dev/null +++ b/comps/dataprep/langchain/redis/requirements.txt @@ -0,0 +1,10 @@ +docarray[full] +easyocr +fastapi +fitz +huggingface_hub +langchain +numpy +Pillow +sentence_transformers +shortuuid diff --git a/comps/vectorstores/langchain/redis/schema.yml b/comps/dataprep/langchain/redis/schema.yml similarity index 100% rename from comps/vectorstores/langchain/redis/schema.yml rename to comps/dataprep/langchain/redis/schema.yml diff --git a/comps/vectorstores/langchain/redis/schema_dim_1024.yml b/comps/dataprep/langchain/redis/schema_dim_1024.yml similarity index 100% rename from comps/vectorstores/langchain/redis/schema_dim_1024.yml rename to comps/dataprep/langchain/redis/schema_dim_1024.yml diff --git a/comps/vectorstores/langchain/redis/schema_dim_768.yml b/comps/dataprep/langchain/redis/schema_dim_768.yml similarity index 100% rename from comps/vectorstores/langchain/redis/schema_dim_768.yml rename to comps/dataprep/langchain/redis/schema_dim_768.yml diff --git a/comps/vectorstores/langchain/redis/schema_lcdocs_dim_768.yml b/comps/dataprep/langchain/redis/schema_lcdocs_dim_768.yml similarity index 100% rename from comps/vectorstores/langchain/redis/schema_lcdocs_dim_768.yml rename to comps/dataprep/langchain/redis/schema_lcdocs_dim_768.yml diff --git a/comps/dataprep/langchain/utils.py b/comps/dataprep/langchain/utils.py new file mode 100644 index 0000000000..c380a23d47 --- /dev/null +++ b/comps/dataprep/langchain/utils.py @@ -0,0 +1,61 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import io + +import numpy as np +from PIL import Image + + +def pdf_loader(file_path): + try: + import easyocr + import fitz + except ImportError: + raise ImportError( + "`PyMuPDF` or 'easyocr' package is not found, please install it with " "`pip install pymupdf easyocr.`" + ) + + doc = fitz.open(file_path) + reader = easyocr.Reader(["en"]) + result = "" + for i in range(doc.page_count): + page = doc.load_page(i) + pagetext = page.get_text().strip() + if pagetext: + result = result + pagetext + if len(doc.get_page_images(i)) > 0: + for img in doc.get_page_images(i): + if img: + pageimg = "" + xref = img[0] + img_data = doc.extract_image(xref) + img_bytes = img_data["image"] + pil_image = Image.open(io.BytesIO(img_bytes)) + img = np.array(pil_image) + img_result = reader.readtext(img, paragraph=True, detail=0) + pageimg = pageimg + ", ".join(img_result).strip() + if pageimg.endswith("!") or pageimg.endswith("?") or pageimg.endswith("."): + pass + else: + pageimg = pageimg + "." + result = result + pageimg + return result + + +def docment_loader(doc_path): + if doc_path.endswith(".pdf"): + return pdf_loader(doc_path) + else: + raise NotImplementedError("Current only support pdf format.") diff --git a/comps/vectorstores/langchain/qdrant/README.md b/comps/vectorstores/langchain/qdrant/README.md index e69de29bb2..518c3fd859 100644 --- a/comps/vectorstores/langchain/qdrant/README.md +++ b/comps/vectorstores/langchain/qdrant/README.md @@ -0,0 +1,13 @@ +# Start Qdrant server + +## 1. Download Qdrant image + +```bash +docker pull qdrant/qdrant +``` + +## 2. Run Qdrant service + +```bash +docker run -p 6333:6333 -p 6334:6334 -v $(pwd)/qdrant_storage:/qdrant/storage:z qdrant/qdrant +``` diff --git a/comps/vectorstores/langchain/redis/README.md b/comps/vectorstores/langchain/redis/README.md index e69de29bb2..246f304ad2 100644 --- a/comps/vectorstores/langchain/redis/README.md +++ b/comps/vectorstores/langchain/redis/README.md @@ -0,0 +1,13 @@ +# Start Redis server + +## 1. Download Redis image + +```bash +docker pull redis/redis-stack:7.2.0-v9 +``` + +## 2. Run Redis service + +```bash +docker run -p 6379:6379 -p 8001:8001 redis/redis-stack:7.2.0-v9 +```