From e2e437da40775c8ea011ea309ee509616282626d Mon Sep 17 00:00:00 2001 From: Lance Martin Date: Wed, 13 Dec 2023 14:57:49 -0800 Subject: [PATCH] fmt, add env vars --- .../README.md | 18 +- .../ingest.py | 9 +- .../rag_chroma_multi_modal_multi_vector.ipynb | 345 ------------------ .../chain.py | 9 +- 4 files changed, 19 insertions(+), 362 deletions(-) diff --git a/templates/rag-chroma-multi-modal-multi-vector/README.md b/templates/rag-chroma-multi-modal-multi-vector/README.md index ea4772bbb246f..cd4486c941178 100644 --- a/templates/rag-chroma-multi-modal-multi-vector/README.md +++ b/templates/rag-chroma-multi-modal-multi-vector/README.md @@ -1,5 +1,5 @@ -# rag-chroma-multi-modal +# rag-chroma-multi-modal-multi-vector Presentations (slide decks, etc) contain visual content that challenges conventional RAG. @@ -7,24 +7,24 @@ Multi-modal LLMs unlock new ways to build apps over visual content like presenta This template performs multi-modal RAG using Chroma with the multi-vector retriever (see [blog](https://blog.langchain.dev/multi-modal-rag-template/)): -* Extract the slides as images -* Use GPT-4V to summarize each image -* Embed the image summaries with a link to the original images -* Retrieve relevant image based on similarity between the image summary and the user input +* Extracts the slides as images +* Uses GPT-4V to summarize each image +* Embeds the image summaries with a link to the original images +* Retrieves relevant image based on similarity between the image summary and the user input * Finally pass those images to GPT-4V for answer synthesis ## Storage -We will use Upstash to store the images. +We will use Upstash to store the images, which offers Redis with a REST API. Simply login [here](https://upstash.com/) and create a database. -This will give you: +This will give you a REST API with: * UPSTASH_URL * UPSTASH_TOKEN -Set these in chain.py (***TODO: Update this? Env var?***) +Set `UPSTASH_URL` and `UPSTASH_TOKEN` as environment variables to access your database. We will use Chroma to store and index the image summaries, which will be created locally in the template directory. @@ -47,6 +47,8 @@ The app will retrieve images using multi-modal embeddings, and pass them to GPT- Set the `OPENAI_API_KEY` environment variable to access the OpenAI GPT-4V. +Set `UPSTASH_URL` and `UPSTASH_TOKEN` as environment variables to access your database. + ## Usage To use this package, you should first have the LangChain CLI installed: diff --git a/templates/rag-chroma-multi-modal-multi-vector/ingest.py b/templates/rag-chroma-multi-modal-multi-vector/ingest.py index ec67109347c88..c2a6358d68604 100644 --- a/templates/rag-chroma-multi-modal-multi-vector/ingest.py +++ b/templates/rag-chroma-multi-modal-multi-vector/ingest.py @@ -1,5 +1,6 @@ import base64 import io +import os import uuid from io import BytesIO from pathlib import Path @@ -64,8 +65,8 @@ def generate_img_summaries(img_base64_list): try: image_summaries.append(image_summarize(base64_image, prompt)) processed_images.append(base64_image) - except: - print(f"BadRequestError with image {i+1}") + except Exception as e: + print(f"Error with image {i+1}: {e}") return image_summaries, processed_images @@ -136,8 +137,8 @@ def create_multi_vector_retriever(vectorstore, image_summaries, images): """ # Initialize the storage layer for images - UPSTASH_URL = "https://usw1-bright-beagle-34178.upstash.io" - UPSTASH_TOKEN = "AYWCACQgNzk3OTJjZTItMGIxNy00MTEzLWIyZTAtZWI0ZmI1ZGY0NjFhNGRhMGZjNDE4YjgxNGE4MTkzOWYxMzllM2MzZThlOGY=" + UPSTASH_URL = os.getenv("UPSTASH_URL") + UPSTASH_TOKEN = os.getenv("UPSTASH_TOKEN") store = UpstashRedisByteStore(url=UPSTASH_URL, token=UPSTASH_TOKEN) id_key = "doc_id" diff --git a/templates/rag-chroma-multi-modal-multi-vector/rag_chroma_multi_modal_multi_vector.ipynb b/templates/rag-chroma-multi-modal-multi-vector/rag_chroma_multi_modal_multi_vector.ipynb index d93ae9a0d29de..bfaa9d82725cb 100644 --- a/templates/rag-chroma-multi-modal-multi-vector/rag_chroma_multi_modal_multi_vector.ipynb +++ b/templates/rag-chroma-multi-modal-multi-vector/rag_chroma_multi_modal_multi_vector.ipynb @@ -26,351 +26,6 @@ "rag_app = RemoteRunnable(\"http://localhost:8001/rag-chroma-multi-modal-multi-vector\")\n", "rag_app.invoke(\"What is the projected TAM for observability expected for each year through 2026?\")" ] - }, - { - "cell_type": "markdown", - "id": "98a94c02-1f0e-4e38-a1df-572d95913e01", - "metadata": {}, - "source": [ - "## TMP (TODO: Remove)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "e0eb4640-7f44-4c97-942c-54927640d954", - "metadata": {}, - "outputs": [], - "source": [ - "import base64\n", - "import io\n", - "import json\n", - "from pathlib import Path\n", - "\n", - "from langchain.chat_models import ChatOpenAI\n", - "from langchain.embeddings import OpenAIEmbeddings\n", - "from langchain.pydantic_v1 import BaseModel\n", - "from langchain.retrievers.multi_vector import MultiVectorRetriever\n", - "from langchain.schema.messages import HumanMessage\n", - "from langchain.schema.output_parser import StrOutputParser\n", - "from langchain.schema.runnable import RunnableLambda, RunnablePassthrough\n", - "from langchain.storage import UpstashRedisByteStore\n", - "from langchain.vectorstores import Chroma\n", - "from PIL import Image\n", - "\n", - "\n", - "def resize_base64_image(base64_string, size=(128, 128)):\n", - " \"\"\"\n", - " Resize an image encoded as a Base64 string.\n", - "\n", - " :param base64_string: A Base64 encoded string of the image to be resized.\n", - " :param size: A tuple representing the new size (width, height) for the image.\n", - " :return: A Base64 encoded string of the resized image.\n", - " \"\"\"\n", - " img_data = base64.b64decode(base64_string)\n", - " img = Image.open(io.BytesIO(img_data))\n", - " resized_img = img.resize(size, Image.LANCZOS)\n", - " buffered = io.BytesIO()\n", - " resized_img.save(buffered, format=img.format)\n", - " return base64.b64encode(buffered.getvalue()).decode(\"utf-8\")\n", - "\n", - "\n", - "def get_resized_images(docs):\n", - " \"\"\"\n", - " Resize images from base64-encoded strings.\n", - "\n", - " :param docs: A list of base64-encoded image to be resized.\n", - " :return: Dict containing a list of resized base64-encoded strings.\n", - " \"\"\"\n", - " b64_images = []\n", - " for doc in docs:\n", - " doc = json.loads(doc.decode(\"utf-8\"))[\"kwargs\"][\"page_content\"]\n", - " resized_image = resize_base64_image(doc, size=(1280, 720))\n", - " b64_images.append(resized_image)\n", - " return {\"images\": b64_images}\n", - "\n", - "\n", - "def img_prompt_func(data_dict, num_images=2):\n", - " \"\"\"\n", - " GPT-4V prompt for image analysis.\n", - "\n", - " :param data_dict: A dict with images and a user-provided question.\n", - " :param num_images: Number of images to include in the prompt.\n", - " :return: A list containing message objects for each image and the text prompt.\n", - " \"\"\"\n", - " messages = []\n", - " if data_dict[\"context\"][\"images\"]:\n", - " for image in data_dict[\"context\"][\"images\"][:num_images]:\n", - " image_message = {\n", - " \"type\": \"image_url\",\n", - " \"image_url\": {\"url\": f\"data:image/jpeg;base64,{image}\"},\n", - " }\n", - " messages.append(image_message)\n", - " text_message = {\n", - " \"type\": \"text\",\n", - " \"text\": (\n", - " \"You are an analyst tasked with answering questions about visual content.\\n\"\n", - " \"You will be give a set of image(s) from a slide deck / presentation.\\n\"\n", - " \"Use this information to answer the user question. \\n\"\n", - " f\"User-provided question: {data_dict['question']}\\n\\n\"\n", - " ),\n", - " }\n", - " messages.append(text_message)\n", - " return [HumanMessage(content=messages)]\n", - "\n", - "\n", - "def multi_modal_rag_chain(retriever):\n", - " \"\"\"\n", - " Multi-modal RAG chain,\n", - "\n", - " :param retriever: A function that retrieves the necessary context for the model.\n", - " :return: A chain of functions representing the multi-modal RAG process.\n", - " \"\"\"\n", - " # Initialize the multi-modal Large Language Model with specific parameters\n", - " model = ChatOpenAI(temperature=0, model=\"gpt-4-vision-preview\", max_tokens=1024)\n", - "\n", - " # Define the RAG pipeline\n", - " chain = (\n", - " {\n", - " \"context\": retriever | RunnableLambda(get_resized_images),\n", - " \"question\": RunnablePassthrough(),\n", - " }\n", - " | RunnableLambda(img_prompt_func)\n", - " | model\n", - " | StrOutputParser()\n", - " )\n", - "\n", - " return chain\n", - "\n", - "\n", - "# Load chroma\n", - "vectorstore_mvr = Chroma(\n", - " collection_name=\"image_summaries\",\n", - " persist_directory=\"chroma_db_multi_modal\",\n", - " embedding_function=OpenAIEmbeddings(),\n", - ")\n", - "\n", - "# Load redis\n", - "UPSTASH_URL = \"https://usw1-bright-beagle-34178.upstash.io\"\n", - "UPSTASH_TOKEN = \"AYWCACQgNzk3OTJjZTItMGIxNy00MTEzLWIyZTAtZWI0ZmI1ZGY0NjFhNGRhMGZjNDE4YjgxNGE4MTkzOWYxMzllM2MzZThlOGY=\"\n", - "store = UpstashRedisByteStore(url=UPSTASH_URL,\n", - " token=UPSTASH_TOKEN)\n", - "id_key = \"doc_id\"\n", - "\n", - "# Create the multi-vector retriever\n", - "retriever = MultiVectorRetriever(\n", - " vectorstore=vectorstore_mvr,\n", - " docstore=store,\n", - " id_key=id_key,\n", - ")\n", - "\n", - "# Create RAG chain\n", - "chain = multi_modal_rag_chain(retriever)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "57af60d7-fc78-406f-9cdd-fc64ea7798f9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'The total number of customers for Datadog, as shown in the image, is approximately 26,800.'" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke(\"What is the total numbner of customers for DataDog?\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8549b439-949e-4adf-bb52-3db7bf3f4c70", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1a98689a-3bab-4535-bd13-05782ff5aea3", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "f9a58eb6-fcd5-4f2b-ae86-891ebf6735c9", - "metadata": {}, - "outputs": [], - "source": [ - "import base64\n", - "import io\n", - "from pathlib import Path\n", - "\n", - "from langchain.retrievers.multi_vector import MultiVectorRetriever\n", - "from langchain.chat_models import ChatOpenAI\n", - "from langchain.pydantic_v1 import BaseModel\n", - "from langchain.schema.document import Document\n", - "from langchain.schema.messages import HumanMessage\n", - "from langchain.schema.output_parser import StrOutputParser\n", - "from langchain.schema.runnable import RunnableLambda, RunnablePassthrough\n", - "from langchain.vectorstores import Chroma\n", - "from langchain.embeddings import OpenAIEmbeddings\n", - "from langchain.storage import UpstashRedisByteStore\n", - "from PIL import Image\n", - "\n", - "\n", - "def resize_base64_image(base64_string, size=(128, 128)):\n", - " \"\"\"\n", - " Resize an image encoded as a Base64 string.\n", - "\n", - " :param base64_string: A Base64 encoded string of the image to be resized.\n", - " :param size: A tuple representing the new size (width, height) for the image.\n", - " :return: A Base64 encoded string of the resized image.\n", - " \"\"\"\n", - " img_data = base64.b64decode(base64_string)\n", - " img = Image.open(io.BytesIO(img_data))\n", - " resized_img = img.resize(size, Image.LANCZOS)\n", - " buffered = io.BytesIO()\n", - " resized_img.save(buffered, format=img.format)\n", - " return base64.b64encode(buffered.getvalue()).decode(\"utf-8\")\n", - "\n", - "\n", - "def get_resized_images(docs):\n", - " \"\"\"\n", - " Resize images from base64-encoded strings.\n", - "\n", - " :param docs: A list of base64-encoded image to be resized.\n", - " :return: Dict containing a list of resized base64-encoded strings.\n", - " \"\"\"\n", - " b64_images = []\n", - " for doc in docs:\n", - " # Convert from bytes and get b64 str from the Document JSON\n", - " doc = json.loads(doc.decode('utf-8'))['kwargs']['page_content']\n", - " resized_image = resize_base64_image(doc, size=(1280, 720))\n", - " b64_images.append(resized_image)\n", - " return {\"images\": b64_images}\n", - "\n", - "\n", - "def img_prompt_func(data_dict, num_images=2):\n", - " \"\"\"\n", - " GPT-4V prompt for image analysis.\n", - "\n", - " :param data_dict: A dict with images and a user-provided question.\n", - " :param num_images: Number of images to include in the prompt.\n", - " :return: A list containing message objects for each image and the text prompt.\n", - " \"\"\"\n", - " messages = []\n", - " if data_dict[\"context\"][\"images\"]:\n", - " for image in data_dict[\"context\"][\"images\"][:num_images]:\n", - " image_message = {\n", - " \"type\": \"image_url\",\n", - " \"image_url\": {\"url\": f\"data:image/jpeg;base64,{image}\"},\n", - " }\n", - " messages.append(image_message)\n", - " text_message = {\n", - " \"type\": \"text\",\n", - " \"text\": (\n", - " \"You are an analyst tasked with answering questions about visual content.\\n\"\n", - " \"You will be give a set of image(s) from a slide deck / presentation.\\n\"\n", - " \"Use this information to answer the user question. \\n\"\n", - " f\"User-provided question: {data_dict['question']}\\n\\n\"\n", - " ),\n", - " }\n", - " messages.append(text_message)\n", - " return [HumanMessage(content=messages)]\n", - "\n", - "\n", - "def multi_modal_rag_chain(retriever):\n", - " \"\"\"\n", - " Multi-modal RAG chain,\n", - "\n", - " :param retriever: A function that retrieves the necessary context for the model.\n", - " :return: A chain of functions representing the multi-modal RAG process.\n", - " \"\"\"\n", - " # Initialize the multi-modal Large Language Model with specific parameters\n", - " model = ChatOpenAI(temperature=0, model=\"gpt-4-vision-preview\", max_tokens=1024)\n", - "\n", - " # Define the RAG pipeline\n", - " chain = (\n", - " {\n", - " \"context\": retriever | RunnableLambda(get_resized_images),\n", - " \"question\": RunnablePassthrough(),\n", - " }\n", - " | RunnableLambda(img_prompt_func)\n", - " | model\n", - " | StrOutputParser()\n", - " )\n", - "\n", - " return chain\n", - "\n", - "\n", - "# Load chroma\n", - "vectorstore_mvr = Chroma(\n", - " collection_name=\"image_summaries\",\n", - " persist_directory=\"chroma_db_multi_modal\",\n", - " embedding_function=OpenAIEmbeddings()\n", - ")\n", - "\n", - "# Load redis\n", - "UPSTASH_URL = \"https://usw1-bright-beagle-34178.upstash.io\"\n", - "UPSTASH_TOKEN = \"AYWCACQgNzk3OTJjZTItMGIxNy00MTEzLWIyZTAtZWI0ZmI1ZGY0NjFhNGRhMGZjNDE4YjgxNGE4MTkzOWYxMzllM2MzZThlOGY=\"\n", - "store = UpstashRedisByteStore(url=UPSTASH_URL,\n", - " token=UPSTASH_TOKEN)\n", - "\n", - "id_key = \"doc_id\"\n", - "\n", - "# Create the multi-vector retriever\n", - "retriever = MultiVectorRetriever(\n", - " vectorstore=vectorstore_mvr,\n", - " docstore=store,\n", - " id_key=id_key,\n", - ")\n", - "\n", - "# Create RAG chain\n", - "chain = multi_modal_rag_chain(retriever)" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "dbe8cf0c-91c6-4bb8-8514-342199260559", - "metadata": {}, - "outputs": [], - "source": [ - "q = \"How many total customers does Datadog have?\"\n", - "docs = retriever.get_relevant_documents(q)" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "71d008e0-9629-4967-9063-dce31f8b5412", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Datadog has approximately 26,800 total customers.'" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke(q)" - ] } ], "metadata": { diff --git a/templates/rag-chroma-multi-modal-multi-vector/rag_chroma_multi_modal_multi_vector/chain.py b/templates/rag-chroma-multi-modal-multi-vector/rag_chroma_multi_modal_multi_vector/chain.py index 59baf8df51f86..648486cbfb98d 100644 --- a/templates/rag-chroma-multi-modal-multi-vector/rag_chroma_multi_modal_multi_vector/chain.py +++ b/templates/rag-chroma-multi-modal-multi-vector/rag_chroma_multi_modal_multi_vector/chain.py @@ -1,17 +1,17 @@ import base64 import io -import json +import os from pathlib import Path from langchain.chat_models import ChatOpenAI from langchain.embeddings import OpenAIEmbeddings from langchain.pydantic_v1 import BaseModel from langchain.retrievers.multi_vector import MultiVectorRetriever +from langchain.schema.document import Document from langchain.schema.messages import HumanMessage from langchain.schema.output_parser import StrOutputParser from langchain.schema.runnable import RunnableLambda, RunnablePassthrough from langchain.storage import UpstashRedisByteStore -from langchain.schema.document import Document from langchain.vectorstores import Chroma from PIL import Image @@ -43,7 +43,6 @@ def get_resized_images(docs): for doc in docs: if isinstance(doc, Document): doc = doc.page_content - # doc = json.loads(doc.decode("utf-8"))["kwargs"]["page_content"] resized_image = resize_base64_image(doc, size=(1280, 720)) b64_images.append(resized_image) return {"images": b64_images} @@ -110,8 +109,8 @@ def multi_modal_rag_chain(retriever): ) # Load redis -UPSTASH_URL = "https://usw1-bright-beagle-34178.upstash.io" -UPSTASH_TOKEN = "AYWCACQgNzk3OTJjZTItMGIxNy00MTEzLWIyZTAtZWI0ZmI1ZGY0NjFhNGRhMGZjNDE4YjgxNGE4MTkzOWYxMzllM2MzZThlOGY=" +UPSTASH_URL = os.getenv("UPSTASH_URL") +UPSTASH_TOKEN = os.getenv("UPSTASH_TOKEN") store = UpstashRedisByteStore(url=UPSTASH_URL, token=UPSTASH_TOKEN) id_key = "doc_id"