From 0a9a519a398fb5ae696cc8f453aba17090d6d1c1 Mon Sep 17 00:00:00 2001 From: volodymyr-memsql <57520563+volodymyr-memsql@users.noreply.github.com> Date: Thu, 22 Feb 2024 01:16:32 +0200 Subject: [PATCH] community[patch]: Added add_images method to SingleStoreDB vector store (#17871) In this pull request, we introduce the add_images method to the SingleStoreDB vector store class, expanding its capabilities to handle multi-modal embeddings seamlessly. This method facilitates the incorporation of image data into the vector store by associating each image's URI with corresponding document content, metadata, and either pre-generated embeddings or embeddings computed using the embed_image method of the provided embedding object. the change includes integration tests, validating the behavior of the add_images. Additionally, we provide a notebook showcasing the usage of this new method. --------- Co-authored-by: Volodymyr Tkachuk --- .../vectorstores/singlestoredb.ipynb | 48 +++++++++++++- .../vectorstores/singlestoredb.py | 29 +++++++++ .../vectorstores/test_singlestoredb.py | 63 ++++++++++++++++++- 3 files changed, 137 insertions(+), 3 deletions(-) diff --git a/docs/docs/integrations/vectorstores/singlestoredb.ipynb b/docs/docs/integrations/vectorstores/singlestoredb.ipynb index 6cae0d544241e..2278b86764576 100644 --- a/docs/docs/integrations/vectorstores/singlestoredb.ipynb +++ b/docs/docs/integrations/vectorstores/singlestoredb.ipynb @@ -114,13 +114,57 @@ "Enhance your search efficiency with SingleStore DB version 8.5 or above by leveraging [ANN vector indexes](https://docs.singlestore.com/cloud/reference/sql-reference/vector-functions/vector-indexing/). By setting `use_vector_index=True` during vector store object creation, you can activate this feature. Additionally, if your vectors differ in dimensionality from the default OpenAI embedding size of 1536, ensure to specify the `vector_size` parameter accordingly. " ] }, + { + "cell_type": "markdown", + "id": "86efff90", + "metadata": {}, + "source": [ + "## Multi-modal Example: Leveraging CLIP and OpenClip Embeddings\n", + "\n", + "In the realm of multi-modal data analysis, the integration of diverse information types like images and text has become increasingly crucial. One powerful tool facilitating such integration is [CLIP](https://openai.com/research/clip), a cutting-edge model capable of embedding both images and text into a shared semantic space. By doing so, CLIP enables the retrieval of relevant content across different modalities through similarity search.\n", + "\n", + "To illustrate, let's consider an application scenario where we aim to effectively analyze multi-modal data. In this example, we harness the capabilities of [OpenClip multimodal embeddings](https://python.langchain.com/docs/integrations/text_embedding/open_clip), which leverage CLIP's framework. With OpenClip, we can seamlessly embed textual descriptions alongside corresponding images, enabling comprehensive analysis and retrieval tasks. Whether it's identifying visually similar images based on textual queries or finding relevant text passages associated with specific visual content, OpenClip empowers users to explore and extract insights from multi-modal data with remarkable efficiency and accuracy." + ] + }, { "cell_type": "code", "execution_count": null, - "id": "86efff90", + "id": "9c0bce88", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "%pip install -U langchain openai singlestoredb langchain-experimental # (newest versions required for multi-modal)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21a8c25c", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "from langchain_community.vectorstores import SingleStoreDB\n", + "from langchain_experimental.open_clip import OpenCLIPEmbeddings\n", + "\n", + "os.environ[\"SINGLESTOREDB_URL\"] = \"root:pass@localhost:3306/db\"\n", + "\n", + "TEST_IMAGES_DIR = \"../../modules/images\"\n", + "\n", + "docsearch = SingleStoreDB(OpenCLIPEmbeddings())\n", + "\n", + "image_uris = sorted(\n", + " [\n", + " os.path.join(TEST_IMAGES_DIR, image_name)\n", + " for image_name in os.listdir(TEST_IMAGES_DIR)\n", + " if image_name.endswith(\".jpg\")\n", + " ]\n", + ")\n", + "\n", + "# Add images\n", + "docsearch.add_images(uris=image_uris)" + ] } ], "metadata": { diff --git a/libs/community/langchain_community/vectorstores/singlestoredb.py b/libs/community/langchain_community/vectorstores/singlestoredb.py index 6d43b6199b8f7..3eaba96a889c5 100644 --- a/libs/community/langchain_community/vectorstores/singlestoredb.py +++ b/libs/community/langchain_community/vectorstores/singlestoredb.py @@ -303,6 +303,35 @@ def _create_table(self: SingleStoreDB) -> None: finally: conn.close() + def add_images( + self, + uris: List[str], + metadatas: Optional[List[dict]] = None, + embeddings: Optional[List[List[float]]] = None, + **kwargs: Any, + ) -> List[str]: + """Run images through the embeddings and add to the vectorstore. + + Args: + uris List[str]: File path to images. + Each URI will be added to the vectorstore as document content. + metadatas (Optional[List[dict]], optional): Optional list of metadatas. + Defaults to None. + embeddings (Optional[List[List[float]]], optional): Optional pre-generated + embeddings. Defaults to None. + + Returns: + List[str]: empty list + """ + # Set embeddings + if ( + embeddings is None + and self.embedding is not None + and hasattr(self.embedding, "embed_image") + ): + embeddings = self.embedding.embed_image(uris=uris) + return self.add_texts(uris, metadatas, embeddings, **kwargs) + def add_texts( self, texts: Iterable[str], diff --git a/libs/community/tests/integration_tests/vectorstores/test_singlestoredb.py b/libs/community/tests/integration_tests/vectorstores/test_singlestoredb.py index 4f690f079fdd3..da161ce7a128d 100644 --- a/libs/community/tests/integration_tests/vectorstores/test_singlestoredb.py +++ b/libs/community/tests/integration_tests/vectorstores/test_singlestoredb.py @@ -1,4 +1,6 @@ """Test SingleStoreDB functionality.""" +import os +import tempfile from typing import List import numpy as np @@ -14,6 +16,7 @@ TEST_SINGLE_RESULT = [Document(page_content="foo")] TEST_SINGLE_WITH_METADATA_RESULT = [Document(page_content="foo", metadata={"a": "b"})] TEST_RESULT = [Document(page_content="foo"), Document(page_content="foo")] +TEST_IMAGES_DIR = "" try: import singlestoredb as s2 @@ -22,6 +25,13 @@ except ImportError: singlestoredb_installed = False +try: + from langchain_experimental.open_clip import OpenCLIPEmbeddings + + langchain_experimental_installed = True +except ImportError: + langchain_experimental_installed = False + def drop(table_name: str) -> None: with s2.connect(TEST_SINGLESTOREDB_URL) as conn: @@ -53,6 +63,9 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]: def embed_query(self, text: str) -> List[float]: return np.random.rand(100).tolist() + def embed_image(self, uris: List[str]) -> List[List[float]]: + return [np.random.rand(100).tolist() for _ in uris] + @pytest.fixture def texts() -> List[str]: @@ -156,7 +169,7 @@ def test_singlestoredb_vector_index_large() -> None: table_name = "test_singlestoredb_vector_index_large" drop(table_name) docsearch = SingleStoreDB.from_texts( - ["foo"] * 300000, + ["foo"] * 30, RandomEmbeddings(), distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE, table_name=table_name, @@ -444,3 +457,51 @@ def test_singlestoredb_as_retriever(texts: List[str]) -> None: ), ] drop(table_name) + + +@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") +def test_singlestoredb_add_image(texts: List[str]) -> None: + """Test adding images""" + table_name = "test_singlestoredb_add_image" + drop(table_name) + docsearch = SingleStoreDB( + RandomEmbeddings(), + table_name=table_name, + host=TEST_SINGLESTOREDB_URL, + ) + temp_files = [] + for _ in range(3): + temp_file = tempfile.NamedTemporaryFile(delete=False) + temp_file.write(b"foo") + temp_file.close() + temp_files.append(temp_file.name) + + docsearch.add_images(temp_files) + output = docsearch.similarity_search("foo", k=1) + assert output[0].page_content in temp_files + drop(table_name) + + +@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") +@pytest.mark.skipif( + not langchain_experimental_installed, reason="langchain_experimental not installed" +) +def test_singestoredb_add_image2() -> None: + table_name = "test_singlestoredb_add_images" + drop(table_name) + docsearch = SingleStoreDB( + OpenCLIPEmbeddings(), + table_name=table_name, + host=TEST_SINGLESTOREDB_URL, + ) + image_uris = sorted( + [ + os.path.join(TEST_IMAGES_DIR, image_name) + for image_name in os.listdir(TEST_IMAGES_DIR) + if image_name.endswith(".jpg") + ] + ) + docsearch.add_images(image_uris) + output = docsearch.similarity_search("horse", k=1) + assert "horse" in output[0].page_content + drop(table_name)