From 22d775d0bad24f5f9a83a063712f3a07c6197445 Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Wed, 27 Nov 2024 17:31:27 +0100 Subject: [PATCH 01/19] fix: qdrant returns empty list instead of raising error when listing empty store --- .../ragbits-core/src/ragbits/core/vector_stores/qdrant.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py b/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py index d5bfeb4f..b07aedda 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py @@ -168,6 +168,12 @@ async def list( # type: ignore Raises: MetadataNotFoundError: If the metadata is not found. """ + collections = await self._client.get_collections() + collection_names = [collection.name for collection in collections.collections] + + if self._index_name not in collection_names: + return [] + results = await self._client.query_points( collection_name=self._index_name, query_filter=where, From 885a77dc7d7f24f931da3247d77fa3ae2bba7326 Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Wed, 27 Nov 2024 17:36:13 +0100 Subject: [PATCH 02/19] fix: add document_meta.source.id to VectorStoreEntry metadata --- .../src/ragbits/document_search/documents/element.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/packages/ragbits-document-search/src/ragbits/document_search/documents/element.py b/packages/ragbits-document-search/src/ragbits/document_search/documents/element.py index 63bf82bd..6597a1bc 100644 --- a/packages/ragbits-document-search/src/ragbits/document_search/documents/element.py +++ b/packages/ragbits-document-search/src/ragbits/document_search/documents/element.py @@ -102,11 +102,14 @@ def to_vector_db_entry(self, vector: list[float]) -> VectorStoreEntry: Returns: The vector database entry """ + metadata = self.model_dump(exclude={"id", "key"}) + metadata["document_meta"]["source"]["id"] = self.document_meta.source.id + return VectorStoreEntry( id=self.id, key=self.key, vector=vector, - metadata=self.model_dump(exclude={"id", "key"}), + metadata=metadata, ) From f07b293ae9371c3752e227f7e99bfa9a565740bd Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Wed, 27 Nov 2024 17:39:27 +0100 Subject: [PATCH 03/19] feat: add remove methods for vector stores --- .../src/ragbits/core/vector_stores/base.py | 9 +++++++++ .../src/ragbits/core/vector_stores/chroma.py | 10 ++++++++++ .../src/ragbits/core/vector_stores/in_memory.py | 11 +++++++++++ .../src/ragbits/core/vector_stores/qdrant.py | 17 ++++++++++++++++- 4 files changed, 46 insertions(+), 1 deletion(-) diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/base.py b/packages/ragbits-core/src/ragbits/core/vector_stores/base.py index 9ba342e4..4512c659 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/base.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/base.py @@ -86,6 +86,15 @@ async def retrieve(self, vector: list[float], options: VectorStoreOptions | None The entries. """ + @abstractmethod + async def remove(self, ids: list[str]) -> None: + """ + Remove entries from the vector store. + + Args: + ids: The list of entries' IDs to remove. + """ + @abstractmethod async def list( self, where: WhereQuery | None = None, limit: int | None = None, offset: int = 0 diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py b/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py index e9f55c95..e5c9325e 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py @@ -132,6 +132,16 @@ async def retrieve(self, vector: list[float], options: VectorStoreOptions | None if options.max_distance is None or distance <= options.max_distance ] + @traceable + async def remove(self, ids: list[str]) -> None: + """ + Remove entries from the vector store. + + Args: + ids: The list of entries' IDs to remove. + """ + self._collection.delete(ids=ids) + @traceable async def list( self, where: WhereQuery | None = None, limit: int | None = None, offset: int = 0 diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/in_memory.py b/packages/ragbits-core/src/ragbits/core/vector_stores/in_memory.py index 28f69608..3ae96c2a 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/in_memory.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/in_memory.py @@ -81,6 +81,17 @@ async def retrieve(self, vector: list[float], options: VectorStoreOptions | None if options.max_distance is None or distance <= options.max_distance ] + @traceable + async def remove(self, ids: list[str]) -> None: + """ + Remove entries from the vector store. + + Args: + ids: The list of entries' IDs to remove. + """ + for id in ids: + del self._storage[id] + @traceable async def list( self, where: WhereQuery | None = None, limit: int | None = None, offset: int = 0 diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py b/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py index b07aedda..16e0bed1 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py @@ -1,7 +1,7 @@ import json import qdrant_client -from qdrant_client import AsyncQdrantClient +from qdrant_client import AsyncQdrantClient, models from qdrant_client.models import Distance, Filter, VectorParams from ragbits.core.audit import traceable @@ -146,6 +146,21 @@ async def retrieve(self, vector: list[float], options: VectorStoreOptions | None for id, document, vector, metadata in zip(ids, documents, vectors, metadatas, strict=True) ] + @traceable + async def remove(self, ids: list[str]) -> None: + """ + Remove entries from the vector store. + + Args: + ids: The list of entries' IDs to remove. + """ + await self._client.delete( + collection_name=self._index_name, + points_selector=models.PointIdsList( + points=ids, + ), + ) + @traceable async def list( # type: ignore self, From 0080c8f0c865216856f1bcc38f8385078daa2b34 Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Wed, 27 Nov 2024 17:41:45 +0100 Subject: [PATCH 04/19] test: add unit tests for remove method in vector stores --- .../tests/unit/vector_stores/test_chroma.py | 9 +++++++++ .../tests/unit/vector_stores/test_in_memory.py | 9 +++++++++ .../tests/unit/vector_stores/test_qdrant.py | 14 ++++++++++++++ 3 files changed, 32 insertions(+) diff --git a/packages/ragbits-core/tests/unit/vector_stores/test_chroma.py b/packages/ragbits-core/tests/unit/vector_stores/test_chroma.py index f79a5126..10cf8d43 100644 --- a/packages/ragbits-core/tests/unit/vector_stores/test_chroma.py +++ b/packages/ragbits-core/tests/unit/vector_stores/test_chroma.py @@ -95,6 +95,15 @@ async def test_retrieve( assert entry.key == result["content"] +async def test_remove(mock_chromadb_store: ChromaVectorStore) -> None: + ids_to_remove = ["1c7d6b27-4ef1-537c-ad7c-676edb8bc8a8"] + + await mock_chromadb_store.remove(ids_to_remove) + + mock_chromadb_store._client.get_or_create_collection().delete.assert_called_once() # type: ignore + mock_chromadb_store._client.get_or_create_collection().delete.assert_called_with(ids=ids_to_remove) + + async def test_list(mock_chromadb_store: ChromaVectorStore) -> None: mock_chromadb_store._collection.get.return_value = { # type: ignore "metadatas": [ diff --git a/packages/ragbits-core/tests/unit/vector_stores/test_in_memory.py b/packages/ragbits-core/tests/unit/vector_stores/test_in_memory.py index 2c167330..17dc998e 100644 --- a/packages/ragbits-core/tests/unit/vector_stores/test_in_memory.py +++ b/packages/ragbits-core/tests/unit/vector_stores/test_in_memory.py @@ -72,6 +72,15 @@ async def test_retrieve(store: InMemoryVectorStore, k: int, max_distance: float for entry, result in zip(entries, results, strict=True): assert entry.metadata["name"] == result +async def test_remove(store: InMemoryVectorStore) -> None: + entries = await store.list() + entry_number = len(entries) + + ids_to_remove = [entries[0].id] + await store.remove(ids_to_remove) + + assert len(await store.list()) == entry_number - 1 + async def test_list_all(store: InMemoryVectorStore) -> None: results = await store.list() diff --git a/packages/ragbits-core/tests/unit/vector_stores/test_qdrant.py b/packages/ragbits-core/tests/unit/vector_stores/test_qdrant.py index 22cb9391..c76715a0 100644 --- a/packages/ragbits-core/tests/unit/vector_stores/test_qdrant.py +++ b/packages/ragbits-core/tests/unit/vector_stores/test_qdrant.py @@ -96,6 +96,20 @@ async def test_retrieve(mock_qdrant_store: QdrantVectorStore) -> None: assert entry.vector == result["vector"] +async def test_remove(mock_qdrant_store: QdrantVectorStore) -> None: + ids_to_remove = ["1c7d6b27-4ef1-537c-ad7c-676edb8bc8a8"] + + await mock_qdrant_store.remove(ids_to_remove) + + mock_qdrant_store._client.delete.assert_called_once() # type: ignore + mock_qdrant_store._client.delete.assert_called_with( + collection_name="test_collection", + points_selector=models.PointIdsList( + points=ids_to_remove, + ), + ) + + async def test_list(mock_qdrant_store: QdrantVectorStore) -> None: mock_qdrant_store._client.query_points.return_value = models.QueryResponse( # type: ignore points=[ From 28571802b3034d42b768b50a70a0dc9e421ace88 Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Wed, 27 Nov 2024 17:43:41 +0100 Subject: [PATCH 05/19] feat: add removing entries from vector store when ingesting documents with same source.id --- .../src/ragbits/document_search/_main.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/packages/ragbits-document-search/src/ragbits/document_search/_main.py b/packages/ragbits-document-search/src/ragbits/document_search/_main.py index 706e1e86..5de4c0ff 100644 --- a/packages/ragbits-document-search/src/ragbits/document_search/_main.py +++ b/packages/ragbits-document-search/src/ragbits/document_search/_main.py @@ -141,8 +141,26 @@ async def ingest( elements = await self.processing_strategy.process_documents( documents, self.document_processor_router, document_processor ) + await self.remove_entries_with_same_id(elements) await self.insert_elements(elements) + async def remove_entries_with_same_id(self, elements: list[Element]) -> None: + """ + Remove entries from the vector store whose source id is present in the elements' metadata. + + Args: + elements: List of elements whose source ids will be checked and removed from the vector store if present. + """ + unique_source_ids = {element.document_meta.source.id for element in elements} + + ids_to_delete = [] + for entry in await self.vector_store.list(): + if entry.metadata["document_meta"]["source"]["id"] in unique_source_ids: + ids_to_delete.append(entry.id) + + if ids_to_delete: + await self.vector_store.remove(ids_to_delete) + async def insert_elements(self, elements: list[Element]) -> None: """ Insert Elements into the vector store. From 44f3ea74da15869c433d40d9dcb50708ce63db9c Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Wed, 27 Nov 2024 17:59:30 +0100 Subject: [PATCH 06/19] test: add integration tests for vector stores, regarding ingesting documents with the same source --- .../integration/vector_stores/test_chroma.py | 62 +++++++++++++++++++ .../vector_stores/test_in_memory.py | 57 +++++++++++++++++ .../integration/vector_stores/test_qdrant.py | 62 +++++++++++++++++++ 3 files changed, 181 insertions(+) create mode 100644 packages/ragbits-core/tests/integration/vector_stores/test_chroma.py create mode 100644 packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py create mode 100644 packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py diff --git a/packages/ragbits-core/tests/integration/vector_stores/test_chroma.py b/packages/ragbits-core/tests/integration/vector_stores/test_chroma.py new file mode 100644 index 00000000..78cc4cba --- /dev/null +++ b/packages/ragbits-core/tests/integration/vector_stores/test_chroma.py @@ -0,0 +1,62 @@ +import os +import tempfile +from pathlib import Path + +from chromadb import EphemeralClient + +from ragbits.core.embeddings.litellm import LiteLLMEmbeddings +from ragbits.core.vector_stores.chroma import ChromaVectorStore +from ragbits.document_search import DocumentSearch +from ragbits.document_search.documents.document import DocumentMeta + + +async def test_update_document() -> None: + document_1_content = "This is a test sentence and it should be in the vector store" + document_2_content = "This is another test sentence and it should be removed from the vector store" + document_2_new_content = "This is one more test sentence and it should be added to the vector store" + + with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as document_1: + document_1.write(document_1_content) + document_1_path = document_1.name + + with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as document_2: + document_2.write(document_2_content) + document_2_path = document_2.name + + document_1 = DocumentMeta.from_local_path(Path(document_1_path)) + document_2 = DocumentMeta.from_local_path(Path(document_2_path)) + + embedder = LiteLLMEmbeddings( + model="text-embedding-3-small", + ) + vector_store = ChromaVectorStore( + client=EphemeralClient(), + index_name="test_index_name", + ) + document_search = DocumentSearch( + embedder=embedder, + vector_store=vector_store, + ) + await document_search.ingest([document_1, document_2]) + + with open(document_2_path, "w") as file: + file.write(document_2_new_content) + + await document_search.ingest([document_2]) + + os.remove(document_1_path) + os.remove(document_2_path) + + document_1_present = False + document_2_old_present = False + document_2_new_present = False + for entry in await vector_store.list(): + if entry.key == document_1_content: + document_1_present = True + elif entry.key == document_2_content: + document_2_old_present = True + elif entry.key == document_2_new_content: + document_2_new_present = True + assert document_1_present + assert document_2_new_present + assert not document_2_old_present diff --git a/packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py b/packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py new file mode 100644 index 00000000..c50fcf6f --- /dev/null +++ b/packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py @@ -0,0 +1,57 @@ +import os +import tempfile +from pathlib import Path + +from ragbits.core.embeddings.litellm import LiteLLMEmbeddings +from ragbits.core.vector_stores.in_memory import InMemoryVectorStore +from ragbits.document_search import DocumentSearch +from ragbits.document_search.documents.document import DocumentMeta + + +async def test_update_document() -> None: + document_1_content = "This is a test sentence and it should be in the vector store" + document_2_content = "This is another test sentence and it should be removed from the vector store" + document_2_new_content = "This is one more test sentence and it should be added to the vector store" + + with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as document_1: + document_1.write(document_1_content) + document_1_path = document_1.name + + with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as document_2: + document_2.write(document_2_content) + document_2_path = document_2.name + + document_1 = DocumentMeta.from_local_path(Path(document_1_path)) + document_2 = DocumentMeta.from_local_path(Path(document_2_path)) + + embedder = LiteLLMEmbeddings( + model="text-embedding-3-small", + ) + vector_store = InMemoryVectorStore() + document_search = DocumentSearch( + embedder=embedder, + vector_store=vector_store, + ) + await document_search.ingest([document_1, document_2]) + + with open(document_2_path, "w") as file: + file.write(document_2_new_content) + + await document_search.ingest([document_2]) + + os.remove(document_1_path) + os.remove(document_2_path) + + document_1_present = False + document_2_old_present = False + document_2_new_present = False + for entry in await vector_store.list(): + if entry.key == document_1_content: + document_1_present = True + elif entry.key == document_2_content: + document_2_old_present = True + elif entry.key == document_2_new_content: + document_2_new_present = True + assert document_1_present + assert document_2_new_present + assert not document_2_old_present diff --git a/packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py b/packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py new file mode 100644 index 00000000..005bb760 --- /dev/null +++ b/packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py @@ -0,0 +1,62 @@ +import os +import tempfile +from pathlib import Path + +from qdrant_client import AsyncQdrantClient + +from ragbits.core.embeddings.litellm import LiteLLMEmbeddings +from ragbits.core.vector_stores.qdrant import QdrantVectorStore +from ragbits.document_search import DocumentSearch +from ragbits.document_search.documents.document import DocumentMeta + + +async def test_update_document() -> None: + document_1_content = "This is a test sentence and it should be in the vector store" + document_2_content = "This is another test sentence and it should be removed from the vector store" + document_2_new_content = "This is one more test sentence and it should be added to the vector store" + + with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as document_1: + document_1.write(document_1_content) + document_1_path = document_1.name + + with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as document_2: + document_2.write(document_2_content) + document_2_path = document_2.name + + document_1 = DocumentMeta.from_local_path(Path(document_1_path)) + document_2 = DocumentMeta.from_local_path(Path(document_2_path)) + + embedder = LiteLLMEmbeddings( + model="text-embedding-3-small", + ) + vector_store = QdrantVectorStore( + client=AsyncQdrantClient(":memory:"), + index_name="test_index_name", + ) + document_search = DocumentSearch( + embedder=embedder, + vector_store=vector_store, + ) + await document_search.ingest([document_1, document_2]) + + with open(document_2_path, "w") as file: + file.write(document_2_new_content) + + await document_search.ingest([document_2]) + + os.remove(document_1_path) + os.remove(document_2_path) + + document_1_present = False + document_2_old_present = False + document_2_new_present = False + for entry in await vector_store.list(): + if entry.key == document_1_content: + document_1_present = True + elif entry.key == document_2_content: + document_2_old_present = True + elif entry.key == document_2_new_content: + document_2_new_present = True + assert document_1_present + assert document_2_new_present + assert not document_2_old_present From 42c6afdf6b203cc330b21acb2db480d379ead232 Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Thu, 28 Nov 2024 08:48:47 +0100 Subject: [PATCH 07/19] fix: lint --- .../src/ragbits/core/vector_stores/qdrant.py | 3 ++- .../tests/integration/vector_stores/test_chroma.py | 12 ++++++------ .../integration/vector_stores/test_in_memory.py | 12 ++++++------ .../tests/integration/vector_stores/test_qdrant.py | 12 ++++++------ .../tests/unit/vector_stores/test_chroma.py | 2 +- .../tests/unit/vector_stores/test_in_memory.py | 1 + .../tests/unit/vector_stores/test_qdrant.py | 5 +++-- 7 files changed, 25 insertions(+), 22 deletions(-) diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py b/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py index 16e0bed1..5f05a872 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py @@ -1,4 +1,5 @@ import json +import typing import qdrant_client from qdrant_client import AsyncQdrantClient, models @@ -157,7 +158,7 @@ async def remove(self, ids: list[str]) -> None: await self._client.delete( collection_name=self._index_name, points_selector=models.PointIdsList( - points=ids, + points=typing.cast(list[int | str], ids), ), ) diff --git a/packages/ragbits-core/tests/integration/vector_stores/test_chroma.py b/packages/ragbits-core/tests/integration/vector_stores/test_chroma.py index 78cc4cba..cedbf1c4 100644 --- a/packages/ragbits-core/tests/integration/vector_stores/test_chroma.py +++ b/packages/ragbits-core/tests/integration/vector_stores/test_chroma.py @@ -15,13 +15,13 @@ async def test_update_document() -> None: document_2_content = "This is another test sentence and it should be removed from the vector store" document_2_new_content = "This is one more test sentence and it should be added to the vector store" - with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as document_1: - document_1.write(document_1_content) - document_1_path = document_1.name + with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as file_1: + file_1.write(document_1_content) + document_1_path = file_1.name - with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as document_2: - document_2.write(document_2_content) - document_2_path = document_2.name + with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as file_2: + file_2.write(document_2_content) + document_2_path = file_2.name document_1 = DocumentMeta.from_local_path(Path(document_1_path)) document_2 = DocumentMeta.from_local_path(Path(document_2_path)) diff --git a/packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py b/packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py index c50fcf6f..f5c31170 100644 --- a/packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py +++ b/packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py @@ -13,13 +13,13 @@ async def test_update_document() -> None: document_2_content = "This is another test sentence and it should be removed from the vector store" document_2_new_content = "This is one more test sentence and it should be added to the vector store" - with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as document_1: - document_1.write(document_1_content) - document_1_path = document_1.name + with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as file_1: + file_1.write(document_1_content) + document_1_path = file_1.name - with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as document_2: - document_2.write(document_2_content) - document_2_path = document_2.name + with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as file_2: + file_2.write(document_2_content) + document_2_path = file_2.name document_1 = DocumentMeta.from_local_path(Path(document_1_path)) document_2 = DocumentMeta.from_local_path(Path(document_2_path)) diff --git a/packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py b/packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py index 005bb760..34f4a809 100644 --- a/packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py +++ b/packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py @@ -15,13 +15,13 @@ async def test_update_document() -> None: document_2_content = "This is another test sentence and it should be removed from the vector store" document_2_new_content = "This is one more test sentence and it should be added to the vector store" - with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as document_1: - document_1.write(document_1_content) - document_1_path = document_1.name + with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as file_1: + file_1.write(document_1_content) + document_1_path = file_1.name - with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as document_2: - document_2.write(document_2_content) - document_2_path = document_2.name + with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as file_2: + file_2.write(document_2_content) + document_2_path = file_2.name document_1 = DocumentMeta.from_local_path(Path(document_1_path)) document_2 = DocumentMeta.from_local_path(Path(document_2_path)) diff --git a/packages/ragbits-core/tests/unit/vector_stores/test_chroma.py b/packages/ragbits-core/tests/unit/vector_stores/test_chroma.py index 10cf8d43..c2de3e26 100644 --- a/packages/ragbits-core/tests/unit/vector_stores/test_chroma.py +++ b/packages/ragbits-core/tests/unit/vector_stores/test_chroma.py @@ -101,7 +101,7 @@ async def test_remove(mock_chromadb_store: ChromaVectorStore) -> None: await mock_chromadb_store.remove(ids_to_remove) mock_chromadb_store._client.get_or_create_collection().delete.assert_called_once() # type: ignore - mock_chromadb_store._client.get_or_create_collection().delete.assert_called_with(ids=ids_to_remove) + mock_chromadb_store._client.get_or_create_collection().delete.assert_called_with(ids=ids_to_remove) # type: ignore async def test_list(mock_chromadb_store: ChromaVectorStore) -> None: diff --git a/packages/ragbits-core/tests/unit/vector_stores/test_in_memory.py b/packages/ragbits-core/tests/unit/vector_stores/test_in_memory.py index 17dc998e..dddf4f7d 100644 --- a/packages/ragbits-core/tests/unit/vector_stores/test_in_memory.py +++ b/packages/ragbits-core/tests/unit/vector_stores/test_in_memory.py @@ -72,6 +72,7 @@ async def test_retrieve(store: InMemoryVectorStore, k: int, max_distance: float for entry, result in zip(entries, results, strict=True): assert entry.metadata["name"] == result + async def test_remove(store: InMemoryVectorStore) -> None: entries = await store.list() entry_number = len(entries) diff --git a/packages/ragbits-core/tests/unit/vector_stores/test_qdrant.py b/packages/ragbits-core/tests/unit/vector_stores/test_qdrant.py index c76715a0..8600af25 100644 --- a/packages/ragbits-core/tests/unit/vector_stores/test_qdrant.py +++ b/packages/ragbits-core/tests/unit/vector_stores/test_qdrant.py @@ -1,3 +1,4 @@ +import typing from unittest.mock import AsyncMock import pytest @@ -102,10 +103,10 @@ async def test_remove(mock_qdrant_store: QdrantVectorStore) -> None: await mock_qdrant_store.remove(ids_to_remove) mock_qdrant_store._client.delete.assert_called_once() # type: ignore - mock_qdrant_store._client.delete.assert_called_with( + mock_qdrant_store._client.delete.assert_called_with( # type: ignore collection_name="test_collection", points_selector=models.PointIdsList( - points=ids_to_remove, + points=typing.cast(list[int | str], ids_to_remove), ), ) From af80819ebd4f64f0d3acb5737179475abac113ac Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Thu, 28 Nov 2024 12:04:20 +0100 Subject: [PATCH 08/19] fix: change checking qdrant collection existance method and update qdrant test_list --- .../ragbits-core/src/ragbits/core/vector_stores/qdrant.py | 6 ++---- .../ragbits-core/tests/unit/vector_stores/test_qdrant.py | 1 + 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py b/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py index 5f05a872..ed355658 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py @@ -184,10 +184,8 @@ async def list( # type: ignore Raises: MetadataNotFoundError: If the metadata is not found. """ - collections = await self._client.get_collections() - collection_names = [collection.name for collection in collections.collections] - - if self._index_name not in collection_names: + collection_exists = await self._client.collection_exists(collection_name=self._index_name) + if not collection_exists: return [] results = await self._client.query_points( diff --git a/packages/ragbits-core/tests/unit/vector_stores/test_qdrant.py b/packages/ragbits-core/tests/unit/vector_stores/test_qdrant.py index 8600af25..7868a455 100644 --- a/packages/ragbits-core/tests/unit/vector_stores/test_qdrant.py +++ b/packages/ragbits-core/tests/unit/vector_stores/test_qdrant.py @@ -112,6 +112,7 @@ async def test_remove(mock_qdrant_store: QdrantVectorStore) -> None: async def test_list(mock_qdrant_store: QdrantVectorStore) -> None: + mock_qdrant_store._client.collection_exists.return_value = True # type: ignore mock_qdrant_store._client.query_points.return_value = models.QueryResponse( # type: ignore points=[ models.ScoredPoint( From c8870ab10936598f84a88d2bc4254c1d5bc046dc Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Thu, 28 Nov 2024 13:24:45 +0100 Subject: [PATCH 09/19] fix: mock embedder in integration tests --- .../tests/integration/vector_stores/test_chroma.py | 7 +++---- .../tests/integration/vector_stores/test_in_memory.py | 7 +++---- .../tests/integration/vector_stores/test_qdrant.py | 7 +++---- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/packages/ragbits-core/tests/integration/vector_stores/test_chroma.py b/packages/ragbits-core/tests/integration/vector_stores/test_chroma.py index cedbf1c4..d03a5cd3 100644 --- a/packages/ragbits-core/tests/integration/vector_stores/test_chroma.py +++ b/packages/ragbits-core/tests/integration/vector_stores/test_chroma.py @@ -1,10 +1,10 @@ import os import tempfile from pathlib import Path +from unittest.mock import AsyncMock from chromadb import EphemeralClient -from ragbits.core.embeddings.litellm import LiteLLMEmbeddings from ragbits.core.vector_stores.chroma import ChromaVectorStore from ragbits.document_search import DocumentSearch from ragbits.document_search.documents.document import DocumentMeta @@ -26,9 +26,8 @@ async def test_update_document() -> None: document_1 = DocumentMeta.from_local_path(Path(document_1_path)) document_2 = DocumentMeta.from_local_path(Path(document_2_path)) - embedder = LiteLLMEmbeddings( - model="text-embedding-3-small", - ) + embedder = AsyncMock() + embedder.embed_text.return_value = [[0.0], [0.0]] vector_store = ChromaVectorStore( client=EphemeralClient(), index_name="test_index_name", diff --git a/packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py b/packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py index f5c31170..8a6dae82 100644 --- a/packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py +++ b/packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py @@ -1,8 +1,8 @@ import os import tempfile from pathlib import Path +from unittest.mock import AsyncMock -from ragbits.core.embeddings.litellm import LiteLLMEmbeddings from ragbits.core.vector_stores.in_memory import InMemoryVectorStore from ragbits.document_search import DocumentSearch from ragbits.document_search.documents.document import DocumentMeta @@ -24,9 +24,8 @@ async def test_update_document() -> None: document_1 = DocumentMeta.from_local_path(Path(document_1_path)) document_2 = DocumentMeta.from_local_path(Path(document_2_path)) - embedder = LiteLLMEmbeddings( - model="text-embedding-3-small", - ) + embedder = AsyncMock() + embedder.embed_text.return_value = [[0.0], [0.0]] vector_store = InMemoryVectorStore() document_search = DocumentSearch( embedder=embedder, diff --git a/packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py b/packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py index 34f4a809..2b791708 100644 --- a/packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py +++ b/packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py @@ -1,10 +1,10 @@ import os import tempfile from pathlib import Path +from unittest.mock import AsyncMock from qdrant_client import AsyncQdrantClient -from ragbits.core.embeddings.litellm import LiteLLMEmbeddings from ragbits.core.vector_stores.qdrant import QdrantVectorStore from ragbits.document_search import DocumentSearch from ragbits.document_search.documents.document import DocumentMeta @@ -26,9 +26,8 @@ async def test_update_document() -> None: document_1 = DocumentMeta.from_local_path(Path(document_1_path)) document_2 = DocumentMeta.from_local_path(Path(document_2_path)) - embedder = LiteLLMEmbeddings( - model="text-embedding-3-small", - ) + embedder = AsyncMock() + embedder.embed_text.return_value = [[0.0], [0.0]] vector_store = QdrantVectorStore( client=AsyncQdrantClient(":memory:"), index_name="test_index_name", From c042a33d1155ae595dd67a6ce2d1610864fec83a Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Thu, 28 Nov 2024 13:50:37 +0100 Subject: [PATCH 10/19] chore: rename function from remove_entries_with_same_id to remove_entries_with_same_sources --- .../src/ragbits/document_search/_main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/ragbits-document-search/src/ragbits/document_search/_main.py b/packages/ragbits-document-search/src/ragbits/document_search/_main.py index 5de4c0ff..965fc0d4 100644 --- a/packages/ragbits-document-search/src/ragbits/document_search/_main.py +++ b/packages/ragbits-document-search/src/ragbits/document_search/_main.py @@ -141,10 +141,10 @@ async def ingest( elements = await self.processing_strategy.process_documents( documents, self.document_processor_router, document_processor ) - await self.remove_entries_with_same_id(elements) + await self.remove_entries_with_same_sources(elements) await self.insert_elements(elements) - async def remove_entries_with_same_id(self, elements: list[Element]) -> None: + async def remove_entries_with_same_sources(self, elements: list[Element]) -> None: """ Remove entries from the vector store whose source id is present in the elements' metadata. From cb2d051e34a7cde1f7dc81cf6e85ce9e18db69ce Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Thu, 28 Nov 2024 15:41:39 +0100 Subject: [PATCH 11/19] refactor: optimize integration tests for better readability and efficiency --- .../integration/vector_stores/test_chroma.py | 39 +++++-------------- .../vector_stores/test_in_memory.py | 39 +++++-------------- .../integration/vector_stores/test_qdrant.py | 39 +++++-------------- 3 files changed, 30 insertions(+), 87 deletions(-) diff --git a/packages/ragbits-core/tests/integration/vector_stores/test_chroma.py b/packages/ragbits-core/tests/integration/vector_stores/test_chroma.py index d03a5cd3..462186ae 100644 --- a/packages/ragbits-core/tests/integration/vector_stores/test_chroma.py +++ b/packages/ragbits-core/tests/integration/vector_stores/test_chroma.py @@ -1,6 +1,3 @@ -import os -import tempfile -from pathlib import Path from unittest.mock import AsyncMock from chromadb import EphemeralClient @@ -8,6 +5,7 @@ from ragbits.core.vector_stores.chroma import ChromaVectorStore from ragbits.document_search import DocumentSearch from ragbits.document_search.documents.document import DocumentMeta +from ragbits.document_search.documents.sources import LocalFileSource async def test_update_document() -> None: @@ -15,16 +13,8 @@ async def test_update_document() -> None: document_2_content = "This is another test sentence and it should be removed from the vector store" document_2_new_content = "This is one more test sentence and it should be added to the vector store" - with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as file_1: - file_1.write(document_1_content) - document_1_path = file_1.name - - with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as file_2: - file_2.write(document_2_content) - document_2_path = file_2.name - - document_1 = DocumentMeta.from_local_path(Path(document_1_path)) - document_2 = DocumentMeta.from_local_path(Path(document_2_path)) + document_1 = DocumentMeta.create_text_document_from_literal(document_1_content) + document_2 = DocumentMeta.create_text_document_from_literal(document_2_content) embedder = AsyncMock() embedder.embed_text.return_value = [[0.0], [0.0]] @@ -38,24 +28,15 @@ async def test_update_document() -> None: ) await document_search.ingest([document_1, document_2]) + if isinstance(document_2.source, LocalFileSource): + document_2_path = document_2.source.path with open(document_2_path, "w") as file: file.write(document_2_new_content) await document_search.ingest([document_2]) - os.remove(document_1_path) - os.remove(document_2_path) - - document_1_present = False - document_2_old_present = False - document_2_new_present = False - for entry in await vector_store.list(): - if entry.key == document_1_content: - document_1_present = True - elif entry.key == document_2_content: - document_2_old_present = True - elif entry.key == document_2_new_content: - document_2_new_present = True - assert document_1_present - assert document_2_new_present - assert not document_2_old_present + document_contents = {entry.key for entry in await vector_store.list()} + + assert document_1_content in document_contents + assert document_2_new_content in document_contents + assert document_2_content not in document_contents diff --git a/packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py b/packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py index 8a6dae82..8fab57f8 100644 --- a/packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py +++ b/packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py @@ -1,11 +1,9 @@ -import os -import tempfile -from pathlib import Path from unittest.mock import AsyncMock from ragbits.core.vector_stores.in_memory import InMemoryVectorStore from ragbits.document_search import DocumentSearch from ragbits.document_search.documents.document import DocumentMeta +from ragbits.document_search.documents.sources import LocalFileSource async def test_update_document() -> None: @@ -13,16 +11,8 @@ async def test_update_document() -> None: document_2_content = "This is another test sentence and it should be removed from the vector store" document_2_new_content = "This is one more test sentence and it should be added to the vector store" - with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as file_1: - file_1.write(document_1_content) - document_1_path = file_1.name - - with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as file_2: - file_2.write(document_2_content) - document_2_path = file_2.name - - document_1 = DocumentMeta.from_local_path(Path(document_1_path)) - document_2 = DocumentMeta.from_local_path(Path(document_2_path)) + document_1 = DocumentMeta.create_text_document_from_literal(document_1_content) + document_2 = DocumentMeta.create_text_document_from_literal(document_2_content) embedder = AsyncMock() embedder.embed_text.return_value = [[0.0], [0.0]] @@ -33,24 +23,15 @@ async def test_update_document() -> None: ) await document_search.ingest([document_1, document_2]) + if isinstance(document_2.source, LocalFileSource): + document_2_path = document_2.source.path with open(document_2_path, "w") as file: file.write(document_2_new_content) await document_search.ingest([document_2]) - os.remove(document_1_path) - os.remove(document_2_path) - - document_1_present = False - document_2_old_present = False - document_2_new_present = False - for entry in await vector_store.list(): - if entry.key == document_1_content: - document_1_present = True - elif entry.key == document_2_content: - document_2_old_present = True - elif entry.key == document_2_new_content: - document_2_new_present = True - assert document_1_present - assert document_2_new_present - assert not document_2_old_present + document_contents = {entry.key for entry in await vector_store.list()} + + assert document_1_content in document_contents + assert document_2_new_content in document_contents + assert document_2_content not in document_contents diff --git a/packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py b/packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py index 2b791708..f364909e 100644 --- a/packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py +++ b/packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py @@ -1,6 +1,3 @@ -import os -import tempfile -from pathlib import Path from unittest.mock import AsyncMock from qdrant_client import AsyncQdrantClient @@ -8,6 +5,7 @@ from ragbits.core.vector_stores.qdrant import QdrantVectorStore from ragbits.document_search import DocumentSearch from ragbits.document_search.documents.document import DocumentMeta +from ragbits.document_search.documents.sources import LocalFileSource async def test_update_document() -> None: @@ -15,16 +13,8 @@ async def test_update_document() -> None: document_2_content = "This is another test sentence and it should be removed from the vector store" document_2_new_content = "This is one more test sentence and it should be added to the vector store" - with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as file_1: - file_1.write(document_1_content) - document_1_path = file_1.name - - with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as file_2: - file_2.write(document_2_content) - document_2_path = file_2.name - - document_1 = DocumentMeta.from_local_path(Path(document_1_path)) - document_2 = DocumentMeta.from_local_path(Path(document_2_path)) + document_1 = DocumentMeta.create_text_document_from_literal(document_1_content) + document_2 = DocumentMeta.create_text_document_from_literal(document_2_content) embedder = AsyncMock() embedder.embed_text.return_value = [[0.0], [0.0]] @@ -38,24 +28,15 @@ async def test_update_document() -> None: ) await document_search.ingest([document_1, document_2]) + if isinstance(document_2.source, LocalFileSource): + document_2_path = document_2.source.path with open(document_2_path, "w") as file: file.write(document_2_new_content) await document_search.ingest([document_2]) - os.remove(document_1_path) - os.remove(document_2_path) - - document_1_present = False - document_2_old_present = False - document_2_new_present = False - for entry in await vector_store.list(): - if entry.key == document_1_content: - document_1_present = True - elif entry.key == document_2_content: - document_2_old_present = True - elif entry.key == document_2_new_content: - document_2_new_present = True - assert document_1_present - assert document_2_new_present - assert not document_2_old_present + document_contents = {entry.key for entry in await vector_store.list()} + + assert document_1_content in document_contents + assert document_2_new_content in document_contents + assert document_2_content not in document_contents From feccfc6192ddff6ec2868b16383008568da9b3cd Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Thu, 28 Nov 2024 16:12:32 +0100 Subject: [PATCH 12/19] chore: rename integration tests from test_update_document to test_handling_document_ingestion_with_different_content_and_verifying_replacement --- .../ragbits-core/tests/integration/vector_stores/test_chroma.py | 2 +- .../tests/integration/vector_stores/test_in_memory.py | 2 +- .../ragbits-core/tests/integration/vector_stores/test_qdrant.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/ragbits-core/tests/integration/vector_stores/test_chroma.py b/packages/ragbits-core/tests/integration/vector_stores/test_chroma.py index 462186ae..b248074c 100644 --- a/packages/ragbits-core/tests/integration/vector_stores/test_chroma.py +++ b/packages/ragbits-core/tests/integration/vector_stores/test_chroma.py @@ -8,7 +8,7 @@ from ragbits.document_search.documents.sources import LocalFileSource -async def test_update_document() -> None: +async def test_handling_document_ingestion_with_different_content_and_verifying_replacement() -> None: document_1_content = "This is a test sentence and it should be in the vector store" document_2_content = "This is another test sentence and it should be removed from the vector store" document_2_new_content = "This is one more test sentence and it should be added to the vector store" diff --git a/packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py b/packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py index 8fab57f8..71734c8d 100644 --- a/packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py +++ b/packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py @@ -6,7 +6,7 @@ from ragbits.document_search.documents.sources import LocalFileSource -async def test_update_document() -> None: +async def test_handling_document_ingestion_with_different_content_and_verifying_replacement() -> None: document_1_content = "This is a test sentence and it should be in the vector store" document_2_content = "This is another test sentence and it should be removed from the vector store" document_2_new_content = "This is one more test sentence and it should be added to the vector store" diff --git a/packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py b/packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py index f364909e..8c9dea6b 100644 --- a/packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py +++ b/packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py @@ -8,7 +8,7 @@ from ragbits.document_search.documents.sources import LocalFileSource -async def test_update_document() -> None: +async def test_handling_document_ingestion_with_different_content_and_verifying_replacement() -> None: document_1_content = "This is a test sentence and it should be in the vector store" document_2_content = "This is another test sentence and it should be removed from the vector store" document_2_new_content = "This is one more test sentence and it should be added to the vector store" From 1e97250cad7c94a49643aba234887d829ba46aeb Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Fri, 29 Nov 2024 08:24:06 +0100 Subject: [PATCH 13/19] chore: make remove_entries_with_same_sources private --- .../src/ragbits/document_search/_main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/ragbits-document-search/src/ragbits/document_search/_main.py b/packages/ragbits-document-search/src/ragbits/document_search/_main.py index 965fc0d4..5aa897e9 100644 --- a/packages/ragbits-document-search/src/ragbits/document_search/_main.py +++ b/packages/ragbits-document-search/src/ragbits/document_search/_main.py @@ -141,10 +141,10 @@ async def ingest( elements = await self.processing_strategy.process_documents( documents, self.document_processor_router, document_processor ) - await self.remove_entries_with_same_sources(elements) + await self.__remove_entries_with_same_sources(elements) await self.insert_elements(elements) - async def remove_entries_with_same_sources(self, elements: list[Element]) -> None: + async def __remove_entries_with_same_sources(self, elements: list[Element]) -> None: """ Remove entries from the vector store whose source id is present in the elements' metadata. From 96e99484c16c09b3ab4602f3d7253929b9c100d6 Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Fri, 29 Nov 2024 12:30:00 +0100 Subject: [PATCH 14/19] fix: handle limit=None in Qdrant --- .../ragbits-core/src/ragbits/core/vector_stores/qdrant.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py b/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py index ed355658..212d75a5 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py @@ -188,10 +188,12 @@ async def list( # type: ignore if not collection_exists: return [] + limit = limit or (await self._client.count(collection_name=self._index_name)).count + results = await self._client.query_points( collection_name=self._index_name, query_filter=where, - limit=limit or 10, + limit=limit, offset=offset, with_payload=True, with_vectors=True, From b48a1a26f0962696abc562b31ce04de2e84d1248 Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Fri, 29 Nov 2024 12:37:13 +0100 Subject: [PATCH 15/19] chore: add TODO for passing 'where' argument to list method in __remove_entries_with_same_sources --- .../ragbits-document-search/src/ragbits/document_search/_main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/ragbits-document-search/src/ragbits/document_search/_main.py b/packages/ragbits-document-search/src/ragbits/document_search/_main.py index 674b63cd..a4941a2f 100644 --- a/packages/ragbits-document-search/src/ragbits/document_search/_main.py +++ b/packages/ragbits-document-search/src/ragbits/document_search/_main.py @@ -154,6 +154,7 @@ async def __remove_entries_with_same_sources(self, elements: list[Element]) -> N unique_source_ids = {element.document_meta.source.id for element in elements} ids_to_delete = [] + # TODO: Pass 'where' argument to the list method to filter results and optimize search for entry in await self.vector_store.list(): if entry.metadata["document_meta"]["source"]["id"] in unique_source_ids: ids_to_delete.append(entry.id) From 3d8a32f7462610b9c2358957671734e02e636a52 Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Fri, 29 Nov 2024 12:45:53 +0100 Subject: [PATCH 16/19] fix: change way for obtaining metadata from entry --- .../src/ragbits/document_search/_main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/ragbits-document-search/src/ragbits/document_search/_main.py b/packages/ragbits-document-search/src/ragbits/document_search/_main.py index a4941a2f..ccc7f8c6 100644 --- a/packages/ragbits-document-search/src/ragbits/document_search/_main.py +++ b/packages/ragbits-document-search/src/ragbits/document_search/_main.py @@ -156,7 +156,7 @@ async def __remove_entries_with_same_sources(self, elements: list[Element]) -> N ids_to_delete = [] # TODO: Pass 'where' argument to the list method to filter results and optimize search for entry in await self.vector_store.list(): - if entry.metadata["document_meta"]["source"]["id"] in unique_source_ids: + if entry.metadata.get("document_meta", {}).get("source", {}).get("id") in unique_source_ids: ids_to_delete.append(entry.id) if ids_to_delete: From 887f946ea1a8c82e2af0c7eb33e7c91b1272f5d5 Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Fri, 29 Nov 2024 14:06:22 +0100 Subject: [PATCH 17/19] docs: update docstring for qdrant remove method --- packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py b/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py index 212d75a5..221bf107 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py @@ -154,6 +154,9 @@ async def remove(self, ids: list[str]) -> None: Args: ids: The list of entries' IDs to remove. + + Raises: + ValueError: If collection named `self._index_name` is not present in the vector store. """ await self._client.delete( collection_name=self._index_name, From a09bf2c4c040106c2cb972446037796d5baf91bc Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Fri, 29 Nov 2024 15:27:15 +0100 Subject: [PATCH 18/19] chore: update function name --- .../src/ragbits/document_search/_main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/ragbits-document-search/src/ragbits/document_search/_main.py b/packages/ragbits-document-search/src/ragbits/document_search/_main.py index ccc7f8c6..104ebb61 100644 --- a/packages/ragbits-document-search/src/ragbits/document_search/_main.py +++ b/packages/ragbits-document-search/src/ragbits/document_search/_main.py @@ -141,10 +141,10 @@ async def ingest( elements = await self.processing_strategy.process_documents( documents, self.document_processor_router, document_processor ) - await self.__remove_entries_with_same_sources(elements) + await self._remove_entries_with_same_sources(elements) await self.insert_elements(elements) - async def __remove_entries_with_same_sources(self, elements: list[Element]) -> None: + async def _remove_entries_with_same_sources(self, elements: list[Element]) -> None: """ Remove entries from the vector store whose source id is present in the elements' metadata. From e84f1105be74f7ac93b8b8aa43731ea0314aee57 Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Fri, 29 Nov 2024 15:45:48 +0100 Subject: [PATCH 19/19] refactor: combine integration tests to one parametrized test --- .../integration/vector_stores/test_chroma.py | 42 ------------------- .../vector_stores/test_in_memory.py | 37 ---------------- .../{test_qdrant.py => test_vector_store.py} | 27 +++++++++--- 3 files changed, 22 insertions(+), 84 deletions(-) delete mode 100644 packages/ragbits-core/tests/integration/vector_stores/test_chroma.py delete mode 100644 packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py rename packages/ragbits-core/tests/integration/vector_stores/{test_qdrant.py => test_vector_store.py} (70%) diff --git a/packages/ragbits-core/tests/integration/vector_stores/test_chroma.py b/packages/ragbits-core/tests/integration/vector_stores/test_chroma.py deleted file mode 100644 index b248074c..00000000 --- a/packages/ragbits-core/tests/integration/vector_stores/test_chroma.py +++ /dev/null @@ -1,42 +0,0 @@ -from unittest.mock import AsyncMock - -from chromadb import EphemeralClient - -from ragbits.core.vector_stores.chroma import ChromaVectorStore -from ragbits.document_search import DocumentSearch -from ragbits.document_search.documents.document import DocumentMeta -from ragbits.document_search.documents.sources import LocalFileSource - - -async def test_handling_document_ingestion_with_different_content_and_verifying_replacement() -> None: - document_1_content = "This is a test sentence and it should be in the vector store" - document_2_content = "This is another test sentence and it should be removed from the vector store" - document_2_new_content = "This is one more test sentence and it should be added to the vector store" - - document_1 = DocumentMeta.create_text_document_from_literal(document_1_content) - document_2 = DocumentMeta.create_text_document_from_literal(document_2_content) - - embedder = AsyncMock() - embedder.embed_text.return_value = [[0.0], [0.0]] - vector_store = ChromaVectorStore( - client=EphemeralClient(), - index_name="test_index_name", - ) - document_search = DocumentSearch( - embedder=embedder, - vector_store=vector_store, - ) - await document_search.ingest([document_1, document_2]) - - if isinstance(document_2.source, LocalFileSource): - document_2_path = document_2.source.path - with open(document_2_path, "w") as file: - file.write(document_2_new_content) - - await document_search.ingest([document_2]) - - document_contents = {entry.key for entry in await vector_store.list()} - - assert document_1_content in document_contents - assert document_2_new_content in document_contents - assert document_2_content not in document_contents diff --git a/packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py b/packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py deleted file mode 100644 index 71734c8d..00000000 --- a/packages/ragbits-core/tests/integration/vector_stores/test_in_memory.py +++ /dev/null @@ -1,37 +0,0 @@ -from unittest.mock import AsyncMock - -from ragbits.core.vector_stores.in_memory import InMemoryVectorStore -from ragbits.document_search import DocumentSearch -from ragbits.document_search.documents.document import DocumentMeta -from ragbits.document_search.documents.sources import LocalFileSource - - -async def test_handling_document_ingestion_with_different_content_and_verifying_replacement() -> None: - document_1_content = "This is a test sentence and it should be in the vector store" - document_2_content = "This is another test sentence and it should be removed from the vector store" - document_2_new_content = "This is one more test sentence and it should be added to the vector store" - - document_1 = DocumentMeta.create_text_document_from_literal(document_1_content) - document_2 = DocumentMeta.create_text_document_from_literal(document_2_content) - - embedder = AsyncMock() - embedder.embed_text.return_value = [[0.0], [0.0]] - vector_store = InMemoryVectorStore() - document_search = DocumentSearch( - embedder=embedder, - vector_store=vector_store, - ) - await document_search.ingest([document_1, document_2]) - - if isinstance(document_2.source, LocalFileSource): - document_2_path = document_2.source.path - with open(document_2_path, "w") as file: - file.write(document_2_new_content) - - await document_search.ingest([document_2]) - - document_contents = {entry.key for entry in await vector_store.list()} - - assert document_1_content in document_contents - assert document_2_new_content in document_contents - assert document_2_content not in document_contents diff --git a/packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py b/packages/ragbits-core/tests/integration/vector_stores/test_vector_store.py similarity index 70% rename from packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py rename to packages/ragbits-core/tests/integration/vector_stores/test_vector_store.py index 8c9dea6b..91160322 100644 --- a/packages/ragbits-core/tests/integration/vector_stores/test_qdrant.py +++ b/packages/ragbits-core/tests/integration/vector_stores/test_vector_store.py @@ -1,14 +1,35 @@ from unittest.mock import AsyncMock +import pytest +from chromadb import EphemeralClient from qdrant_client import AsyncQdrantClient +from ragbits.core.vector_stores.base import VectorStore +from ragbits.core.vector_stores.chroma import ChromaVectorStore +from ragbits.core.vector_stores.in_memory import InMemoryVectorStore from ragbits.core.vector_stores.qdrant import QdrantVectorStore from ragbits.document_search import DocumentSearch from ragbits.document_search.documents.document import DocumentMeta from ragbits.document_search.documents.sources import LocalFileSource -async def test_handling_document_ingestion_with_different_content_and_verifying_replacement() -> None: +@pytest.mark.parametrize( + "vector_store", + [ + InMemoryVectorStore(), + ChromaVectorStore( + client=EphemeralClient(), + index_name="test_index_name", + ), + QdrantVectorStore( + client=AsyncQdrantClient(":memory:"), + index_name="test_index_name", + ), + ], +) +async def test_handling_document_ingestion_with_different_content_and_verifying_replacement( + vector_store: VectorStore, +) -> None: document_1_content = "This is a test sentence and it should be in the vector store" document_2_content = "This is another test sentence and it should be removed from the vector store" document_2_new_content = "This is one more test sentence and it should be added to the vector store" @@ -18,10 +39,6 @@ async def test_handling_document_ingestion_with_different_content_and_verifying_ embedder = AsyncMock() embedder.embed_text.return_value = [[0.0], [0.0]] - vector_store = QdrantVectorStore( - client=AsyncQdrantClient(":memory:"), - index_name="test_index_name", - ) document_search = DocumentSearch( embedder=embedder, vector_store=vector_store,