Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(core): update document in vector store #210

Merged
merged 22 commits into from
Nov 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
22d775d
fix: qdrant returns empty list instead of raising error when listing …
mackurzawa Nov 27, 2024
885a77d
fix: add document_meta.source.id to VectorStoreEntry metadata
mackurzawa Nov 27, 2024
f07b293
feat: add remove methods for vector stores
mackurzawa Nov 27, 2024
0080c8f
test: add unit tests for remove method in vector stores
mackurzawa Nov 27, 2024
2857180
feat: add removing entries from vector store when ingesting documents…
mackurzawa Nov 27, 2024
44f3ea7
test: add integration tests for vector stores, regarding ingesting do…
mackurzawa Nov 27, 2024
42c6afd
fix: lint
mackurzawa Nov 28, 2024
af80819
fix: change checking qdrant collection existance method and update qd…
mackurzawa Nov 28, 2024
c8870ab
fix: mock embedder in integration tests
mackurzawa Nov 28, 2024
c042a33
chore: rename function from remove_entries_with_same_id to remove_ent…
mackurzawa Nov 28, 2024
cb2d051
refactor: optimize integration tests for better readability and effic…
mackurzawa Nov 28, 2024
feccfc6
chore: rename integration tests from test_update_document to test_han…
mackurzawa Nov 28, 2024
eec2bfc
Merge branch 'main' into feat/update-document-in-vector-store
mackurzawa Nov 28, 2024
1e97250
chore: make remove_entries_with_same_sources private
mackurzawa Nov 29, 2024
7eea03e
merge main and resolve conflicts
mackurzawa Nov 29, 2024
96e9948
fix: handle limit=None in Qdrant
mackurzawa Nov 29, 2024
b48a1a2
chore: add TODO for passing 'where' argument to list method in __remo…
mackurzawa Nov 29, 2024
3d8a32f
fix: change way for obtaining metadata from entry
mackurzawa Nov 29, 2024
887f946
docs: update docstring for qdrant remove method
mackurzawa Nov 29, 2024
2ad8a4a
Merge branch 'main' into feat/update-document-in-vector-store
mackurzawa Nov 29, 2024
a09bf2c
chore: update function name
mackurzawa Nov 29, 2024
e84f110
refactor: combine integration tests to one parametrized test
mackurzawa Nov 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions packages/ragbits-core/src/ragbits/core/vector_stores/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,15 @@ async def retrieve(self, vector: list[float], options: VectorStoreOptions | None
The entries.
"""

@abstractmethod
async def remove(self, ids: list[str]) -> None:
"""
Remove entries from the vector store.

Args:
ids: The list of entries' IDs to remove.
"""

@abstractmethod
async def list(
self, where: WhereQuery | None = None, limit: int | None = None, offset: int = 0
Expand Down
10 changes: 10 additions & 0 deletions packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,16 @@ async def retrieve(self, vector: list[float], options: VectorStoreOptions | None
if options.max_distance is None or distance <= options.max_distance
]

@traceable
async def remove(self, ids: list[str]) -> None:
"""
Remove entries from the vector store.

Args:
ids: The list of entries' IDs to remove.
"""
self._collection.delete(ids=ids)

@traceable
async def list(
self, where: WhereQuery | None = None, limit: int | None = None, offset: int = 0
Expand Down
11 changes: 11 additions & 0 deletions packages/ragbits-core/src/ragbits/core/vector_stores/in_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,17 @@ async def retrieve(self, vector: list[float], options: VectorStoreOptions | None
if options.max_distance is None or distance <= options.max_distance
]

@traceable
async def remove(self, ids: list[str]) -> None:
"""
Remove entries from the vector store.

Args:
ids: The list of entries' IDs to remove.
"""
for id in ids:
del self._storage[id]

@traceable
async def list(
self, where: WhereQuery | None = None, limit: int | None = None, offset: int = 0
Expand Down
29 changes: 27 additions & 2 deletions packages/ragbits-core/src/ragbits/core/vector_stores/qdrant.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import json
import typing

import qdrant_client
from qdrant_client import AsyncQdrantClient
from qdrant_client import AsyncQdrantClient, models
from qdrant_client.models import Distance, Filter, VectorParams

from ragbits.core.audit import traceable
Expand Down Expand Up @@ -146,6 +147,24 @@ async def retrieve(self, vector: list[float], options: VectorStoreOptions | None
for id, document, vector, metadata in zip(ids, documents, vectors, metadatas, strict=True)
]

@traceable
async def remove(self, ids: list[str]) -> None:
"""
Remove entries from the vector store.

Args:
ids: The list of entries' IDs to remove.

Raises:
ValueError: If collection named `self._index_name` is not present in the vector store.
"""
await self._client.delete(
mackurzawa marked this conversation as resolved.
Show resolved Hide resolved
collection_name=self._index_name,
points_selector=models.PointIdsList(
points=typing.cast(list[int | str], ids),
),
)

@traceable
async def list( # type: ignore
self,
Expand All @@ -168,10 +187,16 @@ async def list( # type: ignore
Raises:
MetadataNotFoundError: If the metadata is not found.
"""
collection_exists = await self._client.collection_exists(collection_name=self._index_name)
ludwiktrammer marked this conversation as resolved.
Show resolved Hide resolved
if not collection_exists:
return []

limit = limit or (await self._client.count(collection_name=self._index_name)).count

results = await self._client.query_points(
collection_name=self._index_name,
query_filter=where,
limit=limit or 10,
limit=limit,
offset=offset,
with_payload=True,
with_vectors=True,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from unittest.mock import AsyncMock

import pytest
from chromadb import EphemeralClient
from qdrant_client import AsyncQdrantClient

from ragbits.core.vector_stores.base import VectorStore
from ragbits.core.vector_stores.chroma import ChromaVectorStore
from ragbits.core.vector_stores.in_memory import InMemoryVectorStore
from ragbits.core.vector_stores.qdrant import QdrantVectorStore
from ragbits.document_search import DocumentSearch
from ragbits.document_search.documents.document import DocumentMeta
from ragbits.document_search.documents.sources import LocalFileSource


@pytest.mark.parametrize(
"vector_store",
[
InMemoryVectorStore(),
ChromaVectorStore(
client=EphemeralClient(),
index_name="test_index_name",
),
QdrantVectorStore(
client=AsyncQdrantClient(":memory:"),
index_name="test_index_name",
),
],
)
async def test_handling_document_ingestion_with_different_content_and_verifying_replacement(
vector_store: VectorStore,
) -> None:
document_1_content = "This is a test sentence and it should be in the vector store"
document_2_content = "This is another test sentence and it should be removed from the vector store"
document_2_new_content = "This is one more test sentence and it should be added to the vector store"

document_1 = DocumentMeta.create_text_document_from_literal(document_1_content)
document_2 = DocumentMeta.create_text_document_from_literal(document_2_content)

embedder = AsyncMock()
embedder.embed_text.return_value = [[0.0], [0.0]]
document_search = DocumentSearch(
embedder=embedder,
vector_store=vector_store,
)
await document_search.ingest([document_1, document_2])

if isinstance(document_2.source, LocalFileSource):
document_2_path = document_2.source.path
with open(document_2_path, "w") as file:
file.write(document_2_new_content)

await document_search.ingest([document_2])

document_contents = {entry.key for entry in await vector_store.list()}

assert document_1_content in document_contents
assert document_2_new_content in document_contents
assert document_2_content not in document_contents
9 changes: 9 additions & 0 deletions packages/ragbits-core/tests/unit/vector_stores/test_chroma.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,15 @@ async def test_retrieve(
assert entry.key == result["content"]


async def test_remove(mock_chromadb_store: ChromaVectorStore) -> None:
ids_to_remove = ["1c7d6b27-4ef1-537c-ad7c-676edb8bc8a8"]

await mock_chromadb_store.remove(ids_to_remove)

mock_chromadb_store._client.get_or_create_collection().delete.assert_called_once() # type: ignore
mock_chromadb_store._client.get_or_create_collection().delete.assert_called_with(ids=ids_to_remove) # type: ignore
pawel-chmielak-deepsense marked this conversation as resolved.
Show resolved Hide resolved


async def test_list(mock_chromadb_store: ChromaVectorStore) -> None:
mock_chromadb_store._collection.get.return_value = { # type: ignore
"metadatas": [
Expand Down
10 changes: 10 additions & 0 deletions packages/ragbits-core/tests/unit/vector_stores/test_in_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,16 @@ async def test_retrieve(store: InMemoryVectorStore, k: int, max_distance: float
assert entry.metadata["name"] == result


async def test_remove(store: InMemoryVectorStore) -> None:
entries = await store.list()
entry_number = len(entries)

ids_to_remove = [entries[0].id]
await store.remove(ids_to_remove)

assert len(await store.list()) == entry_number - 1


async def test_list_all(store: InMemoryVectorStore) -> None:
results = await store.list()

Expand Down
16 changes: 16 additions & 0 deletions packages/ragbits-core/tests/unit/vector_stores/test_qdrant.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import typing
from unittest.mock import AsyncMock

import pytest
Expand Down Expand Up @@ -96,7 +97,22 @@ async def test_retrieve(mock_qdrant_store: QdrantVectorStore) -> None:
assert entry.vector == result["vector"]


async def test_remove(mock_qdrant_store: QdrantVectorStore) -> None:
ids_to_remove = ["1c7d6b27-4ef1-537c-ad7c-676edb8bc8a8"]

await mock_qdrant_store.remove(ids_to_remove)

mock_qdrant_store._client.delete.assert_called_once() # type: ignore
mock_qdrant_store._client.delete.assert_called_with( # type: ignore
collection_name="test_collection",
points_selector=models.PointIdsList(
points=typing.cast(list[int | str], ids_to_remove),
),
)


async def test_list(mock_qdrant_store: QdrantVectorStore) -> None:
mock_qdrant_store._client.collection_exists.return_value = True # type: ignore
mock_qdrant_store._client.query_points.return_value = models.QueryResponse( # type: ignore
points=[
models.ScoredPoint(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,27 @@ async def ingest(
elements = await self.processing_strategy.process_documents(
documents, self.document_processor_router, document_processor
)
await self._remove_entries_with_same_sources(elements)
await self.insert_elements(elements)

async def _remove_entries_with_same_sources(self, elements: list[Element]) -> None:
"""
Remove entries from the vector store whose source id is present in the elements' metadata.

Args:
elements: List of elements whose source ids will be checked and removed from the vector store if present.
"""
unique_source_ids = {element.document_meta.source.id for element in elements}

ids_to_delete = []
# TODO: Pass 'where' argument to the list method to filter results and optimize search
for entry in await self.vector_store.list():
mackurzawa marked this conversation as resolved.
Show resolved Hide resolved
if entry.metadata.get("document_meta", {}).get("source", {}).get("id") in unique_source_ids:
ids_to_delete.append(entry.id)

if ids_to_delete:
await self.vector_store.remove(ids_to_delete)

async def insert_elements(self, elements: list[Element]) -> None:
"""
Insert Elements into the vector store.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def to_vector_db_entry(self, vector: list[float], embedding_type: EmbeddingType)
vector_store_entry_id = str(uuid.uuid5(uuid.NAMESPACE_OID, ";".join(id_components)))
metadata = self.model_dump(exclude={"id", "key"})
metadata["embedding_type"] = str(embedding_type)
metadata["document_meta"]["source"]["id"] = self.document_meta.source.id
return VectorStoreEntry(id=vector_store_entry_id, key=str(self.key), vector=vector, metadata=metadata)


Expand Down
Loading