diff --git a/examples/document-search/chroma.py b/examples/document-search/chroma.py index 7becdff6..b641c360 100644 --- a/examples/document-search/chroma.py +++ b/examples/document-search/chroma.py @@ -100,7 +100,7 @@ async def main() -> None: print() print(f"Documents similar to: {query}") - print([element.get_key() for element in results]) + print([element.get_text_representation() for element in results]) if __name__ == "__main__": diff --git a/examples/document-search/chroma_otel.py b/examples/document-search/chroma_otel.py index d137bd42..64b5e198 100644 --- a/examples/document-search/chroma_otel.py +++ b/examples/document-search/chroma_otel.py @@ -130,7 +130,7 @@ async def main() -> None: print() print(f"Documents similar to: {query}") - print([element.get_key() for element in results]) + print([element.get_text_representation() for element in results]) if __name__ == "__main__": diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/base.py b/packages/ragbits-core/src/ragbits/core/vector_stores/base.py index f1c102b1..234ca441 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/base.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/base.py @@ -12,8 +12,9 @@ class VectorStoreEntry(BaseModel): An object representing a vector database entry. """ - key: str + id: str vector: list[float] + content: str metadata: dict diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py b/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py index f3a981f6..00a147d7 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py @@ -1,7 +1,6 @@ from __future__ import annotations import json -from hashlib import sha256 from typing import Literal import chromadb @@ -84,9 +83,8 @@ async def store(self, entries: list[VectorStoreEntry]) -> None: Args: entries: The entries to store. """ - # TODO: Think about better id components for hashing and move hash computing to VectorStoreEntry - ids = [sha256(entry.key.encode("utf-8")).hexdigest() for entry in entries] - documents = [entry.key for entry in entries] + ids = [entry.id for entry in entries] + documents = [entry.content for entry in entries] embeddings = [entry.vector for entry in entries] metadatas = [entry.metadata for entry in entries] metadatas = ( @@ -132,12 +130,13 @@ async def retrieve(self, vector: list[float], options: VectorStoreOptions | None return [ VectorStoreEntry( - key=document, + id=id, + content=document, vector=list(embeddings), metadata=metadata, # type: ignore ) - for batch in zip(metadatas, embeddings, distances, documents, strict=False) - for metadata, embeddings, distance, document in zip(*batch, strict=False) + for batch in zip(ids, metadatas, embeddings, distances, documents, strict=False) + for id, metadata, embeddings, distance, document in zip(*batch, strict=False) if options.max_distance is None or distance <= options.max_distance ] @@ -182,9 +181,10 @@ async def list( return [ VectorStoreEntry( - key=document, + id=id, + content=document, vector=list(embedding), metadata=metadata, # type: ignore ) - for metadata, embedding, document in zip(metadatas, embeddings, documents, strict=False) + for id, metadata, embedding, document in zip(ids, metadatas, embeddings, documents, strict=False) ] diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/in_memory.py b/packages/ragbits-core/src/ragbits/core/vector_stores/in_memory.py index 4f2b9454..d1a1c27e 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/in_memory.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/in_memory.py @@ -36,7 +36,7 @@ async def store(self, entries: list[VectorStoreEntry]) -> None: entries: The entries to store. """ for entry in entries: - self._storage[entry.key] = entry + self._storage[entry.id] = entry @traceable async def retrieve(self, vector: list[float], options: VectorStoreOptions | None = None) -> list[VectorStoreEntry]: diff --git a/packages/ragbits-core/tests/unit/vector_stores/test_chroma.py b/packages/ragbits-core/tests/unit/vector_stores/test_chroma.py index 2f796848..b3d038ec 100644 --- a/packages/ragbits-core/tests/unit/vector_stores/test_chroma.py +++ b/packages/ragbits-core/tests/unit/vector_stores/test_chroma.py @@ -22,7 +22,8 @@ async def test_get_chroma_collection(mock_chromadb_store: ChromaVectorStore) -> async def test_store(mock_chromadb_store: ChromaVectorStore) -> None: data = [ VectorStoreEntry( - key="test_key", + id="test_key", + content="test content", vector=[0.1, 0.2, 0.3], metadata={ "content": "test content", @@ -39,7 +40,7 @@ async def test_store(mock_chromadb_store: ChromaVectorStore) -> None: mock_chromadb_store._client.get_or_create_collection().add.assert_called_once() # type: ignore mock_chromadb_store._client.get_or_create_collection().add.assert_called_with( # type: ignore - ids=["92488e1e3eeecdf99f3ed2ce59233efb4b4fb612d5655c0ce9ea52b5a502e655"], + ids=[data[0].id], embeddings=[[0.1, 0.2, 0.3]], metadatas=[ { @@ -47,7 +48,7 @@ async def test_store(mock_chromadb_store: ChromaVectorStore) -> None: ' {"path": "/test/path"}, "document_type": "test_type"}}', } ], - documents=["test_key"], + documents=["test content"], ) @@ -85,7 +86,7 @@ async def test_retrieve( ], "embeddings": [[[0.12, 0.25, 0.29], [0.13, 0.26, 0.30]]], "distances": [[0.1, 0.2]], - "documents": [["test_key_1", "test_key_2"]], + "documents": [["test content 1", "test content 2"]], "ids": [["test_id_1", "test_id_2"]], } @@ -96,6 +97,8 @@ async def test_retrieve( assert entry.metadata["content"] == result["content"] assert entry.metadata["document"]["title"] == result["title"] assert entry.vector == result["vector"] + assert entry.id == f"test_id_{results.index(result) + 1}" + assert entry.content == result["content"] async def test_list(mock_chromadb_store: ChromaVectorStore) -> None: @@ -112,7 +115,7 @@ async def test_list(mock_chromadb_store: ChromaVectorStore) -> None: }, ], "embeddings": [[0.12, 0.25, 0.29], [0.13, 0.26, 0.30]], - "documents": ["test_key", "test_key_2"], + "documents": ["test content 1", "test content2"], "ids": ["test_id_1", "test_id_2"], } @@ -122,6 +125,10 @@ async def test_list(mock_chromadb_store: ChromaVectorStore) -> None: assert entries[0].metadata["content"] == "test content" assert entries[0].metadata["document"]["title"] == "test title" assert entries[0].vector == [0.12, 0.25, 0.29] + assert entries[0].content == "test content 1" + assert entries[0].id == "test_id_1" assert entries[1].metadata["content"] == "test content 2" assert entries[1].metadata["document"]["title"] == "test title 2" assert entries[1].vector == [0.13, 0.26, 0.30] + assert entries[1].content == "test content2" + assert entries[1].id == "test_id_2" diff --git a/packages/ragbits-core/tests/unit/vector_stores/test_in_memory.py b/packages/ragbits-core/tests/unit/vector_stores/test_in_memory.py index d9c23a57..d47e19a2 100644 --- a/packages/ragbits-core/tests/unit/vector_stores/test_in_memory.py +++ b/packages/ragbits-core/tests/unit/vector_stores/test_in_memory.py @@ -20,15 +20,6 @@ class AnimalElement(Element): type: str age: int - def get_key(self) -> str: - """ - Get the key of the element which will be used to generate the vector. - - Returns: - The key. - """ - return self.name - def get_text_representation(self) -> str: """ Get the text representation of the element. diff --git a/packages/ragbits-document-search/src/ragbits/document_search/_main.py b/packages/ragbits-document-search/src/ragbits/document_search/_main.py index b0fbacef..75cd8390 100644 --- a/packages/ragbits-document-search/src/ragbits/document_search/_main.py +++ b/packages/ragbits-document-search/src/ragbits/document_search/_main.py @@ -169,6 +169,6 @@ async def insert_elements(self, elements: list[Element]) -> None: Args: elements: The list of Elements to insert. """ - vectors = await self.embedder.embed_text([element.get_key() for element in elements]) + vectors = await self.embedder.embed_text([element.get_text_for_embedding() for element in elements]) entries = [element.to_vector_db_entry(vector) for element, vector in zip(elements, vectors, strict=False)] await self.vector_store.store(entries) diff --git a/packages/ragbits-document-search/src/ragbits/document_search/documents/element.py b/packages/ragbits-document-search/src/ragbits/document_search/documents/element.py index a0a51ca3..21b080f2 100644 --- a/packages/ragbits-document-search/src/ragbits/document_search/documents/element.py +++ b/packages/ragbits-document-search/src/ragbits/document_search/documents/element.py @@ -1,7 +1,8 @@ +import uuid from abc import ABC, abstractmethod from typing import Any, ClassVar -from pydantic import BaseModel +from pydantic import BaseModel, computed_field from ragbits.core.vector_stores.base import VectorStoreEntry from ragbits.document_search.documents.document import DocumentMeta @@ -27,12 +28,32 @@ class Element(BaseModel, ABC): _elements_registry: ClassVar[dict[str, type["Element"]]] = {} - def get_key(self) -> str: + @computed_field # type: ignore[prop-decorator] + @property + def id(self) -> str: """ - Get the key of the element which will be used to generate the vector. + Get the ID of the element. The id is primarly used as a key in the vector store. + The current representation is a UUID5 hash of various element metadata, including + its contents and location where it was sourced from. Returns: - The key. + The ID in the form of a UUID5 hash. + """ + id_components = [ + self.document_meta.id, + self.get_text_for_embedding(), + self.get_text_representation(), + str(self.location), + ] + + return str(uuid.uuid5(uuid.NAMESPACE_OID, ";".join(id_components))) + + def get_text_for_embedding(self) -> str: + """ + Get the text representation of the element for embedding. + + Returns: + The text representation for embedding. """ return self.get_text_representation() @@ -82,8 +103,9 @@ def to_vector_db_entry(self, vector: list[float]) -> VectorStoreEntry: The vector database entry """ return VectorStoreEntry( - key=self.get_key(), + id=self.id, vector=vector, + content=self.get_text_for_embedding(), metadata=self.model_dump(), ) diff --git a/packages/ragbits-document-search/tests/unit/test_elements.py b/packages/ragbits-document-search/tests/unit/test_elements.py index a41a69e4..7f98307c 100644 --- a/packages/ragbits-document-search/tests/unit/test_elements.py +++ b/packages/ragbits-document-search/tests/unit/test_elements.py @@ -8,15 +8,13 @@ class MyElement(Element): element_type: str = "custom_element" foo: str - def get_key(self) -> str: - return self.foo + self.foo - def get_text_representation(self) -> str: return self.foo + self.foo element = Element.from_vector_db_entry( db_entry=VectorStoreEntry( - key="key", + id="test id", + content="test content", vector=[0.1, 0.2], metadata={ "element_type": "custom_element", @@ -31,6 +29,7 @@ def get_text_representation(self) -> str: assert isinstance(element, MyElement) assert element.foo == "bar" - assert element.get_key() == "barbar" + assert element.get_text_for_embedding() == "barbar" + assert element.get_text_representation() == "barbar" assert element.document_meta.document_type == DocumentType.TXT assert element.document_meta.source.source_type == "local_file_source"