diff --git a/integrations/qdrant/pyproject.toml b/integrations/qdrant/pyproject.toml index 7e86bf1f6..3544ae7d4 100644 --- a/integrations/qdrant/pyproject.toml +++ b/integrations/qdrant/pyproject.toml @@ -25,7 +25,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = ["haystack-ai>=2.0.1", "qdrant-client"] +dependencies = ["haystack-ai>=2.0.1", "qdrant-client>=1.10.0"] [project.urls] Source = "https://github.com/deepset-ai/haystack-core-integrations" diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index f2724a969..d55cbd71c 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -111,6 +111,7 @@ def __init__( embedding_dim: int = 768, on_disk: bool = False, use_sparse_embeddings: bool = False, + sparse_idf: bool = False, similarity: str = "cosine", return_embedding: bool = False, progress_bar: bool = True, @@ -168,6 +169,9 @@ def __init__( Whether to store the collection on disk. :param use_sparse_embedding: If set to `True`, enables support for sparse embeddings. + :param sparse_idf: + If set to `True`, computes the Inverse Document Frequency (IDF) when using sparse embeddings. + It is required to use techniques like BM42. It is ignored if `use_sparse_embeddings` is `False`. :param similarity: The similarity metric to use. :param return_embedding: @@ -246,6 +250,7 @@ def __init__( self.recreate_index = recreate_index self.payload_fields_to_index = payload_fields_to_index self.use_sparse_embeddings = use_sparse_embeddings + self.sparse_idf = use_sparse_embeddings and sparse_idf self.embedding_dim = embedding_dim self.on_disk = on_disk self.similarity = similarity @@ -280,6 +285,7 @@ def client(self): self.recreate_index, self.similarity, self.use_sparse_embeddings, + self.sparse_idf, self.on_disk, self.payload_fields_to_index, ) @@ -347,7 +353,9 @@ def write_documents( if not isinstance(doc, Document): msg = f"DocumentStore.write_documents() expects a list of Documents but got an element of {type(doc)}." raise ValueError(msg) - self._set_up_collection(self.index, self.embedding_dim, False, self.similarity, self.use_sparse_embeddings) + self._set_up_collection( + self.index, self.embedding_dim, False, self.similarity, self.use_sparse_embeddings, self.sparse_idf + ) if len(documents) == 0: logger.warning("Calling QdrantDocumentStore.write_documents() with empty list") @@ -732,6 +740,7 @@ def _set_up_collection( recreate_collection: bool, similarity: str, use_sparse_embeddings: bool, + sparse_idf: bool, on_disk: bool = False, payload_fields_to_index: Optional[List[dict]] = None, ): @@ -747,6 +756,8 @@ def _set_up_collection( The similarity measure to use. :param use_sparse_embeddings: Whether to use sparse embeddings. + :param sparse_idf: + Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42. :param on_disk: Whether to store the collection on disk. :param payload_fields_to_index: @@ -763,7 +774,9 @@ def _set_up_collection( if recreate_collection or not self.client.collection_exists(collection_name): # There is no need to verify the current configuration of that # collection. It might be just recreated again or does not exist yet. - self.recreate_collection(collection_name, distance, embedding_dim, on_disk, use_sparse_embeddings) + self.recreate_collection( + collection_name, distance, embedding_dim, on_disk, use_sparse_embeddings, sparse_idf + ) # Create Payload index if payload_fields_to_index is provided self._create_payload_index(collection_name, payload_fields_to_index) return @@ -826,6 +839,7 @@ def recreate_collection( embedding_dim: int, on_disk: Optional[bool] = None, use_sparse_embeddings: Optional[bool] = None, + sparse_idf: bool = False, ): """ Recreates the Qdrant collection with the specified parameters. @@ -840,6 +854,8 @@ def recreate_collection( Whether to store the collection on disk. :param use_sparse_embeddings: Whether to use sparse embeddings. + :param sparse_idf: + Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42. """ if on_disk is None: on_disk = self.on_disk @@ -858,7 +874,8 @@ def recreate_collection( SPARSE_VECTORS_NAME: rest.SparseVectorParams( index=rest.SparseIndexParams( on_disk=on_disk, - ) + ), + modifier=rest.Modifier.IDF if sparse_idf else None, ), } diff --git a/integrations/qdrant/tests/test_dict_converters.py b/integrations/qdrant/tests/test_dict_converters.py index 9fc8779f7..3871dbff0 100644 --- a/integrations/qdrant/tests/test_dict_converters.py +++ b/integrations/qdrant/tests/test_dict_converters.py @@ -24,6 +24,7 @@ def test_to_dict(): "on_disk": False, "force_disable_check_same_thread": False, "use_sparse_embeddings": False, + "sparse_idf": False, "similarity": "cosine", "return_embedding": False, "progress_bar": True, @@ -60,6 +61,7 @@ def test_from_dict(): "on_disk": False, "force_disable_check_same_thread": False, "use_sparse_embeddings": True, + "sparse_idf": True, "similarity": "cosine", "return_embedding": False, "progress_bar": True, @@ -81,6 +83,7 @@ def test_from_dict(): document_store.index == "test", document_store.force_disable_check_same_thread is False, document_store.use_sparse_embeddings is True, + document_store.sparse_idf is True, document_store.on_disk is False, document_store.similarity == "cosine", document_store.return_embedding is False, diff --git a/integrations/qdrant/tests/test_document_store.py b/integrations/qdrant/tests/test_document_store.py index f18b2c453..c388a10cf 100644 --- a/integrations/qdrant/tests/test_document_store.py +++ b/integrations/qdrant/tests/test_document_store.py @@ -12,7 +12,12 @@ WriteDocumentsTest, _random_embeddings, ) -from haystack_integrations.document_stores.qdrant.document_store import QdrantDocumentStore, QdrantStoreError +from haystack_integrations.document_stores.qdrant.document_store import ( + SPARSE_VECTORS_NAME, + QdrantDocumentStore, + QdrantStoreError, +) +from qdrant_client.http import models as rest class TestQdrantDocumentStore(CountDocumentsTest, WriteDocumentsTest, DeleteDocumentsTest): @@ -49,6 +54,23 @@ def test_write_documents(self, document_store: QdrantDocumentStore): with pytest.raises(DuplicateDocumentError): document_store.write_documents(docs, DuplicatePolicy.FAIL) + def test_sparse_configuration(self): + document_store = QdrantDocumentStore( + ":memory:", + recreate_index=True, + use_sparse_embeddings=True, + sparse_idf=True, + ) + + client = document_store.client + sparse_config = client.get_collection("Document").config.params.sparse_vectors + + assert SPARSE_VECTORS_NAME in sparse_config + + # check that the `sparse_idf` parameter takes effect + assert hasattr(sparse_config[SPARSE_VECTORS_NAME], "modifier") + assert sparse_config[SPARSE_VECTORS_NAME].modifier == rest.Modifier.IDF + def test_query_hybrid(self, generate_sparse_embedding): document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True) diff --git a/integrations/qdrant/tests/test_retriever.py b/integrations/qdrant/tests/test_retriever.py index 0fa4e2f66..c011a2261 100644 --- a/integrations/qdrant/tests/test_retriever.py +++ b/integrations/qdrant/tests/test_retriever.py @@ -50,6 +50,7 @@ def test_to_dict(self): "on_disk": False, "force_disable_check_same_thread": False, "use_sparse_embeddings": False, + "sparse_idf": False, "similarity": "cosine", "return_embedding": False, "progress_bar": True, @@ -195,6 +196,7 @@ def test_to_dict(self): "on_disk": False, "force_disable_check_same_thread": False, "use_sparse_embeddings": False, + "sparse_idf": False, "similarity": "cosine", "return_embedding": False, "progress_bar": True, @@ -305,6 +307,7 @@ def test_to_dict(self): "on_disk": False, "force_disable_check_same_thread": False, "use_sparse_embeddings": False, + "sparse_idf": False, "similarity": "cosine", "return_embedding": False, "progress_bar": True,