feat: Qdrant - add support for BM42 (#864)

* Qdrant: add support for BM42 * add test for sparse configuration
deepset-ai · Jul 3, 2024 · 0fd154b · 0fd154b
1 parent fd0059e
commit 0fd154b
Show file tree

Hide file tree

Showing 5 changed files with 50 additions and 5 deletions.
diff --git a/integrations/qdrant/pyproject.toml b/integrations/qdrant/pyproject.toml
@@ -25,7 +25,7 @@ classifiers = [
   "Programming Language :: Python :: Implementation :: CPython",
   "Programming Language :: Python :: Implementation :: PyPy",
 ]
-dependencies = ["haystack-ai>=2.0.1", "qdrant-client"]
+dependencies = ["haystack-ai>=2.0.1", "qdrant-client>=1.10.0"]
 
 [project.urls]
 Source = "https://github.com/deepset-ai/haystack-core-integrations"

diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py
@@ -111,6 +111,7 @@ def __init__(
         embedding_dim: int = 768,
         on_disk: bool = False,
         use_sparse_embeddings: bool = False,
+        sparse_idf: bool = False,
         similarity: str = "cosine",
         return_embedding: bool = False,
         progress_bar: bool = True,
@@ -168,6 +169,9 @@ def __init__(
             Whether to store the collection on disk.
         :param use_sparse_embedding:
             If set to `True`, enables support for sparse embeddings.
+        :param sparse_idf:
+            If set to `True`, computes the Inverse Document Frequency (IDF) when using sparse embeddings.
+            It is required to use techniques like BM42. It is ignored if `use_sparse_embeddings` is `False`.
         :param similarity:
             The similarity metric to use.
         :param return_embedding:
@@ -246,6 +250,7 @@ def __init__(
         self.recreate_index = recreate_index
         self.payload_fields_to_index = payload_fields_to_index
         self.use_sparse_embeddings = use_sparse_embeddings
+        self.sparse_idf = use_sparse_embeddings and sparse_idf
         self.embedding_dim = embedding_dim
         self.on_disk = on_disk
         self.similarity = similarity
@@ -280,6 +285,7 @@ def client(self):
                 self.recreate_index,
                 self.similarity,
                 self.use_sparse_embeddings,
+                self.sparse_idf,
                 self.on_disk,
                 self.payload_fields_to_index,
             )
@@ -347,7 +353,9 @@ def write_documents(
             if not isinstance(doc, Document):
                 msg = f"DocumentStore.write_documents() expects a list of Documents but got an element of {type(doc)}."
                 raise ValueError(msg)
-        self._set_up_collection(self.index, self.embedding_dim, False, self.similarity, self.use_sparse_embeddings)
+        self._set_up_collection(
+            self.index, self.embedding_dim, False, self.similarity, self.use_sparse_embeddings, self.sparse_idf
+        )
 
         if len(documents) == 0:
             logger.warning("Calling QdrantDocumentStore.write_documents() with empty list")
@@ -732,6 +740,7 @@ def _set_up_collection(
         recreate_collection: bool,
         similarity: str,
         use_sparse_embeddings: bool,
+        sparse_idf: bool,
         on_disk: bool = False,
         payload_fields_to_index: Optional[List[dict]] = None,
     ):
@@ -747,6 +756,8 @@ def _set_up_collection(
             The similarity measure to use.
         :param use_sparse_embeddings:
             Whether to use sparse embeddings.
+        :param sparse_idf:
+            Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
         :param on_disk:
             Whether to store the collection on disk.
         :param payload_fields_to_index:
@@ -763,7 +774,9 @@ def _set_up_collection(
         if recreate_collection or not self.client.collection_exists(collection_name):
             # There is no need to verify the current configuration of that
             # collection. It might be just recreated again or does not exist yet.
-            self.recreate_collection(collection_name, distance, embedding_dim, on_disk, use_sparse_embeddings)
+            self.recreate_collection(
+                collection_name, distance, embedding_dim, on_disk, use_sparse_embeddings, sparse_idf
+            )
             # Create Payload index if payload_fields_to_index is provided
             self._create_payload_index(collection_name, payload_fields_to_index)
             return
@@ -826,6 +839,7 @@ def recreate_collection(
         embedding_dim: int,
         on_disk: Optional[bool] = None,
         use_sparse_embeddings: Optional[bool] = None,
+        sparse_idf: bool = False,
     ):
         """
         Recreates the Qdrant collection with the specified parameters.
@@ -840,6 +854,8 @@ def recreate_collection(
             Whether to store the collection on disk.
         :param use_sparse_embeddings:
             Whether to use sparse embeddings.
+        :param sparse_idf:
+            Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
         """
         if on_disk is None:
             on_disk = self.on_disk
@@ -858,7 +874,8 @@ def recreate_collection(
                 SPARSE_VECTORS_NAME: rest.SparseVectorParams(
                     index=rest.SparseIndexParams(
                         on_disk=on_disk,
-                    )
+                    ),
+                    modifier=rest.Modifier.IDF if sparse_idf else None,
                 ),
             }
 

diff --git a/integrations/qdrant/tests/test_dict_converters.py b/integrations/qdrant/tests/test_dict_converters.py
@@ -24,6 +24,7 @@ def test_to_dict():
             "on_disk": False,
             "force_disable_check_same_thread": False,
             "use_sparse_embeddings": False,
+            "sparse_idf": False,
             "similarity": "cosine",
             "return_embedding": False,
             "progress_bar": True,
@@ -60,6 +61,7 @@ def test_from_dict():
                 "on_disk": False,
                 "force_disable_check_same_thread": False,
                 "use_sparse_embeddings": True,
+                "sparse_idf": True,
                 "similarity": "cosine",
                 "return_embedding": False,
                 "progress_bar": True,
@@ -81,6 +83,7 @@ def test_from_dict():
             document_store.index == "test",
             document_store.force_disable_check_same_thread is False,
             document_store.use_sparse_embeddings is True,
+            document_store.sparse_idf is True,
             document_store.on_disk is False,
             document_store.similarity == "cosine",
             document_store.return_embedding is False,

diff --git a/integrations/qdrant/tests/test_document_store.py b/integrations/qdrant/tests/test_document_store.py
@@ -12,7 +12,12 @@
     WriteDocumentsTest,
     _random_embeddings,
 )
-from haystack_integrations.document_stores.qdrant.document_store import QdrantDocumentStore, QdrantStoreError
+from haystack_integrations.document_stores.qdrant.document_store import (
+    SPARSE_VECTORS_NAME,
+    QdrantDocumentStore,
+    QdrantStoreError,
+)
+from qdrant_client.http import models as rest
 
 
 class TestQdrantDocumentStore(CountDocumentsTest, WriteDocumentsTest, DeleteDocumentsTest):
@@ -49,6 +54,23 @@ def test_write_documents(self, document_store: QdrantDocumentStore):
         with pytest.raises(DuplicateDocumentError):
             document_store.write_documents(docs, DuplicatePolicy.FAIL)
 
+    def test_sparse_configuration(self):
+        document_store = QdrantDocumentStore(
+            ":memory:",
+            recreate_index=True,
+            use_sparse_embeddings=True,
+            sparse_idf=True,
+        )
+
+        client = document_store.client
+        sparse_config = client.get_collection("Document").config.params.sparse_vectors
+
+        assert SPARSE_VECTORS_NAME in sparse_config
+
+        # check that the `sparse_idf` parameter takes effect
+        assert hasattr(sparse_config[SPARSE_VECTORS_NAME], "modifier")
+        assert sparse_config[SPARSE_VECTORS_NAME].modifier == rest.Modifier.IDF
+
     def test_query_hybrid(self, generate_sparse_embedding):
         document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True)
 

diff --git a/integrations/qdrant/tests/test_retriever.py b/integrations/qdrant/tests/test_retriever.py
@@ -50,6 +50,7 @@ def test_to_dict(self):
                         "on_disk": False,
                         "force_disable_check_same_thread": False,
                         "use_sparse_embeddings": False,
+                        "sparse_idf": False,
                         "similarity": "cosine",
                         "return_embedding": False,
                         "progress_bar": True,
@@ -195,6 +196,7 @@ def test_to_dict(self):
                         "on_disk": False,
                         "force_disable_check_same_thread": False,
                         "use_sparse_embeddings": False,
+                        "sparse_idf": False,
                         "similarity": "cosine",
                         "return_embedding": False,
                         "progress_bar": True,
@@ -305,6 +307,7 @@ def test_to_dict(self):
                         "on_disk": False,
                         "force_disable_check_same_thread": False,
                         "use_sparse_embeddings": False,
+                        "sparse_idf": False,
                         "similarity": "cosine",
                         "return_embedding": False,
                         "progress_bar": True,