Skip to content

Commit

Permalink
feat: Qdrant - add support for BM42 (#864)
Browse files Browse the repository at this point in the history
* Qdrant: add support for BM42

* add test for sparse configuration
  • Loading branch information
anakin87 authored Jul 3, 2024
1 parent fd0059e commit 0fd154b
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 5 deletions.
2 changes: 1 addition & 1 deletion integrations/qdrant/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ classifiers = [
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
]
dependencies = ["haystack-ai>=2.0.1", "qdrant-client"]
dependencies = ["haystack-ai>=2.0.1", "qdrant-client>=1.10.0"]

[project.urls]
Source = "https://github.com/deepset-ai/haystack-core-integrations"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def __init__(
embedding_dim: int = 768,
on_disk: bool = False,
use_sparse_embeddings: bool = False,
sparse_idf: bool = False,
similarity: str = "cosine",
return_embedding: bool = False,
progress_bar: bool = True,
Expand Down Expand Up @@ -168,6 +169,9 @@ def __init__(
Whether to store the collection on disk.
:param use_sparse_embedding:
If set to `True`, enables support for sparse embeddings.
:param sparse_idf:
If set to `True`, computes the Inverse Document Frequency (IDF) when using sparse embeddings.
It is required to use techniques like BM42. It is ignored if `use_sparse_embeddings` is `False`.
:param similarity:
The similarity metric to use.
:param return_embedding:
Expand Down Expand Up @@ -246,6 +250,7 @@ def __init__(
self.recreate_index = recreate_index
self.payload_fields_to_index = payload_fields_to_index
self.use_sparse_embeddings = use_sparse_embeddings
self.sparse_idf = use_sparse_embeddings and sparse_idf
self.embedding_dim = embedding_dim
self.on_disk = on_disk
self.similarity = similarity
Expand Down Expand Up @@ -280,6 +285,7 @@ def client(self):
self.recreate_index,
self.similarity,
self.use_sparse_embeddings,
self.sparse_idf,
self.on_disk,
self.payload_fields_to_index,
)
Expand Down Expand Up @@ -347,7 +353,9 @@ def write_documents(
if not isinstance(doc, Document):
msg = f"DocumentStore.write_documents() expects a list of Documents but got an element of {type(doc)}."
raise ValueError(msg)
self._set_up_collection(self.index, self.embedding_dim, False, self.similarity, self.use_sparse_embeddings)
self._set_up_collection(
self.index, self.embedding_dim, False, self.similarity, self.use_sparse_embeddings, self.sparse_idf
)

if len(documents) == 0:
logger.warning("Calling QdrantDocumentStore.write_documents() with empty list")
Expand Down Expand Up @@ -732,6 +740,7 @@ def _set_up_collection(
recreate_collection: bool,
similarity: str,
use_sparse_embeddings: bool,
sparse_idf: bool,
on_disk: bool = False,
payload_fields_to_index: Optional[List[dict]] = None,
):
Expand All @@ -747,6 +756,8 @@ def _set_up_collection(
The similarity measure to use.
:param use_sparse_embeddings:
Whether to use sparse embeddings.
:param sparse_idf:
Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
:param on_disk:
Whether to store the collection on disk.
:param payload_fields_to_index:
Expand All @@ -763,7 +774,9 @@ def _set_up_collection(
if recreate_collection or not self.client.collection_exists(collection_name):
# There is no need to verify the current configuration of that
# collection. It might be just recreated again or does not exist yet.
self.recreate_collection(collection_name, distance, embedding_dim, on_disk, use_sparse_embeddings)
self.recreate_collection(
collection_name, distance, embedding_dim, on_disk, use_sparse_embeddings, sparse_idf
)
# Create Payload index if payload_fields_to_index is provided
self._create_payload_index(collection_name, payload_fields_to_index)
return
Expand Down Expand Up @@ -826,6 +839,7 @@ def recreate_collection(
embedding_dim: int,
on_disk: Optional[bool] = None,
use_sparse_embeddings: Optional[bool] = None,
sparse_idf: bool = False,
):
"""
Recreates the Qdrant collection with the specified parameters.
Expand All @@ -840,6 +854,8 @@ def recreate_collection(
Whether to store the collection on disk.
:param use_sparse_embeddings:
Whether to use sparse embeddings.
:param sparse_idf:
Whether to compute the Inverse Document Frequency (IDF) when using sparse embeddings. Required for BM42.
"""
if on_disk is None:
on_disk = self.on_disk
Expand All @@ -858,7 +874,8 @@ def recreate_collection(
SPARSE_VECTORS_NAME: rest.SparseVectorParams(
index=rest.SparseIndexParams(
on_disk=on_disk,
)
),
modifier=rest.Modifier.IDF if sparse_idf else None,
),
}

Expand Down
3 changes: 3 additions & 0 deletions integrations/qdrant/tests/test_dict_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def test_to_dict():
"on_disk": False,
"force_disable_check_same_thread": False,
"use_sparse_embeddings": False,
"sparse_idf": False,
"similarity": "cosine",
"return_embedding": False,
"progress_bar": True,
Expand Down Expand Up @@ -60,6 +61,7 @@ def test_from_dict():
"on_disk": False,
"force_disable_check_same_thread": False,
"use_sparse_embeddings": True,
"sparse_idf": True,
"similarity": "cosine",
"return_embedding": False,
"progress_bar": True,
Expand All @@ -81,6 +83,7 @@ def test_from_dict():
document_store.index == "test",
document_store.force_disable_check_same_thread is False,
document_store.use_sparse_embeddings is True,
document_store.sparse_idf is True,
document_store.on_disk is False,
document_store.similarity == "cosine",
document_store.return_embedding is False,
Expand Down
24 changes: 23 additions & 1 deletion integrations/qdrant/tests/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@
WriteDocumentsTest,
_random_embeddings,
)
from haystack_integrations.document_stores.qdrant.document_store import QdrantDocumentStore, QdrantStoreError
from haystack_integrations.document_stores.qdrant.document_store import (
SPARSE_VECTORS_NAME,
QdrantDocumentStore,
QdrantStoreError,
)
from qdrant_client.http import models as rest


class TestQdrantDocumentStore(CountDocumentsTest, WriteDocumentsTest, DeleteDocumentsTest):
Expand Down Expand Up @@ -49,6 +54,23 @@ def test_write_documents(self, document_store: QdrantDocumentStore):
with pytest.raises(DuplicateDocumentError):
document_store.write_documents(docs, DuplicatePolicy.FAIL)

def test_sparse_configuration(self):
document_store = QdrantDocumentStore(
":memory:",
recreate_index=True,
use_sparse_embeddings=True,
sparse_idf=True,
)

client = document_store.client
sparse_config = client.get_collection("Document").config.params.sparse_vectors

assert SPARSE_VECTORS_NAME in sparse_config

# check that the `sparse_idf` parameter takes effect
assert hasattr(sparse_config[SPARSE_VECTORS_NAME], "modifier")
assert sparse_config[SPARSE_VECTORS_NAME].modifier == rest.Modifier.IDF

def test_query_hybrid(self, generate_sparse_embedding):
document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True)

Expand Down
3 changes: 3 additions & 0 deletions integrations/qdrant/tests/test_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def test_to_dict(self):
"on_disk": False,
"force_disable_check_same_thread": False,
"use_sparse_embeddings": False,
"sparse_idf": False,
"similarity": "cosine",
"return_embedding": False,
"progress_bar": True,
Expand Down Expand Up @@ -195,6 +196,7 @@ def test_to_dict(self):
"on_disk": False,
"force_disable_check_same_thread": False,
"use_sparse_embeddings": False,
"sparse_idf": False,
"similarity": "cosine",
"return_embedding": False,
"progress_bar": True,
Expand Down Expand Up @@ -305,6 +307,7 @@ def test_to_dict(self):
"on_disk": False,
"force_disable_check_same_thread": False,
"use_sparse_embeddings": False,
"sparse_idf": False,
"similarity": "cosine",
"return_embedding": False,
"progress_bar": True,
Expand Down

0 comments on commit 0fd154b

Please sign in to comment.