From 0f650ee26e40700d1b13651e69ddf99e5d902e07 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Wed, 13 Mar 2024 13:35:38 +0100 Subject: [PATCH 01/41] feat(Qdrant): start to working on sparse vector integration --- .../components/retrievers/qdrant/__init__.py | 4 +- .../components/retrievers/qdrant/retriever.py | 122 +++++++++++++++++- .../document_stores/qdrant/converters.py | 9 +- .../document_stores/qdrant/document_store.py | 52 ++++++++ 4 files changed, 182 insertions(+), 5 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py index 41b59e42d..58be4211a 100644 --- a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py +++ b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py @@ -2,6 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from .retriever import QdrantEmbeddingRetriever +from .retriever import QdrantEmbeddingRetriever, QdrantSparseRetriever -__all__ = ("QdrantEmbeddingRetriever",) +__all__ = ("QdrantEmbeddingRetriever", "QdrantSparseRetriever") diff --git a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py index cd53ccd7b..826847c59 100644 --- a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py +++ b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union from haystack import Document, component, default_from_dict, default_to_dict from haystack_integrations.document_stores.qdrant import QdrantDocumentStore @@ -7,7 +7,7 @@ @component class QdrantEmbeddingRetriever: """ - A component for retrieving documents from an QdrantDocumentStore. + A component for retrieving documents from an QdrantDocumentStore using dense vectors. Usage example: ```python @@ -120,3 +120,121 @@ def run( ) return {"documents": docs} + + +@component +class QdrantSparseRetriever: + """ + A component for retrieving documents from an QdrantDocumentStore using sparse vectors. + + Usage example: + ```python + from haystack_integrations.components.retrievers.qdrant import QdrantSparseRetriever + from haystack_integrations.document_stores.qdrant import QdrantDocumentStore + + document_store = QdrantDocumentStore( + ":memory:", + recreate_index=True, + return_sparse_embedding=True, + wait_result_from_api=True, + ) + retriever = QdrantSparseRetriever(document_store=document_store) + + # using a fake sparse vector to keep the example simple + retriever.run(query_sparse_embedding={"indices":[0, 1, 2, 3], "values":[0.1, 0.8, 0.05, 0.33]}) + ``` + """ + + def __init__( + self, + document_store: QdrantDocumentStore, + filters: Optional[Dict[str, Any]] = None, + top_k: int = 10, + scale_score: bool = True, # noqa: FBT001, FBT002 + return_embedding: bool = False, # noqa: FBT001, FBT002 + ): + """ + Create a QdrantSparseRetriever component. + + :param document_store: An instance of QdrantDocumentStore. + :param filters: A dictionary with filters to narrow down the search space. Default is None. + :param top_k: The maximum number of documents to retrieve. Default is 10. + :param scale_score: Whether to scale the scores of the retrieved documents or not. Default is True. + :param return_embedding: Whether to return the sparse embedding of the retrieved Documents. Default is False. + + :raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore. + """ + + if not isinstance(document_store, QdrantDocumentStore): + msg = "document_store must be an instance of QdrantDocumentStore" + raise ValueError(msg) + + self._document_store = document_store + self._filters = filters + self._top_k = top_k + self._scale_score = scale_score + self._return_embedding = return_embedding + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + d = default_to_dict( + self, + document_store=self._document_store, + filters=self._filters, + top_k=self._top_k, + scale_score=self._scale_score, + return_embedding=self._return_embedding, + ) + d["init_parameters"]["document_store"] = self._document_store.to_dict() + + return d + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "QdrantEmbeddingRetriever": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ + document_store = QdrantDocumentStore.from_dict(data["init_parameters"]["document_store"]) + data["init_parameters"]["document_store"] = document_store + return default_from_dict(cls, data) + + @component.output_types(documents=List[Document]) + def run( + self, + query_sparse_embedding: Dict[str, List[Union[int, float]]], + filters: Optional[Dict[str, Any]] = None, + top_k: Optional[int] = None, + scale_score: Optional[bool] = None, + return_embedding: Optional[bool] = None, + ): + """ + Run the Embedding Retriever on the given input data. + + :param query_sparse_embedding: Sparse Embedding of the query. + :param filters: A dictionary with filters to narrow down the search space. + :param top_k: The maximum number of documents to return. + :param scale_score: Whether to scale the scores of the retrieved documents or not. + :param return_embedding: Whether to return the embedding of the retrieved Documents. + :returns: + The retrieved documents. + + """ + docs = self._document_store.query_by_sparse( + query_sparse_embedding=query_sparse_embedding, + filters=filters or self._filters, + top_k=top_k or self._top_k, + scale_score=scale_score or self._scale_score, + return_embedding=return_embedding or self._return_embedding, + ) + + return {"documents": docs} diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py index 3fb6dabd6..a741fa8f4 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py @@ -19,6 +19,8 @@ def documents_to_batch( points = [] for document in documents: payload = document.to_dict(flatten=False) + # TODO: vector should be built not only from embedding_field but also from the field containing sparse embeddings + # TODO: Because with sparse vectors, the vector is now a dict vector = payload.pop(embedding_field) or {} _id = self.convert_id(payload.get("id")) @@ -51,6 +53,11 @@ def __init__(self, content_field: str, name_field: str, embedding_field: str): def point_to_document(self, point: QdrantPoint) -> Document: payload = {**point.payload} - payload["embedding"] = point.vector if hasattr(point, "vector") else None + # TODO: rework the converters part because now it's a mess with the new sparse embedding feature + # TODO: With dense+sparse embedding, vector is now a dict ? + # TODO: Unnamed dense vector are accessed with "" key ? + payload["embedding"] = point.vector[""] if hasattr(point, "vector") else None payload["score"] = point.score if hasattr(point, "score") else None + # TODO: Because haystack document don't have sparse embedding field (only dense) in their dataclass, put it in meta ? + payload["meta"]["sparse-embedding"] = point.vector["text-sparse"] if hasattr(point, "vector") else None return Document.from_dict(payload) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index dc22673fa..bf8f444f5 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -298,6 +298,50 @@ def get_documents_by_id( documents.append(self.qdrant_to_haystack.point_to_document(record)) return documents + def query_by_sparse( + self, + query_sparse_embedding: Dict[str, List[Union[int, float]]], + filters: Optional[Dict[str, Any]] = None, + top_k: int = 10, + scale_score: bool = True, # noqa: FBT001, FBT002 + return_embedding: bool = False, # noqa: FBT001, FBT002 + ) -> List[Document]: + qdrant_filters = self.qdrant_filter_converter.convert(filters) + + # TODO: we make the assumption that the sparse query is a dict with indices + # TODO: and values keys and list of int/float as value. Is it ok ? + # TODO: See FastEmbed https://github.com/qdrant/fastembed/blob/4cb8be2fb15c4f5ee2caa2629233121cfe389783/fastembed/sparse/sparse_embedding_base.py#L10 + query_indices = query_sparse_embedding["indices"] + query_values = query_sparse_embedding["values"] + if len(query_indices) != len(query_values): + error_message = "The indices and values of the sparse embedding query must have the same length." + raise ValueError(error_message) + + points = self.client.search( + collection_name=self.index, + query_vector=rest.NamedSparseVector( + name="text-sparse", + vector=rest.SparseVector( + indices=query_indices, + values=query_values, + ), + ), + query_filter=qdrant_filters, + limit=top_k, + with_vectors=return_embedding, + ) + + results = [self.qdrant_to_haystack.point_to_document(point) for point in points] + if scale_score: + for document in results: + score = document.score + if self.similarity == "cosine": + score = (score + 1) / 2 + else: + score = float(1 / (1 + np.exp(-score / 100))) + document.score = score + return results + def query_by_embedding( self, query_embedding: List[float], @@ -415,6 +459,14 @@ def _recreate_collection(self, collection_name: str, distance, embedding_dim: in on_disk=on_disk, distance=distance, ), + # TODO: we use named sparse vector, maybe we should named for dense also ? + sparse_vectors_config={ + "text-sparse": rest.SparseVectorParams( + index=rest.SparseIndexParams( + on_disk=on_disk, + ) + ) + }, shard_number=self.shard_number, replication_factor=self.replication_factor, write_consistency_factor=self.write_consistency_factor, From 6025e504173d4f339f1f3b66de45dbf598ea6c10 Mon Sep 17 00:00:00 2001 From: Corentin Date: Wed, 13 Mar 2024 23:59:43 +0100 Subject: [PATCH 02/41] Progress towards Sparse vector support with Fastembed --- .../components/retrievers/qdrant/retriever.py | 128 +++++++++++++++++- .../document_stores/qdrant/converters.py | 22 +-- .../document_stores/qdrant/document_store.py | 81 +++++++++-- 3 files changed, 209 insertions(+), 22 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py index 826847c59..203e20013 100644 --- a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py +++ b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py @@ -211,14 +211,14 @@ def from_dict(cls, data: Dict[str, Any]) -> "QdrantEmbeddingRetriever": @component.output_types(documents=List[Document]) def run( self, - query_sparse_embedding: Dict[str, List[Union[int, float]]], + query_sparse_embedding: Dict[str, Union[List[int], List[float]]], filters: Optional[Dict[str, Any]] = None, top_k: Optional[int] = None, scale_score: Optional[bool] = None, return_embedding: Optional[bool] = None, ): """ - Run the Embedding Retriever on the given input data. + Run the Sparse Embedding Retriever on the given input data. :param query_sparse_embedding: Sparse Embedding of the query. :param filters: A dictionary with filters to narrow down the search space. @@ -238,3 +238,127 @@ def run( ) return {"documents": docs} + +@component +class QdrantHybridRetriever: + """ + A component for retrieving documents from an QdrantDocumentStore using hybrid search (dense+sparse). + + Usage example: + ```python + from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever + from haystack_integrations.document_stores.qdrant import QdrantDocumentStore + + document_store = QdrantDocumentStore( + ":memory:", + recreate_index=True, + return_sparse_embedding=True, + wait_result_from_api=True, + ) + retriever = QdrantHybridRetriever(document_store=document_store) + + # using a fake sparse vector to keep the example simple + retriever.run(query_embedding=[0.1]*768, query_sparse_embedding={"indices":[0, 1, 2, 3], "values":[0.1, 0.8, 0.05, 0.33]}) + ``` + """ + + def __init__( + self, + document_store: QdrantDocumentStore, + filters: Optional[Dict[str, Any]] = None, + top_k_dense: int = 10, + top_k_sparse: int = 10, + scale_score: bool = True, # noqa: FBT001, FBT002 + return_embedding: bool = False, # noqa: FBT001, FBT002 + ): + """ + Create a QdrantSparseRetriever component. + + :param document_store: An instance of QdrantDocumentStore. + :param filters: A dictionary with filters to narrow down the search space. Default is None. + :param top_k_dense: The maximum number of documents to retrieve by dense retriever. Default is 10. + :param top_k_sparse: The maximum number of documents to retrieve by sparse retriever. Default is 10. + :param scale_score: Whether to scale the scores of the retrieved documents or not. Default is True. + :param return_embedding: Whether to return the sparse embedding of the retrieved Documents. Default is False. + + :raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore. + """ + + if not isinstance(document_store, QdrantDocumentStore): + msg = "document_store must be an instance of QdrantDocumentStore" + raise ValueError(msg) + + self._document_store = document_store + self._filters = filters + self._top_k_dense = top_k_sparse + self._top_k_sparse = top_k_sparse + self._scale_score = scale_score + self._return_embedding = return_embedding + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + d = default_to_dict( + self, + document_store=self._document_store, + filters=self._filters, + top_k_dense=self._top_k_dense, + top_k_sparse=self._top_k_sparse + scale_score=self._scale_score, + return_embedding=self._return_embedding, + ) + d["init_parameters"]["document_store"] = self._document_store.to_dict() + + return d + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "QdrantEmbeddingRetriever": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ + document_store = QdrantDocumentStore.from_dict(data["init_parameters"]["document_store"]) + data["init_parameters"]["document_store"] = document_store + return default_from_dict(cls, data) + + @component.output_types(documents=List[Document]) + def run( + self, + query_sparse_embedding: Dict[str, Union[List[int], List[float]]], + query_embedding: List[float], + filters: Optional[Dict[str, Any]] = None, + top_k: Optional[int] = None, + scale_score: Optional[bool] = None, + return_embedding: Optional[bool] = None, + ): + """ + Run the Sparse Embedding Retriever on the given input data. + + :param query_sparse_embedding: Sparse Embedding of the query. + :param filters: A dictionary with filters to narrow down the search space. + :param top_k: The maximum number of documents to return. + :param scale_score: Whether to scale the scores of the retrieved documents or not. + :param return_embedding: Whether to return the embedding of the retrieved Documents. + :returns: + The retrieved documents. + + """ + docs = self._document_store.query_hybrid( + query_sparse_embedding=query_sparse_embedding, + query_embedding=query_embedding + filters=filters or self._filters, + top_k_dense=top_k_dense or self._top_k_dense, + top_k_sparse=top_k_sparse or self._top_k_sparse, + scale_score=scale_score or self._scale_score, + return_embedding=return_embedding or self._return_embedding, + ) + + return {"documents": docs} diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py index a741fa8f4..7ecae7b4a 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py @@ -19,9 +19,10 @@ def documents_to_batch( points = [] for document in documents: payload = document.to_dict(flatten=False) - # TODO: vector should be built not only from embedding_field but also from the field containing sparse embeddings - # TODO: Because with sparse vectors, the vector is now a dict - vector = payload.pop(embedding_field) or {} + dense_vector = payload.pop(embedding_field) or {} + # TODO: Adapt to Haystack Modification of the Document Dataclass + sparse_vector = payload["meta"].pop("_sparse_vector") or {} + vector = {"text-dense": dense_vector, "text-sparse": sparse_vector} _id = self.convert_id(payload.get("id")) point = rest.PointStruct( @@ -53,11 +54,14 @@ def __init__(self, content_field: str, name_field: str, embedding_field: str): def point_to_document(self, point: QdrantPoint) -> Document: payload = {**point.payload} - # TODO: rework the converters part because now it's a mess with the new sparse embedding feature - # TODO: With dense+sparse embedding, vector is now a dict ? - # TODO: Unnamed dense vector are accessed with "" key ? - payload["embedding"] = point.vector[""] if hasattr(point, "vector") else None + if hasattr(point, "vector") and "text-dense" in point.vector: + payload["embedding"] = point.vector["text-dense"] + else: + payload["embedding"] = None payload["score"] = point.score if hasattr(point, "score") else None - # TODO: Because haystack document don't have sparse embedding field (only dense) in their dataclass, put it in meta ? - payload["meta"]["sparse-embedding"] = point.vector["text-sparse"] if hasattr(point, "vector") else None + # TODO: Adapt to Haystack Modification of the Document Dataclass + if hasattr(point, "vector") and "text-dense" in point.vector: + payload["meta"]["_sparse_vector"] = point.vector["text-sparse"] + else: + payload["meta"]["_sparse_vector"] = None return Document.from_dict(payload) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index bf8f444f5..dd5c4ce2b 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -298,9 +298,66 @@ def get_documents_by_id( documents.append(self.qdrant_to_haystack.point_to_document(record)) return documents + def query_hybrid( + self, + query_sparse_embedding: Dict[str, Union[List[int], List[float]]], + query_embedding: List[float], + filters: Optional[Dict[str, Any]] = None, + top_k_dense: int = 10, + top_k_sparse: int = 10, + scale_score: bool = True, # noqa: FBT001, FBT002 + return_embedding: bool = False, # noqa: FBT001, FBT002 + ) -> List[Document]: + qdrant_filters = self.qdrant_filter_converter.convert(filters) + + query_indices = query_sparse_embedding["indices"] + query_values = query_sparse_embedding["values"] + if len(query_indices) != len(query_values): + error_message = "The indices and values of the sparse embedding query must have the same length." + raise ValueError(error_message) + + points = self.client.search_batch( + collection_name=self.index, + requests=[ + rest.SearchRequest( + vector=rest.NamedVector( + name="text-dense", + vector=query_dense_vector, + ), + query_filter=qdrant_filters + limit=top_k_dense, + with_vectors=return_embedding + ), + rest.SearchRequest( + vector=rest.NamedSparseVector( + name="text-sparse", + vector=rest.SparseVector( + indices=query_indices, + values=query_values, + ), + ), + query_filter=qdrant_filters + limit=top_k_sparse, + with_vectors=return_embedding + ), + ], + ) + + results = [self.qdrant_to_haystack.point_to_document(point) for point in points] + # TODO: Check Scaling method + if scale_score: + for document in results: + score = document.score + if self.similarity == "cosine": + score = (score + 1) / 2 + else: + score = float(1 / (1 + np.exp(-score / 100))) + document.score = score + return results + def query_by_sparse( self, - query_sparse_embedding: Dict[str, List[Union[int, float]]], + query_sparse_embedding: Dict[str, Union[List[int], List[float]]], filters: Optional[Dict[str, Any]] = None, top_k: int = 10, scale_score: bool = True, # noqa: FBT001, FBT002 @@ -308,9 +365,6 @@ def query_by_sparse( ) -> List[Document]: qdrant_filters = self.qdrant_filter_converter.convert(filters) - # TODO: we make the assumption that the sparse query is a dict with indices - # TODO: and values keys and list of int/float as value. Is it ok ? - # TODO: See FastEmbed https://github.com/qdrant/fastembed/blob/4cb8be2fb15c4f5ee2caa2629233121cfe389783/fastembed/sparse/sparse_embedding_base.py#L10 query_indices = query_sparse_embedding["indices"] query_values = query_sparse_embedding["values"] if len(query_indices) != len(query_values): @@ -332,6 +386,7 @@ def query_by_sparse( ) results = [self.qdrant_to_haystack.point_to_document(point) for point in points] + # TODO: Check Scaling method if scale_score: for document in results: score = document.score @@ -354,7 +409,10 @@ def query_by_embedding( points = self.client.search( collection_name=self.index, - query_vector=query_embedding, + query_vector=rest.NamedVector( + name="text-dense", + vector=query_dense_vector, + ), query_filter=qdrant_filters, limit=top_k, with_vectors=return_embedding, @@ -454,12 +512,13 @@ def _set_up_collection( def _recreate_collection(self, collection_name: str, distance, embedding_dim: int, on_disk: bool): # noqa: FBT001 self.client.recreate_collection( collection_name=collection_name, - vectors_config=rest.VectorParams( - size=embedding_dim, - on_disk=on_disk, - distance=distance, - ), - # TODO: we use named sparse vector, maybe we should named for dense also ? + vectors_config={ + "text-dense": rest.VectorParams( + size=embedding_dim, + on_disk=on_disk, + distance=distance, + ), + } sparse_vectors_config={ "text-sparse": rest.SparseVectorParams( index=rest.SparseIndexParams( From ad6fcbcceadf466d2a22b368a33d3be8ac2c6d69 Mon Sep 17 00:00:00 2001 From: Corentin Date: Thu, 14 Mar 2024 00:03:04 +0100 Subject: [PATCH 03/41] __init__.py --- .../components/retrievers/qdrant/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py index 58be4211a..c3c39882f 100644 --- a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py +++ b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py @@ -2,6 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from .retriever import QdrantEmbeddingRetriever, QdrantSparseRetriever +from .retriever import QdrantEmbeddingRetriever, QdrantSparseRetriever, QdrantHybridRetriever -__all__ = ("QdrantEmbeddingRetriever", "QdrantSparseRetriever") +__all__ = ("QdrantEmbeddingRetriever", "QdrantSparseRetriever", "QdrantHybridRetriever") From c9a571ae2598767f123b2012088b072304c202fd Mon Sep 17 00:00:00 2001 From: Corentin Date: Thu, 14 Mar 2024 00:07:16 +0100 Subject: [PATCH 04/41] merge batch results for hybrid request --- .../document_stores/qdrant/document_store.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index dd5c4ce2b..25e1b1df9 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -342,17 +342,29 @@ def query_hybrid( ), ], ) + results_dense = [self.qdrant_to_haystack.point_to_document(point) for point in points[0]] + results_sparse = [self.qdrant_to_haystack.point_to_document(point) for point in points[1]] - results = [self.qdrant_to_haystack.point_to_document(point) for point in points] - # TODO: Check Scaling method if scale_score: - for document in results: + for document in results_dense: + score = document.score + if self.similarity == "cosine": + score = (score + 1) / 2 + else: + score = float(1 / (1 + np.exp(-score / 100))) + document.score = score + # TODO: Check Scaling method for sparse + if scale_score: + for document in results_sparse: score = document.score if self.similarity == "cosine": score = (score + 1) / 2 else: score = float(1 / (1 + np.exp(-score / 100))) document.score = score + + results = results_dense + results_sparse + return results def query_by_sparse( From 3824e8c6c0502ebffb65a8d01956d8e69f53ee61 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Thu, 14 Mar 2024 09:22:05 +0100 Subject: [PATCH 05/41] feat(Qdrant): missing comma --- .../components/retrievers/qdrant/retriever.py | 7 ++++--- .../document_stores/qdrant/document_store.py | 10 +++++----- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py index 203e20013..c6f5617fb 100644 --- a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py +++ b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py @@ -307,7 +307,7 @@ def to_dict(self) -> Dict[str, Any]: document_store=self._document_store, filters=self._filters, top_k_dense=self._top_k_dense, - top_k_sparse=self._top_k_sparse + top_k_sparse=self._top_k_sparse, scale_score=self._scale_score, return_embedding=self._return_embedding, ) @@ -335,7 +335,8 @@ def run( query_sparse_embedding: Dict[str, Union[List[int], List[float]]], query_embedding: List[float], filters: Optional[Dict[str, Any]] = None, - top_k: Optional[int] = None, + top_k_dense: Optional[int] = None, + top_k_sparse: Optional[int] = None, scale_score: Optional[bool] = None, return_embedding: Optional[bool] = None, ): @@ -353,7 +354,7 @@ def run( """ docs = self._document_store.query_hybrid( query_sparse_embedding=query_sparse_embedding, - query_embedding=query_embedding + query_embedding=query_embedding, filters=filters or self._filters, top_k_dense=top_k_dense or self._top_k_dense, top_k_sparse=top_k_sparse or self._top_k_sparse, diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index 25e1b1df9..f733baea6 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -322,9 +322,9 @@ def query_hybrid( rest.SearchRequest( vector=rest.NamedVector( name="text-dense", - vector=query_dense_vector, + vector=query_embedding, ), - query_filter=qdrant_filters + query_filter=qdrant_filters, limit=top_k_dense, with_vectors=return_embedding ), @@ -336,7 +336,7 @@ def query_hybrid( values=query_values, ), ), - query_filter=qdrant_filters + query_filter=qdrant_filters, limit=top_k_sparse, with_vectors=return_embedding ), @@ -423,7 +423,7 @@ def query_by_embedding( collection_name=self.index, query_vector=rest.NamedVector( name="text-dense", - vector=query_dense_vector, + vector=query_embedding, ), query_filter=qdrant_filters, limit=top_k, @@ -530,7 +530,7 @@ def _recreate_collection(self, collection_name: str, distance, embedding_dim: in on_disk=on_disk, distance=distance, ), - } + }, sparse_vectors_config={ "text-sparse": rest.SparseVectorParams( index=rest.SparseIndexParams( From 4253a1cb3ccf5cbf9973a868ce9cd232c1bb040b Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Thu, 14 Mar 2024 10:21:26 +0100 Subject: [PATCH 06/41] feat(Qdrant): making some test progress --- .../document_stores/qdrant/converters.py | 17 +++++++++++++---- .../document_stores/qdrant/document_store.py | 5 ++--- integrations/qdrant/tests/test_converters.py | 4 ++-- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py index 7ecae7b4a..40cf64c16 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py @@ -19,10 +19,15 @@ def documents_to_batch( points = [] for document in documents: payload = document.to_dict(flatten=False) - dense_vector = payload.pop(embedding_field) or {} + vector = {} + if embedding_field in payload and payload[embedding_field] is not None: + dense_vector = payload.pop(embedding_field) or [] + vector["text-dense"] = dense_vector # TODO: Adapt to Haystack Modification of the Document Dataclass - sparse_vector = payload["meta"].pop("_sparse_vector") or {} - vector = {"text-dense": dense_vector, "text-sparse": sparse_vector} + if "_sparse_vector" in payload["meta"]: + sparse_vector = payload["meta"].pop("_sparse_vector", {"indices": [], "values": []}) + sparse_vector_instance = rest.SparseVector(**sparse_vector) + vector["text-sparse"] = sparse_vector_instance _id = self.convert_id(payload.get("id")) point = rest.PointStruct( @@ -61,7 +66,11 @@ def point_to_document(self, point: QdrantPoint) -> Document: payload["score"] = point.score if hasattr(point, "score") else None # TODO: Adapt to Haystack Modification of the Document Dataclass if hasattr(point, "vector") and "text-dense" in point.vector: - payload["meta"]["_sparse_vector"] = point.vector["text-sparse"] + parse_vector_dict = { + "indices": point.vector["text-sparse"].indices, + "values": point.vector["text-sparse"].values + } + payload["meta"]["_sparse_vector"] = parse_vector_dict else: payload["meta"]["_sparse_vector"] = None return Document.from_dict(payload) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index f733baea6..5eb0ca4d3 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -499,9 +499,8 @@ def _set_up_collection( # Create Payload index if payload_fields_to_index is provided self._create_payload_index(collection_name, payload_fields_to_index) return - - current_distance = collection_info.config.params.vectors.distance - current_vector_size = collection_info.config.params.vectors.size + current_distance = collection_info.config.params.vectors["text-dense"].distance + current_vector_size = collection_info.config.params.vectors["text-dense"].size if current_distance != distance: msg = ( diff --git a/integrations/qdrant/tests/test_converters.py b/integrations/qdrant/tests/test_converters.py index 0c6c5676a..0b21378cb 100644 --- a/integrations/qdrant/tests/test_converters.py +++ b/integrations/qdrant/tests/test_converters.py @@ -42,11 +42,11 @@ def test_point_to_document_reverts_proper_structure_from_record( "test_field": 1, }, }, - vector=[1.0, 0.0, 0.0, 0.0], + vector={"text-dense":[1.0, 0.0, 0.0, 0.0], "text-sparse": {"indices": [7, 1024, 367], "values": [0.1, 0.98, 0.33]}}, ) document = qdrant_to_haystack.point_to_document(point) assert "my-id" == document.id assert "Lorem ipsum" == document.content assert "text" == document.content_type - assert {"test_field": 1} == document.meta + assert {"test_field": 1, "_sparse_vector": {"indices": [7, 1024, 367], "values": [0.1, 0.98, 0.33]}} == document.meta assert 0.0 == np.sum(np.array([1.0, 0.0, 0.0, 0.0]) - document.embedding) From 37fddebf1eb9824e583c6063cd7b907fd93413c2 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Thu, 14 Mar 2024 10:29:44 +0100 Subject: [PATCH 07/41] feat(Qdrant): all current test are fixed --- .../document_stores/qdrant/converters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py index 40cf64c16..f240177b3 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py @@ -59,13 +59,13 @@ def __init__(self, content_field: str, name_field: str, embedding_field: str): def point_to_document(self, point: QdrantPoint) -> Document: payload = {**point.payload} - if hasattr(point, "vector") and "text-dense" in point.vector: + if hasattr(point, "vector") and point.vector is not None and "text-dense" in point.vector: payload["embedding"] = point.vector["text-dense"] else: payload["embedding"] = None payload["score"] = point.score if hasattr(point, "score") else None # TODO: Adapt to Haystack Modification of the Document Dataclass - if hasattr(point, "vector") and "text-dense" in point.vector: + if hasattr(point, "vector") and point.vector is not None and "text-sparse" in point.vector: parse_vector_dict = { "indices": point.vector["text-sparse"].indices, "values": point.vector["text-sparse"].values From 550ef45bb37b4aae67ad5f63a91024377ba8bcc9 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Thu, 14 Mar 2024 10:32:36 +0100 Subject: [PATCH 08/41] feat(Qdrant): linting --- .../components/retrievers/qdrant/__init__.py | 2 +- .../components/retrievers/qdrant/retriever.py | 6 ++++-- .../document_stores/qdrant/converters.py | 2 +- .../document_stores/qdrant/document_store.py | 10 +++++----- integrations/qdrant/tests/test_converters.py | 10 ++++++++-- 5 files changed, 19 insertions(+), 11 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py index c3c39882f..c410d369e 100644 --- a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py +++ b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py @@ -2,6 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from .retriever import QdrantEmbeddingRetriever, QdrantSparseRetriever, QdrantHybridRetriever +from .retriever import QdrantEmbeddingRetriever, QdrantHybridRetriever, QdrantSparseRetriever __all__ = ("QdrantEmbeddingRetriever", "QdrantSparseRetriever", "QdrantHybridRetriever") diff --git a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py index c6f5617fb..ffc28f260 100644 --- a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py +++ b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py @@ -239,6 +239,7 @@ def run( return {"documents": docs} + @component class QdrantHybridRetriever: """ @@ -258,7 +259,8 @@ class QdrantHybridRetriever: retriever = QdrantHybridRetriever(document_store=document_store) # using a fake sparse vector to keep the example simple - retriever.run(query_embedding=[0.1]*768, query_sparse_embedding={"indices":[0, 1, 2, 3], "values":[0.1, 0.8, 0.05, 0.33]}) + retriever.run(query_embedding=[0.1]*768, + query_sparse_embedding={"indices":[0, 1, 2, 3], "values":[0.1, 0.8, 0.05, 0.33]}) ``` """ @@ -290,7 +292,7 @@ def __init__( self._document_store = document_store self._filters = filters - self._top_k_dense = top_k_sparse + self._top_k_dense = top_k_dense self._top_k_sparse = top_k_sparse self._scale_score = scale_score self._return_embedding = return_embedding diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py index f240177b3..95df20fc0 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py @@ -68,7 +68,7 @@ def point_to_document(self, point: QdrantPoint) -> Document: if hasattr(point, "vector") and point.vector is not None and "text-sparse" in point.vector: parse_vector_dict = { "indices": point.vector["text-sparse"].indices, - "values": point.vector["text-sparse"].values + "values": point.vector["text-sparse"].values, } payload["meta"]["_sparse_vector"] = parse_vector_dict else: diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index 5eb0ca4d3..6f7d9cfaf 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -315,7 +315,7 @@ def query_hybrid( if len(query_indices) != len(query_values): error_message = "The indices and values of the sparse embedding query must have the same length." raise ValueError(error_message) - + points = self.client.search_batch( collection_name=self.index, requests=[ @@ -326,7 +326,7 @@ def query_hybrid( ), query_filter=qdrant_filters, limit=top_k_dense, - with_vectors=return_embedding + with_vectors=return_embedding, ), rest.SearchRequest( vector=rest.NamedSparseVector( @@ -338,7 +338,7 @@ def query_hybrid( ), query_filter=qdrant_filters, limit=top_k_sparse, - with_vectors=return_embedding + with_vectors=return_embedding, ), ], ) @@ -364,9 +364,9 @@ def query_hybrid( document.score = score results = results_dense + results_sparse - + return results - + def query_by_sparse( self, query_sparse_embedding: Dict[str, Union[List[int], List[float]]], diff --git a/integrations/qdrant/tests/test_converters.py b/integrations/qdrant/tests/test_converters.py index 0b21378cb..5526a9077 100644 --- a/integrations/qdrant/tests/test_converters.py +++ b/integrations/qdrant/tests/test_converters.py @@ -42,11 +42,17 @@ def test_point_to_document_reverts_proper_structure_from_record( "test_field": 1, }, }, - vector={"text-dense":[1.0, 0.0, 0.0, 0.0], "text-sparse": {"indices": [7, 1024, 367], "values": [0.1, 0.98, 0.33]}}, + vector={ + "text-dense": [1.0, 0.0, 0.0, 0.0], + "text-sparse": {"indices": [7, 1024, 367], "values": [0.1, 0.98, 0.33]}, + }, ) document = qdrant_to_haystack.point_to_document(point) assert "my-id" == document.id assert "Lorem ipsum" == document.content assert "text" == document.content_type - assert {"test_field": 1, "_sparse_vector": {"indices": [7, 1024, 367], "values": [0.1, 0.98, 0.33]}} == document.meta + assert { + "test_field": 1, + "_sparse_vector": {"indices": [7, 1024, 367], "values": [0.1, 0.98, 0.33]}, + } == document.meta assert 0.0 == np.sum(np.array([1.0, 0.0, 0.0, 0.0]) - document.embedding) From c79c604c2964e492383467f6215ba851ce8ebdca Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Thu, 14 Mar 2024 11:13:02 +0100 Subject: [PATCH 09/41] feat(Qdrant): working sparse retriver hooray --- .../components/retrievers/qdrant/__init__.py | 4 +- .../components/retrievers/qdrant/retriever.py | 250 ++++++++--------- .../document_stores/qdrant/document_store.py | 136 +++++----- integrations/qdrant/tests/test_retriever.py | 253 +++++++++++++++++- 4 files changed, 447 insertions(+), 196 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py index c410d369e..58be4211a 100644 --- a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py +++ b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py @@ -2,6 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from .retriever import QdrantEmbeddingRetriever, QdrantHybridRetriever, QdrantSparseRetriever +from .retriever import QdrantEmbeddingRetriever, QdrantSparseRetriever -__all__ = ("QdrantEmbeddingRetriever", "QdrantSparseRetriever", "QdrantHybridRetriever") +__all__ = ("QdrantEmbeddingRetriever", "QdrantSparseRetriever") diff --git a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py index ffc28f260..eb80c9d11 100644 --- a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py +++ b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py @@ -240,128 +240,128 @@ def run( return {"documents": docs} -@component -class QdrantHybridRetriever: - """ - A component for retrieving documents from an QdrantDocumentStore using hybrid search (dense+sparse). - - Usage example: - ```python - from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever - from haystack_integrations.document_stores.qdrant import QdrantDocumentStore - - document_store = QdrantDocumentStore( - ":memory:", - recreate_index=True, - return_sparse_embedding=True, - wait_result_from_api=True, - ) - retriever = QdrantHybridRetriever(document_store=document_store) - - # using a fake sparse vector to keep the example simple - retriever.run(query_embedding=[0.1]*768, - query_sparse_embedding={"indices":[0, 1, 2, 3], "values":[0.1, 0.8, 0.05, 0.33]}) - ``` - """ - - def __init__( - self, - document_store: QdrantDocumentStore, - filters: Optional[Dict[str, Any]] = None, - top_k_dense: int = 10, - top_k_sparse: int = 10, - scale_score: bool = True, # noqa: FBT001, FBT002 - return_embedding: bool = False, # noqa: FBT001, FBT002 - ): - """ - Create a QdrantSparseRetriever component. - - :param document_store: An instance of QdrantDocumentStore. - :param filters: A dictionary with filters to narrow down the search space. Default is None. - :param top_k_dense: The maximum number of documents to retrieve by dense retriever. Default is 10. - :param top_k_sparse: The maximum number of documents to retrieve by sparse retriever. Default is 10. - :param scale_score: Whether to scale the scores of the retrieved documents or not. Default is True. - :param return_embedding: Whether to return the sparse embedding of the retrieved Documents. Default is False. - - :raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore. - """ - - if not isinstance(document_store, QdrantDocumentStore): - msg = "document_store must be an instance of QdrantDocumentStore" - raise ValueError(msg) - - self._document_store = document_store - self._filters = filters - self._top_k_dense = top_k_dense - self._top_k_sparse = top_k_sparse - self._scale_score = scale_score - self._return_embedding = return_embedding - - def to_dict(self) -> Dict[str, Any]: - """ - Serializes the component to a dictionary. - - :returns: - Dictionary with serialized data. - """ - d = default_to_dict( - self, - document_store=self._document_store, - filters=self._filters, - top_k_dense=self._top_k_dense, - top_k_sparse=self._top_k_sparse, - scale_score=self._scale_score, - return_embedding=self._return_embedding, - ) - d["init_parameters"]["document_store"] = self._document_store.to_dict() - - return d - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "QdrantEmbeddingRetriever": - """ - Deserializes the component from a dictionary. - - :param data: - Dictionary to deserialize from. - :returns: - Deserialized component. - """ - document_store = QdrantDocumentStore.from_dict(data["init_parameters"]["document_store"]) - data["init_parameters"]["document_store"] = document_store - return default_from_dict(cls, data) - - @component.output_types(documents=List[Document]) - def run( - self, - query_sparse_embedding: Dict[str, Union[List[int], List[float]]], - query_embedding: List[float], - filters: Optional[Dict[str, Any]] = None, - top_k_dense: Optional[int] = None, - top_k_sparse: Optional[int] = None, - scale_score: Optional[bool] = None, - return_embedding: Optional[bool] = None, - ): - """ - Run the Sparse Embedding Retriever on the given input data. - - :param query_sparse_embedding: Sparse Embedding of the query. - :param filters: A dictionary with filters to narrow down the search space. - :param top_k: The maximum number of documents to return. - :param scale_score: Whether to scale the scores of the retrieved documents or not. - :param return_embedding: Whether to return the embedding of the retrieved Documents. - :returns: - The retrieved documents. - - """ - docs = self._document_store.query_hybrid( - query_sparse_embedding=query_sparse_embedding, - query_embedding=query_embedding, - filters=filters or self._filters, - top_k_dense=top_k_dense or self._top_k_dense, - top_k_sparse=top_k_sparse or self._top_k_sparse, - scale_score=scale_score or self._scale_score, - return_embedding=return_embedding or self._return_embedding, - ) - - return {"documents": docs} +# @component +# class QdrantHybridRetriever: +# """ +# A component for retrieving documents from an QdrantDocumentStore using hybrid search (dense+sparse). +# +# Usage example: +# ```python +# from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever +# from haystack_integrations.document_stores.qdrant import QdrantDocumentStore +# +# document_store = QdrantDocumentStore( +# ":memory:", +# recreate_index=True, +# return_sparse_embedding=True, +# wait_result_from_api=True, +# ) +# retriever = QdrantHybridRetriever(document_store=document_store) +# +# # using a fake sparse vector to keep the example simple +# retriever.run(query_embedding=[0.1]*768, +# query_sparse_embedding={"indices":[0, 1, 2, 3], "values":[0.1, 0.8, 0.05, 0.33]}) +# ``` +# """ +# +# def __init__( +# self, +# document_store: QdrantDocumentStore, +# filters: Optional[Dict[str, Any]] = None, +# top_k_dense: int = 10, +# top_k_sparse: int = 10, +# scale_score: bool = True, +# return_embedding: bool = False, +# ): +# """ +# Create a QdrantSparseRetriever component. +# +# :param document_store: An instance of QdrantDocumentStore. +# :param filters: A dictionary with filters to narrow down the search space. Default is None. +# :param top_k_dense: The maximum number of documents to retrieve by dense retriever. Default is 10. +# :param top_k_sparse: The maximum number of documents to retrieve by sparse retriever. Default is 10. +# :param scale_score: Whether to scale the scores of the retrieved documents or not. Default is True. +# :param return_embedding: Whether to return the sparse embedding of the retrieved Documents. Default is False. +# +# :raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore. +# """ +# +# if not isinstance(document_store, QdrantDocumentStore): +# msg = "document_store must be an instance of QdrantDocumentStore" +# raise ValueError(msg) +# +# self._document_store = document_store +# self._filters = filters +# self._top_k_dense = top_k_dense +# self._top_k_sparse = top_k_sparse +# self._scale_score = scale_score +# self._return_embedding = return_embedding +# +# def to_dict(self) -> Dict[str, Any]: +# """ +# Serializes the component to a dictionary. +# +# :returns: +# Dictionary with serialized data. +# """ +# d = default_to_dict( +# self, +# document_store=self._document_store, +# filters=self._filters, +# top_k_dense=self._top_k_dense, +# top_k_sparse=self._top_k_sparse, +# scale_score=self._scale_score, +# return_embedding=self._return_embedding, +# ) +# d["init_parameters"]["document_store"] = self._document_store.to_dict() +# +# return d +# +# @classmethod +# def from_dict(cls, data: Dict[str, Any]) -> "QdrantEmbeddingRetriever": +# """ +# Deserializes the component from a dictionary. +# +# :param data: +# Dictionary to deserialize from. +# :returns: +# Deserialized component. +# """ +# document_store = QdrantDocumentStore.from_dict(data["init_parameters"]["document_store"]) +# data["init_parameters"]["document_store"] = document_store +# return default_from_dict(cls, data) +# +# @component.output_types(documents=List[Document]) +# def run( +# self, +# query_sparse_embedding: Dict[str, Union[List[int], List[float]]], +# query_embedding: List[float], +# filters: Optional[Dict[str, Any]] = None, +# top_k_dense: Optional[int] = None, +# top_k_sparse: Optional[int] = None, +# scale_score: Optional[bool] = None, +# return_embedding: Optional[bool] = None, +# ): +# """ +# Run the Sparse Embedding Retriever on the given input data. +# +# :param query_sparse_embedding: Sparse Embedding of the query. +# :param filters: A dictionary with filters to narrow down the search space. +# :param top_k: The maximum number of documents to return. +# :param scale_score: Whether to scale the scores of the retrieved documents or not. +# :param return_embedding: Whether to return the embedding of the retrieved Documents. +# :returns: +# The retrieved documents. +# +# """ +# docs = self._document_store.query_hybrid( +# query_sparse_embedding=query_sparse_embedding, +# query_embedding=query_embedding, +# filters=filters or self._filters, +# top_k_dense=top_k_dense or self._top_k_dense, +# top_k_sparse=top_k_sparse or self._top_k_sparse, +# scale_score=scale_score or self._scale_score, +# return_embedding=return_embedding or self._return_embedding, +# ) +# +# return {"documents": docs} diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index 6f7d9cfaf..2e15e062e 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -298,74 +298,74 @@ def get_documents_by_id( documents.append(self.qdrant_to_haystack.point_to_document(record)) return documents - def query_hybrid( - self, - query_sparse_embedding: Dict[str, Union[List[int], List[float]]], - query_embedding: List[float], - filters: Optional[Dict[str, Any]] = None, - top_k_dense: int = 10, - top_k_sparse: int = 10, - scale_score: bool = True, # noqa: FBT001, FBT002 - return_embedding: bool = False, # noqa: FBT001, FBT002 - ) -> List[Document]: - qdrant_filters = self.qdrant_filter_converter.convert(filters) - - query_indices = query_sparse_embedding["indices"] - query_values = query_sparse_embedding["values"] - if len(query_indices) != len(query_values): - error_message = "The indices and values of the sparse embedding query must have the same length." - raise ValueError(error_message) - - points = self.client.search_batch( - collection_name=self.index, - requests=[ - rest.SearchRequest( - vector=rest.NamedVector( - name="text-dense", - vector=query_embedding, - ), - query_filter=qdrant_filters, - limit=top_k_dense, - with_vectors=return_embedding, - ), - rest.SearchRequest( - vector=rest.NamedSparseVector( - name="text-sparse", - vector=rest.SparseVector( - indices=query_indices, - values=query_values, - ), - ), - query_filter=qdrant_filters, - limit=top_k_sparse, - with_vectors=return_embedding, - ), - ], - ) - results_dense = [self.qdrant_to_haystack.point_to_document(point) for point in points[0]] - results_sparse = [self.qdrant_to_haystack.point_to_document(point) for point in points[1]] - - if scale_score: - for document in results_dense: - score = document.score - if self.similarity == "cosine": - score = (score + 1) / 2 - else: - score = float(1 / (1 + np.exp(-score / 100))) - document.score = score - # TODO: Check Scaling method for sparse - if scale_score: - for document in results_sparse: - score = document.score - if self.similarity == "cosine": - score = (score + 1) / 2 - else: - score = float(1 / (1 + np.exp(-score / 100))) - document.score = score - - results = results_dense + results_sparse - - return results + # def query_hybrid( + # self, + # query_sparse_embedding: Dict[str, Union[List[int], List[float]]], + # query_embedding: List[float], + # filters: Optional[Dict[str, Any]] = None, + # top_k_dense: int = 10, + # top_k_sparse: int = 10, + # scale_score: bool = True, + # return_embedding: bool = False, + # ) -> List[Document]: + # qdrant_filters = self.qdrant_filter_converter.convert(filters) + # + # query_indices = query_sparse_embedding["indices"] + # query_values = query_sparse_embedding["values"] + # if len(query_indices) != len(query_values): + # error_message = "The indices and values of the sparse embedding query must have the same length." + # raise ValueError(error_message) + # + # points = self.client.search_batch( + # collection_name=self.index, + # requests=[ + # rest.SearchRequest( + # vector=rest.NamedVector( + # name="text-dense", + # vector=query_embedding, + # ), + # filter=qdrant_filters, + # limit=top_k_dense, + # # with_vectors=return_embedding, # TODO not supported ? + # ), + # rest.SearchRequest( + # vector=rest.NamedSparseVector( + # name="text-sparse", + # vector=rest.SparseVector( + # indices=query_indices, + # values=query_values, + # ), + # ), + # filter=qdrant_filters, + # limit=top_k_sparse, + # # with_vectors=return_embedding, # TODO not supported ? + # ), + # ], + # ) + # results_dense = [self.qdrant_to_haystack.point_to_document(point) for point in points[0]] + # results_sparse = [self.qdrant_to_haystack.point_to_document(point) for point in points[1]] + # + # if scale_score: + # for document in results_dense: + # score = document.score + # if self.similarity == "cosine": + # score = (score + 1) / 2 + # else: + # score = float(1 / (1 + np.exp(-score / 100))) + # document.score = score + # # TODO: Check Scaling method for sparse + # if scale_score: + # for document in results_sparse: + # score = document.score + # if self.similarity == "cosine": + # score = (score + 1) / 2 + # else: + # score = float(1 / (1 + np.exp(-score / 100))) + # document.score = score + # + # results = results_dense + results_sparse + # + # return results def query_by_sparse( self, diff --git a/integrations/qdrant/tests/test_retriever.py b/integrations/qdrant/tests/test_retriever.py index 41d9b3088..eddd6aaa1 100644 --- a/integrations/qdrant/tests/test_retriever.py +++ b/integrations/qdrant/tests/test_retriever.py @@ -1,11 +1,12 @@ from typing import List +import numpy as np from haystack.dataclasses import Document from haystack.testing.document_store import ( FilterableDocsFixtureMixin, _random_embeddings, ) -from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever +from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever, QdrantSparseRetriever from haystack_integrations.document_stores.qdrant import QdrantDocumentStore @@ -112,3 +113,253 @@ def test_run(self, filterable_docs: List[Document]): for document in results["documents"]: # type: ignore assert document.embedding is None + + +class TestQdrantSparseRetriever(FilterableDocsFixtureMixin): + def test_init_default(self): + document_store = QdrantDocumentStore(location=":memory:", index="test") + retriever = QdrantSparseRetriever(document_store=document_store) + assert retriever._document_store == document_store + assert retriever._filters is None + assert retriever._top_k == 10 + assert retriever._return_embedding is False + + def test_to_dict(self): + document_store = QdrantDocumentStore(location=":memory:", index="test") + retriever = QdrantSparseRetriever(document_store=document_store) + res = retriever.to_dict() + assert res == { + "type": "haystack_integrations.components.retrievers.qdrant.retriever.QdrantSparseRetriever", + "init_parameters": { + "document_store": { + "type": "haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore", + "init_parameters": { + "location": ":memory:", + "url": None, + "port": 6333, + "grpc_port": 6334, + "prefer_grpc": False, + "https": None, + "api_key": None, + "prefix": None, + "timeout": None, + "host": None, + "path": None, + "index": "test", + "embedding_dim": 768, + "on_disk": False, + "content_field": "content", + "name_field": "name", + "embedding_field": "embedding", + "similarity": "cosine", + "return_embedding": False, + "progress_bar": True, + "duplicate_documents": "overwrite", + "recreate_index": False, + "shard_number": None, + "replication_factor": None, + "write_consistency_factor": None, + "on_disk_payload": None, + "hnsw_config": None, + "optimizers_config": None, + "wal_config": None, + "quantization_config": None, + "init_from": None, + "wait_result_from_api": True, + "metadata": {}, + "write_batch_size": 100, + "scroll_size": 10000, + "payload_fields_to_index": None, + }, + }, + "filters": None, + "top_k": 10, + "scale_score": True, + "return_embedding": False, + }, + } + + def test_from_dict(self): + data = { + "type": "haystack_integrations.components.retrievers.qdrant.retriever.QdrantSparseRetriever", + "init_parameters": { + "document_store": { + "init_parameters": {"location": ":memory:", "index": "test"}, + "type": "haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore", + }, + "filters": None, + "top_k": 5, + "scale_score": False, + "return_embedding": True, + }, + } + retriever = QdrantSparseRetriever.from_dict(data) + assert isinstance(retriever._document_store, QdrantDocumentStore) + assert retriever._document_store.index == "test" + assert retriever._filters is None + assert retriever._top_k == 5 + assert retriever._scale_score is False + assert retriever._return_embedding is True + + def _generate_mocked_sparse_embedding(self, n): + list_of_sparse_vectors = [] + for _ in range(n): + random_indice_length = np.random.randint(3, 15) + data = { + "indices": list(range(random_indice_length)), + "values": [np.random.random_sample() for _ in range(random_indice_length)], + } + list_of_sparse_vectors.append(data) + return list_of_sparse_vectors + + def test_run(self, filterable_docs: List[Document]): + document_store = QdrantDocumentStore(location=":memory:", index="Boi") + + # Add fake sparse embedding to documents + for doc in filterable_docs: + doc.meta["_sparse_vector"] = self._generate_mocked_sparse_embedding(1)[0] + + document_store.write_documents(filterable_docs) + + retriever = QdrantSparseRetriever(document_store=document_store) + + results: List[Document] = retriever.run(query_sparse_embedding=self._generate_mocked_sparse_embedding(1)[0]) + + assert len(results["documents"]) == 10 # type: ignore + + results = retriever.run( + query_sparse_embedding=self._generate_mocked_sparse_embedding(1)[0], top_k=5, return_embedding=False + ) + + assert len(results["documents"]) == 5 # type: ignore + + for document in results["documents"]: # type: ignore + assert document.embedding is None + + +# class TestQdrantHybridRetriever(FilterableDocsFixtureMixin): +# def test_init_default(self): +# document_store = QdrantDocumentStore(location=":memory:", index="test") +# retriever = QdrantHybridRetriever(document_store=document_store) +# assert retriever._document_store == document_store +# assert retriever._filters is None +# assert retriever._top_k_sparse == 10 +# assert retriever._top_k_dense == 10 +# assert retriever._return_embedding is False +# +# def test_to_dict(self): +# document_store = QdrantDocumentStore(location=":memory:", index="test") +# retriever = QdrantHybridRetriever(document_store=document_store) +# res = retriever.to_dict() +# assert res == { +# "type": "haystack_integrations.components.retrievers.qdrant.retriever.QdrantHybridRetriever", +# "init_parameters": { +# "document_store": { +# "type": "haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore", +# "init_parameters": { +# "location": ":memory:", +# "url": None, +# "port": 6333, +# "grpc_port": 6334, +# "prefer_grpc": False, +# "https": None, +# "api_key": None, +# "prefix": None, +# "timeout": None, +# "host": None, +# "path": None, +# "index": "test", +# "embedding_dim": 768, +# "on_disk": False, +# "content_field": "content", +# "name_field": "name", +# "embedding_field": "embedding", +# "similarity": "cosine", +# "return_embedding": False, +# "progress_bar": True, +# "duplicate_documents": "overwrite", +# "recreate_index": False, +# "shard_number": None, +# "replication_factor": None, +# "write_consistency_factor": None, +# "on_disk_payload": None, +# "hnsw_config": None, +# "optimizers_config": None, +# "wal_config": None, +# "quantization_config": None, +# "init_from": None, +# "wait_result_from_api": True, +# "metadata": {}, +# "write_batch_size": 100, +# "scroll_size": 10000, +# "payload_fields_to_index": None, +# }, +# }, +# "filters": None, +# "top_k_sparse": 10, +# "top_k_dense": 10, +# "scale_score": True, +# "return_embedding": False, +# }, +# } +# +# def test_from_dict(self): +# data = { +# "type": "haystack_integrations.components.retrievers.qdrant.retriever.QdrantHybridRetriever", +# "init_parameters": { +# "document_store": { +# "init_parameters": {"location": ":memory:", "index": "test"}, +# "type": "haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore", +# }, +# "filters": None, +# "top_k_sparse": 5, +# "top_k_dense": 5, +# "scale_score": False, +# "return_embedding": True, +# }, +# } +# retriever = QdrantHybridRetriever.from_dict(data) +# assert isinstance(retriever._document_store, QdrantDocumentStore) +# assert retriever._document_store.index == "test" +# assert retriever._filters is None +# assert retriever._top_k_sparse == 5 +# assert retriever._top_k_dense == 5 +# assert retriever._scale_score is False +# assert retriever._return_embedding is True +# +# def _generate_mocked_sparse_embedding(self, n): +# list_of_sparse_vectors = [] +# for _ in range(n): +# random_indice_length = np.random.randint(3, 15) +# data = { +# "indices": list(range(random_indice_length)), +# "values": [np.random.random_sample() for _ in range(random_indice_length)], +# } +# list_of_sparse_vectors.append(data) +# return list_of_sparse_vectors +# +# def test_run(self, filterable_docs: List[Document]): +# document_store = QdrantDocumentStore(location=":memory:", index="Boi") +# +# # Add fake sparse embedding to documents +# for doc in filterable_docs: +# doc.meta["_sparse_vector"] = self._generate_mocked_sparse_embedding(1)[0] +# +# document_store.write_documents(filterable_docs) +# +# retriever = QdrantHybridRetriever(document_store=document_store) +# +# results: List[Document] = retriever.run(query_sparse_embedding=self._generate_mocked_sparse_embedding(1)[0], +# query_embedding=_random_embeddings(768)) +# +# assert len(results["documents"]) == 20 # type: ignore +# +# results = retriever.run(query_sparse_embedding=self._generate_mocked_sparse_embedding(1)[0], +# query_embedding=_random_embeddings(768), +# top_k_dense=5, top_k_sparse=5, +# return_embedding=False) +# +# assert len(results["documents"]) == 10 # type: ignore +# +# for document in results["documents"]: # type: ignore +# assert document.embedding is None From 91d67f72eba33b589fd2dbf5051a46e63fab7548 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Wed, 20 Mar 2024 21:30:39 +0100 Subject: [PATCH 10/41] feat(Qdrant): fix hybrid retriver --- .../components/retrievers/qdrant/__init__.py | 4 +- .../components/retrievers/qdrant/retriever.py | 250 ++++++++--------- .../document_stores/qdrant/document_store.py | 140 +++++----- integrations/qdrant/tests/test_retriever.py | 254 +++++++++--------- 4 files changed, 321 insertions(+), 327 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py index 58be4211a..c3c39882f 100644 --- a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py +++ b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py @@ -2,6 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from .retriever import QdrantEmbeddingRetriever, QdrantSparseRetriever +from .retriever import QdrantEmbeddingRetriever, QdrantSparseRetriever, QdrantHybridRetriever -__all__ = ("QdrantEmbeddingRetriever", "QdrantSparseRetriever") +__all__ = ("QdrantEmbeddingRetriever", "QdrantSparseRetriever", "QdrantHybridRetriever") diff --git a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py index eb80c9d11..0eed5cbbe 100644 --- a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py +++ b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py @@ -240,128 +240,128 @@ def run( return {"documents": docs} -# @component -# class QdrantHybridRetriever: -# """ -# A component for retrieving documents from an QdrantDocumentStore using hybrid search (dense+sparse). -# -# Usage example: -# ```python -# from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever -# from haystack_integrations.document_stores.qdrant import QdrantDocumentStore -# -# document_store = QdrantDocumentStore( -# ":memory:", -# recreate_index=True, -# return_sparse_embedding=True, -# wait_result_from_api=True, -# ) -# retriever = QdrantHybridRetriever(document_store=document_store) -# -# # using a fake sparse vector to keep the example simple -# retriever.run(query_embedding=[0.1]*768, -# query_sparse_embedding={"indices":[0, 1, 2, 3], "values":[0.1, 0.8, 0.05, 0.33]}) -# ``` -# """ -# -# def __init__( -# self, -# document_store: QdrantDocumentStore, -# filters: Optional[Dict[str, Any]] = None, -# top_k_dense: int = 10, -# top_k_sparse: int = 10, -# scale_score: bool = True, -# return_embedding: bool = False, -# ): -# """ -# Create a QdrantSparseRetriever component. -# -# :param document_store: An instance of QdrantDocumentStore. -# :param filters: A dictionary with filters to narrow down the search space. Default is None. -# :param top_k_dense: The maximum number of documents to retrieve by dense retriever. Default is 10. -# :param top_k_sparse: The maximum number of documents to retrieve by sparse retriever. Default is 10. -# :param scale_score: Whether to scale the scores of the retrieved documents or not. Default is True. -# :param return_embedding: Whether to return the sparse embedding of the retrieved Documents. Default is False. -# -# :raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore. -# """ -# -# if not isinstance(document_store, QdrantDocumentStore): -# msg = "document_store must be an instance of QdrantDocumentStore" -# raise ValueError(msg) -# -# self._document_store = document_store -# self._filters = filters -# self._top_k_dense = top_k_dense -# self._top_k_sparse = top_k_sparse -# self._scale_score = scale_score -# self._return_embedding = return_embedding -# -# def to_dict(self) -> Dict[str, Any]: -# """ -# Serializes the component to a dictionary. -# -# :returns: -# Dictionary with serialized data. -# """ -# d = default_to_dict( -# self, -# document_store=self._document_store, -# filters=self._filters, -# top_k_dense=self._top_k_dense, -# top_k_sparse=self._top_k_sparse, -# scale_score=self._scale_score, -# return_embedding=self._return_embedding, -# ) -# d["init_parameters"]["document_store"] = self._document_store.to_dict() -# -# return d -# -# @classmethod -# def from_dict(cls, data: Dict[str, Any]) -> "QdrantEmbeddingRetriever": -# """ -# Deserializes the component from a dictionary. -# -# :param data: -# Dictionary to deserialize from. -# :returns: -# Deserialized component. -# """ -# document_store = QdrantDocumentStore.from_dict(data["init_parameters"]["document_store"]) -# data["init_parameters"]["document_store"] = document_store -# return default_from_dict(cls, data) -# -# @component.output_types(documents=List[Document]) -# def run( -# self, -# query_sparse_embedding: Dict[str, Union[List[int], List[float]]], -# query_embedding: List[float], -# filters: Optional[Dict[str, Any]] = None, -# top_k_dense: Optional[int] = None, -# top_k_sparse: Optional[int] = None, -# scale_score: Optional[bool] = None, -# return_embedding: Optional[bool] = None, -# ): -# """ -# Run the Sparse Embedding Retriever on the given input data. -# -# :param query_sparse_embedding: Sparse Embedding of the query. -# :param filters: A dictionary with filters to narrow down the search space. -# :param top_k: The maximum number of documents to return. -# :param scale_score: Whether to scale the scores of the retrieved documents or not. -# :param return_embedding: Whether to return the embedding of the retrieved Documents. -# :returns: -# The retrieved documents. -# -# """ -# docs = self._document_store.query_hybrid( -# query_sparse_embedding=query_sparse_embedding, -# query_embedding=query_embedding, -# filters=filters or self._filters, -# top_k_dense=top_k_dense or self._top_k_dense, -# top_k_sparse=top_k_sparse or self._top_k_sparse, -# scale_score=scale_score or self._scale_score, -# return_embedding=return_embedding or self._return_embedding, -# ) -# -# return {"documents": docs} +@component +class QdrantHybridRetriever: + """ + A component for retrieving documents from an QdrantDocumentStore using hybrid search (dense+sparse). + + Usage example: + ```python + from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever + from haystack_integrations.document_stores.qdrant import QdrantDocumentStore + + document_store = QdrantDocumentStore( + ":memory:", + recreate_index=True, + return_sparse_embedding=True, + wait_result_from_api=True, + ) + retriever = QdrantHybridRetriever(document_store=document_store) + + # using a fake sparse vector to keep the example simple + retriever.run(query_embedding=[0.1]*768, + query_sparse_embedding={"indices":[0, 1, 2, 3], "values":[0.1, 0.8, 0.05, 0.33]}) + ``` + """ + + def __init__( + self, + document_store: QdrantDocumentStore, + filters: Optional[Dict[str, Any]] = None, + top_k_dense: int = 10, + top_k_sparse: int = 10, + scale_score: bool = True, + return_embedding: bool = False, + ): + """ + Create a QdrantSparseRetriever component. + + :param document_store: An instance of QdrantDocumentStore. + :param filters: A dictionary with filters to narrow down the search space. Default is None. + :param top_k_dense: The maximum number of documents to retrieve by dense retriever. Default is 10. + :param top_k_sparse: The maximum number of documents to retrieve by sparse retriever. Default is 10. + :param scale_score: Whether to scale the scores of the retrieved documents or not. Default is True. + :param return_embedding: Whether to return the sparse embedding of the retrieved Documents. Default is False. + + :raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore. + """ + + if not isinstance(document_store, QdrantDocumentStore): + msg = "document_store must be an instance of QdrantDocumentStore" + raise ValueError(msg) + + self._document_store = document_store + self._filters = filters + self._top_k_dense = top_k_dense + self._top_k_sparse = top_k_sparse + self._scale_score = scale_score + self._return_embedding = return_embedding + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + d = default_to_dict( + self, + document_store=self._document_store, + filters=self._filters, + top_k_dense=self._top_k_dense, + top_k_sparse=self._top_k_sparse, + scale_score=self._scale_score, + return_embedding=self._return_embedding, + ) + d["init_parameters"]["document_store"] = self._document_store.to_dict() + + return d + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "QdrantEmbeddingRetriever": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ + document_store = QdrantDocumentStore.from_dict(data["init_parameters"]["document_store"]) + data["init_parameters"]["document_store"] = document_store + return default_from_dict(cls, data) + + @component.output_types(documents=List[Document]) + def run( + self, + query_sparse_embedding: Dict[str, Union[List[int], List[float]]], + query_embedding: List[float], + filters: Optional[Dict[str, Any]] = None, + top_k_dense: Optional[int] = None, + top_k_sparse: Optional[int] = None, + scale_score: Optional[bool] = None, + return_embedding: Optional[bool] = None, + ): + """ + Run the Sparse Embedding Retriever on the given input data. + + :param query_sparse_embedding: Sparse Embedding of the query. + :param filters: A dictionary with filters to narrow down the search space. + :param top_k: The maximum number of documents to return. + :param scale_score: Whether to scale the scores of the retrieved documents or not. + :param return_embedding: Whether to return the embedding of the retrieved Documents. + :returns: + The retrieved documents. + + """ + docs = self._document_store.query_hybrid( + query_sparse_embedding=query_sparse_embedding, + query_embedding=query_embedding, + filters=filters or self._filters, + top_k_dense=top_k_dense or self._top_k_dense, + top_k_sparse=top_k_sparse or self._top_k_sparse, + scale_score=scale_score or self._scale_score, + return_embedding=return_embedding or self._return_embedding, + ) + + return {"documents": docs} diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index 2e15e062e..0e43ae2ae 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -298,74 +298,72 @@ def get_documents_by_id( documents.append(self.qdrant_to_haystack.point_to_document(record)) return documents - # def query_hybrid( - # self, - # query_sparse_embedding: Dict[str, Union[List[int], List[float]]], - # query_embedding: List[float], - # filters: Optional[Dict[str, Any]] = None, - # top_k_dense: int = 10, - # top_k_sparse: int = 10, - # scale_score: bool = True, - # return_embedding: bool = False, - # ) -> List[Document]: - # qdrant_filters = self.qdrant_filter_converter.convert(filters) - # - # query_indices = query_sparse_embedding["indices"] - # query_values = query_sparse_embedding["values"] - # if len(query_indices) != len(query_values): - # error_message = "The indices and values of the sparse embedding query must have the same length." - # raise ValueError(error_message) - # - # points = self.client.search_batch( - # collection_name=self.index, - # requests=[ - # rest.SearchRequest( - # vector=rest.NamedVector( - # name="text-dense", - # vector=query_embedding, - # ), - # filter=qdrant_filters, - # limit=top_k_dense, - # # with_vectors=return_embedding, # TODO not supported ? - # ), - # rest.SearchRequest( - # vector=rest.NamedSparseVector( - # name="text-sparse", - # vector=rest.SparseVector( - # indices=query_indices, - # values=query_values, - # ), - # ), - # filter=qdrant_filters, - # limit=top_k_sparse, - # # with_vectors=return_embedding, # TODO not supported ? - # ), - # ], - # ) - # results_dense = [self.qdrant_to_haystack.point_to_document(point) for point in points[0]] - # results_sparse = [self.qdrant_to_haystack.point_to_document(point) for point in points[1]] - # - # if scale_score: - # for document in results_dense: - # score = document.score - # if self.similarity == "cosine": - # score = (score + 1) / 2 - # else: - # score = float(1 / (1 + np.exp(-score / 100))) - # document.score = score - # # TODO: Check Scaling method for sparse - # if scale_score: - # for document in results_sparse: - # score = document.score - # if self.similarity == "cosine": - # score = (score + 1) / 2 - # else: - # score = float(1 / (1 + np.exp(-score / 100))) - # document.score = score - # - # results = results_dense + results_sparse - # - # return results + def query_hybrid( + self, + query_sparse_embedding: Dict[str, Union[List[int], List[float]]], + query_embedding: List[float], + filters: Optional[Dict[str, Any]] = None, + top_k_dense: int = 10, + top_k_sparse: int = 10, + scale_score: bool = True, + return_embedding: bool = False, + ) -> List[Document]: + qdrant_filters = self.qdrant_filter_converter.convert(filters) + + query_indices = query_sparse_embedding["indices"] + query_values = query_sparse_embedding["values"] + if len(query_indices) != len(query_values): + error_message = "The indices and values of the sparse embedding query must have the same length." + raise ValueError(error_message) + + points = self.client.search_batch( + collection_name=self.index, + requests=[ + rest.SearchRequest( + vector=rest.NamedVector( + name="text-dense", + vector=query_embedding, + ), + filter=qdrant_filters, + limit=top_k_dense, + with_payload=True, + with_vector=return_embedding, + ), + rest.SearchRequest( + vector=rest.NamedSparseVector( + name="text-sparse", + vector=rest.SparseVector( + indices=query_indices, + values=query_values, + ), + ), + filter=qdrant_filters, + limit=top_k_sparse, + with_payload=True, + with_vector=return_embedding, + ), + ], + ) + results_dense = [self.qdrant_to_haystack.point_to_document(point) for point in points[0]] + results_sparse = [self.qdrant_to_haystack.point_to_document(point) for point in points[1]] + + if scale_score: + for document in results_dense: + score = document.score + if self.similarity == "cosine": + score = (score + 1) / 2 + else: + score = float(1 / (1 + np.exp(-score / 100))) + document.score = score + if scale_score: + for document in results_sparse: + score = document.score + score = float(1 / (1 + np.exp(-score / 100))) + document.score = score + + results = results_dense + results_sparse + + return results def query_by_sparse( self, @@ -398,14 +396,10 @@ def query_by_sparse( ) results = [self.qdrant_to_haystack.point_to_document(point) for point in points] - # TODO: Check Scaling method if scale_score: for document in results: score = document.score - if self.similarity == "cosine": - score = (score + 1) / 2 - else: - score = float(1 / (1 + np.exp(-score / 100))) + score = float(1 / (1 + np.exp(-score / 100))) document.score = score return results diff --git a/integrations/qdrant/tests/test_retriever.py b/integrations/qdrant/tests/test_retriever.py index eddd6aaa1..a33ed3bc8 100644 --- a/integrations/qdrant/tests/test_retriever.py +++ b/integrations/qdrant/tests/test_retriever.py @@ -6,7 +6,7 @@ FilterableDocsFixtureMixin, _random_embeddings, ) -from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever, QdrantSparseRetriever +from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever, QdrantSparseRetriever, QdrantHybridRetriever from haystack_integrations.document_stores.qdrant import QdrantDocumentStore @@ -237,129 +237,129 @@ def test_run(self, filterable_docs: List[Document]): assert document.embedding is None -# class TestQdrantHybridRetriever(FilterableDocsFixtureMixin): -# def test_init_default(self): -# document_store = QdrantDocumentStore(location=":memory:", index="test") -# retriever = QdrantHybridRetriever(document_store=document_store) -# assert retriever._document_store == document_store -# assert retriever._filters is None -# assert retriever._top_k_sparse == 10 -# assert retriever._top_k_dense == 10 -# assert retriever._return_embedding is False -# -# def test_to_dict(self): -# document_store = QdrantDocumentStore(location=":memory:", index="test") -# retriever = QdrantHybridRetriever(document_store=document_store) -# res = retriever.to_dict() -# assert res == { -# "type": "haystack_integrations.components.retrievers.qdrant.retriever.QdrantHybridRetriever", -# "init_parameters": { -# "document_store": { -# "type": "haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore", -# "init_parameters": { -# "location": ":memory:", -# "url": None, -# "port": 6333, -# "grpc_port": 6334, -# "prefer_grpc": False, -# "https": None, -# "api_key": None, -# "prefix": None, -# "timeout": None, -# "host": None, -# "path": None, -# "index": "test", -# "embedding_dim": 768, -# "on_disk": False, -# "content_field": "content", -# "name_field": "name", -# "embedding_field": "embedding", -# "similarity": "cosine", -# "return_embedding": False, -# "progress_bar": True, -# "duplicate_documents": "overwrite", -# "recreate_index": False, -# "shard_number": None, -# "replication_factor": None, -# "write_consistency_factor": None, -# "on_disk_payload": None, -# "hnsw_config": None, -# "optimizers_config": None, -# "wal_config": None, -# "quantization_config": None, -# "init_from": None, -# "wait_result_from_api": True, -# "metadata": {}, -# "write_batch_size": 100, -# "scroll_size": 10000, -# "payload_fields_to_index": None, -# }, -# }, -# "filters": None, -# "top_k_sparse": 10, -# "top_k_dense": 10, -# "scale_score": True, -# "return_embedding": False, -# }, -# } -# -# def test_from_dict(self): -# data = { -# "type": "haystack_integrations.components.retrievers.qdrant.retriever.QdrantHybridRetriever", -# "init_parameters": { -# "document_store": { -# "init_parameters": {"location": ":memory:", "index": "test"}, -# "type": "haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore", -# }, -# "filters": None, -# "top_k_sparse": 5, -# "top_k_dense": 5, -# "scale_score": False, -# "return_embedding": True, -# }, -# } -# retriever = QdrantHybridRetriever.from_dict(data) -# assert isinstance(retriever._document_store, QdrantDocumentStore) -# assert retriever._document_store.index == "test" -# assert retriever._filters is None -# assert retriever._top_k_sparse == 5 -# assert retriever._top_k_dense == 5 -# assert retriever._scale_score is False -# assert retriever._return_embedding is True -# -# def _generate_mocked_sparse_embedding(self, n): -# list_of_sparse_vectors = [] -# for _ in range(n): -# random_indice_length = np.random.randint(3, 15) -# data = { -# "indices": list(range(random_indice_length)), -# "values": [np.random.random_sample() for _ in range(random_indice_length)], -# } -# list_of_sparse_vectors.append(data) -# return list_of_sparse_vectors -# -# def test_run(self, filterable_docs: List[Document]): -# document_store = QdrantDocumentStore(location=":memory:", index="Boi") -# -# # Add fake sparse embedding to documents -# for doc in filterable_docs: -# doc.meta["_sparse_vector"] = self._generate_mocked_sparse_embedding(1)[0] -# -# document_store.write_documents(filterable_docs) -# -# retriever = QdrantHybridRetriever(document_store=document_store) -# -# results: List[Document] = retriever.run(query_sparse_embedding=self._generate_mocked_sparse_embedding(1)[0], -# query_embedding=_random_embeddings(768)) -# -# assert len(results["documents"]) == 20 # type: ignore -# -# results = retriever.run(query_sparse_embedding=self._generate_mocked_sparse_embedding(1)[0], -# query_embedding=_random_embeddings(768), -# top_k_dense=5, top_k_sparse=5, -# return_embedding=False) -# -# assert len(results["documents"]) == 10 # type: ignore -# -# for document in results["documents"]: # type: ignore -# assert document.embedding is None +class TestQdrantHybridRetriever(FilterableDocsFixtureMixin): + def test_init_default(self): + document_store = QdrantDocumentStore(location=":memory:", index="test") + retriever = QdrantHybridRetriever(document_store=document_store) + assert retriever._document_store == document_store + assert retriever._filters is None + assert retriever._top_k_sparse == 10 + assert retriever._top_k_dense == 10 + assert retriever._return_embedding is False + + def test_to_dict(self): + document_store = QdrantDocumentStore(location=":memory:", index="test") + retriever = QdrantHybridRetriever(document_store=document_store) + res = retriever.to_dict() + assert res == { + "type": "haystack_integrations.components.retrievers.qdrant.retriever.QdrantHybridRetriever", + "init_parameters": { + "document_store": { + "type": "haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore", + "init_parameters": { + "location": ":memory:", + "url": None, + "port": 6333, + "grpc_port": 6334, + "prefer_grpc": False, + "https": None, + "api_key": None, + "prefix": None, + "timeout": None, + "host": None, + "path": None, + "index": "test", + "embedding_dim": 768, + "on_disk": False, + "content_field": "content", + "name_field": "name", + "embedding_field": "embedding", + "similarity": "cosine", + "return_embedding": False, + "progress_bar": True, + "duplicate_documents": "overwrite", + "recreate_index": False, + "shard_number": None, + "replication_factor": None, + "write_consistency_factor": None, + "on_disk_payload": None, + "hnsw_config": None, + "optimizers_config": None, + "wal_config": None, + "quantization_config": None, + "init_from": None, + "wait_result_from_api": True, + "metadata": {}, + "write_batch_size": 100, + "scroll_size": 10000, + "payload_fields_to_index": None, + }, + }, + "filters": None, + "top_k_sparse": 10, + "top_k_dense": 10, + "scale_score": True, + "return_embedding": False, + }, + } + + def test_from_dict(self): + data = { + "type": "haystack_integrations.components.retrievers.qdrant.retriever.QdrantHybridRetriever", + "init_parameters": { + "document_store": { + "init_parameters": {"location": ":memory:", "index": "test"}, + "type": "haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore", + }, + "filters": None, + "top_k_sparse": 5, + "top_k_dense": 5, + "scale_score": False, + "return_embedding": True, + }, + } + retriever = QdrantHybridRetriever.from_dict(data) + assert isinstance(retriever._document_store, QdrantDocumentStore) + assert retriever._document_store.index == "test" + assert retriever._filters is None + assert retriever._top_k_sparse == 5 + assert retriever._top_k_dense == 5 + assert retriever._scale_score is False + assert retriever._return_embedding is True + + def _generate_mocked_sparse_embedding(self, n): + list_of_sparse_vectors = [] + for _ in range(n): + random_indice_length = np.random.randint(3, 15) + data = { + "indices": list(range(random_indice_length)), + "values": [np.random.random_sample() for _ in range(random_indice_length)], + } + list_of_sparse_vectors.append(data) + return list_of_sparse_vectors + + def test_run(self, filterable_docs: List[Document]): + document_store = QdrantDocumentStore(location=":memory:", index="Boi") + + # Add fake sparse embedding to documents + for doc in filterable_docs: + doc.meta["_sparse_vector"] = self._generate_mocked_sparse_embedding(1)[0] + + document_store.write_documents(filterable_docs) + + retriever = QdrantHybridRetriever(document_store=document_store) + + results: List[Document] = retriever.run(query_sparse_embedding=self._generate_mocked_sparse_embedding(1)[0], + query_embedding=_random_embeddings(768)) + + assert len(results["documents"]) == 20 # type: ignore + + results = retriever.run(query_sparse_embedding=self._generate_mocked_sparse_embedding(1)[0], + query_embedding=_random_embeddings(768), + top_k_dense=5, top_k_sparse=5, + return_embedding=False) + + assert len(results["documents"]) == 10 # type: ignore + + for document in results["documents"]: # type: ignore + assert document.embedding is None From 115c01b844108e1bb5c5ecff268a3833a902a2cd Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Wed, 20 Mar 2024 22:35:37 +0100 Subject: [PATCH 11/41] feat(Qdrant): modify PR for haystack 2.1.0 with proper sparse vectors --- integrations/qdrant/pyproject.toml | 2 +- .../document_stores/qdrant/converters.py | 19 +++++++++++-------- .../document_stores/qdrant/document_store.py | 4 ++++ integrations/qdrant/tests/test_converters.py | 8 ++++---- .../qdrant/tests/test_dict_converters.py | 3 +++ integrations/qdrant/tests/test_retriever.py | 9 ++++++--- 6 files changed, 29 insertions(+), 16 deletions(-) diff --git a/integrations/qdrant/pyproject.toml b/integrations/qdrant/pyproject.toml index 40a97d9b9..4ac3abda7 100644 --- a/integrations/qdrant/pyproject.toml +++ b/integrations/qdrant/pyproject.toml @@ -25,7 +25,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = ["haystack-ai>=2.0.0b6", "qdrant-client"] +dependencies = ["haystack-ai>=2.1.0", "qdrant-client"] [project.urls] Source = "https://github.com/deepset-ai/haystack-core-integrations" diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py index 95df20fc0..3e51ff4cc 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py @@ -1,7 +1,7 @@ import uuid from typing import List, Union -from haystack.dataclasses import Document +from haystack.dataclasses import Document, SparseEmbedding from qdrant_client.http import models as rest @@ -15,6 +15,7 @@ def documents_to_batch( documents: List[Document], *, embedding_field: str, + sparse_embedding_field: str, ) -> List[rest.PointStruct]: points = [] for document in documents: @@ -23,9 +24,10 @@ def documents_to_batch( if embedding_field in payload and payload[embedding_field] is not None: dense_vector = payload.pop(embedding_field) or [] vector["text-dense"] = dense_vector - # TODO: Adapt to Haystack Modification of the Document Dataclass - if "_sparse_vector" in payload["meta"]: - sparse_vector = payload["meta"].pop("_sparse_vector", {"indices": [], "values": []}) + # TODO: does this work with this https://github.com/deepset-ai/haystack/pull/7382#discussion_r1530515595 ? + # TODO: maybe check if not empty string also on top of None ? + if sparse_embedding_field in payload and payload[sparse_embedding_field] is not None: + sparse_vector = payload.pop(sparse_embedding_field, {"indices": [], "values": []}) sparse_vector_instance = rest.SparseVector(**sparse_vector) vector["text-sparse"] = sparse_vector_instance _id = self.convert_id(payload.get("id")) @@ -52,10 +54,11 @@ def convert_id(self, _id: str) -> str: class QdrantToHaystack: - def __init__(self, content_field: str, name_field: str, embedding_field: str): + def __init__(self, content_field: str, name_field: str, embedding_field: str, sparse_embedding_field: str): self.content_field = content_field self.name_field = name_field self.embedding_field = embedding_field + self.sparse_embedding_field = sparse_embedding_field def point_to_document(self, point: QdrantPoint) -> Document: payload = {**point.payload} @@ -64,13 +67,13 @@ def point_to_document(self, point: QdrantPoint) -> Document: else: payload["embedding"] = None payload["score"] = point.score if hasattr(point, "score") else None - # TODO: Adapt to Haystack Modification of the Document Dataclass if hasattr(point, "vector") and point.vector is not None and "text-sparse" in point.vector: parse_vector_dict = { "indices": point.vector["text-sparse"].indices, "values": point.vector["text-sparse"].values, } - payload["meta"]["_sparse_vector"] = parse_vector_dict + payload["sparse_embedding"] = parse_vector_dict else: - payload["meta"]["_sparse_vector"] = None + # TODO: does this work with this https://github.com/deepset-ai/haystack/pull/7382#discussion_r1530515595 ? + payload["sparse_embedding"] = None return Document.from_dict(payload) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index 0e43ae2ae..569083714 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -67,6 +67,7 @@ def __init__( content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", + sparse_embedding_field: str = "sparse_embedding", similarity: str = "cosine", return_embedding: bool = False, # noqa: FBT001, FBT002 progress_bar: bool = True, # noqa: FBT001, FBT002 @@ -142,6 +143,7 @@ def __init__( self.content_field = content_field self.name_field = name_field self.embedding_field = embedding_field + self.sparse_embedding_field = sparse_embedding_field self.similarity = similarity self.index = index self.return_embedding = return_embedding @@ -153,6 +155,7 @@ def __init__( content_field, name_field, embedding_field, + sparse_embedding_field ) self.write_batch_size = write_batch_size self.scroll_size = scroll_size @@ -212,6 +215,7 @@ def write_documents( batch = self.haystack_to_qdrant_converter.documents_to_batch( document_batch, embedding_field=self.embedding_field, + sparse_embedding_field=self.sparse_embedding_field ) self.client.upsert( diff --git a/integrations/qdrant/tests/test_converters.py b/integrations/qdrant/tests/test_converters.py index 5526a9077..162af2141 100644 --- a/integrations/qdrant/tests/test_converters.py +++ b/integrations/qdrant/tests/test_converters.py @@ -6,6 +6,7 @@ CONTENT_FIELD = "content" NAME_FIELD = "name" EMBEDDING_FIELD = "vector" +SPARSE_EMBEDDING_FIELD = "sparse-vector" @pytest.fixture @@ -19,6 +20,7 @@ def qdrant_to_haystack() -> QdrantToHaystack: content_field=CONTENT_FIELD, name_field=NAME_FIELD, embedding_field=EMBEDDING_FIELD, + sparse_embedding_field=SPARSE_EMBEDDING_FIELD ) @@ -51,8 +53,6 @@ def test_point_to_document_reverts_proper_structure_from_record( assert "my-id" == document.id assert "Lorem ipsum" == document.content assert "text" == document.content_type - assert { - "test_field": 1, - "_sparse_vector": {"indices": [7, 1024, 367], "values": [0.1, 0.98, 0.33]}, - } == document.meta + assert {"indices": [7, 1024, 367], "values": [0.1, 0.98, 0.33]} == document.sparse_embedding.to_dict() + assert {"test_field": 1} == document.meta assert 0.0 == np.sum(np.array([1.0, 0.0, 0.0, 0.0]) - document.embedding) diff --git a/integrations/qdrant/tests/test_dict_converters.py b/integrations/qdrant/tests/test_dict_converters.py index 3da64743a..32fb750fb 100644 --- a/integrations/qdrant/tests/test_dict_converters.py +++ b/integrations/qdrant/tests/test_dict_converters.py @@ -25,6 +25,7 @@ def test_to_dict(): "content_field": "content", "name_field": "name", "embedding_field": "embedding", + "sparse_embedding_field": "sparse_embedding", "similarity": "cosine", "return_embedding": False, "progress_bar": True, @@ -63,6 +64,7 @@ def test_from_dict(): "content_field": "content", "name_field": "name", "embedding_field": "embedding", + "sparse_embedding_field": "sparse_embedding", "similarity": "cosine", "return_embedding": False, "progress_bar": True, @@ -86,6 +88,7 @@ def test_from_dict(): document_store.content_field == "content", document_store.name_field == "name", document_store.embedding_field == "embedding", + document_store.sparse_embedding_field == "sparse_embedding", document_store.on_disk is False, document_store.similarity == "cosine", document_store.return_embedding is False, diff --git a/integrations/qdrant/tests/test_retriever.py b/integrations/qdrant/tests/test_retriever.py index a33ed3bc8..0d409b864 100644 --- a/integrations/qdrant/tests/test_retriever.py +++ b/integrations/qdrant/tests/test_retriever.py @@ -1,7 +1,7 @@ from typing import List import numpy as np -from haystack.dataclasses import Document +from haystack.dataclasses import Document, SparseEmbedding from haystack.testing.document_store import ( FilterableDocsFixtureMixin, _random_embeddings, @@ -46,6 +46,7 @@ def test_to_dict(self): "content_field": "content", "name_field": "name", "embedding_field": "embedding", + "sparse_embedding_field": "sparse_embedding", "similarity": "cosine", "return_embedding": False, "progress_bar": True, @@ -151,6 +152,7 @@ def test_to_dict(self): "content_field": "content", "name_field": "name", "embedding_field": "embedding", + "sparse_embedding_field": "sparse_embedding", "similarity": "cosine", "return_embedding": False, "progress_bar": True, @@ -217,7 +219,7 @@ def test_run(self, filterable_docs: List[Document]): # Add fake sparse embedding to documents for doc in filterable_docs: - doc.meta["_sparse_vector"] = self._generate_mocked_sparse_embedding(1)[0] + doc.sparse_embedding = SparseEmbedding.from_dict(self._generate_mocked_sparse_embedding(1)[0]) document_store.write_documents(filterable_docs) @@ -274,6 +276,7 @@ def test_to_dict(self): "content_field": "content", "name_field": "name", "embedding_field": "embedding", + "sparse_embedding_field": "sparse_embedding", "similarity": "cosine", "return_embedding": False, "progress_bar": True, @@ -343,7 +346,7 @@ def test_run(self, filterable_docs: List[Document]): # Add fake sparse embedding to documents for doc in filterable_docs: - doc.meta["_sparse_vector"] = self._generate_mocked_sparse_embedding(1)[0] + doc.sparse_embedding = SparseEmbedding.from_dict(self._generate_mocked_sparse_embedding(1)[0]) document_store.write_documents(filterable_docs) From 9c4e256162dddfccb23f27fee81b6cdd89701663 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Wed, 20 Mar 2024 22:44:31 +0100 Subject: [PATCH 12/41] feat(Qdrant): fix lint --- .../components/retrievers/qdrant/__init__.py | 2 +- .../components/retrievers/qdrant/retriever.py | 4 ++-- .../document_stores/qdrant/converters.py | 2 +- .../document_stores/qdrant/document_store.py | 13 ++++------- integrations/qdrant/tests/test_converters.py | 2 +- integrations/qdrant/tests/test_retriever.py | 22 +++++++++++++------ 6 files changed, 24 insertions(+), 21 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py index c3c39882f..c410d369e 100644 --- a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py +++ b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py @@ -2,6 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from .retriever import QdrantEmbeddingRetriever, QdrantSparseRetriever, QdrantHybridRetriever +from .retriever import QdrantEmbeddingRetriever, QdrantHybridRetriever, QdrantSparseRetriever __all__ = ("QdrantEmbeddingRetriever", "QdrantSparseRetriever", "QdrantHybridRetriever") diff --git a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py index 0eed5cbbe..ffc28f260 100644 --- a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py +++ b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py @@ -270,8 +270,8 @@ def __init__( filters: Optional[Dict[str, Any]] = None, top_k_dense: int = 10, top_k_sparse: int = 10, - scale_score: bool = True, - return_embedding: bool = False, + scale_score: bool = True, # noqa: FBT001, FBT002 + return_embedding: bool = False, # noqa: FBT001, FBT002 ): """ Create a QdrantSparseRetriever component. diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py index 3e51ff4cc..5405bf840 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py @@ -1,7 +1,7 @@ import uuid from typing import List, Union -from haystack.dataclasses import Document, SparseEmbedding +from haystack.dataclasses import Document from qdrant_client.http import models as rest diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index 569083714..3ea4de74c 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -151,12 +151,7 @@ def __init__( self.duplicate_documents = duplicate_documents self.qdrant_filter_converter = QdrantFilterConverter() self.haystack_to_qdrant_converter = HaystackToQdrant() - self.qdrant_to_haystack = QdrantToHaystack( - content_field, - name_field, - embedding_field, - sparse_embedding_field - ) + self.qdrant_to_haystack = QdrantToHaystack(content_field, name_field, embedding_field, sparse_embedding_field) self.write_batch_size = write_batch_size self.scroll_size = scroll_size @@ -215,7 +210,7 @@ def write_documents( batch = self.haystack_to_qdrant_converter.documents_to_batch( document_batch, embedding_field=self.embedding_field, - sparse_embedding_field=self.sparse_embedding_field + sparse_embedding_field=self.sparse_embedding_field, ) self.client.upsert( @@ -309,8 +304,8 @@ def query_hybrid( filters: Optional[Dict[str, Any]] = None, top_k_dense: int = 10, top_k_sparse: int = 10, - scale_score: bool = True, - return_embedding: bool = False, + scale_score: bool = True, # noqa: FBT001, FBT002 + return_embedding: bool = False, # noqa: FBT001, FBT002 ) -> List[Document]: qdrant_filters = self.qdrant_filter_converter.convert(filters) diff --git a/integrations/qdrant/tests/test_converters.py b/integrations/qdrant/tests/test_converters.py index 162af2141..c4de6d981 100644 --- a/integrations/qdrant/tests/test_converters.py +++ b/integrations/qdrant/tests/test_converters.py @@ -20,7 +20,7 @@ def qdrant_to_haystack() -> QdrantToHaystack: content_field=CONTENT_FIELD, name_field=NAME_FIELD, embedding_field=EMBEDDING_FIELD, - sparse_embedding_field=SPARSE_EMBEDDING_FIELD + sparse_embedding_field=SPARSE_EMBEDDING_FIELD, ) diff --git a/integrations/qdrant/tests/test_retriever.py b/integrations/qdrant/tests/test_retriever.py index 0d409b864..73b391412 100644 --- a/integrations/qdrant/tests/test_retriever.py +++ b/integrations/qdrant/tests/test_retriever.py @@ -6,7 +6,11 @@ FilterableDocsFixtureMixin, _random_embeddings, ) -from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever, QdrantSparseRetriever, QdrantHybridRetriever +from haystack_integrations.components.retrievers.qdrant import ( + QdrantEmbeddingRetriever, + QdrantHybridRetriever, + QdrantSparseRetriever, +) from haystack_integrations.document_stores.qdrant import QdrantDocumentStore @@ -352,15 +356,19 @@ def test_run(self, filterable_docs: List[Document]): retriever = QdrantHybridRetriever(document_store=document_store) - results: List[Document] = retriever.run(query_sparse_embedding=self._generate_mocked_sparse_embedding(1)[0], - query_embedding=_random_embeddings(768)) + results: List[Document] = retriever.run( + query_sparse_embedding=self._generate_mocked_sparse_embedding(1)[0], query_embedding=_random_embeddings(768) + ) assert len(results["documents"]) == 20 # type: ignore - results = retriever.run(query_sparse_embedding=self._generate_mocked_sparse_embedding(1)[0], - query_embedding=_random_embeddings(768), - top_k_dense=5, top_k_sparse=5, - return_embedding=False) + results = retriever.run( + query_sparse_embedding=self._generate_mocked_sparse_embedding(1)[0], + query_embedding=_random_embeddings(768), + top_k_dense=5, + top_k_sparse=5, + return_embedding=False, + ) assert len(results["documents"]) == 10 # type: ignore From 33395989ec5a06d859effc65d20d577cdacd51db Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 21 Mar 2024 15:27:06 +0100 Subject: [PATCH 13/41] test w Haystack main --- .github/workflows/qdrant.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/qdrant.yml b/.github/workflows/qdrant.yml index 5995911fb..a629dbb7f 100644 --- a/.github/workflows/qdrant.yml +++ b/.github/workflows/qdrant.yml @@ -58,7 +58,9 @@ jobs: - name: Run tests id: tests - run: hatch run cov + run: | + hatch run pip install git+https://github.com/deepset-ai/haystack.git #TODO: rm before merging + hatch run cov - name: Nightly - run unit tests with Haystack main branch if: github.event_name == 'schedule' From 725c9dcc6ac458a48fa0f1a8d75ed4b02813cd79 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 21 Mar 2024 15:29:30 +0100 Subject: [PATCH 14/41] fix deps --- integrations/qdrant/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/qdrant/pyproject.toml b/integrations/qdrant/pyproject.toml index 9133c0670..29be8da0f 100644 --- a/integrations/qdrant/pyproject.toml +++ b/integrations/qdrant/pyproject.toml @@ -24,7 +24,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = ["haystack-ai>=2.1.0", "qdrant-client"] +dependencies = ["haystack-ai", "qdrant-client"] [project.urls] Source = "https://github.com/deepset-ai/haystack-core-integrations" From cea3cb7265458111e5b1595e9c4ce4fd331b15d3 Mon Sep 17 00:00:00 2001 From: Corentin Date: Thu, 21 Mar 2024 19:14:38 +0100 Subject: [PATCH 15/41] Update integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py Co-authored-by: Anush --- .../components/retrievers/qdrant/retriever.py | 1 - 1 file changed, 1 deletion(-) diff --git a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py index ffc28f260..059fab391 100644 --- a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py +++ b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py @@ -140,7 +140,6 @@ class QdrantSparseRetriever: ) retriever = QdrantSparseRetriever(document_store=document_store) - # using a fake sparse vector to keep the example simple retriever.run(query_sparse_embedding={"indices":[0, 1, 2, 3], "values":[0.1, 0.8, 0.05, 0.33]}) ``` """ From 94411351ead3c5fbff0937f1442796c5e370e2ec Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Fri, 22 Mar 2024 09:08:47 +0100 Subject: [PATCH 16/41] feat(Qdrant): remove hybrid & old code, constant for vector field names --- .../components/retrievers/qdrant/__init__.py | 4 +- .../components/retrievers/qdrant/retriever.py | 127 ----------------- .../document_stores/qdrant/converters.py | 28 ++-- .../document_stores/qdrant/document_store.py | 82 ++--------- integrations/qdrant/tests/test_retriever.py | 134 ------------------ 5 files changed, 21 insertions(+), 354 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py index c410d369e..58be4211a 100644 --- a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py +++ b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py @@ -2,6 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from .retriever import QdrantEmbeddingRetriever, QdrantHybridRetriever, QdrantSparseRetriever +from .retriever import QdrantEmbeddingRetriever, QdrantSparseRetriever -__all__ = ("QdrantEmbeddingRetriever", "QdrantSparseRetriever", "QdrantHybridRetriever") +__all__ = ("QdrantEmbeddingRetriever", "QdrantSparseRetriever") diff --git a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py index 059fab391..fb81587c3 100644 --- a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py +++ b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py @@ -237,130 +237,3 @@ def run( ) return {"documents": docs} - - -@component -class QdrantHybridRetriever: - """ - A component for retrieving documents from an QdrantDocumentStore using hybrid search (dense+sparse). - - Usage example: - ```python - from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever - from haystack_integrations.document_stores.qdrant import QdrantDocumentStore - - document_store = QdrantDocumentStore( - ":memory:", - recreate_index=True, - return_sparse_embedding=True, - wait_result_from_api=True, - ) - retriever = QdrantHybridRetriever(document_store=document_store) - - # using a fake sparse vector to keep the example simple - retriever.run(query_embedding=[0.1]*768, - query_sparse_embedding={"indices":[0, 1, 2, 3], "values":[0.1, 0.8, 0.05, 0.33]}) - ``` - """ - - def __init__( - self, - document_store: QdrantDocumentStore, - filters: Optional[Dict[str, Any]] = None, - top_k_dense: int = 10, - top_k_sparse: int = 10, - scale_score: bool = True, # noqa: FBT001, FBT002 - return_embedding: bool = False, # noqa: FBT001, FBT002 - ): - """ - Create a QdrantSparseRetriever component. - - :param document_store: An instance of QdrantDocumentStore. - :param filters: A dictionary with filters to narrow down the search space. Default is None. - :param top_k_dense: The maximum number of documents to retrieve by dense retriever. Default is 10. - :param top_k_sparse: The maximum number of documents to retrieve by sparse retriever. Default is 10. - :param scale_score: Whether to scale the scores of the retrieved documents or not. Default is True. - :param return_embedding: Whether to return the sparse embedding of the retrieved Documents. Default is False. - - :raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore. - """ - - if not isinstance(document_store, QdrantDocumentStore): - msg = "document_store must be an instance of QdrantDocumentStore" - raise ValueError(msg) - - self._document_store = document_store - self._filters = filters - self._top_k_dense = top_k_dense - self._top_k_sparse = top_k_sparse - self._scale_score = scale_score - self._return_embedding = return_embedding - - def to_dict(self) -> Dict[str, Any]: - """ - Serializes the component to a dictionary. - - :returns: - Dictionary with serialized data. - """ - d = default_to_dict( - self, - document_store=self._document_store, - filters=self._filters, - top_k_dense=self._top_k_dense, - top_k_sparse=self._top_k_sparse, - scale_score=self._scale_score, - return_embedding=self._return_embedding, - ) - d["init_parameters"]["document_store"] = self._document_store.to_dict() - - return d - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "QdrantEmbeddingRetriever": - """ - Deserializes the component from a dictionary. - - :param data: - Dictionary to deserialize from. - :returns: - Deserialized component. - """ - document_store = QdrantDocumentStore.from_dict(data["init_parameters"]["document_store"]) - data["init_parameters"]["document_store"] = document_store - return default_from_dict(cls, data) - - @component.output_types(documents=List[Document]) - def run( - self, - query_sparse_embedding: Dict[str, Union[List[int], List[float]]], - query_embedding: List[float], - filters: Optional[Dict[str, Any]] = None, - top_k_dense: Optional[int] = None, - top_k_sparse: Optional[int] = None, - scale_score: Optional[bool] = None, - return_embedding: Optional[bool] = None, - ): - """ - Run the Sparse Embedding Retriever on the given input data. - - :param query_sparse_embedding: Sparse Embedding of the query. - :param filters: A dictionary with filters to narrow down the search space. - :param top_k: The maximum number of documents to return. - :param scale_score: Whether to scale the scores of the retrieved documents or not. - :param return_embedding: Whether to return the embedding of the retrieved Documents. - :returns: - The retrieved documents. - - """ - docs = self._document_store.query_hybrid( - query_sparse_embedding=query_sparse_embedding, - query_embedding=query_embedding, - filters=filters or self._filters, - top_k_dense=top_k_dense or self._top_k_dense, - top_k_sparse=top_k_sparse or self._top_k_sparse, - scale_score=scale_score or self._scale_score, - return_embedding=return_embedding or self._return_embedding, - ) - - return {"documents": docs} diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py index b93960308..b8a166a17 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py @@ -7,6 +7,9 @@ logger = logging.getLogger(__name__) +DENSE_VECTORS_NAME = "text-dense" +SPARSE_VECTORS_NAME = "text-sparse" + class HaystackToQdrant: """A converter from Haystack to Qdrant types.""" @@ -26,26 +29,15 @@ def documents_to_batch( vector = {} if embedding_field in payload and payload[embedding_field] is not None: dense_vector = payload.pop(embedding_field) or [] - vector["text-dense"] = dense_vector + vector[DENSE_VECTORS_NAME] = dense_vector # TODO: does this work with this https://github.com/deepset-ai/haystack/pull/7382#discussion_r1530515595 ? # TODO: maybe check if not empty string also on top of None ? if sparse_embedding_field in payload and payload[sparse_embedding_field] is not None: sparse_vector = payload.pop(sparse_embedding_field, {"indices": [], "values": []}) sparse_vector_instance = rest.SparseVector(**sparse_vector) - vector["text-sparse"] = sparse_vector_instance + vector[SPARSE_VECTORS_NAME] = sparse_vector_instance _id = self.convert_id(payload.get("id")) - # TODO: remove as soon as we introduce the support for sparse embeddings in Qdrant - if "sparse_embedding" in payload: - sparse_embedding = payload.pop("sparse_embedding", None) - if sparse_embedding: - logger.warning( - "Document %s has the `sparse_embedding` field set," - "but storing sparse embeddings in Qdrant is not currently supported." - "The `sparse_embedding` field will be ignored.", - payload["id"], - ) - point = rest.PointStruct( payload=payload, vector=vector, @@ -76,15 +68,15 @@ def __init__(self, content_field: str, name_field: str, embedding_field: str, sp def point_to_document(self, point: QdrantPoint) -> Document: payload = {**point.payload} - if hasattr(point, "vector") and point.vector is not None and "text-dense" in point.vector: - payload["embedding"] = point.vector["text-dense"] + if hasattr(point, "vector") and point.vector is not None and DENSE_VECTORS_NAME in point.vector: + payload["embedding"] = point.vector[DENSE_VECTORS_NAME] else: payload["embedding"] = None payload["score"] = point.score if hasattr(point, "score") else None - if hasattr(point, "vector") and point.vector is not None and "text-sparse" in point.vector: + if hasattr(point, "vector") and point.vector is not None and SPARSE_VECTORS_NAME in point.vector: parse_vector_dict = { - "indices": point.vector["text-sparse"].indices, - "values": point.vector["text-sparse"].values, + "indices": point.vector[SPARSE_VECTORS_NAME].indices, + "values": point.vector[SPARSE_VECTORS_NAME].values, } payload["sparse_embedding"] = parse_vector_dict else: diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index 3ea4de74c..8d26ebba1 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -22,6 +22,9 @@ logger = logging.getLogger(__name__) +DENSE_VECTORS_NAME = "text-dense" +SPARSE_VECTORS_NAME = "text-sparse" + class QdrantStoreError(DocumentStoreError): pass @@ -297,73 +300,6 @@ def get_documents_by_id( documents.append(self.qdrant_to_haystack.point_to_document(record)) return documents - def query_hybrid( - self, - query_sparse_embedding: Dict[str, Union[List[int], List[float]]], - query_embedding: List[float], - filters: Optional[Dict[str, Any]] = None, - top_k_dense: int = 10, - top_k_sparse: int = 10, - scale_score: bool = True, # noqa: FBT001, FBT002 - return_embedding: bool = False, # noqa: FBT001, FBT002 - ) -> List[Document]: - qdrant_filters = self.qdrant_filter_converter.convert(filters) - - query_indices = query_sparse_embedding["indices"] - query_values = query_sparse_embedding["values"] - if len(query_indices) != len(query_values): - error_message = "The indices and values of the sparse embedding query must have the same length." - raise ValueError(error_message) - - points = self.client.search_batch( - collection_name=self.index, - requests=[ - rest.SearchRequest( - vector=rest.NamedVector( - name="text-dense", - vector=query_embedding, - ), - filter=qdrant_filters, - limit=top_k_dense, - with_payload=True, - with_vector=return_embedding, - ), - rest.SearchRequest( - vector=rest.NamedSparseVector( - name="text-sparse", - vector=rest.SparseVector( - indices=query_indices, - values=query_values, - ), - ), - filter=qdrant_filters, - limit=top_k_sparse, - with_payload=True, - with_vector=return_embedding, - ), - ], - ) - results_dense = [self.qdrant_to_haystack.point_to_document(point) for point in points[0]] - results_sparse = [self.qdrant_to_haystack.point_to_document(point) for point in points[1]] - - if scale_score: - for document in results_dense: - score = document.score - if self.similarity == "cosine": - score = (score + 1) / 2 - else: - score = float(1 / (1 + np.exp(-score / 100))) - document.score = score - if scale_score: - for document in results_sparse: - score = document.score - score = float(1 / (1 + np.exp(-score / 100))) - document.score = score - - results = results_dense + results_sparse - - return results - def query_by_sparse( self, query_sparse_embedding: Dict[str, Union[List[int], List[float]]], @@ -383,7 +319,7 @@ def query_by_sparse( points = self.client.search( collection_name=self.index, query_vector=rest.NamedSparseVector( - name="text-sparse", + name=SPARSE_VECTORS_NAME, vector=rest.SparseVector( indices=query_indices, values=query_values, @@ -415,7 +351,7 @@ def query_by_embedding( points = self.client.search( collection_name=self.index, query_vector=rest.NamedVector( - name="text-dense", + name=DENSE_VECTORS_NAME, vector=query_embedding, ), query_filter=qdrant_filters, @@ -492,8 +428,8 @@ def _set_up_collection( # Create Payload index if payload_fields_to_index is provided self._create_payload_index(collection_name, payload_fields_to_index) return - current_distance = collection_info.config.params.vectors["text-dense"].distance - current_vector_size = collection_info.config.params.vectors["text-dense"].size + current_distance = collection_info.config.params.vectors[DENSE_VECTORS_NAME].distance + current_vector_size = collection_info.config.params.vectors[DENSE_VECTORS_NAME].size if current_distance != distance: msg = ( @@ -517,14 +453,14 @@ def _recreate_collection(self, collection_name: str, distance, embedding_dim: in self.client.recreate_collection( collection_name=collection_name, vectors_config={ - "text-dense": rest.VectorParams( + DENSE_VECTORS_NAME: rest.VectorParams( size=embedding_dim, on_disk=on_disk, distance=distance, ), }, sparse_vectors_config={ - "text-sparse": rest.SparseVectorParams( + SPARSE_VECTORS_NAME: rest.SparseVectorParams( index=rest.SparseIndexParams( on_disk=on_disk, ) diff --git a/integrations/qdrant/tests/test_retriever.py b/integrations/qdrant/tests/test_retriever.py index 73b391412..fb4ac704e 100644 --- a/integrations/qdrant/tests/test_retriever.py +++ b/integrations/qdrant/tests/test_retriever.py @@ -8,7 +8,6 @@ ) from haystack_integrations.components.retrievers.qdrant import ( QdrantEmbeddingRetriever, - QdrantHybridRetriever, QdrantSparseRetriever, ) from haystack_integrations.document_stores.qdrant import QdrantDocumentStore @@ -241,136 +240,3 @@ def test_run(self, filterable_docs: List[Document]): for document in results["documents"]: # type: ignore assert document.embedding is None - - -class TestQdrantHybridRetriever(FilterableDocsFixtureMixin): - def test_init_default(self): - document_store = QdrantDocumentStore(location=":memory:", index="test") - retriever = QdrantHybridRetriever(document_store=document_store) - assert retriever._document_store == document_store - assert retriever._filters is None - assert retriever._top_k_sparse == 10 - assert retriever._top_k_dense == 10 - assert retriever._return_embedding is False - - def test_to_dict(self): - document_store = QdrantDocumentStore(location=":memory:", index="test") - retriever = QdrantHybridRetriever(document_store=document_store) - res = retriever.to_dict() - assert res == { - "type": "haystack_integrations.components.retrievers.qdrant.retriever.QdrantHybridRetriever", - "init_parameters": { - "document_store": { - "type": "haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore", - "init_parameters": { - "location": ":memory:", - "url": None, - "port": 6333, - "grpc_port": 6334, - "prefer_grpc": False, - "https": None, - "api_key": None, - "prefix": None, - "timeout": None, - "host": None, - "path": None, - "index": "test", - "embedding_dim": 768, - "on_disk": False, - "content_field": "content", - "name_field": "name", - "embedding_field": "embedding", - "sparse_embedding_field": "sparse_embedding", - "similarity": "cosine", - "return_embedding": False, - "progress_bar": True, - "duplicate_documents": "overwrite", - "recreate_index": False, - "shard_number": None, - "replication_factor": None, - "write_consistency_factor": None, - "on_disk_payload": None, - "hnsw_config": None, - "optimizers_config": None, - "wal_config": None, - "quantization_config": None, - "init_from": None, - "wait_result_from_api": True, - "metadata": {}, - "write_batch_size": 100, - "scroll_size": 10000, - "payload_fields_to_index": None, - }, - }, - "filters": None, - "top_k_sparse": 10, - "top_k_dense": 10, - "scale_score": True, - "return_embedding": False, - }, - } - - def test_from_dict(self): - data = { - "type": "haystack_integrations.components.retrievers.qdrant.retriever.QdrantHybridRetriever", - "init_parameters": { - "document_store": { - "init_parameters": {"location": ":memory:", "index": "test"}, - "type": "haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore", - }, - "filters": None, - "top_k_sparse": 5, - "top_k_dense": 5, - "scale_score": False, - "return_embedding": True, - }, - } - retriever = QdrantHybridRetriever.from_dict(data) - assert isinstance(retriever._document_store, QdrantDocumentStore) - assert retriever._document_store.index == "test" - assert retriever._filters is None - assert retriever._top_k_sparse == 5 - assert retriever._top_k_dense == 5 - assert retriever._scale_score is False - assert retriever._return_embedding is True - - def _generate_mocked_sparse_embedding(self, n): - list_of_sparse_vectors = [] - for _ in range(n): - random_indice_length = np.random.randint(3, 15) - data = { - "indices": list(range(random_indice_length)), - "values": [np.random.random_sample() for _ in range(random_indice_length)], - } - list_of_sparse_vectors.append(data) - return list_of_sparse_vectors - - def test_run(self, filterable_docs: List[Document]): - document_store = QdrantDocumentStore(location=":memory:", index="Boi") - - # Add fake sparse embedding to documents - for doc in filterable_docs: - doc.sparse_embedding = SparseEmbedding.from_dict(self._generate_mocked_sparse_embedding(1)[0]) - - document_store.write_documents(filterable_docs) - - retriever = QdrantHybridRetriever(document_store=document_store) - - results: List[Document] = retriever.run( - query_sparse_embedding=self._generate_mocked_sparse_embedding(1)[0], query_embedding=_random_embeddings(768) - ) - - assert len(results["documents"]) == 20 # type: ignore - - results = retriever.run( - query_sparse_embedding=self._generate_mocked_sparse_embedding(1)[0], - query_embedding=_random_embeddings(768), - top_k_dense=5, - top_k_sparse=5, - return_embedding=False, - ) - - assert len(results["documents"]) == 10 # type: ignore - - for document in results["documents"]: # type: ignore - assert document.embedding is None From 6cacc787836ce6a5e6f1f89f1a98a17c1dcb67b8 Mon Sep 17 00:00:00 2001 From: Corentin Date: Fri, 22 Mar 2024 10:56:31 +0100 Subject: [PATCH 17/41] Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py Co-authored-by: Stefano Fiorucci --- .../document_stores/qdrant/converters.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py index b8a166a17..1eb8ca4d0 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py @@ -27,9 +27,7 @@ def documents_to_batch( for document in documents: payload = document.to_dict(flatten=False) vector = {} - if embedding_field in payload and payload[embedding_field] is not None: - dense_vector = payload.pop(embedding_field) or [] - vector[DENSE_VECTORS_NAME] = dense_vector + vector[DENSE_VECTORS_NAME] = payload.pop(embedding_field, None) or [] # TODO: does this work with this https://github.com/deepset-ai/haystack/pull/7382#discussion_r1530515595 ? # TODO: maybe check if not empty string also on top of None ? if sparse_embedding_field in payload and payload[sparse_embedding_field] is not None: From d911b182b8bedc479c78ea22deb003fc4827dfbd Mon Sep 17 00:00:00 2001 From: Corentin Date: Fri, 22 Mar 2024 11:00:22 +0100 Subject: [PATCH 18/41] Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py Co-authored-by: Stefano Fiorucci --- .../document_stores/qdrant/converters.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py index 1eb8ca4d0..be05833c2 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py @@ -30,10 +30,9 @@ def documents_to_batch( vector[DENSE_VECTORS_NAME] = payload.pop(embedding_field, None) or [] # TODO: does this work with this https://github.com/deepset-ai/haystack/pull/7382#discussion_r1530515595 ? # TODO: maybe check if not empty string also on top of None ? - if sparse_embedding_field in payload and payload[sparse_embedding_field] is not None: - sparse_vector = payload.pop(sparse_embedding_field, {"indices": [], "values": []}) - sparse_vector_instance = rest.SparseVector(**sparse_vector) - vector[SPARSE_VECTORS_NAME] = sparse_vector_instance + sparse_vector = payload.pop(sparse_embedding_field, {"indices": [], "values": []}) + sparse_vector_instance = rest.SparseVector(**sparse_vector) + vector[SPARSE_VECTORS_NAME] = sparse_vector_instance _id = self.convert_id(payload.get("id")) point = rest.PointStruct( From 125c123543104e7a0917539833dd40c958820a8f Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Fri, 22 Mar 2024 11:08:15 +0100 Subject: [PATCH 19/41] feat(Qdrant): reverting pop change, changing Dict to SparseEmbedding type --- .../components/retrievers/qdrant/retriever.py | 5 +++-- .../document_stores/qdrant/converters.py | 13 +++++++------ .../document_stores/qdrant/document_store.py | 6 ++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py index fb81587c3..5d44107eb 100644 --- a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py +++ b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py @@ -1,6 +1,7 @@ -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional from haystack import Document, component, default_from_dict, default_to_dict +from haystack.dataclasses.sparse_embedding import SparseEmbedding from haystack_integrations.document_stores.qdrant import QdrantDocumentStore @@ -210,7 +211,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "QdrantEmbeddingRetriever": @component.output_types(documents=List[Document]) def run( self, - query_sparse_embedding: Dict[str, Union[List[int], List[float]]], + query_sparse_embedding: SparseEmbedding, filters: Optional[Dict[str, Any]] = None, top_k: Optional[int] = None, scale_score: Optional[bool] = None, diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py index be05833c2..a1efa48cd 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py @@ -27,12 +27,13 @@ def documents_to_batch( for document in documents: payload = document.to_dict(flatten=False) vector = {} - vector[DENSE_VECTORS_NAME] = payload.pop(embedding_field, None) or [] - # TODO: does this work with this https://github.com/deepset-ai/haystack/pull/7382#discussion_r1530515595 ? - # TODO: maybe check if not empty string also on top of None ? - sparse_vector = payload.pop(sparse_embedding_field, {"indices": [], "values": []}) - sparse_vector_instance = rest.SparseVector(**sparse_vector) - vector[SPARSE_VECTORS_NAME] = sparse_vector_instance + if embedding_field in payload and payload[embedding_field] is not None: + dense_vector = payload.pop(embedding_field) or [] + vector[DENSE_VECTORS_NAME] = dense_vector + if sparse_embedding_field in payload and payload[sparse_embedding_field] is not None and payload[sparse_embedding_field] != "": + sparse_vector = payload.pop(sparse_embedding_field, {"indices": [], "values": []}) + sparse_vector_instance = rest.SparseVector(**sparse_vector) + vector[SPARSE_VECTORS_NAME] = sparse_vector_instance _id = self.convert_id(payload.get("id")) point = rest.PointStruct( diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index 8d26ebba1..d68903f06 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -8,6 +8,7 @@ from grpc import RpcError from haystack import default_from_dict, default_to_dict from haystack.dataclasses import Document +from haystack.dataclasses.sparse_embedding import SparseEmbedding from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy from haystack.utils import Secret, deserialize_secrets_inplace @@ -302,7 +303,7 @@ def get_documents_by_id( def query_by_sparse( self, - query_sparse_embedding: Dict[str, Union[List[int], List[float]]], + query_sparse_embedding: SparseEmbedding, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, scale_score: bool = True, # noqa: FBT001, FBT002 @@ -312,9 +313,6 @@ def query_by_sparse( query_indices = query_sparse_embedding["indices"] query_values = query_sparse_embedding["values"] - if len(query_indices) != len(query_values): - error_message = "The indices and values of the sparse embedding query must have the same length." - raise ValueError(error_message) points = self.client.search( collection_name=self.index, From 7cf1882e7f02f23ffec1e233de47622f0b85b087 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Fri, 22 Mar 2024 11:08:47 +0100 Subject: [PATCH 20/41] feat(Qdrant): fix lint --- .../document_stores/qdrant/converters.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py index a1efa48cd..75ebb89d6 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py @@ -30,7 +30,11 @@ def documents_to_batch( if embedding_field in payload and payload[embedding_field] is not None: dense_vector = payload.pop(embedding_field) or [] vector[DENSE_VECTORS_NAME] = dense_vector - if sparse_embedding_field in payload and payload[sparse_embedding_field] is not None and payload[sparse_embedding_field] != "": + if ( + sparse_embedding_field in payload + and payload[sparse_embedding_field] is not None + and payload[sparse_embedding_field] != "" + ): sparse_vector = payload.pop(sparse_embedding_field, {"indices": [], "values": []}) sparse_vector_instance = rest.SparseVector(**sparse_vector) vector[SPARSE_VECTORS_NAME] = sparse_vector_instance From 9749ee086f8f49b2dd89fde5ecf5153802a68557 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Fri, 22 Mar 2024 11:29:43 +0100 Subject: [PATCH 21/41] feat(Qdrant): remove old todo --- .../haystack_integrations/document_stores/qdrant/converters.py | 1 - 1 file changed, 1 deletion(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py index 75ebb89d6..a7dbab57f 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py @@ -82,6 +82,5 @@ def point_to_document(self, point: QdrantPoint) -> Document: } payload["sparse_embedding"] = parse_vector_dict else: - # TODO: does this work with this https://github.com/deepset-ai/haystack/pull/7382#discussion_r1530515595 ? payload["sparse_embedding"] = None return Document.from_dict(payload) From 2683a74c45e8e595446c15c0bda7ced1417ec00a Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Mar 2024 13:01:45 +0100 Subject: [PATCH 22/41] simplify documents_to_batch --- .../document_stores/qdrant/converters.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py index a7dbab57f..bd38db214 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py @@ -27,17 +27,16 @@ def documents_to_batch( for document in documents: payload = document.to_dict(flatten=False) vector = {} - if embedding_field in payload and payload[embedding_field] is not None: - dense_vector = payload.pop(embedding_field) or [] + + dense_vector = payload.pop(embedding_field, None) + if dense_vector is not None: vector[DENSE_VECTORS_NAME] = dense_vector - if ( - sparse_embedding_field in payload - and payload[sparse_embedding_field] is not None - and payload[sparse_embedding_field] != "" - ): - sparse_vector = payload.pop(sparse_embedding_field, {"indices": [], "values": []}) + + sparse_vector = payload.pop(sparse_embedding_field, None) + if sparse_vector is not None: sparse_vector_instance = rest.SparseVector(**sparse_vector) vector[SPARSE_VECTORS_NAME] = sparse_vector_instance + _id = self.convert_id(payload.get("id")) point = rest.PointStruct( From 79d0d527c3887b09333d011b392ee15878586c83 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Fri, 22 Mar 2024 14:09:11 +0100 Subject: [PATCH 23/41] feat(Qdrant): SparseEmbedding instead of Dict --- .../components/retrievers/qdrant/retriever.py | 5 +++-- .../document_stores/qdrant/document_store.py | 5 ++--- integrations/qdrant/tests/test_retriever.py | 8 +++----- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py index 5d44107eb..12a67a3b7 100644 --- a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py +++ b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py @@ -132,6 +132,7 @@ class QdrantSparseRetriever: ```python from haystack_integrations.components.retrievers.qdrant import QdrantSparseRetriever from haystack_integrations.document_stores.qdrant import QdrantDocumentStore + from haystack.dataclasses.sparse_embedding import SparseEmbedding document_store = QdrantDocumentStore( ":memory:", @@ -140,8 +141,8 @@ class QdrantSparseRetriever: wait_result_from_api=True, ) retriever = QdrantSparseRetriever(document_store=document_store) - - retriever.run(query_sparse_embedding={"indices":[0, 1, 2, 3], "values":[0.1, 0.8, 0.05, 0.33]}) + sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33]) + retriever.run(query_sparse_embedding=sparse_embedding) ``` """ diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index d68903f06..f304a73d8 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -310,9 +310,8 @@ def query_by_sparse( return_embedding: bool = False, # noqa: FBT001, FBT002 ) -> List[Document]: qdrant_filters = self.qdrant_filter_converter.convert(filters) - - query_indices = query_sparse_embedding["indices"] - query_values = query_sparse_embedding["values"] + query_indices = query_sparse_embedding.indices + query_values = query_sparse_embedding.values points = self.client.search( collection_name=self.index, diff --git a/integrations/qdrant/tests/test_retriever.py b/integrations/qdrant/tests/test_retriever.py index fb4ac704e..6b24270b4 100644 --- a/integrations/qdrant/tests/test_retriever.py +++ b/integrations/qdrant/tests/test_retriever.py @@ -227,14 +227,12 @@ def test_run(self, filterable_docs: List[Document]): document_store.write_documents(filterable_docs) retriever = QdrantSparseRetriever(document_store=document_store) - - results: List[Document] = retriever.run(query_sparse_embedding=self._generate_mocked_sparse_embedding(1)[0]) + sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33]) + results: List[Document] = retriever.run(query_sparse_embedding=sparse_embedding) assert len(results["documents"]) == 10 # type: ignore - results = retriever.run( - query_sparse_embedding=self._generate_mocked_sparse_embedding(1)[0], top_k=5, return_embedding=False - ) + results = retriever.run(query_sparse_embedding=sparse_embedding, top_k=5, return_embedding=False) assert len(results["documents"]) == 5 # type: ignore From 529719a0fb8c37727a0ca005d43cdd1e0be3ca80 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Fri, 22 Mar 2024 17:02:36 +0100 Subject: [PATCH 24/41] feat(Qdrant): introducing `use_sparse_embeddings` parameters for document store to make sparse embeddings non breaking change. Need more testing --- .../document_stores/qdrant/converters.py | 59 ++++--- .../document_stores/qdrant/document_store.py | 155 +++++++++++++----- integrations/qdrant/tests/test_converters.py | 32 +++- .../qdrant/tests/test_dict_converters.py | 3 + integrations/qdrant/tests/test_retriever.py | 11 +- 5 files changed, 188 insertions(+), 72 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py index bd38db214..f5d220365 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py @@ -21,22 +21,26 @@ def documents_to_batch( documents: List[Document], *, embedding_field: str, + use_sparse_embeddings: bool, sparse_embedding_field: str, ) -> List[rest.PointStruct]: points = [] for document in documents: payload = document.to_dict(flatten=False) - vector = {} + if use_sparse_embeddings: + vector = {} - dense_vector = payload.pop(embedding_field, None) - if dense_vector is not None: - vector[DENSE_VECTORS_NAME] = dense_vector + dense_vector = payload.pop(embedding_field, None) + if dense_vector is not None: + vector[DENSE_VECTORS_NAME] = dense_vector - sparse_vector = payload.pop(sparse_embedding_field, None) - if sparse_vector is not None: - sparse_vector_instance = rest.SparseVector(**sparse_vector) - vector[SPARSE_VECTORS_NAME] = sparse_vector_instance + sparse_vector = payload.pop(sparse_embedding_field, None) + if sparse_vector is not None: + sparse_vector_instance = rest.SparseVector(**sparse_vector) + vector[SPARSE_VECTORS_NAME] = sparse_vector_instance + if not use_sparse_embeddings: + vector = payload.pop(embedding_field) or {} _id = self.convert_id(payload.get("id")) point = rest.PointStruct( @@ -61,25 +65,38 @@ def convert_id(self, _id: str) -> str: class QdrantToHaystack: - def __init__(self, content_field: str, name_field: str, embedding_field: str, sparse_embedding_field: str): + def __init__( + self, + content_field: str, + name_field: str, + embedding_field: str, + use_sparse_embeddings: bool, # noqa: FBT001 + sparse_embedding_field: str, + ): self.content_field = content_field self.name_field = name_field self.embedding_field = embedding_field + self.use_sparse_embeddings = use_sparse_embeddings self.sparse_embedding_field = sparse_embedding_field def point_to_document(self, point: QdrantPoint) -> Document: payload = {**point.payload} - if hasattr(point, "vector") and point.vector is not None and DENSE_VECTORS_NAME in point.vector: - payload["embedding"] = point.vector[DENSE_VECTORS_NAME] - else: - payload["embedding"] = None payload["score"] = point.score if hasattr(point, "score") else None - if hasattr(point, "vector") and point.vector is not None and SPARSE_VECTORS_NAME in point.vector: - parse_vector_dict = { - "indices": point.vector[SPARSE_VECTORS_NAME].indices, - "values": point.vector[SPARSE_VECTORS_NAME].values, - } - payload["sparse_embedding"] = parse_vector_dict - else: - payload["sparse_embedding"] = None + if not self.use_sparse_embeddings: + payload["embedding"] = point.vector if hasattr(point, "vector") else None + + if self.use_sparse_embeddings: + if hasattr(point, "vector") and point.vector is not None and DENSE_VECTORS_NAME in point.vector: + payload["embedding"] = point.vector[DENSE_VECTORS_NAME] + else: + payload["embedding"] = None + + if hasattr(point, "vector") and point.vector is not None and SPARSE_VECTORS_NAME in point.vector: + parse_vector_dict = { + "indices": point.vector[SPARSE_VECTORS_NAME].indices, + "values": point.vector[SPARSE_VECTORS_NAME].values, + } + payload["sparse_embedding"] = parse_vector_dict + else: + payload["sparse_embedding"] = None return Document.from_dict(payload) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index f304a73d8..0cffcf31b 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -71,6 +71,7 @@ def __init__( content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", + use_sparse_embeddings: bool = False, # noqa: FBT001, FBT002 sparse_embedding_field: str = "sparse_embedding", similarity: str = "cosine", return_embedding: bool = False, # noqa: FBT001, FBT002 @@ -140,13 +141,16 @@ def __init__( self.payload_fields_to_index = payload_fields_to_index # Make sure the collection is properly set up - self._set_up_collection(index, embedding_dim, recreate_index, similarity, on_disk, payload_fields_to_index) + self._set_up_collection( + index, embedding_dim, recreate_index, similarity, use_sparse_embeddings, on_disk, payload_fields_to_index + ) self.embedding_dim = embedding_dim self.on_disk = on_disk self.content_field = content_field self.name_field = name_field self.embedding_field = embedding_field + self.use_sparse_embeddings = use_sparse_embeddings self.sparse_embedding_field = sparse_embedding_field self.similarity = similarity self.index = index @@ -155,7 +159,9 @@ def __init__( self.duplicate_documents = duplicate_documents self.qdrant_filter_converter = QdrantFilterConverter() self.haystack_to_qdrant_converter = HaystackToQdrant() - self.qdrant_to_haystack = QdrantToHaystack(content_field, name_field, embedding_field, sparse_embedding_field) + self.qdrant_to_haystack = QdrantToHaystack( + content_field, name_field, embedding_field, use_sparse_embeddings, sparse_embedding_field + ) self.write_batch_size = write_batch_size self.scroll_size = scroll_size @@ -196,7 +202,7 @@ def write_documents( if not isinstance(doc, Document): msg = f"DocumentStore.write_documents() expects a list of Documents but got an element of {type(doc)}." raise ValueError(msg) - self._set_up_collection(self.index, self.embedding_dim, False, self.similarity) + self._set_up_collection(self.index, self.embedding_dim, False, self.similarity, self.use_sparse_embeddings) if len(documents) == 0: logger.warning("Calling QdrantDocumentStore.write_documents() with empty list") @@ -214,6 +220,7 @@ def write_documents( batch = self.haystack_to_qdrant_converter.documents_to_batch( document_batch, embedding_field=self.embedding_field, + use_sparse_embeddings=self.use_sparse_embeddings, sparse_embedding_field=self.sparse_embedding_field, ) @@ -309,10 +316,17 @@ def query_by_sparse( scale_score: bool = True, # noqa: FBT001, FBT002 return_embedding: bool = False, # noqa: FBT001, FBT002 ) -> List[Document]: + + if not self.use_sparse_embeddings: + message = ( + "Error: tried to query by sparse vector with a Qdrant " + "Document Store initialized with use_sparse_embeddings=False" + ) + raise ValueError(message) + qdrant_filters = self.qdrant_filter_converter.convert(filters) query_indices = query_sparse_embedding.indices query_values = query_sparse_embedding.values - points = self.client.search( collection_name=self.index, query_vector=rest.NamedSparseVector( @@ -326,7 +340,6 @@ def query_by_sparse( limit=top_k, with_vectors=return_embedding, ) - results = [self.qdrant_to_haystack.point_to_document(point) for point in points] if scale_score: for document in results: @@ -345,17 +358,25 @@ def query_by_embedding( ) -> List[Document]: qdrant_filters = self.qdrant_filter_converter.convert(filters) - points = self.client.search( - collection_name=self.index, - query_vector=rest.NamedVector( - name=DENSE_VECTORS_NAME, - vector=query_embedding, - ), - query_filter=qdrant_filters, - limit=top_k, - with_vectors=return_embedding, - ) - + if self.use_sparse_embeddings: + points = self.client.search( + collection_name=self.index, + query_vector=rest.NamedVector( + name=DENSE_VECTORS_NAME, + vector=query_embedding, + ), + query_filter=qdrant_filters, + limit=top_k, + with_vectors=return_embedding, + ) + if not self.use_sparse_embeddings: + points = self.client.search( + collection_name=self.index, + query_vector=query_embedding, + query_filter=qdrant_filters, + limit=top_k, + with_vectors=return_embedding, + ) results = [self.qdrant_to_haystack.point_to_document(point) for point in points] if scale_score: for document in results: @@ -397,6 +418,7 @@ def _set_up_collection( embedding_dim: int, recreate_collection: bool, # noqa: FBT001 similarity: str, + use_sparse_embeddings: bool, # noqa: FBT001 on_disk: bool = False, # noqa: FBT001, FBT002 payload_fields_to_index: Optional[List[dict]] = None, ): @@ -405,7 +427,7 @@ def _set_up_collection( if recreate_collection: # There is no need to verify the current configuration of that # collection. It might be just recreated again. - self._recreate_collection(collection_name, distance, embedding_dim, on_disk) + self._recreate_collection(collection_name, distance, embedding_dim, on_disk, use_sparse_embeddings) # Create Payload index if payload_fields_to_index is provided self._create_payload_index(collection_name, payload_fields_to_index) return @@ -421,12 +443,33 @@ def _set_up_collection( # Qdrant local raises ValueError if the collection is not found, but # with the remote server UnexpectedResponse / RpcError is raised. # Until that's unified, we need to catch both. - self._recreate_collection(collection_name, distance, embedding_dim, on_disk) + self._recreate_collection(collection_name, distance, embedding_dim, on_disk, use_sparse_embeddings) # Create Payload index if payload_fields_to_index is provided self._create_payload_index(collection_name, payload_fields_to_index) return - current_distance = collection_info.config.params.vectors[DENSE_VECTORS_NAME].distance - current_vector_size = collection_info.config.params.vectors[DENSE_VECTORS_NAME].size + if self.use_sparse_embeddings: + current_distance = collection_info.config.params.vectors[DENSE_VECTORS_NAME].distance + current_vector_size = collection_info.config.params.vectors[DENSE_VECTORS_NAME].size + if not self.use_sparse_embeddings: + current_distance = collection_info.config.params.vectors.distance + current_vector_size = collection_info.config.params.vectors.size + + if self.use_sparse_embeddings and not isinstance(collection_info.config.params.vectors, dict): + msg = ( + f"Collection '{collection_name}' already exists in Qdrant, " + f"but it has been originaly created without sparse embedding vectors." + f"If you want to use that collection, either set `use_sparse_embeddings=False` " + f"or run a migration script " + f"to use Named Dense Vectors (`text-sparse`) and Named Sparse Vectors (`text-dense`)." + ) + raise ValueError(msg) + if not self.use_sparse_embeddings and isinstance(collection_info.config.params.vectors, dict): + msg = ( + f"Collection '{collection_name}' already exists in Qdrant, " + f"but it has been originaly created with sparse embedding vectors." + f"If you want to use that collection, please set `use_sparse_embeddings=True`" + ) + raise ValueError(msg) if current_distance != distance: msg = ( @@ -446,33 +489,59 @@ def _set_up_collection( ) raise ValueError(msg) - def _recreate_collection(self, collection_name: str, distance, embedding_dim: int, on_disk: bool): # noqa: FBT001 - self.client.recreate_collection( - collection_name=collection_name, - vectors_config={ - DENSE_VECTORS_NAME: rest.VectorParams( + def _recreate_collection( + self, + collection_name: str, + distance, + embedding_dim: int, + on_disk: bool, # noqa: FBT001 + use_sparse_embeddings: bool, # noqa: FBT001 + ): + if use_sparse_embeddings: + self.client.recreate_collection( + collection_name=collection_name, + vectors_config={ + DENSE_VECTORS_NAME: rest.VectorParams( + size=embedding_dim, + on_disk=on_disk, + distance=distance, + ), + }, + sparse_vectors_config={ + SPARSE_VECTORS_NAME: rest.SparseVectorParams( + index=rest.SparseIndexParams( + on_disk=on_disk, + ) + ) + }, + shard_number=self.shard_number, + replication_factor=self.replication_factor, + write_consistency_factor=self.write_consistency_factor, + on_disk_payload=self.on_disk_payload, + hnsw_config=self.hnsw_config, + optimizers_config=self.optimizers_config, + wal_config=self.wal_config, + quantization_config=self.quantization_config, + init_from=self.init_from, + ) + if not use_sparse_embeddings: + self.client.recreate_collection( + collection_name=collection_name, + vectors_config=rest.VectorParams( size=embedding_dim, on_disk=on_disk, distance=distance, ), - }, - sparse_vectors_config={ - SPARSE_VECTORS_NAME: rest.SparseVectorParams( - index=rest.SparseIndexParams( - on_disk=on_disk, - ) - ) - }, - shard_number=self.shard_number, - replication_factor=self.replication_factor, - write_consistency_factor=self.write_consistency_factor, - on_disk_payload=self.on_disk_payload, - hnsw_config=self.hnsw_config, - optimizers_config=self.optimizers_config, - wal_config=self.wal_config, - quantization_config=self.quantization_config, - init_from=self.init_from, - ) + shard_number=self.shard_number, + replication_factor=self.replication_factor, + write_consistency_factor=self.write_consistency_factor, + on_disk_payload=self.on_disk_payload, + hnsw_config=self.hnsw_config, + optimizers_config=self.optimizers_config, + wal_config=self.wal_config, + quantization_config=self.quantization_config, + init_from=self.init_from, + ) def _handle_duplicate_documents( self, diff --git a/integrations/qdrant/tests/test_converters.py b/integrations/qdrant/tests/test_converters.py index c4de6d981..fd9d5f3ad 100644 --- a/integrations/qdrant/tests/test_converters.py +++ b/integrations/qdrant/tests/test_converters.py @@ -15,11 +15,12 @@ def haystack_to_qdrant() -> HaystackToQdrant: @pytest.fixture -def qdrant_to_haystack() -> QdrantToHaystack: +def qdrant_to_haystack(request) -> QdrantToHaystack: return QdrantToHaystack( content_field=CONTENT_FIELD, name_field=NAME_FIELD, embedding_field=EMBEDDING_FIELD, + use_sparse_embeddings=request.param, sparse_embedding_field=SPARSE_EMBEDDING_FIELD, ) @@ -30,7 +31,8 @@ def test_convert_id_is_deterministic(haystack_to_qdrant: HaystackToQdrant): assert first_id == second_id -def test_point_to_document_reverts_proper_structure_from_record( +@pytest.mark.parametrize("qdrant_to_haystack", [True], indirect=True) +def test_point_to_document_reverts_proper_structure_from_record_with_sparse( qdrant_to_haystack: QdrantToHaystack, ): point = rest.Record( @@ -56,3 +58,29 @@ def test_point_to_document_reverts_proper_structure_from_record( assert {"indices": [7, 1024, 367], "values": [0.1, 0.98, 0.33]} == document.sparse_embedding.to_dict() assert {"test_field": 1} == document.meta assert 0.0 == np.sum(np.array([1.0, 0.0, 0.0, 0.0]) - document.embedding) + + +@pytest.mark.parametrize("qdrant_to_haystack", [False], indirect=True) +def test_point_to_document_reverts_proper_structure_from_record_without_sparse( + qdrant_to_haystack: QdrantToHaystack, +): + point = rest.Record( + id="c7c62e8e-02b9-4ec6-9f88-46bd97b628b7", + payload={ + "id": "my-id", + "id_hash_keys": ["content"], + "content": "Lorem ipsum", + "content_type": "text", + "meta": { + "test_field": 1, + }, + }, + vector=[1.0, 0.0, 0.0, 0.0], + ) + document = qdrant_to_haystack.point_to_document(point) + assert "my-id" == document.id + assert "Lorem ipsum" == document.content + assert "text" == document.content_type + assert document.sparse_embedding is None + assert {"test_field": 1} == document.meta + assert 0.0 == np.sum(np.array([1.0, 0.0, 0.0, 0.0]) - document.embedding) diff --git a/integrations/qdrant/tests/test_dict_converters.py b/integrations/qdrant/tests/test_dict_converters.py index 32fb750fb..fd8b15181 100644 --- a/integrations/qdrant/tests/test_dict_converters.py +++ b/integrations/qdrant/tests/test_dict_converters.py @@ -25,6 +25,7 @@ def test_to_dict(): "content_field": "content", "name_field": "name", "embedding_field": "embedding", + "use_sparse_embeddings": False, "sparse_embedding_field": "sparse_embedding", "similarity": "cosine", "return_embedding": False, @@ -64,6 +65,7 @@ def test_from_dict(): "content_field": "content", "name_field": "name", "embedding_field": "embedding", + "use_sparse_embeddings": False, "sparse_embedding_field": "sparse_embedding", "similarity": "cosine", "return_embedding": False, @@ -88,6 +90,7 @@ def test_from_dict(): document_store.content_field == "content", document_store.name_field == "name", document_store.embedding_field == "embedding", + document_store.use_sparse_embeddings is False, document_store.sparse_embedding_field == "sparse_embedding", document_store.on_disk is False, document_store.similarity == "cosine", diff --git a/integrations/qdrant/tests/test_retriever.py b/integrations/qdrant/tests/test_retriever.py index 6b24270b4..b6369773c 100644 --- a/integrations/qdrant/tests/test_retriever.py +++ b/integrations/qdrant/tests/test_retriever.py @@ -49,6 +49,7 @@ def test_to_dict(self): "content_field": "content", "name_field": "name", "embedding_field": "embedding", + "use_sparse_embeddings": False, "sparse_embedding_field": "sparse_embedding", "similarity": "cosine", "return_embedding": False, @@ -121,7 +122,7 @@ def test_run(self, filterable_docs: List[Document]): class TestQdrantSparseRetriever(FilterableDocsFixtureMixin): def test_init_default(self): - document_store = QdrantDocumentStore(location=":memory:", index="test") + document_store = QdrantDocumentStore(location=":memory:", index="test", use_sparse_embeddings=True) retriever = QdrantSparseRetriever(document_store=document_store) assert retriever._document_store == document_store assert retriever._filters is None @@ -129,7 +130,7 @@ def test_init_default(self): assert retriever._return_embedding is False def test_to_dict(self): - document_store = QdrantDocumentStore(location=":memory:", index="test") + document_store = QdrantDocumentStore(location=":memory:", index="test", use_sparse_embeddings=True) retriever = QdrantSparseRetriever(document_store=document_store) res = retriever.to_dict() assert res == { @@ -155,6 +156,7 @@ def test_to_dict(self): "content_field": "content", "name_field": "name", "embedding_field": "embedding", + "use_sparse_embeddings": True, "sparse_embedding_field": "sparse_embedding", "similarity": "cosine", "return_embedding": False, @@ -218,18 +220,15 @@ def _generate_mocked_sparse_embedding(self, n): return list_of_sparse_vectors def test_run(self, filterable_docs: List[Document]): - document_store = QdrantDocumentStore(location=":memory:", index="Boi") + document_store = QdrantDocumentStore(location=":memory:", index="Boi", use_sparse_embeddings=True) # Add fake sparse embedding to documents for doc in filterable_docs: doc.sparse_embedding = SparseEmbedding.from_dict(self._generate_mocked_sparse_embedding(1)[0]) - document_store.write_documents(filterable_docs) - retriever = QdrantSparseRetriever(document_store=document_store) sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33]) results: List[Document] = retriever.run(query_sparse_embedding=sparse_embedding) - assert len(results["documents"]) == 10 # type: ignore results = retriever.run(query_sparse_embedding=sparse_embedding, top_k=5, return_embedding=False) From fd064da77c210e011b2be9a017c70e2dc588ee3f Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Fri, 22 Mar 2024 17:19:49 +0100 Subject: [PATCH 25/41] feat(Qdrant): `use_sparse_embeddings` true by default + bugfix --- .../document_stores/qdrant/document_store.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index 0cffcf31b..277f3ba4d 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -71,7 +71,7 @@ def __init__( content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", - use_sparse_embeddings: bool = False, # noqa: FBT001, FBT002 + use_sparse_embeddings: bool = True, # noqa: FBT001, FBT002 sparse_embedding_field: str = "sparse_embedding", similarity: str = "cosine", return_embedding: bool = False, # noqa: FBT001, FBT002 @@ -139,6 +139,7 @@ def __init__( self.wait_result_from_api = wait_result_from_api self.recreate_index = recreate_index self.payload_fields_to_index = payload_fields_to_index + self.use_sparse_embeddings = use_sparse_embeddings # Make sure the collection is properly set up self._set_up_collection( @@ -150,7 +151,6 @@ def __init__( self.content_field = content_field self.name_field = name_field self.embedding_field = embedding_field - self.use_sparse_embeddings = use_sparse_embeddings self.sparse_embedding_field = sparse_embedding_field self.similarity = similarity self.index = index @@ -457,7 +457,7 @@ def _set_up_collection( if self.use_sparse_embeddings and not isinstance(collection_info.config.params.vectors, dict): msg = ( f"Collection '{collection_name}' already exists in Qdrant, " - f"but it has been originaly created without sparse embedding vectors." + f"but it has been originally created without sparse embedding vectors." f"If you want to use that collection, either set `use_sparse_embeddings=False` " f"or run a migration script " f"to use Named Dense Vectors (`text-sparse`) and Named Sparse Vectors (`text-dense`)." From b018504de8664393bf5b9d916cebe00d691c7003 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Fri, 22 Mar 2024 17:24:26 +0100 Subject: [PATCH 26/41] feat(Qdrant): `use_sparse_embeddings` true by default + bugfix --- .../qdrant/tests/test_dict_converters.py | 6 +- integrations/qdrant/tests/test_retriever.py | 133 +++++++++++++++++- 2 files changed, 131 insertions(+), 8 deletions(-) diff --git a/integrations/qdrant/tests/test_dict_converters.py b/integrations/qdrant/tests/test_dict_converters.py index fd8b15181..c6cc5754c 100644 --- a/integrations/qdrant/tests/test_dict_converters.py +++ b/integrations/qdrant/tests/test_dict_converters.py @@ -25,7 +25,7 @@ def test_to_dict(): "content_field": "content", "name_field": "name", "embedding_field": "embedding", - "use_sparse_embeddings": False, + "use_sparse_embeddings": True, "sparse_embedding_field": "sparse_embedding", "similarity": "cosine", "return_embedding": False, @@ -65,7 +65,7 @@ def test_from_dict(): "content_field": "content", "name_field": "name", "embedding_field": "embedding", - "use_sparse_embeddings": False, + "use_sparse_embeddings": True, "sparse_embedding_field": "sparse_embedding", "similarity": "cosine", "return_embedding": False, @@ -90,7 +90,7 @@ def test_from_dict(): document_store.content_field == "content", document_store.name_field == "name", document_store.embedding_field == "embedding", - document_store.use_sparse_embeddings is False, + document_store.use_sparse_embeddings is True, document_store.sparse_embedding_field == "sparse_embedding", document_store.on_disk is False, document_store.similarity == "cosine", diff --git a/integrations/qdrant/tests/test_retriever.py b/integrations/qdrant/tests/test_retriever.py index b6369773c..961816ea6 100644 --- a/integrations/qdrant/tests/test_retriever.py +++ b/integrations/qdrant/tests/test_retriever.py @@ -15,7 +15,7 @@ class TestQdrantRetriever(FilterableDocsFixtureMixin): def test_init_default(self): - document_store = QdrantDocumentStore(location=":memory:", index="test") + document_store = QdrantDocumentStore(location=":memory:", index="test", use_sparse_embeddings=False) retriever = QdrantEmbeddingRetriever(document_store=document_store) assert retriever._document_store == document_store assert retriever._filters is None @@ -23,7 +23,113 @@ def test_init_default(self): assert retriever._return_embedding is False def test_to_dict(self): - document_store = QdrantDocumentStore(location=":memory:", index="test") + document_store = QdrantDocumentStore(location=":memory:", index="test", use_sparse_embeddings=False) + retriever = QdrantEmbeddingRetriever(document_store=document_store) + res = retriever.to_dict() + assert res == { + "type": "haystack_integrations.components.retrievers.qdrant.retriever.QdrantEmbeddingRetriever", + "init_parameters": { + "document_store": { + "type": "haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore", + "init_parameters": { + "location": ":memory:", + "url": None, + "port": 6333, + "grpc_port": 6334, + "prefer_grpc": False, + "https": None, + "api_key": None, + "prefix": None, + "timeout": None, + "host": None, + "path": None, + "index": "test", + "embedding_dim": 768, + "on_disk": False, + "content_field": "content", + "name_field": "name", + "embedding_field": "embedding", + "use_sparse_embeddings": True, + "sparse_embedding_field": "sparse_embedding", + "similarity": "cosine", + "return_embedding": False, + "progress_bar": True, + "duplicate_documents": "overwrite", + "recreate_index": False, + "shard_number": None, + "replication_factor": None, + "write_consistency_factor": None, + "on_disk_payload": None, + "hnsw_config": None, + "optimizers_config": None, + "wal_config": None, + "quantization_config": None, + "init_from": None, + "wait_result_from_api": True, + "metadata": {}, + "write_batch_size": 100, + "scroll_size": 10000, + "payload_fields_to_index": None, + }, + }, + "filters": None, + "top_k": 10, + "scale_score": True, + "return_embedding": False, + }, + } + + def test_from_dict(self): + data = { + "type": "haystack_integrations.components.retrievers.qdrant.retriever.QdrantEmbeddingRetriever", + "init_parameters": { + "document_store": { + "init_parameters": {"location": ":memory:", "index": "test"}, + "type": "haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore", + }, + "filters": None, + "top_k": 5, + "scale_score": False, + "return_embedding": True, + }, + } + retriever = QdrantEmbeddingRetriever.from_dict(data) + assert isinstance(retriever._document_store, QdrantDocumentStore) + assert retriever._document_store.index == "test" + assert retriever._filters is None + assert retriever._top_k == 5 + assert retriever._scale_score is False + assert retriever._return_embedding is True + + def test_run(self, filterable_docs: List[Document]): + document_store = QdrantDocumentStore(location=":memory:", index="Boi", use_sparse_embeddings=False) + + document_store.write_documents(filterable_docs) + + retriever = QdrantEmbeddingRetriever(document_store=document_store) + + results: List[Document] = retriever.run(query_embedding=_random_embeddings(768)) + + assert len(results["documents"]) == 10 # type: ignore + + results = retriever.run(query_embedding=_random_embeddings(768), top_k=5, return_embedding=False) + + assert len(results["documents"]) == 5 # type: ignore + + for document in results["documents"]: # type: ignore + assert document.embedding is None + +class TestQdrantRetriever(FilterableDocsFixtureMixin): + def test_init_default(self): + document_store = QdrantDocumentStore(location=":memory:", index="test", use_sparse_embeddings=False) + retriever = QdrantEmbeddingRetriever(document_store=document_store) + assert retriever._document_store == document_store + assert retriever._filters is None + assert retriever._top_k == 10 + assert retriever._return_embedding is False + + def test_to_dict(self): + document_store = QdrantDocumentStore(location=":memory:", index="test", use_sparse_embeddings=False) retriever = QdrantEmbeddingRetriever(document_store=document_store) res = retriever.to_dict() assert res == { @@ -102,7 +208,7 @@ def test_from_dict(self): assert retriever._return_embedding is True def test_run(self, filterable_docs: List[Document]): - document_store = QdrantDocumentStore(location=":memory:", index="Boi") + document_store = QdrantDocumentStore(location=":memory:", index="Boi", use_sparse_embeddings=False) document_store.write_documents(filterable_docs) @@ -119,10 +225,27 @@ def test_run(self, filterable_docs: List[Document]): for document in results["documents"]: # type: ignore assert document.embedding is None + def test_run_with_sparse_activated(self, filterable_docs: List[Document]): + document_store = QdrantDocumentStore(location=":memory:", index="Boi", use_sparse_embeddings=True) + + document_store.write_documents(filterable_docs) + + retriever = QdrantEmbeddingRetriever(document_store=document_store) + + results: List[Document] = retriever.run(query_embedding=_random_embeddings(768)) + + assert len(results["documents"]) == 10 # type: ignore + + results = retriever.run(query_embedding=_random_embeddings(768), top_k=5, return_embedding=False) + + assert len(results["documents"]) == 5 # type: ignore + + for document in results["documents"]: # type: ignore + assert document.embedding is None class TestQdrantSparseRetriever(FilterableDocsFixtureMixin): def test_init_default(self): - document_store = QdrantDocumentStore(location=":memory:", index="test", use_sparse_embeddings=True) + document_store = QdrantDocumentStore(location=":memory:", index="test") retriever = QdrantSparseRetriever(document_store=document_store) assert retriever._document_store == document_store assert retriever._filters is None @@ -130,7 +253,7 @@ def test_init_default(self): assert retriever._return_embedding is False def test_to_dict(self): - document_store = QdrantDocumentStore(location=":memory:", index="test", use_sparse_embeddings=True) + document_store = QdrantDocumentStore(location=":memory:", index="test") retriever = QdrantSparseRetriever(document_store=document_store) res = retriever.to_dict() assert res == { From e1c38be921ae49701d526969f76ef2373a0fa0c4 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Fri, 22 Mar 2024 17:26:04 +0100 Subject: [PATCH 27/41] feat(Qdrant): `use_sparse_embeddings` true by default + bugfix --- integrations/qdrant/tests/test_retriever.py | 107 +------------------- 1 file changed, 1 insertion(+), 106 deletions(-) diff --git a/integrations/qdrant/tests/test_retriever.py b/integrations/qdrant/tests/test_retriever.py index 961816ea6..e58496868 100644 --- a/integrations/qdrant/tests/test_retriever.py +++ b/integrations/qdrant/tests/test_retriever.py @@ -13,112 +13,6 @@ from haystack_integrations.document_stores.qdrant import QdrantDocumentStore -class TestQdrantRetriever(FilterableDocsFixtureMixin): - def test_init_default(self): - document_store = QdrantDocumentStore(location=":memory:", index="test", use_sparse_embeddings=False) - retriever = QdrantEmbeddingRetriever(document_store=document_store) - assert retriever._document_store == document_store - assert retriever._filters is None - assert retriever._top_k == 10 - assert retriever._return_embedding is False - - def test_to_dict(self): - document_store = QdrantDocumentStore(location=":memory:", index="test", use_sparse_embeddings=False) - retriever = QdrantEmbeddingRetriever(document_store=document_store) - res = retriever.to_dict() - assert res == { - "type": "haystack_integrations.components.retrievers.qdrant.retriever.QdrantEmbeddingRetriever", - "init_parameters": { - "document_store": { - "type": "haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore", - "init_parameters": { - "location": ":memory:", - "url": None, - "port": 6333, - "grpc_port": 6334, - "prefer_grpc": False, - "https": None, - "api_key": None, - "prefix": None, - "timeout": None, - "host": None, - "path": None, - "index": "test", - "embedding_dim": 768, - "on_disk": False, - "content_field": "content", - "name_field": "name", - "embedding_field": "embedding", - "use_sparse_embeddings": True, - "sparse_embedding_field": "sparse_embedding", - "similarity": "cosine", - "return_embedding": False, - "progress_bar": True, - "duplicate_documents": "overwrite", - "recreate_index": False, - "shard_number": None, - "replication_factor": None, - "write_consistency_factor": None, - "on_disk_payload": None, - "hnsw_config": None, - "optimizers_config": None, - "wal_config": None, - "quantization_config": None, - "init_from": None, - "wait_result_from_api": True, - "metadata": {}, - "write_batch_size": 100, - "scroll_size": 10000, - "payload_fields_to_index": None, - }, - }, - "filters": None, - "top_k": 10, - "scale_score": True, - "return_embedding": False, - }, - } - - def test_from_dict(self): - data = { - "type": "haystack_integrations.components.retrievers.qdrant.retriever.QdrantEmbeddingRetriever", - "init_parameters": { - "document_store": { - "init_parameters": {"location": ":memory:", "index": "test"}, - "type": "haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore", - }, - "filters": None, - "top_k": 5, - "scale_score": False, - "return_embedding": True, - }, - } - retriever = QdrantEmbeddingRetriever.from_dict(data) - assert isinstance(retriever._document_store, QdrantDocumentStore) - assert retriever._document_store.index == "test" - assert retriever._filters is None - assert retriever._top_k == 5 - assert retriever._scale_score is False - assert retriever._return_embedding is True - - def test_run(self, filterable_docs: List[Document]): - document_store = QdrantDocumentStore(location=":memory:", index="Boi", use_sparse_embeddings=False) - - document_store.write_documents(filterable_docs) - - retriever = QdrantEmbeddingRetriever(document_store=document_store) - - results: List[Document] = retriever.run(query_embedding=_random_embeddings(768)) - - assert len(results["documents"]) == 10 # type: ignore - - results = retriever.run(query_embedding=_random_embeddings(768), top_k=5, return_embedding=False) - - assert len(results["documents"]) == 5 # type: ignore - - for document in results["documents"]: # type: ignore - assert document.embedding is None - class TestQdrantRetriever(FilterableDocsFixtureMixin): def test_init_default(self): document_store = QdrantDocumentStore(location=":memory:", index="test", use_sparse_embeddings=False) @@ -243,6 +137,7 @@ def test_run_with_sparse_activated(self, filterable_docs: List[Document]): for document in results["documents"]: # type: ignore assert document.embedding is None + class TestQdrantSparseRetriever(FilterableDocsFixtureMixin): def test_init_default(self): document_store = QdrantDocumentStore(location=":memory:", index="test") From 1cb601d81d6478118715b08f86074f7428f812e1 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Fri, 22 Mar 2024 17:35:33 +0100 Subject: [PATCH 28/41] feat(Qdrant): bugfix --- .../document_stores/qdrant/document_store.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index 277f3ba4d..a6a1e377e 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -447,12 +447,6 @@ def _set_up_collection( # Create Payload index if payload_fields_to_index is provided self._create_payload_index(collection_name, payload_fields_to_index) return - if self.use_sparse_embeddings: - current_distance = collection_info.config.params.vectors[DENSE_VECTORS_NAME].distance - current_vector_size = collection_info.config.params.vectors[DENSE_VECTORS_NAME].size - if not self.use_sparse_embeddings: - current_distance = collection_info.config.params.vectors.distance - current_vector_size = collection_info.config.params.vectors.size if self.use_sparse_embeddings and not isinstance(collection_info.config.params.vectors, dict): msg = ( @@ -471,6 +465,13 @@ def _set_up_collection( ) raise ValueError(msg) + if self.use_sparse_embeddings: + current_distance = collection_info.config.params.vectors[DENSE_VECTORS_NAME].distance + current_vector_size = collection_info.config.params.vectors[DENSE_VECTORS_NAME].size + if not self.use_sparse_embeddings: + current_distance = collection_info.config.params.vectors.distance + current_vector_size = collection_info.config.params.vectors.size + if current_distance != distance: msg = ( f"Collection '{collection_name}' already exists in Qdrant, " From a3bd3d3f919affd3b0f3022297caeab9989460b2 Mon Sep 17 00:00:00 2001 From: Corentin Date: Mon, 25 Mar 2024 09:36:05 +0100 Subject: [PATCH 29/41] Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py Co-authored-by: Anush --- .../haystack_integrations/document_stores/qdrant/converters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py index f5d220365..abf55db8f 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py @@ -39,7 +39,7 @@ def documents_to_batch( sparse_vector_instance = rest.SparseVector(**sparse_vector) vector[SPARSE_VECTORS_NAME] = sparse_vector_instance - if not use_sparse_embeddings: + else: vector = payload.pop(embedding_field) or {} _id = self.convert_id(payload.get("id")) From a72b65bbfe1f4a87a6ee380ef8506d1fddac3707 Mon Sep 17 00:00:00 2001 From: Corentin Date: Mon, 25 Mar 2024 09:36:13 +0100 Subject: [PATCH 30/41] Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py Co-authored-by: Anush --- .../haystack_integrations/document_stores/qdrant/converters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py index abf55db8f..2f103a6e1 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py @@ -85,7 +85,7 @@ def point_to_document(self, point: QdrantPoint) -> Document: if not self.use_sparse_embeddings: payload["embedding"] = point.vector if hasattr(point, "vector") else None - if self.use_sparse_embeddings: + else: if hasattr(point, "vector") and point.vector is not None and DENSE_VECTORS_NAME in point.vector: payload["embedding"] = point.vector[DENSE_VECTORS_NAME] else: From 827b8267248efea9ede9b744138c965a55f53c66 Mon Sep 17 00:00:00 2001 From: Corentin Date: Mon, 25 Mar 2024 09:36:20 +0100 Subject: [PATCH 31/41] Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py Co-authored-by: Anush --- .../document_stores/qdrant/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index a6a1e377e..3c2c0a4a3 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -468,7 +468,7 @@ def _set_up_collection( if self.use_sparse_embeddings: current_distance = collection_info.config.params.vectors[DENSE_VECTORS_NAME].distance current_vector_size = collection_info.config.params.vectors[DENSE_VECTORS_NAME].size - if not self.use_sparse_embeddings: + else: current_distance = collection_info.config.params.vectors.distance current_vector_size = collection_info.config.params.vectors.size From e66c74b17c1079cf279c3e56a4a56284f7381a4f Mon Sep 17 00:00:00 2001 From: Corentin Date: Mon, 25 Mar 2024 09:36:31 +0100 Subject: [PATCH 32/41] Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py Co-authored-by: Anush --- .../document_stores/qdrant/document_store.py | 29 +++++++------------ 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index 3c2c0a4a3..1f67249c7 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -358,25 +358,16 @@ def query_by_embedding( ) -> List[Document]: qdrant_filters = self.qdrant_filter_converter.convert(filters) - if self.use_sparse_embeddings: - points = self.client.search( - collection_name=self.index, - query_vector=rest.NamedVector( - name=DENSE_VECTORS_NAME, - vector=query_embedding, - ), - query_filter=qdrant_filters, - limit=top_k, - with_vectors=return_embedding, - ) - if not self.use_sparse_embeddings: - points = self.client.search( - collection_name=self.index, - query_vector=query_embedding, - query_filter=qdrant_filters, - limit=top_k, - with_vectors=return_embedding, - ) + points = self.client.search( + collection_name=self.index, + query_vector=rest.NamedVector( + name=DENSE_VECTORS_NAME if self.use_sparse_embeddings else "", + vector=query_embedding, + ), + query_filter=qdrant_filters, + limit=top_k, + with_vectors=return_embedding, + ) results = [self.qdrant_to_haystack.point_to_document(point) for point in points] if scale_score: for document in results: From f7cf65ec7ea9b2bf2a360c41096cc5770b114f82 Mon Sep 17 00:00:00 2001 From: Corentin Date: Mon, 25 Mar 2024 09:37:11 +0100 Subject: [PATCH 33/41] Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py Co-authored-by: Anush --- .../document_stores/qdrant/document_store.py | 69 ++++++++----------- 1 file changed, 27 insertions(+), 42 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index 1f67249c7..3722bb535 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -489,52 +489,37 @@ def _recreate_collection( on_disk: bool, # noqa: FBT001 use_sparse_embeddings: bool, # noqa: FBT001 ): + dense_vector_name = DENSE_VECTORS_NAME if use_sparse_embeddings else "" + + vectors_config = { + dense_vector_name: rest.VectorParams( + size=embedding_dim, + on_disk=on_disk, + distance=distance, + ), + } + if use_sparse_embeddings: - self.client.recreate_collection( - collection_name=collection_name, - vectors_config={ - DENSE_VECTORS_NAME: rest.VectorParams( - size=embedding_dim, - on_disk=on_disk, - distance=distance, - ), - }, - sparse_vectors_config={ - SPARSE_VECTORS_NAME: rest.SparseVectorParams( - index=rest.SparseIndexParams( - on_disk=on_disk, - ) - ) - }, - shard_number=self.shard_number, - replication_factor=self.replication_factor, - write_consistency_factor=self.write_consistency_factor, - on_disk_payload=self.on_disk_payload, - hnsw_config=self.hnsw_config, - optimizers_config=self.optimizers_config, - wal_config=self.wal_config, - quantization_config=self.quantization_config, - init_from=self.init_from, - ) - if not use_sparse_embeddings: - self.client.recreate_collection( - collection_name=collection_name, - vectors_config=rest.VectorParams( - size=embedding_dim, + vectors_config[SPARSE_VECTORS_NAME] = rest.SparseVectorParams( + index=rest.SparseIndexParams( on_disk=on_disk, - distance=distance, - ), - shard_number=self.shard_number, - replication_factor=self.replication_factor, - write_consistency_factor=self.write_consistency_factor, - on_disk_payload=self.on_disk_payload, - hnsw_config=self.hnsw_config, - optimizers_config=self.optimizers_config, - wal_config=self.wal_config, - quantization_config=self.quantization_config, - init_from=self.init_from, + ) ) + self.client.recreate_collection( + collection_name=collection_name, + vectors_config=vectors_config, + shard_number=self.shard_number, + replication_factor=self.replication_factor, + write_consistency_factor=self.write_consistency_factor, + on_disk_payload=self.on_disk_payload, + hnsw_config=self.hnsw_config, + optimizers_config=self.optimizers_config, + wal_config=self.wal_config, + quantization_config=self.quantization_config, + init_from=self.init_from, + ) + def _handle_duplicate_documents( self, documents: List[Document], From a968d712f1e753ab136907153f3082b7f97b3d5b Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Mon, 25 Mar 2024 09:44:59 +0100 Subject: [PATCH 34/41] Revert "Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py" This reverts commit f7cf65ec7ea9b2bf2a360c41096cc5770b114f82. --- .../document_stores/qdrant/document_store.py | 69 +++++++++++-------- 1 file changed, 42 insertions(+), 27 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index 3722bb535..1f67249c7 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -489,37 +489,52 @@ def _recreate_collection( on_disk: bool, # noqa: FBT001 use_sparse_embeddings: bool, # noqa: FBT001 ): - dense_vector_name = DENSE_VECTORS_NAME if use_sparse_embeddings else "" - - vectors_config = { - dense_vector_name: rest.VectorParams( - size=embedding_dim, - on_disk=on_disk, - distance=distance, - ), - } - if use_sparse_embeddings: - vectors_config[SPARSE_VECTORS_NAME] = rest.SparseVectorParams( - index=rest.SparseIndexParams( + self.client.recreate_collection( + collection_name=collection_name, + vectors_config={ + DENSE_VECTORS_NAME: rest.VectorParams( + size=embedding_dim, + on_disk=on_disk, + distance=distance, + ), + }, + sparse_vectors_config={ + SPARSE_VECTORS_NAME: rest.SparseVectorParams( + index=rest.SparseIndexParams( + on_disk=on_disk, + ) + ) + }, + shard_number=self.shard_number, + replication_factor=self.replication_factor, + write_consistency_factor=self.write_consistency_factor, + on_disk_payload=self.on_disk_payload, + hnsw_config=self.hnsw_config, + optimizers_config=self.optimizers_config, + wal_config=self.wal_config, + quantization_config=self.quantization_config, + init_from=self.init_from, + ) + if not use_sparse_embeddings: + self.client.recreate_collection( + collection_name=collection_name, + vectors_config=rest.VectorParams( + size=embedding_dim, on_disk=on_disk, - ) + distance=distance, + ), + shard_number=self.shard_number, + replication_factor=self.replication_factor, + write_consistency_factor=self.write_consistency_factor, + on_disk_payload=self.on_disk_payload, + hnsw_config=self.hnsw_config, + optimizers_config=self.optimizers_config, + wal_config=self.wal_config, + quantization_config=self.quantization_config, + init_from=self.init_from, ) - self.client.recreate_collection( - collection_name=collection_name, - vectors_config=vectors_config, - shard_number=self.shard_number, - replication_factor=self.replication_factor, - write_consistency_factor=self.write_consistency_factor, - on_disk_payload=self.on_disk_payload, - hnsw_config=self.hnsw_config, - optimizers_config=self.optimizers_config, - wal_config=self.wal_config, - quantization_config=self.quantization_config, - init_from=self.init_from, - ) - def _handle_duplicate_documents( self, documents: List[Document], From 56c4ee51f58648157f2e0d038d335777ad05b27f Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Mon, 25 Mar 2024 10:26:02 +0100 Subject: [PATCH 35/41] feat(Qdrant): fixing test --- .../document_stores/qdrant/document_store.py | 84 ++++++++----------- 1 file changed, 34 insertions(+), 50 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index 1f67249c7..951cf52db 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -316,7 +316,6 @@ def query_by_sparse( scale_score: bool = True, # noqa: FBT001, FBT002 return_embedding: bool = False, # noqa: FBT001, FBT002 ) -> List[Document]: - if not self.use_sparse_embeddings: message = ( "Error: tried to query by sparse vector with a Qdrant " @@ -414,6 +413,7 @@ def _set_up_collection( payload_fields_to_index: Optional[List[dict]] = None, ): distance = self._get_distance(similarity) + dense_vector_name = DENSE_VECTORS_NAME if use_sparse_embeddings else "" if recreate_collection: # There is no need to verify the current configuration of that @@ -439,7 +439,7 @@ def _set_up_collection( self._create_payload_index(collection_name, payload_fields_to_index) return - if self.use_sparse_embeddings and not isinstance(collection_info.config.params.vectors, dict): + if self.use_sparse_embeddings and DENSE_VECTORS_NAME not in collection_info.config.params.vectors.keys(): msg = ( f"Collection '{collection_name}' already exists in Qdrant, " f"but it has been originally created without sparse embedding vectors." @@ -448,7 +448,7 @@ def _set_up_collection( f"to use Named Dense Vectors (`text-sparse`) and Named Sparse Vectors (`text-dense`)." ) raise ValueError(msg) - if not self.use_sparse_embeddings and isinstance(collection_info.config.params.vectors, dict): + if not self.use_sparse_embeddings and DENSE_VECTORS_NAME in collection_info.config.params.vectors.keys(): msg = ( f"Collection '{collection_name}' already exists in Qdrant, " f"but it has been originaly created with sparse embedding vectors." @@ -456,12 +456,8 @@ def _set_up_collection( ) raise ValueError(msg) - if self.use_sparse_embeddings: - current_distance = collection_info.config.params.vectors[DENSE_VECTORS_NAME].distance - current_vector_size = collection_info.config.params.vectors[DENSE_VECTORS_NAME].size - else: - current_distance = collection_info.config.params.vectors.distance - current_vector_size = collection_info.config.params.vectors.size + current_distance = collection_info.config.params.vectors[dense_vector_name].distance + current_vector_size = collection_info.config.params.vectors[dense_vector_name].size if current_distance != distance: msg = ( @@ -489,51 +485,39 @@ def _recreate_collection( on_disk: bool, # noqa: FBT001 use_sparse_embeddings: bool, # noqa: FBT001 ): + dense_vector_name = DENSE_VECTORS_NAME if use_sparse_embeddings else "" + + vectors_config = { + dense_vector_name: rest.VectorParams( + size=embedding_dim, + on_disk=on_disk, + distance=distance, + ), + } + if use_sparse_embeddings: - self.client.recreate_collection( - collection_name=collection_name, - vectors_config={ - DENSE_VECTORS_NAME: rest.VectorParams( - size=embedding_dim, + sparse_vectors_config = { + SPARSE_VECTORS_NAME: rest.SparseVectorParams( + index=rest.SparseIndexParams( on_disk=on_disk, - distance=distance, - ), - }, - sparse_vectors_config={ - SPARSE_VECTORS_NAME: rest.SparseVectorParams( - index=rest.SparseIndexParams( - on_disk=on_disk, - ) ) - }, - shard_number=self.shard_number, - replication_factor=self.replication_factor, - write_consistency_factor=self.write_consistency_factor, - on_disk_payload=self.on_disk_payload, - hnsw_config=self.hnsw_config, - optimizers_config=self.optimizers_config, - wal_config=self.wal_config, - quantization_config=self.quantization_config, - init_from=self.init_from, - ) - if not use_sparse_embeddings: - self.client.recreate_collection( - collection_name=collection_name, - vectors_config=rest.VectorParams( - size=embedding_dim, - on_disk=on_disk, - distance=distance, ), - shard_number=self.shard_number, - replication_factor=self.replication_factor, - write_consistency_factor=self.write_consistency_factor, - on_disk_payload=self.on_disk_payload, - hnsw_config=self.hnsw_config, - optimizers_config=self.optimizers_config, - wal_config=self.wal_config, - quantization_config=self.quantization_config, - init_from=self.init_from, - ) + } + + self.client.recreate_collection( + collection_name=collection_name, + vectors_config=vectors_config, + sparse_vectors_config=sparse_vectors_config if use_sparse_embeddings else None, + shard_number=self.shard_number, + replication_factor=self.replication_factor, + write_consistency_factor=self.write_consistency_factor, + on_disk_payload=self.on_disk_payload, + hnsw_config=self.hnsw_config, + optimizers_config=self.optimizers_config, + wal_config=self.wal_config, + quantization_config=self.quantization_config, + init_from=self.init_from, + ) def _handle_duplicate_documents( self, From 99749754b3e806d9bdfffc5c4de4146f244d2221 Mon Sep 17 00:00:00 2001 From: Corentin Date: Mon, 25 Mar 2024 10:27:44 +0100 Subject: [PATCH 36/41] Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py Co-authored-by: Anush --- .../document_stores/qdrant/converters.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py index 2f103a6e1..c73ede0de 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py @@ -86,10 +86,8 @@ def point_to_document(self, point: QdrantPoint) -> Document: payload["embedding"] = point.vector if hasattr(point, "vector") else None else: - if hasattr(point, "vector") and point.vector is not None and DENSE_VECTORS_NAME in point.vector: - payload["embedding"] = point.vector[DENSE_VECTORS_NAME] - else: - payload["embedding"] = None + if hasattr(point, "vector") and point.vector is not None: + payload["embedding"] = point.vector.get(DENSE_VECTORS_NAME) if hasattr(point, "vector") and point.vector is not None and SPARSE_VECTORS_NAME in point.vector: parse_vector_dict = { From 67ad96ce0171bef21678792649230e8eec4e9b76 Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Mon, 25 Mar 2024 11:20:52 +0100 Subject: [PATCH 37/41] feat(Qdrant): fixing creation --- .../document_stores/qdrant/document_store.py | 43 ++++++++++++------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index 951cf52db..47e54d6e2 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -413,7 +413,6 @@ def _set_up_collection( payload_fields_to_index: Optional[List[dict]] = None, ): distance = self._get_distance(similarity) - dense_vector_name = DENSE_VECTORS_NAME if use_sparse_embeddings else "" if recreate_collection: # There is no need to verify the current configuration of that @@ -439,7 +438,12 @@ def _set_up_collection( self._create_payload_index(collection_name, payload_fields_to_index) return - if self.use_sparse_embeddings and DENSE_VECTORS_NAME not in collection_info.config.params.vectors.keys(): + if isinstance(collection_info.config.params.vectors, dict): + has_named_vectors = DENSE_VECTORS_NAME in collection_info.config.params.vectors + else: + has_named_vectors = False + + if self.use_sparse_embeddings and not has_named_vectors: msg = ( f"Collection '{collection_name}' already exists in Qdrant, " f"but it has been originally created without sparse embedding vectors." @@ -448,16 +452,21 @@ def _set_up_collection( f"to use Named Dense Vectors (`text-sparse`) and Named Sparse Vectors (`text-dense`)." ) raise ValueError(msg) - if not self.use_sparse_embeddings and DENSE_VECTORS_NAME in collection_info.config.params.vectors.keys(): + + elif not self.use_sparse_embeddings and has_named_vectors: msg = ( f"Collection '{collection_name}' already exists in Qdrant, " - f"but it has been originaly created with sparse embedding vectors." + f"but it has been originally created with sparse embedding vectors." f"If you want to use that collection, please set `use_sparse_embeddings=True`" ) raise ValueError(msg) - current_distance = collection_info.config.params.vectors[dense_vector_name].distance - current_vector_size = collection_info.config.params.vectors[dense_vector_name].size + if self.use_sparse_embeddings: + current_distance = collection_info.config.params.vectors[DENSE_VECTORS_NAME].distance + current_vector_size = collection_info.config.params.vectors[DENSE_VECTORS_NAME].size + else: + current_distance = collection_info.config.params.vectors.distance + current_vector_size = collection_info.config.params.vectors.size if current_distance != distance: msg = ( @@ -485,17 +494,16 @@ def _recreate_collection( on_disk: bool, # noqa: FBT001 use_sparse_embeddings: bool, # noqa: FBT001 ): - dense_vector_name = DENSE_VECTORS_NAME if use_sparse_embeddings else "" - - vectors_config = { - dense_vector_name: rest.VectorParams( - size=embedding_dim, - on_disk=on_disk, - distance=distance, - ), - } if use_sparse_embeddings: + vectors_config = { + DENSE_VECTORS_NAME: rest.VectorParams( + size=embedding_dim, + on_disk=on_disk, + distance=distance, + ), + } + sparse_vectors_config = { SPARSE_VECTORS_NAME: rest.SparseVectorParams( index=rest.SparseIndexParams( @@ -503,6 +511,11 @@ def _recreate_collection( ) ), } + else: + vectors_config = rest.VectorParams( + size=embedding_dim, + on_disk=on_disk, + distance=distance) self.client.recreate_collection( collection_name=collection_name, From 2f103a7696744892a4f8b919a2a04ff5295d832a Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Mon, 25 Mar 2024 11:22:04 +0100 Subject: [PATCH 38/41] feat(Qdrant): fixing creation --- .../document_stores/qdrant/document_store.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index 47e54d6e2..cb0471b72 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -494,7 +494,6 @@ def _recreate_collection( on_disk: bool, # noqa: FBT001 use_sparse_embeddings: bool, # noqa: FBT001 ): - if use_sparse_embeddings: vectors_config = { DENSE_VECTORS_NAME: rest.VectorParams( @@ -512,10 +511,7 @@ def _recreate_collection( ), } else: - vectors_config = rest.VectorParams( - size=embedding_dim, - on_disk=on_disk, - distance=distance) + vectors_config = rest.VectorParams(size=embedding_dim, on_disk=on_disk, distance=distance) self.client.recreate_collection( collection_name=collection_name, From 976cbb5bd80d18c1f7a7f0ca45eeef3b51ddc478 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Wed, 27 Mar 2024 19:42:28 +0100 Subject: [PATCH 39/41] little fixes --- .../document_stores/qdrant/converters.py | 4 +- .../document_stores/qdrant/document_store.py | 39 ++++++++----------- integrations/qdrant/tests/test_retriever.py | 35 ++++++++--------- 3 files changed, 35 insertions(+), 43 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py index c73ede0de..4bc7443ce 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py @@ -84,7 +84,6 @@ def point_to_document(self, point: QdrantPoint) -> Document: payload["score"] = point.score if hasattr(point, "score") else None if not self.use_sparse_embeddings: payload["embedding"] = point.vector if hasattr(point, "vector") else None - else: if hasattr(point, "vector") and point.vector is not None: payload["embedding"] = point.vector.get(DENSE_VECTORS_NAME) @@ -95,6 +94,5 @@ def point_to_document(self, point: QdrantPoint) -> Document: "values": point.vector[SPARSE_VECTORS_NAME].values, } payload["sparse_embedding"] = parse_vector_dict - else: - payload["sparse_embedding"] = None + return Document.from_dict(payload) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index cb0471b72..ca7220125 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -318,10 +318,10 @@ def query_by_sparse( ) -> List[Document]: if not self.use_sparse_embeddings: message = ( - "Error: tried to query by sparse vector with a Qdrant " - "Document Store initialized with use_sparse_embeddings=False" + "You are trying to query using sparse embeddings, but the Document Store " + "was initialized with `use_sparse_embeddings=False`. " ) - raise ValueError(message) + raise QdrantStoreError(message) qdrant_filters = self.qdrant_filter_converter.convert(filters) query_indices = query_sparse_embedding.indices @@ -438,28 +438,27 @@ def _set_up_collection( self._create_payload_index(collection_name, payload_fields_to_index) return - if isinstance(collection_info.config.params.vectors, dict): - has_named_vectors = DENSE_VECTORS_NAME in collection_info.config.params.vectors - else: - has_named_vectors = False + has_named_vectors = ( + isinstance(collection_info.config.params.vectors, dict) + and DENSE_VECTORS_NAME in collection_info.config.params.vectors + ) if self.use_sparse_embeddings and not has_named_vectors: msg = ( f"Collection '{collection_name}' already exists in Qdrant, " - f"but it has been originally created without sparse embedding vectors." - f"If you want to use that collection, either set `use_sparse_embeddings=False` " - f"or run a migration script " - f"to use Named Dense Vectors (`text-sparse`) and Named Sparse Vectors (`text-dense`)." + f"but it has been originally created without sparse embedding vectors. " + f"If you want to use that collection, you can set `use_sparse_embeddings=False`. " + f"To use sparse embeddings, you need to recreate the collection or migrate the existing one." ) - raise ValueError(msg) + raise QdrantStoreError(msg) elif not self.use_sparse_embeddings and has_named_vectors: msg = ( f"Collection '{collection_name}' already exists in Qdrant, " f"but it has been originally created with sparse embedding vectors." - f"If you want to use that collection, please set `use_sparse_embeddings=True`" + f"If you want to use that collection, please set `use_sparse_embeddings=True`." ) - raise ValueError(msg) + raise QdrantStoreError(msg) if self.use_sparse_embeddings: current_distance = collection_info.config.params.vectors[DENSE_VECTORS_NAME].distance @@ -494,14 +493,10 @@ def _recreate_collection( on_disk: bool, # noqa: FBT001 use_sparse_embeddings: bool, # noqa: FBT001 ): + dense_vectors_config = rest.VectorParams(size=embedding_dim, on_disk=on_disk, distance=distance) + if use_sparse_embeddings: - vectors_config = { - DENSE_VECTORS_NAME: rest.VectorParams( - size=embedding_dim, - on_disk=on_disk, - distance=distance, - ), - } + vectors_config = {DENSE_VECTORS_NAME: dense_vectors_config} sparse_vectors_config = { SPARSE_VECTORS_NAME: rest.SparseVectorParams( @@ -511,7 +506,7 @@ def _recreate_collection( ), } else: - vectors_config = rest.VectorParams(size=embedding_dim, on_disk=on_disk, distance=distance) + vectors_config = dense_vectors_config self.client.recreate_collection( collection_name=collection_name, diff --git a/integrations/qdrant/tests/test_retriever.py b/integrations/qdrant/tests/test_retriever.py index e58496868..ae3475d02 100644 --- a/integrations/qdrant/tests/test_retriever.py +++ b/integrations/qdrant/tests/test_retriever.py @@ -108,15 +108,13 @@ def test_run(self, filterable_docs: List[Document]): retriever = QdrantEmbeddingRetriever(document_store=document_store) - results: List[Document] = retriever.run(query_embedding=_random_embeddings(768)) + results: List[Document] = retriever.run(query_embedding=_random_embeddings(768))["documents"] + assert len(results) == 10 - assert len(results["documents"]) == 10 # type: ignore + results = retriever.run(query_embedding=_random_embeddings(768), top_k=5, return_embedding=False)["documents"] + assert len(results) == 5 - results = retriever.run(query_embedding=_random_embeddings(768), top_k=5, return_embedding=False) - - assert len(results["documents"]) == 5 # type: ignore - - for document in results["documents"]: # type: ignore + for document in results: assert document.embedding is None def test_run_with_sparse_activated(self, filterable_docs: List[Document]): @@ -126,15 +124,15 @@ def test_run_with_sparse_activated(self, filterable_docs: List[Document]): retriever = QdrantEmbeddingRetriever(document_store=document_store) - results: List[Document] = retriever.run(query_embedding=_random_embeddings(768)) + results: List[Document] = retriever.run(query_embedding=_random_embeddings(768))["documents"] - assert len(results["documents"]) == 10 # type: ignore + assert len(results) == 10 - results = retriever.run(query_embedding=_random_embeddings(768), top_k=5, return_embedding=False) + results = retriever.run(query_embedding=_random_embeddings(768), top_k=5, return_embedding=False)["documents"] - assert len(results["documents"]) == 5 # type: ignore + assert len(results) == 5 - for document in results["documents"]: # type: ignore + for document in results: assert document.embedding is None @@ -243,15 +241,16 @@ def test_run(self, filterable_docs: List[Document]): # Add fake sparse embedding to documents for doc in filterable_docs: doc.sparse_embedding = SparseEmbedding.from_dict(self._generate_mocked_sparse_embedding(1)[0]) + document_store.write_documents(filterable_docs) retriever = QdrantSparseRetriever(document_store=document_store) sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33]) - results: List[Document] = retriever.run(query_sparse_embedding=sparse_embedding) - assert len(results["documents"]) == 10 # type: ignore - results = retriever.run(query_sparse_embedding=sparse_embedding, top_k=5, return_embedding=False) + results: List[Document] = retriever.run(query_sparse_embedding=sparse_embedding)["documents"] + assert len(results) == 10 - assert len(results["documents"]) == 5 # type: ignore + results = retriever.run(query_sparse_embedding=sparse_embedding, top_k=5, return_embedding=True)["documents"] + assert len(results) == 5 - for document in results["documents"]: # type: ignore - assert document.embedding is None + for document in results: + assert document.sparse_embedding From ee819c8a8dfce3796075720bc81a1ce1dbaaf0a9 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 11 Apr 2024 10:43:09 +0200 Subject: [PATCH 40/41] make changes nonbreaking --- .github/workflows/qdrant.yml | 4 +--- integrations/qdrant/pyproject.toml | 2 +- .../document_stores/qdrant/document_store.py | 2 +- integrations/qdrant/tests/test_dict_converters.py | 2 +- integrations/qdrant/tests/test_retriever.py | 2 +- 5 files changed, 5 insertions(+), 7 deletions(-) diff --git a/.github/workflows/qdrant.yml b/.github/workflows/qdrant.yml index a629dbb7f..5995911fb 100644 --- a/.github/workflows/qdrant.yml +++ b/.github/workflows/qdrant.yml @@ -58,9 +58,7 @@ jobs: - name: Run tests id: tests - run: | - hatch run pip install git+https://github.com/deepset-ai/haystack.git #TODO: rm before merging - hatch run cov + run: hatch run cov - name: Nightly - run unit tests with Haystack main branch if: github.event_name == 'schedule' diff --git a/integrations/qdrant/pyproject.toml b/integrations/qdrant/pyproject.toml index 29be8da0f..fb969fb4e 100644 --- a/integrations/qdrant/pyproject.toml +++ b/integrations/qdrant/pyproject.toml @@ -24,7 +24,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = ["haystack-ai", "qdrant-client"] +dependencies = ["haystack-ai>=2.0.1", "qdrant-client"] [project.urls] Source = "https://github.com/deepset-ai/haystack-core-integrations" diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index ca7220125..a32c970ce 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -71,7 +71,7 @@ def __init__( content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", - use_sparse_embeddings: bool = True, # noqa: FBT001, FBT002 + use_sparse_embeddings: bool = False, # noqa: FBT001, FBT002 sparse_embedding_field: str = "sparse_embedding", similarity: str = "cosine", return_embedding: bool = False, # noqa: FBT001, FBT002 diff --git a/integrations/qdrant/tests/test_dict_converters.py b/integrations/qdrant/tests/test_dict_converters.py index c6cc5754c..7e94fa083 100644 --- a/integrations/qdrant/tests/test_dict_converters.py +++ b/integrations/qdrant/tests/test_dict_converters.py @@ -25,7 +25,7 @@ def test_to_dict(): "content_field": "content", "name_field": "name", "embedding_field": "embedding", - "use_sparse_embeddings": True, + "use_sparse_embeddings": False, "sparse_embedding_field": "sparse_embedding", "similarity": "cosine", "return_embedding": False, diff --git a/integrations/qdrant/tests/test_retriever.py b/integrations/qdrant/tests/test_retriever.py index ae3475d02..0f9452143 100644 --- a/integrations/qdrant/tests/test_retriever.py +++ b/integrations/qdrant/tests/test_retriever.py @@ -172,7 +172,7 @@ def test_to_dict(self): "content_field": "content", "name_field": "name", "embedding_field": "embedding", - "use_sparse_embeddings": True, + "use_sparse_embeddings": False, "sparse_embedding_field": "sparse_embedding", "similarity": "cosine", "return_embedding": False, From 804afd7db9242410c822b28fb8c5dfb1061b486d Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 12 Apr 2024 12:56:50 +0200 Subject: [PATCH 41/41] refactoring --- integrations/qdrant/pyproject.toml | 2 + .../components/retrievers/qdrant/retriever.py | 10 +- .../document_stores/qdrant/converters.py | 138 +++---- .../document_stores/qdrant/document_store.py | 95 ++--- .../document_stores/qdrant/filters.py | 366 +++++++++--------- integrations/qdrant/tests/test_converters.py | 49 +-- .../qdrant/tests/test_dict_converters.py | 3 - integrations/qdrant/tests/test_retriever.py | 2 - 8 files changed, 315 insertions(+), 350 deletions(-) diff --git a/integrations/qdrant/pyproject.toml b/integrations/qdrant/pyproject.toml index fb969fb4e..a566de955 100644 --- a/integrations/qdrant/pyproject.toml +++ b/integrations/qdrant/pyproject.toml @@ -103,6 +103,8 @@ ignore = [ "B027", # Allow boolean positional values in function calls, like `dict.get(... True)` "FBT003", + # Allow boolean arguments in function definition + "FBT001", "FBT002", # Ignore checks for possible passwords "S105", "S106", diff --git a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py index 12a67a3b7..0b7bfa1a4 100644 --- a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py +++ b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py @@ -33,8 +33,8 @@ def __init__( document_store: QdrantDocumentStore, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, - scale_score: bool = True, # noqa: FBT001, FBT002 - return_embedding: bool = False, # noqa: FBT001, FBT002 + scale_score: bool = True, + return_embedding: bool = False, ): """ Create a QdrantEmbeddingRetriever component. @@ -137,7 +137,7 @@ class QdrantSparseRetriever: document_store = QdrantDocumentStore( ":memory:", recreate_index=True, - return_sparse_embedding=True, + return_embedding=True, wait_result_from_api=True, ) retriever = QdrantSparseRetriever(document_store=document_store) @@ -151,8 +151,8 @@ def __init__( document_store: QdrantDocumentStore, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, - scale_score: bool = True, # noqa: FBT001, FBT002 - return_embedding: bool = False, # noqa: FBT001, FBT002 + scale_score: bool = True, + return_embedding: bool = False, ): """ Create a QdrantSparseRetriever component. diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py index 4bc7443ce..96bd4f37a 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py @@ -11,88 +11,70 @@ SPARSE_VECTORS_NAME = "text-sparse" -class HaystackToQdrant: - """A converter from Haystack to Qdrant types.""" - - UUID_NAMESPACE = uuid.UUID("3896d314-1e95-4a3a-b45a-945f9f0b541d") - - def documents_to_batch( - self, - documents: List[Document], - *, - embedding_field: str, - use_sparse_embeddings: bool, - sparse_embedding_field: str, - ) -> List[rest.PointStruct]: - points = [] - for document in documents: - payload = document.to_dict(flatten=False) - if use_sparse_embeddings: - vector = {} - - dense_vector = payload.pop(embedding_field, None) - if dense_vector is not None: - vector[DENSE_VECTORS_NAME] = dense_vector - - sparse_vector = payload.pop(sparse_embedding_field, None) - if sparse_vector is not None: - sparse_vector_instance = rest.SparseVector(**sparse_vector) - vector[SPARSE_VECTORS_NAME] = sparse_vector_instance - - else: - vector = payload.pop(embedding_field) or {} - _id = self.convert_id(payload.get("id")) - - point = rest.PointStruct( - payload=payload, - vector=vector, - id=_id, - ) - points.append(point) - return points - - def convert_id(self, _id: str) -> str: - """ - Converts any string into a UUID-like format in a deterministic way. - - Qdrant does not accept any string as an id, so an internal id has to be - generated for each point. This is a deterministic way of doing so. - """ - return uuid.uuid5(self.UUID_NAMESPACE, _id).hex +UUID_NAMESPACE = uuid.UUID("3896d314-1e95-4a3a-b45a-945f9f0b541d") + + +def convert_haystack_documents_to_qdrant_points( + documents: List[Document], + *, + embedding_field: str, + use_sparse_embeddings: bool, +) -> List[rest.PointStruct]: + points = [] + for document in documents: + payload = document.to_dict(flatten=False) + if use_sparse_embeddings: + vector = {} + + dense_vector = payload.pop(embedding_field, None) + if dense_vector is not None: + vector[DENSE_VECTORS_NAME] = dense_vector + + sparse_vector = payload.pop("sparse_embedding", None) + if sparse_vector is not None: + sparse_vector_instance = rest.SparseVector(**sparse_vector) + vector[SPARSE_VECTORS_NAME] = sparse_vector_instance + + else: + vector = payload.pop(embedding_field) or {} + _id = convert_id(payload.get("id")) + + point = rest.PointStruct( + payload=payload, + vector=vector, + id=_id, + ) + points.append(point) + return points + + +def convert_id(_id: str) -> str: + """ + Converts any string into a UUID-like format in a deterministic way. + + Qdrant does not accept any string as an id, so an internal id has to be + generated for each point. This is a deterministic way of doing so. + """ + return uuid.uuid5(UUID_NAMESPACE, _id).hex QdrantPoint = Union[rest.ScoredPoint, rest.Record] -class QdrantToHaystack: - def __init__( - self, - content_field: str, - name_field: str, - embedding_field: str, - use_sparse_embeddings: bool, # noqa: FBT001 - sparse_embedding_field: str, - ): - self.content_field = content_field - self.name_field = name_field - self.embedding_field = embedding_field - self.use_sparse_embeddings = use_sparse_embeddings - self.sparse_embedding_field = sparse_embedding_field - - def point_to_document(self, point: QdrantPoint) -> Document: - payload = {**point.payload} - payload["score"] = point.score if hasattr(point, "score") else None - if not self.use_sparse_embeddings: - payload["embedding"] = point.vector if hasattr(point, "vector") else None - else: - if hasattr(point, "vector") and point.vector is not None: - payload["embedding"] = point.vector.get(DENSE_VECTORS_NAME) +def convert_qdrant_point_to_haystack_document(point: QdrantPoint, use_sparse_embeddings: bool) -> Document: + payload = {**point.payload} + payload["score"] = point.score if hasattr(point, "score") else None + + if not use_sparse_embeddings: + payload["embedding"] = point.vector if hasattr(point, "vector") else None + elif hasattr(point, "vector") and point.vector is not None: + payload["embedding"] = point.vector.get(DENSE_VECTORS_NAME) - if hasattr(point, "vector") and point.vector is not None and SPARSE_VECTORS_NAME in point.vector: - parse_vector_dict = { - "indices": point.vector[SPARSE_VECTORS_NAME].indices, - "values": point.vector[SPARSE_VECTORS_NAME].values, - } - payload["sparse_embedding"] = parse_vector_dict + if SPARSE_VECTORS_NAME in point.vector: + parse_vector_dict = { + "indices": point.vector[SPARSE_VECTORS_NAME].indices, + "values": point.vector[SPARSE_VECTORS_NAME].values, + } + payload["sparse_embedding"] = parse_vector_dict - return Document.from_dict(payload) + return Document.from_dict(payload) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index a32c970ce..8771a3515 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -12,20 +12,23 @@ from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy from haystack.utils import Secret, deserialize_secrets_inplace -from haystack.utils.filters import convert +from haystack.utils.filters import convert as convert_legacy_filters from qdrant_client import grpc from qdrant_client.http import models as rest from qdrant_client.http.exceptions import UnexpectedResponse from tqdm import tqdm -from .converters import HaystackToQdrant, QdrantToHaystack -from .filters import QdrantFilterConverter +from .converters import ( + DENSE_VECTORS_NAME, + SPARSE_VECTORS_NAME, + convert_haystack_documents_to_qdrant_points, + convert_id, + convert_qdrant_point_to_haystack_document, +) +from .filters import convert_filters_to_qdrant logger = logging.getLogger(__name__) -DENSE_VECTORS_NAME = "text-dense" -SPARSE_VECTORS_NAME = "text-sparse" - class QdrantStoreError(DocumentStoreError): pass @@ -58,7 +61,7 @@ def __init__( url: Optional[str] = None, port: int = 6333, grpc_port: int = 6334, - prefer_grpc: bool = False, # noqa: FBT001, FBT002 + prefer_grpc: bool = False, https: Optional[bool] = None, api_key: Optional[Secret] = None, prefix: Optional[str] = None, @@ -67,17 +70,16 @@ def __init__( path: Optional[str] = None, index: str = "Document", embedding_dim: int = 768, - on_disk: bool = False, # noqa: FBT001, FBT002 + on_disk: bool = False, content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", - use_sparse_embeddings: bool = False, # noqa: FBT001, FBT002 - sparse_embedding_field: str = "sparse_embedding", + use_sparse_embeddings: bool = False, similarity: str = "cosine", - return_embedding: bool = False, # noqa: FBT001, FBT002 - progress_bar: bool = True, # noqa: FBT001, FBT002 + return_embedding: bool = False, + progress_bar: bool = True, duplicate_documents: str = "overwrite", - recreate_index: bool = False, # noqa: FBT001, FBT002 + recreate_index: bool = False, shard_number: Optional[int] = None, replication_factor: Optional[int] = None, write_consistency_factor: Optional[int] = None, @@ -87,7 +89,7 @@ def __init__( wal_config: Optional[dict] = None, quantization_config: Optional[dict] = None, init_from: Optional[dict] = None, - wait_result_from_api: bool = True, # noqa: FBT001, FBT002 + wait_result_from_api: bool = True, metadata: Optional[dict] = None, write_batch_size: int = 100, scroll_size: int = 10_000, @@ -151,17 +153,11 @@ def __init__( self.content_field = content_field self.name_field = name_field self.embedding_field = embedding_field - self.sparse_embedding_field = sparse_embedding_field self.similarity = similarity self.index = index self.return_embedding = return_embedding self.progress_bar = progress_bar self.duplicate_documents = duplicate_documents - self.qdrant_filter_converter = QdrantFilterConverter() - self.haystack_to_qdrant_converter = HaystackToQdrant() - self.qdrant_to_haystack = QdrantToHaystack( - content_field, name_field, embedding_field, use_sparse_embeddings, sparse_embedding_field - ) self.write_batch_size = write_batch_size self.scroll_size = scroll_size @@ -186,7 +182,7 @@ def filter_documents( raise ValueError(msg) if filters and "operator" not in filters: - filters = convert(filters) + filters = convert_legacy_filters(filters) return list( self.get_documents_generator( filters, @@ -217,11 +213,10 @@ def write_documents( batched_documents = get_batches_from_generator(document_objects, self.write_batch_size) with tqdm(total=len(document_objects), disable=not self.progress_bar) as progress_bar: for document_batch in batched_documents: - batch = self.haystack_to_qdrant_converter.documents_to_batch( + batch = convert_haystack_documents_to_qdrant_points( document_batch, embedding_field=self.embedding_field, use_sparse_embeddings=self.use_sparse_embeddings, - sparse_embedding_field=self.sparse_embedding_field, ) self.client.upsert( @@ -234,7 +229,7 @@ def write_documents( return len(document_objects) def delete_documents(self, ids: List[str]): - ids = [self.haystack_to_qdrant_converter.convert_id(_id) for _id in ids] + ids = [convert_id(_id) for _id in ids] try: self.client.delete( collection_name=self.index, @@ -267,7 +262,7 @@ def get_documents_generator( filters: Optional[Dict[str, Any]] = None, ) -> Generator[Document, None, None]: index = self.index - qdrant_filters = self.qdrant_filter_converter.convert(filters) + qdrant_filters = convert_filters_to_qdrant(filters) next_offset = None stop_scrolling = False @@ -285,7 +280,9 @@ def get_documents_generator( ) for record in records: - yield self.qdrant_to_haystack.point_to_document(record) + yield convert_qdrant_point_to_haystack_document( + record, use_sparse_embeddings=self.use_sparse_embeddings + ) def get_documents_by_id( self, @@ -296,7 +293,7 @@ def get_documents_by_id( documents: List[Document] = [] - ids = [self.haystack_to_qdrant_converter.convert_id(_id) for _id in ids] + ids = [convert_id(_id) for _id in ids] records = self.client.retrieve( collection_name=index, ids=ids, @@ -305,7 +302,9 @@ def get_documents_by_id( ) for record in records: - documents.append(self.qdrant_to_haystack.point_to_document(record)) + documents.append( + convert_qdrant_point_to_haystack_document(record, use_sparse_embeddings=self.use_sparse_embeddings) + ) return documents def query_by_sparse( @@ -313,8 +312,8 @@ def query_by_sparse( query_sparse_embedding: SparseEmbedding, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, - scale_score: bool = True, # noqa: FBT001, FBT002 - return_embedding: bool = False, # noqa: FBT001, FBT002 + scale_score: bool = True, + return_embedding: bool = False, ) -> List[Document]: if not self.use_sparse_embeddings: message = ( @@ -323,7 +322,7 @@ def query_by_sparse( ) raise QdrantStoreError(message) - qdrant_filters = self.qdrant_filter_converter.convert(filters) + qdrant_filters = convert_filters_to_qdrant(filters) query_indices = query_sparse_embedding.indices query_values = query_sparse_embedding.values points = self.client.search( @@ -339,7 +338,10 @@ def query_by_sparse( limit=top_k, with_vectors=return_embedding, ) - results = [self.qdrant_to_haystack.point_to_document(point) for point in points] + results = [ + convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings) + for point in points + ] if scale_score: for document in results: score = document.score @@ -352,10 +354,10 @@ def query_by_embedding( query_embedding: List[float], filters: Optional[Dict[str, Any]] = None, top_k: int = 10, - scale_score: bool = True, # noqa: FBT001, FBT002 - return_embedding: bool = False, # noqa: FBT001, FBT002 + scale_score: bool = True, + return_embedding: bool = False, ) -> List[Document]: - qdrant_filters = self.qdrant_filter_converter.convert(filters) + qdrant_filters = convert_filters_to_qdrant(filters) points = self.client.search( collection_name=self.index, @@ -367,7 +369,10 @@ def query_by_embedding( limit=top_k, with_vectors=return_embedding, ) - results = [self.qdrant_to_haystack.point_to_document(point) for point in points] + results = [ + convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=self.use_sparse_embeddings) + for point in points + ] if scale_score: for document in results: score = document.score @@ -406,10 +411,10 @@ def _set_up_collection( self, collection_name: str, embedding_dim: int, - recreate_collection: bool, # noqa: FBT001 + recreate_collection: bool, similarity: str, - use_sparse_embeddings: bool, # noqa: FBT001 - on_disk: bool = False, # noqa: FBT001, FBT002 + use_sparse_embeddings: bool, + on_disk: bool = False, payload_fields_to_index: Optional[List[dict]] = None, ): distance = self._get_distance(similarity) @@ -490,13 +495,15 @@ def _recreate_collection( collection_name: str, distance, embedding_dim: int, - on_disk: bool, # noqa: FBT001 - use_sparse_embeddings: bool, # noqa: FBT001 + on_disk: bool, + use_sparse_embeddings: bool, ): - dense_vectors_config = rest.VectorParams(size=embedding_dim, on_disk=on_disk, distance=distance) + # dense vectors configuration + vectors_config = rest.VectorParams(size=embedding_dim, on_disk=on_disk, distance=distance) if use_sparse_embeddings: - vectors_config = {DENSE_VECTORS_NAME: dense_vectors_config} + # in this case, we need to define named vectors + vectors_config = {DENSE_VECTORS_NAME: vectors_config} sparse_vectors_config = { SPARSE_VECTORS_NAME: rest.SparseVectorParams( @@ -505,8 +512,6 @@ def _recreate_collection( ) ), } - else: - vectors_config = dense_vectors_config self.client.recreate_collection( collection_name=collection_name, diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/filters.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/filters.py index 72a74a8b1..c4387b1e5 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/filters.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/filters.py @@ -4,226 +4,230 @@ from haystack.utils.filters import COMPARISON_OPERATORS, LOGICAL_OPERATORS, FilterError from qdrant_client.http import models -from .converters import HaystackToQdrant +from .converters import convert_id COMPARISON_OPERATORS = COMPARISON_OPERATORS.keys() LOGICAL_OPERATORS = LOGICAL_OPERATORS.keys() -class QdrantFilterConverter: +def convert_filters_to_qdrant( + filter_term: Optional[Union[List[dict], dict]] = None, +) -> Optional[models.Filter]: """Converts Haystack filters to the format used by Qdrant.""" - def __init__(self): - self.haystack_to_qdrant_converter = HaystackToQdrant() + if not filter_term: + return None - def convert( - self, - filter_term: Optional[Union[List[dict], dict]] = None, - ) -> Optional[models.Filter]: - if not filter_term: - return None + must_clauses, should_clauses, must_not_clauses = [], [], [] - must_clauses, should_clauses, must_not_clauses = [], [], [] + if isinstance(filter_term, dict): + filter_term = [filter_term] - if isinstance(filter_term, dict): - filter_term = [filter_term] + for item in filter_term: + operator = item.get("operator") + if operator is None: + msg = "Operator not found in filters" + raise FilterError(msg) - for item in filter_term: - operator = item.get("operator") - if operator is None: - msg = "Operator not found in filters" - raise FilterError(msg) + if operator in LOGICAL_OPERATORS and "conditions" not in item: + msg = f"'conditions' not found for '{operator}'" + raise FilterError(msg) - if operator in LOGICAL_OPERATORS and "conditions" not in item: - msg = f"'conditions' not found for '{operator}'" + if operator == "AND": + must_clauses.append(convert_filters_to_qdrant(item.get("conditions", []))) + elif operator == "OR": + should_clauses.append(convert_filters_to_qdrant(item.get("conditions", []))) + elif operator == "NOT": + must_not_clauses.append(convert_filters_to_qdrant(item.get("conditions", []))) + elif operator in COMPARISON_OPERATORS: + field = item.get("field") + value = item.get("value") + if field is None or value is None: + msg = f"'field' or 'value' not found for '{operator}'" raise FilterError(msg) - if operator == "AND": - must_clauses.append(self.convert(item.get("conditions", []))) - elif operator == "OR": - should_clauses.append(self.convert(item.get("conditions", []))) - elif operator == "NOT": - must_not_clauses.append(self.convert(item.get("conditions", []))) - elif operator in COMPARISON_OPERATORS: - field = item.get("field") - value = item.get("value") - if field is None or value is None: - msg = f"'field' or 'value' not found for '{operator}'" - raise FilterError(msg) - - must_clauses.extend( - self._parse_comparison_operation(comparison_operation=operator, key=field, value=value) - ) - else: - msg = f"Unknown operator {operator} used in filters" - raise FilterError(msg) + must_clauses.extend(_parse_comparison_operation(comparison_operation=operator, key=field, value=value)) + else: + msg = f"Unknown operator {operator} used in filters" + raise FilterError(msg) - payload_filter = models.Filter( - must=must_clauses or None, - should=should_clauses or None, - must_not=must_not_clauses or None, - ) + payload_filter = models.Filter( + must=must_clauses or None, + should=should_clauses or None, + must_not=must_not_clauses or None, + ) - filter_result = self._squeeze_filter(payload_filter) + filter_result = _squeeze_filter(payload_filter) - return filter_result + return filter_result - def _parse_comparison_operation( - self, comparison_operation: str, key: str, value: Union[dict, List, str, float] - ) -> List[models.Condition]: - conditions: List[models.Condition] = [] - condition_builder_mapping = { - "==": self._build_eq_condition, - "in": self._build_in_condition, - "!=": self._build_ne_condition, - "not in": self._build_nin_condition, - ">": self._build_gt_condition, - ">=": self._build_gte_condition, - "<": self._build_lt_condition, - "<=": self._build_lte_condition, - } +def _parse_comparison_operation( + comparison_operation: str, key: str, value: Union[dict, List, str, float] +) -> List[models.Condition]: + conditions: List[models.Condition] = [] - condition_builder = condition_builder_mapping.get(comparison_operation) + condition_builder_mapping = { + "==": _build_eq_condition, + "in": _build_in_condition, + "!=": _build_ne_condition, + "not in": _build_nin_condition, + ">": _build_gt_condition, + ">=": _build_gte_condition, + "<": _build_lt_condition, + "<=": _build_lte_condition, + } - if condition_builder is None: - msg = f"Unknown operator {comparison_operation} used in filters" - raise ValueError(msg) + condition_builder = condition_builder_mapping.get(comparison_operation) - conditions.append(condition_builder(key, value)) + if condition_builder is None: + msg = f"Unknown operator {comparison_operation} used in filters" + raise ValueError(msg) - return conditions + conditions.append(condition_builder(key, value)) - def _build_eq_condition(self, key: str, value: models.ValueVariants) -> models.Condition: - if isinstance(value, str) and " " in value: - models.FieldCondition(key=key, match=models.MatchText(text=value)) - return models.FieldCondition(key=key, match=models.MatchValue(value=value)) + return conditions - def _build_in_condition(self, key: str, value: List[models.ValueVariants]) -> models.Condition: - if not isinstance(value, list): - msg = f"Value {value} is not a list" - raise FilterError(msg) - return models.Filter( - should=[ - ( - models.FieldCondition(key=key, match=models.MatchText(text=item)) - if isinstance(item, str) and " " not in item - else models.FieldCondition(key=key, match=models.MatchValue(value=item)) - ) - for item in value - ] - ) - - def _build_ne_condition(self, key: str, value: models.ValueVariants) -> models.Condition: - return models.Filter( - must_not=[ - ( - models.FieldCondition(key=key, match=models.MatchText(text=value)) - if isinstance(value, str) and " " not in value - else models.FieldCondition(key=key, match=models.MatchValue(value=value)) - ) - ] - ) - - def _build_nin_condition(self, key: str, value: List[models.ValueVariants]) -> models.Condition: - if not isinstance(value, list): - msg = f"Value {value} is not a list" - raise FilterError(msg) - return models.Filter( - must_not=[ - ( - models.FieldCondition(key=key, match=models.MatchText(text=item)) - if isinstance(item, str) and " " in item - else models.FieldCondition(key=key, match=models.MatchValue(value=item)) - ) - for item in value - ] - ) - - def _build_lt_condition(self, key: str, value: Union[str, float, int]) -> models.Condition: - if isinstance(value, str) and is_datetime_string(value): - return models.FieldCondition(key=key, range=models.DatetimeRange(lt=value)) - - if isinstance(value, (int, float)): - return models.FieldCondition(key=key, range=models.Range(lt=value)) - - msg = f"Value {value} is not an int or float or datetime string" - raise FilterError(msg) - def _build_lte_condition(self, key: str, value: Union[str, float, int]) -> models.Condition: - if isinstance(value, str) and is_datetime_string(value): - return models.FieldCondition(key=key, range=models.DatetimeRange(lte=value)) +def _build_eq_condition(key: str, value: models.ValueVariants) -> models.Condition: + if isinstance(value, str) and " " in value: + models.FieldCondition(key=key, match=models.MatchText(text=value)) + return models.FieldCondition(key=key, match=models.MatchValue(value=value)) - if isinstance(value, (int, float)): - return models.FieldCondition(key=key, range=models.Range(lte=value)) - msg = f"Value {value} is not an int or float or datetime string" +def _build_in_condition(key: str, value: List[models.ValueVariants]) -> models.Condition: + if not isinstance(value, list): + msg = f"Value {value} is not a list" + raise FilterError(msg) + return models.Filter( + should=[ + ( + models.FieldCondition(key=key, match=models.MatchText(text=item)) + if isinstance(item, str) and " " not in item + else models.FieldCondition(key=key, match=models.MatchValue(value=item)) + ) + for item in value + ] + ) + + +def _build_ne_condition(key: str, value: models.ValueVariants) -> models.Condition: + return models.Filter( + must_not=[ + ( + models.FieldCondition(key=key, match=models.MatchText(text=value)) + if isinstance(value, str) and " " not in value + else models.FieldCondition(key=key, match=models.MatchValue(value=value)) + ) + ] + ) + + +def _build_nin_condition(key: str, value: List[models.ValueVariants]) -> models.Condition: + if not isinstance(value, list): + msg = f"Value {value} is not a list" raise FilterError(msg) + return models.Filter( + must_not=[ + ( + models.FieldCondition(key=key, match=models.MatchText(text=item)) + if isinstance(item, str) and " " in item + else models.FieldCondition(key=key, match=models.MatchValue(value=item)) + ) + for item in value + ] + ) - def _build_gt_condition(self, key: str, value: Union[str, float, int]) -> models.Condition: - if isinstance(value, str) and is_datetime_string(value): - return models.FieldCondition(key=key, range=models.DatetimeRange(gt=value)) - if isinstance(value, (int, float)): - return models.FieldCondition(key=key, range=models.Range(gt=value)) +def _build_lt_condition(key: str, value: Union[str, float, int]) -> models.Condition: + if isinstance(value, str) and is_datetime_string(value): + return models.FieldCondition(key=key, range=models.DatetimeRange(lt=value)) - msg = f"Value {value} is not an int or float or datetime string" - raise FilterError(msg) + if isinstance(value, (int, float)): + return models.FieldCondition(key=key, range=models.Range(lt=value)) - def _build_gte_condition(self, key: str, value: Union[str, float, int]) -> models.Condition: - if isinstance(value, str) and is_datetime_string(value): - return models.FieldCondition(key=key, range=models.DatetimeRange(gte=value)) + msg = f"Value {value} is not an int or float or datetime string" + raise FilterError(msg) - if isinstance(value, (int, float)): - return models.FieldCondition(key=key, range=models.Range(gte=value)) - msg = f"Value {value} is not an int or float or datetime string" - raise FilterError(msg) +def _build_lte_condition(key: str, value: Union[str, float, int]) -> models.Condition: + if isinstance(value, str) and is_datetime_string(value): + return models.FieldCondition(key=key, range=models.DatetimeRange(lte=value)) + + if isinstance(value, (int, float)): + return models.FieldCondition(key=key, range=models.Range(lte=value)) + + msg = f"Value {value} is not an int or float or datetime string" + raise FilterError(msg) + - def _build_has_id_condition(self, id_values: List[models.ExtendedPointId]) -> models.HasIdCondition: - return models.HasIdCondition( - has_id=[ - # Ids are converted into their internal representation - self.haystack_to_qdrant_converter.convert_id(item) - for item in id_values - ] - ) - - def _squeeze_filter(self, payload_filter: models.Filter) -> models.Filter: - """ - Simplify given payload filter, if the nested structure might be unnested. - That happens if there is a single clause in that filter. - :param payload_filter: - :returns: - """ - filter_parts = { - "must": payload_filter.must, - "should": payload_filter.should, - "must_not": payload_filter.must_not, - } - - total_clauses = sum(len(x) for x in filter_parts.values() if x is not None) - if total_clauses == 0 or total_clauses > 1: - return payload_filter - - # Payload filter has just a single clause provided (either must, should - # or must_not). If that single clause is also of a models.Filter type, - # then it might be returned instead. - for part_name, filter_part in filter_parts.items(): - if not filter_part: - continue - - subfilter = filter_part[0] - if not isinstance(subfilter, models.Filter): - # The inner statement is a simple condition like models.FieldCondition - # so it cannot be simplified. - continue - - if subfilter.must: - return models.Filter(**{part_name: subfilter.must}) +def _build_gt_condition(key: str, value: Union[str, float, int]) -> models.Condition: + if isinstance(value, str) and is_datetime_string(value): + return models.FieldCondition(key=key, range=models.DatetimeRange(gt=value)) + if isinstance(value, (int, float)): + return models.FieldCondition(key=key, range=models.Range(gt=value)) + + msg = f"Value {value} is not an int or float or datetime string" + raise FilterError(msg) + + +def _build_gte_condition(key: str, value: Union[str, float, int]) -> models.Condition: + if isinstance(value, str) and is_datetime_string(value): + return models.FieldCondition(key=key, range=models.DatetimeRange(gte=value)) + + if isinstance(value, (int, float)): + return models.FieldCondition(key=key, range=models.Range(gte=value)) + + msg = f"Value {value} is not an int or float or datetime string" + raise FilterError(msg) + + +def _build_has_id_condition(id_values: List[models.ExtendedPointId]) -> models.HasIdCondition: + return models.HasIdCondition( + has_id=[ + # Ids are converted into their internal representation + convert_id(item) + for item in id_values + ] + ) + + +def _squeeze_filter(payload_filter: models.Filter) -> models.Filter: + """ + Simplify given payload filter, if the nested structure might be unnested. + That happens if there is a single clause in that filter. + :param payload_filter: + :returns: + """ + filter_parts = { + "must": payload_filter.must, + "should": payload_filter.should, + "must_not": payload_filter.must_not, + } + + total_clauses = sum(len(x) for x in filter_parts.values() if x is not None) + if total_clauses == 0 or total_clauses > 1: return payload_filter + # Payload filter has just a single clause provided (either must, should + # or must_not). If that single clause is also of a models.Filter type, + # then it might be returned instead. + for part_name, filter_part in filter_parts.items(): + if not filter_part: + continue + + subfilter = filter_part[0] + if not isinstance(subfilter, models.Filter): + # The inner statement is a simple condition like models.FieldCondition + # so it cannot be simplified. + continue + + if subfilter.must: + return models.Filter(**{part_name: subfilter.must}) + + return payload_filter + def is_datetime_string(value: str) -> bool: try: diff --git a/integrations/qdrant/tests/test_converters.py b/integrations/qdrant/tests/test_converters.py index fd9d5f3ad..242c4cafe 100644 --- a/integrations/qdrant/tests/test_converters.py +++ b/integrations/qdrant/tests/test_converters.py @@ -1,40 +1,19 @@ import numpy as np -import pytest -from haystack_integrations.document_stores.qdrant.converters import HaystackToQdrant, QdrantToHaystack +from haystack_integrations.document_stores.qdrant.converters import ( + convert_id, + convert_qdrant_point_to_haystack_document, +) from qdrant_client.http import models as rest -CONTENT_FIELD = "content" -NAME_FIELD = "name" -EMBEDDING_FIELD = "vector" -SPARSE_EMBEDDING_FIELD = "sparse-vector" - -@pytest.fixture -def haystack_to_qdrant() -> HaystackToQdrant: - return HaystackToQdrant() - - -@pytest.fixture -def qdrant_to_haystack(request) -> QdrantToHaystack: - return QdrantToHaystack( - content_field=CONTENT_FIELD, - name_field=NAME_FIELD, - embedding_field=EMBEDDING_FIELD, - use_sparse_embeddings=request.param, - sparse_embedding_field=SPARSE_EMBEDDING_FIELD, - ) - - -def test_convert_id_is_deterministic(haystack_to_qdrant: HaystackToQdrant): - first_id = haystack_to_qdrant.convert_id("test-id") - second_id = haystack_to_qdrant.convert_id("test-id") +def test_convert_id_is_deterministic(): + first_id = convert_id("test-id") + second_id = convert_id("test-id") assert first_id == second_id -@pytest.mark.parametrize("qdrant_to_haystack", [True], indirect=True) -def test_point_to_document_reverts_proper_structure_from_record_with_sparse( - qdrant_to_haystack: QdrantToHaystack, -): +def test_point_to_document_reverts_proper_structure_from_record_with_sparse(): + point = rest.Record( id="c7c62e8e-02b9-4ec6-9f88-46bd97b628b7", payload={ @@ -51,7 +30,7 @@ def test_point_to_document_reverts_proper_structure_from_record_with_sparse( "text-sparse": {"indices": [7, 1024, 367], "values": [0.1, 0.98, 0.33]}, }, ) - document = qdrant_to_haystack.point_to_document(point) + document = convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=True) assert "my-id" == document.id assert "Lorem ipsum" == document.content assert "text" == document.content_type @@ -60,10 +39,8 @@ def test_point_to_document_reverts_proper_structure_from_record_with_sparse( assert 0.0 == np.sum(np.array([1.0, 0.0, 0.0, 0.0]) - document.embedding) -@pytest.mark.parametrize("qdrant_to_haystack", [False], indirect=True) -def test_point_to_document_reverts_proper_structure_from_record_without_sparse( - qdrant_to_haystack: QdrantToHaystack, -): +def test_point_to_document_reverts_proper_structure_from_record_without_sparse(): + point = rest.Record( id="c7c62e8e-02b9-4ec6-9f88-46bd97b628b7", payload={ @@ -77,7 +54,7 @@ def test_point_to_document_reverts_proper_structure_from_record_without_sparse( }, vector=[1.0, 0.0, 0.0, 0.0], ) - document = qdrant_to_haystack.point_to_document(point) + document = convert_qdrant_point_to_haystack_document(point, use_sparse_embeddings=False) assert "my-id" == document.id assert "Lorem ipsum" == document.content assert "text" == document.content_type diff --git a/integrations/qdrant/tests/test_dict_converters.py b/integrations/qdrant/tests/test_dict_converters.py index 7e94fa083..6c8e46710 100644 --- a/integrations/qdrant/tests/test_dict_converters.py +++ b/integrations/qdrant/tests/test_dict_converters.py @@ -26,7 +26,6 @@ def test_to_dict(): "name_field": "name", "embedding_field": "embedding", "use_sparse_embeddings": False, - "sparse_embedding_field": "sparse_embedding", "similarity": "cosine", "return_embedding": False, "progress_bar": True, @@ -66,7 +65,6 @@ def test_from_dict(): "name_field": "name", "embedding_field": "embedding", "use_sparse_embeddings": True, - "sparse_embedding_field": "sparse_embedding", "similarity": "cosine", "return_embedding": False, "progress_bar": True, @@ -91,7 +89,6 @@ def test_from_dict(): document_store.name_field == "name", document_store.embedding_field == "embedding", document_store.use_sparse_embeddings is True, - document_store.sparse_embedding_field == "sparse_embedding", document_store.on_disk is False, document_store.similarity == "cosine", document_store.return_embedding is False, diff --git a/integrations/qdrant/tests/test_retriever.py b/integrations/qdrant/tests/test_retriever.py index 0f9452143..96e748220 100644 --- a/integrations/qdrant/tests/test_retriever.py +++ b/integrations/qdrant/tests/test_retriever.py @@ -50,7 +50,6 @@ def test_to_dict(self): "name_field": "name", "embedding_field": "embedding", "use_sparse_embeddings": False, - "sparse_embedding_field": "sparse_embedding", "similarity": "cosine", "return_embedding": False, "progress_bar": True, @@ -173,7 +172,6 @@ def test_to_dict(self): "name_field": "name", "embedding_field": "embedding", "use_sparse_embeddings": False, - "sparse_embedding_field": "sparse_embedding", "similarity": "cosine", "return_embedding": False, "progress_bar": True,