Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(Qdrant): start to work on sparse vector integration #578

Merged
merged 45 commits into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
0f650ee
feat(Qdrant): start to working on sparse vector integration
lambda-science Mar 13, 2024
6025e50
Progress towards Sparse vector support with Fastembed
Mar 13, 2024
ad6fcbc
__init__.py
Mar 13, 2024
c9a571a
merge batch results for hybrid request
Mar 13, 2024
3824e8c
feat(Qdrant): missing comma
lambda-science Mar 14, 2024
4253a1c
feat(Qdrant): making some test progress
lambda-science Mar 14, 2024
37fddeb
feat(Qdrant): all current test are fixed
lambda-science Mar 14, 2024
550ef45
feat(Qdrant): linting
lambda-science Mar 14, 2024
c79c604
feat(Qdrant): working sparse retriver hooray
lambda-science Mar 14, 2024
91d67f7
feat(Qdrant): fix hybrid retriver
lambda-science Mar 20, 2024
115c01b
feat(Qdrant): modify PR for haystack 2.1.0 with proper sparse vectors
lambda-science Mar 20, 2024
9c4e256
feat(Qdrant): fix lint
lambda-science Mar 20, 2024
bafc27a
Merge branch 'main' into qdrant-sparse
anakin87 Mar 21, 2024
3339598
test w Haystack main
anakin87 Mar 21, 2024
725c9dc
fix deps
anakin87 Mar 21, 2024
cea3cb7
Update integrations/qdrant/src/haystack_integrations/components/retri…
Mar 21, 2024
34cd6cf
Merge branch 'deepset-ai:main' into qdrant-sparse
Mar 22, 2024
9441135
feat(Qdrant): remove hybrid & old code, constant for vector field names
lambda-science Mar 22, 2024
6cacc78
Update integrations/qdrant/src/haystack_integrations/document_stores/…
Mar 22, 2024
d911b18
Update integrations/qdrant/src/haystack_integrations/document_stores/…
Mar 22, 2024
125c123
feat(Qdrant): reverting pop change, changing Dict to SparseEmbedding …
lambda-science Mar 22, 2024
7cf1882
feat(Qdrant): fix lint
lambda-science Mar 22, 2024
9749ee0
feat(Qdrant): remove old todo
lambda-science Mar 22, 2024
2683a74
simplify documents_to_batch
anakin87 Mar 22, 2024
79d0d52
feat(Qdrant): SparseEmbedding instead of Dict
lambda-science Mar 22, 2024
529719a
feat(Qdrant): introducing `use_sparse_embeddings` parameters for docu…
lambda-science Mar 22, 2024
fd064da
feat(Qdrant): `use_sparse_embeddings` true by default + bugfix
lambda-science Mar 22, 2024
b018504
feat(Qdrant): `use_sparse_embeddings` true by default + bugfix
lambda-science Mar 22, 2024
e1c38be
feat(Qdrant): `use_sparse_embeddings` true by default + bugfix
lambda-science Mar 22, 2024
1cb601d
feat(Qdrant): bugfix
lambda-science Mar 22, 2024
a3bd3d3
Update integrations/qdrant/src/haystack_integrations/document_stores/…
Mar 25, 2024
a72b65b
Update integrations/qdrant/src/haystack_integrations/document_stores/…
Mar 25, 2024
827b826
Update integrations/qdrant/src/haystack_integrations/document_stores/…
Mar 25, 2024
e66c74b
Update integrations/qdrant/src/haystack_integrations/document_stores/…
Mar 25, 2024
f7cf65e
Update integrations/qdrant/src/haystack_integrations/document_stores/…
Mar 25, 2024
a968d71
Revert "Update integrations/qdrant/src/haystack_integrations/document…
lambda-science Mar 25, 2024
56c4ee5
feat(Qdrant): fixing test
lambda-science Mar 25, 2024
9974975
Update integrations/qdrant/src/haystack_integrations/document_stores/…
Mar 25, 2024
67ad96c
feat(Qdrant): fixing creation
lambda-science Mar 25, 2024
2f103a7
feat(Qdrant): fixing creation
lambda-science Mar 25, 2024
10b79e8
Merge branch 'main' into qdrant-sparse
anakin87 Mar 27, 2024
976cbb5
little fixes
anakin87 Mar 27, 2024
37a2fdc
Merge branch 'main' into qdrant-sparse
anakin87 Apr 11, 2024
ee819c8
make changes nonbreaking
anakin87 Apr 11, 2024
804afd7
refactoring
anakin87 Apr 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
#
# SPDX-License-Identifier: Apache-2.0

from .retriever import QdrantEmbeddingRetriever
from .retriever import QdrantEmbeddingRetriever, QdrantSparseRetriever

__all__ = ("QdrantEmbeddingRetriever",)
__all__ = ("QdrantEmbeddingRetriever", "QdrantSparseRetriever")
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Union

from haystack import Document, component, default_from_dict, default_to_dict
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
Expand All @@ -7,7 +7,7 @@
@component
class QdrantEmbeddingRetriever:
"""
A component for retrieving documents from an QdrantDocumentStore.
A component for retrieving documents from an QdrantDocumentStore using dense vectors.

Usage example:
```python
Expand Down Expand Up @@ -120,3 +120,248 @@ def run(
)

return {"documents": docs}


@component
class QdrantSparseRetriever:
"""
A component for retrieving documents from an QdrantDocumentStore using sparse vectors.

Usage example:
```python
from haystack_integrations.components.retrievers.qdrant import QdrantSparseRetriever
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore

document_store = QdrantDocumentStore(
":memory:",
recreate_index=True,
return_sparse_embedding=True,
wait_result_from_api=True,
)
retriever = QdrantSparseRetriever(document_store=document_store)

# using a fake sparse vector to keep the example simple
lambda-science marked this conversation as resolved.
Show resolved Hide resolved
retriever.run(query_sparse_embedding={"indices":[0, 1, 2, 3], "values":[0.1, 0.8, 0.05, 0.33]})
lambda-science marked this conversation as resolved.
Show resolved Hide resolved
```
"""

def __init__(
self,
document_store: QdrantDocumentStore,
filters: Optional[Dict[str, Any]] = None,
top_k: int = 10,
scale_score: bool = True, # noqa: FBT001, FBT002
return_embedding: bool = False, # noqa: FBT001, FBT002
):
"""
Create a QdrantSparseRetriever component.

:param document_store: An instance of QdrantDocumentStore.
:param filters: A dictionary with filters to narrow down the search space. Default is None.
:param top_k: The maximum number of documents to retrieve. Default is 10.
:param scale_score: Whether to scale the scores of the retrieved documents or not. Default is True.
:param return_embedding: Whether to return the sparse embedding of the retrieved Documents. Default is False.

:raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore.
"""

if not isinstance(document_store, QdrantDocumentStore):
msg = "document_store must be an instance of QdrantDocumentStore"
raise ValueError(msg)

self._document_store = document_store
self._filters = filters
self._top_k = top_k
self._scale_score = scale_score
self._return_embedding = return_embedding

def to_dict(self) -> Dict[str, Any]:
"""
Serializes the component to a dictionary.

:returns:
Dictionary with serialized data.
"""
d = default_to_dict(
self,
document_store=self._document_store,
filters=self._filters,
top_k=self._top_k,
scale_score=self._scale_score,
return_embedding=self._return_embedding,
)
d["init_parameters"]["document_store"] = self._document_store.to_dict()

return d

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "QdrantEmbeddingRetriever":
"""
Deserializes the component from a dictionary.

:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
"""
document_store = QdrantDocumentStore.from_dict(data["init_parameters"]["document_store"])
data["init_parameters"]["document_store"] = document_store
return default_from_dict(cls, data)

@component.output_types(documents=List[Document])
def run(
self,
query_sparse_embedding: Dict[str, Union[List[int], List[float]]],
lambda-science marked this conversation as resolved.
Show resolved Hide resolved
filters: Optional[Dict[str, Any]] = None,
top_k: Optional[int] = None,
scale_score: Optional[bool] = None,
return_embedding: Optional[bool] = None,
):
"""
Run the Sparse Embedding Retriever on the given input data.

:param query_sparse_embedding: Sparse Embedding of the query.
:param filters: A dictionary with filters to narrow down the search space.
:param top_k: The maximum number of documents to return.
:param scale_score: Whether to scale the scores of the retrieved documents or not.
:param return_embedding: Whether to return the embedding of the retrieved Documents.
:returns:
The retrieved documents.

"""
docs = self._document_store.query_by_sparse(
query_sparse_embedding=query_sparse_embedding,
filters=filters or self._filters,
top_k=top_k or self._top_k,
scale_score=scale_score or self._scale_score,
return_embedding=return_embedding or self._return_embedding,
)

return {"documents": docs}


# @component
# class QdrantHybridRetriever:
lambda-science marked this conversation as resolved.
Show resolved Hide resolved
# """
# A component for retrieving documents from an QdrantDocumentStore using hybrid search (dense+sparse).
#
# Usage example:
# ```python
# from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever
# from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
#
# document_store = QdrantDocumentStore(
# ":memory:",
# recreate_index=True,
# return_sparse_embedding=True,
# wait_result_from_api=True,
# )
# retriever = QdrantHybridRetriever(document_store=document_store)
#
# # using a fake sparse vector to keep the example simple
# retriever.run(query_embedding=[0.1]*768,
# query_sparse_embedding={"indices":[0, 1, 2, 3], "values":[0.1, 0.8, 0.05, 0.33]})
# ```
# """
#
# def __init__(
# self,
# document_store: QdrantDocumentStore,
# filters: Optional[Dict[str, Any]] = None,
# top_k_dense: int = 10,
# top_k_sparse: int = 10,
# scale_score: bool = True,
# return_embedding: bool = False,
# ):
# """
# Create a QdrantSparseRetriever component.
#
# :param document_store: An instance of QdrantDocumentStore.
# :param filters: A dictionary with filters to narrow down the search space. Default is None.
# :param top_k_dense: The maximum number of documents to retrieve by dense retriever. Default is 10.
# :param top_k_sparse: The maximum number of documents to retrieve by sparse retriever. Default is 10.
# :param scale_score: Whether to scale the scores of the retrieved documents or not. Default is True.
# :param return_embedding: Whether to return the sparse embedding of the retrieved Documents. Default is False.
#
# :raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore.
# """
#
# if not isinstance(document_store, QdrantDocumentStore):
# msg = "document_store must be an instance of QdrantDocumentStore"
# raise ValueError(msg)
#
# self._document_store = document_store
# self._filters = filters
# self._top_k_dense = top_k_dense
# self._top_k_sparse = top_k_sparse
# self._scale_score = scale_score
# self._return_embedding = return_embedding
#
# def to_dict(self) -> Dict[str, Any]:
# """
# Serializes the component to a dictionary.
#
# :returns:
# Dictionary with serialized data.
# """
# d = default_to_dict(
# self,
# document_store=self._document_store,
# filters=self._filters,
# top_k_dense=self._top_k_dense,
# top_k_sparse=self._top_k_sparse,
# scale_score=self._scale_score,
# return_embedding=self._return_embedding,
# )
# d["init_parameters"]["document_store"] = self._document_store.to_dict()
#
# return d
#
# @classmethod
# def from_dict(cls, data: Dict[str, Any]) -> "QdrantEmbeddingRetriever":
# """
# Deserializes the component from a dictionary.
#
# :param data:
# Dictionary to deserialize from.
# :returns:
# Deserialized component.
# """
# document_store = QdrantDocumentStore.from_dict(data["init_parameters"]["document_store"])
# data["init_parameters"]["document_store"] = document_store
# return default_from_dict(cls, data)
#
# @component.output_types(documents=List[Document])
# def run(
# self,
# query_sparse_embedding: Dict[str, Union[List[int], List[float]]],
# query_embedding: List[float],
# filters: Optional[Dict[str, Any]] = None,
# top_k_dense: Optional[int] = None,
# top_k_sparse: Optional[int] = None,
# scale_score: Optional[bool] = None,
# return_embedding: Optional[bool] = None,
# ):
# """
# Run the Sparse Embedding Retriever on the given input data.
#
# :param query_sparse_embedding: Sparse Embedding of the query.
# :param filters: A dictionary with filters to narrow down the search space.
# :param top_k: The maximum number of documents to return.
# :param scale_score: Whether to scale the scores of the retrieved documents or not.
# :param return_embedding: Whether to return the embedding of the retrieved Documents.
# :returns:
# The retrieved documents.
#
# """
# docs = self._document_store.query_hybrid(
# query_sparse_embedding=query_sparse_embedding,
# query_embedding=query_embedding,
# filters=filters or self._filters,
# top_k_dense=top_k_dense or self._top_k_dense,
# top_k_sparse=top_k_sparse or self._top_k_sparse,
# scale_score=scale_score or self._scale_score,
# return_embedding=return_embedding or self._return_embedding,
# )
#
# return {"documents": docs}
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,15 @@ def documents_to_batch(
points = []
for document in documents:
payload = document.to_dict(flatten=False)
vector = payload.pop(embedding_field) or {}
vector = {}
if embedding_field in payload and payload[embedding_field] is not None:
dense_vector = payload.pop(embedding_field) or []
vector["text-dense"] = dense_vector
# TODO: Adapt to Haystack Modification of the Document Dataclass
if "_sparse_vector" in payload["meta"]:
sparse_vector = payload["meta"].pop("_sparse_vector", {"indices": [], "values": []})
sparse_vector_instance = rest.SparseVector(**sparse_vector)
vector["text-sparse"] = sparse_vector_instance
lambda-science marked this conversation as resolved.
Show resolved Hide resolved
lambda-science marked this conversation as resolved.
Show resolved Hide resolved
_id = self.convert_id(payload.get("id"))

point = rest.PointStruct(
Expand Down Expand Up @@ -51,6 +59,18 @@ def __init__(self, content_field: str, name_field: str, embedding_field: str):

def point_to_document(self, point: QdrantPoint) -> Document:
payload = {**point.payload}
payload["embedding"] = point.vector if hasattr(point, "vector") else None
if hasattr(point, "vector") and point.vector is not None and "text-dense" in point.vector:
payload["embedding"] = point.vector["text-dense"]
else:
payload["embedding"] = None
payload["score"] = point.score if hasattr(point, "score") else None
# TODO: Adapt to Haystack Modification of the Document Dataclass
if hasattr(point, "vector") and point.vector is not None and "text-sparse" in point.vector:
parse_vector_dict = {
"indices": point.vector["text-sparse"].indices,
"values": point.vector["text-sparse"].values,
}
payload["meta"]["_sparse_vector"] = parse_vector_dict
else:
payload["meta"]["_sparse_vector"] = None
return Document.from_dict(payload)
Loading