Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(Qdrant): start to work on sparse vector integration #578

Merged
merged 45 commits into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
0f650ee
feat(Qdrant): start to working on sparse vector integration
lambda-science Mar 13, 2024
6025e50
Progress towards Sparse vector support with Fastembed
Mar 13, 2024
ad6fcbc
__init__.py
Mar 13, 2024
c9a571a
merge batch results for hybrid request
Mar 13, 2024
3824e8c
feat(Qdrant): missing comma
lambda-science Mar 14, 2024
4253a1c
feat(Qdrant): making some test progress
lambda-science Mar 14, 2024
37fddeb
feat(Qdrant): all current test are fixed
lambda-science Mar 14, 2024
550ef45
feat(Qdrant): linting
lambda-science Mar 14, 2024
c79c604
feat(Qdrant): working sparse retriver hooray
lambda-science Mar 14, 2024
91d67f7
feat(Qdrant): fix hybrid retriver
lambda-science Mar 20, 2024
115c01b
feat(Qdrant): modify PR for haystack 2.1.0 with proper sparse vectors
lambda-science Mar 20, 2024
9c4e256
feat(Qdrant): fix lint
lambda-science Mar 20, 2024
bafc27a
Merge branch 'main' into qdrant-sparse
anakin87 Mar 21, 2024
3339598
test w Haystack main
anakin87 Mar 21, 2024
725c9dc
fix deps
anakin87 Mar 21, 2024
cea3cb7
Update integrations/qdrant/src/haystack_integrations/components/retri…
Mar 21, 2024
34cd6cf
Merge branch 'deepset-ai:main' into qdrant-sparse
Mar 22, 2024
9441135
feat(Qdrant): remove hybrid & old code, constant for vector field names
lambda-science Mar 22, 2024
6cacc78
Update integrations/qdrant/src/haystack_integrations/document_stores/…
Mar 22, 2024
d911b18
Update integrations/qdrant/src/haystack_integrations/document_stores/…
Mar 22, 2024
125c123
feat(Qdrant): reverting pop change, changing Dict to SparseEmbedding …
lambda-science Mar 22, 2024
7cf1882
feat(Qdrant): fix lint
lambda-science Mar 22, 2024
9749ee0
feat(Qdrant): remove old todo
lambda-science Mar 22, 2024
2683a74
simplify documents_to_batch
anakin87 Mar 22, 2024
79d0d52
feat(Qdrant): SparseEmbedding instead of Dict
lambda-science Mar 22, 2024
529719a
feat(Qdrant): introducing `use_sparse_embeddings` parameters for docu…
lambda-science Mar 22, 2024
fd064da
feat(Qdrant): `use_sparse_embeddings` true by default + bugfix
lambda-science Mar 22, 2024
b018504
feat(Qdrant): `use_sparse_embeddings` true by default + bugfix
lambda-science Mar 22, 2024
e1c38be
feat(Qdrant): `use_sparse_embeddings` true by default + bugfix
lambda-science Mar 22, 2024
1cb601d
feat(Qdrant): bugfix
lambda-science Mar 22, 2024
a3bd3d3
Update integrations/qdrant/src/haystack_integrations/document_stores/…
Mar 25, 2024
a72b65b
Update integrations/qdrant/src/haystack_integrations/document_stores/…
Mar 25, 2024
827b826
Update integrations/qdrant/src/haystack_integrations/document_stores/…
Mar 25, 2024
e66c74b
Update integrations/qdrant/src/haystack_integrations/document_stores/…
Mar 25, 2024
f7cf65e
Update integrations/qdrant/src/haystack_integrations/document_stores/…
Mar 25, 2024
a968d71
Revert "Update integrations/qdrant/src/haystack_integrations/document…
lambda-science Mar 25, 2024
56c4ee5
feat(Qdrant): fixing test
lambda-science Mar 25, 2024
9974975
Update integrations/qdrant/src/haystack_integrations/document_stores/…
Mar 25, 2024
67ad96c
feat(Qdrant): fixing creation
lambda-science Mar 25, 2024
2f103a7
feat(Qdrant): fixing creation
lambda-science Mar 25, 2024
10b79e8
Merge branch 'main' into qdrant-sparse
anakin87 Mar 27, 2024
976cbb5
little fixes
anakin87 Mar 27, 2024
37a2fdc
Merge branch 'main' into qdrant-sparse
anakin87 Apr 11, 2024
ee819c8
make changes nonbreaking
anakin87 Apr 11, 2024
804afd7
refactoring
anakin87 Apr 12, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion integrations/qdrant/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ classifiers = [
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
]
dependencies = ["haystack-ai", "qdrant-client"]
dependencies = ["haystack-ai>=2.0.1", "qdrant-client"]

[project.urls]
Source = "https://github.com/deepset-ai/haystack-core-integrations"
Expand Down Expand Up @@ -103,6 +103,8 @@ ignore = [
"B027",
# Allow boolean positional values in function calls, like `dict.get(... True)`
"FBT003",
# Allow boolean arguments in function definition
"FBT001", "FBT002",
# Ignore checks for possible passwords
"S105",
"S106",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
#
# SPDX-License-Identifier: Apache-2.0

from .retriever import QdrantEmbeddingRetriever
from .retriever import QdrantEmbeddingRetriever, QdrantSparseRetriever

__all__ = ("QdrantEmbeddingRetriever",)
__all__ = ("QdrantEmbeddingRetriever", "QdrantSparseRetriever")
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from typing import Any, Dict, List, Optional

from haystack import Document, component, default_from_dict, default_to_dict
from haystack.dataclasses.sparse_embedding import SparseEmbedding
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore


@component
class QdrantEmbeddingRetriever:
"""
A component for retrieving documents from an QdrantDocumentStore.
A component for retrieving documents from an QdrantDocumentStore using dense vectors.

Usage example:
```python
Expand All @@ -32,8 +33,8 @@ def __init__(
document_store: QdrantDocumentStore,
filters: Optional[Dict[str, Any]] = None,
top_k: int = 10,
scale_score: bool = True, # noqa: FBT001, FBT002
return_embedding: bool = False, # noqa: FBT001, FBT002
scale_score: bool = True,
return_embedding: bool = False,
):
"""
Create a QdrantEmbeddingRetriever component.
Expand Down Expand Up @@ -120,3 +121,121 @@ def run(
)

return {"documents": docs}


@component
class QdrantSparseRetriever:
"""
A component for retrieving documents from an QdrantDocumentStore using sparse vectors.

Usage example:
```python
from haystack_integrations.components.retrievers.qdrant import QdrantSparseRetriever
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
from haystack.dataclasses.sparse_embedding import SparseEmbedding

document_store = QdrantDocumentStore(
":memory:",
recreate_index=True,
return_embedding=True,
wait_result_from_api=True,
)
retriever = QdrantSparseRetriever(document_store=document_store)
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
retriever.run(query_sparse_embedding=sparse_embedding)
```
"""

def __init__(
self,
document_store: QdrantDocumentStore,
filters: Optional[Dict[str, Any]] = None,
top_k: int = 10,
scale_score: bool = True,
return_embedding: bool = False,
):
"""
Create a QdrantSparseRetriever component.

:param document_store: An instance of QdrantDocumentStore.
:param filters: A dictionary with filters to narrow down the search space. Default is None.
:param top_k: The maximum number of documents to retrieve. Default is 10.
:param scale_score: Whether to scale the scores of the retrieved documents or not. Default is True.
:param return_embedding: Whether to return the sparse embedding of the retrieved Documents. Default is False.

:raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore.
"""

if not isinstance(document_store, QdrantDocumentStore):
msg = "document_store must be an instance of QdrantDocumentStore"
raise ValueError(msg)

self._document_store = document_store
self._filters = filters
self._top_k = top_k
self._scale_score = scale_score
self._return_embedding = return_embedding

def to_dict(self) -> Dict[str, Any]:
"""
Serializes the component to a dictionary.

:returns:
Dictionary with serialized data.
"""
d = default_to_dict(
self,
document_store=self._document_store,
filters=self._filters,
top_k=self._top_k,
scale_score=self._scale_score,
return_embedding=self._return_embedding,
)
d["init_parameters"]["document_store"] = self._document_store.to_dict()

return d

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "QdrantEmbeddingRetriever":
"""
Deserializes the component from a dictionary.

:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
"""
document_store = QdrantDocumentStore.from_dict(data["init_parameters"]["document_store"])
data["init_parameters"]["document_store"] = document_store
return default_from_dict(cls, data)

@component.output_types(documents=List[Document])
def run(
self,
query_sparse_embedding: SparseEmbedding,
filters: Optional[Dict[str, Any]] = None,
top_k: Optional[int] = None,
scale_score: Optional[bool] = None,
return_embedding: Optional[bool] = None,
):
"""
Run the Sparse Embedding Retriever on the given input data.

:param query_sparse_embedding: Sparse Embedding of the query.
:param filters: A dictionary with filters to narrow down the search space.
:param top_k: The maximum number of documents to return.
:param scale_score: Whether to scale the scores of the retrieved documents or not.
:param return_embedding: Whether to return the embedding of the retrieved Documents.
:returns:
The retrieved documents.

"""
docs = self._document_store.query_by_sparse(
query_sparse_embedding=query_sparse_embedding,
filters=filters or self._filters,
top_k=top_k or self._top_k,
scale_score=scale_score or self._scale_score,
return_embedding=return_embedding or self._return_embedding,
)

return {"documents": docs}
Original file line number Diff line number Diff line change
Expand Up @@ -7,64 +7,74 @@

logger = logging.getLogger(__name__)

DENSE_VECTORS_NAME = "text-dense"
SPARSE_VECTORS_NAME = "text-sparse"

class HaystackToQdrant:
"""A converter from Haystack to Qdrant types."""

UUID_NAMESPACE = uuid.UUID("3896d314-1e95-4a3a-b45a-945f9f0b541d")
UUID_NAMESPACE = uuid.UUID("3896d314-1e95-4a3a-b45a-945f9f0b541d")

def documents_to_batch(
self,
documents: List[Document],
*,
embedding_field: str,
) -> List[rest.PointStruct]:
points = []
for document in documents:
payload = document.to_dict(flatten=False)

def convert_haystack_documents_to_qdrant_points(
documents: List[Document],
*,
embedding_field: str,
use_sparse_embeddings: bool,
) -> List[rest.PointStruct]:
points = []
for document in documents:
payload = document.to_dict(flatten=False)
if use_sparse_embeddings:
vector = {}

dense_vector = payload.pop(embedding_field, None)
if dense_vector is not None:
vector[DENSE_VECTORS_NAME] = dense_vector

sparse_vector = payload.pop("sparse_embedding", None)
if sparse_vector is not None:
sparse_vector_instance = rest.SparseVector(**sparse_vector)
vector[SPARSE_VECTORS_NAME] = sparse_vector_instance

else:
vector = payload.pop(embedding_field) or {}
_id = self.convert_id(payload.get("id"))

# TODO: remove as soon as we introduce the support for sparse embeddings in Qdrant
if "sparse_embedding" in payload:
sparse_embedding = payload.pop("sparse_embedding", None)
if sparse_embedding:
logger.warning(
"Document %s has the `sparse_embedding` field set,"
"but storing sparse embeddings in Qdrant is not currently supported."
"The `sparse_embedding` field will be ignored.",
payload["id"],
)

point = rest.PointStruct(
payload=payload,
vector=vector,
id=_id,
)
points.append(point)
return points

def convert_id(self, _id: str) -> str:
"""
Converts any string into a UUID-like format in a deterministic way.

Qdrant does not accept any string as an id, so an internal id has to be
generated for each point. This is a deterministic way of doing so.
"""
return uuid.uuid5(self.UUID_NAMESPACE, _id).hex
_id = convert_id(payload.get("id"))

point = rest.PointStruct(
payload=payload,
vector=vector,
id=_id,
)
points.append(point)
return points


def convert_id(_id: str) -> str:
"""
Converts any string into a UUID-like format in a deterministic way.

Qdrant does not accept any string as an id, so an internal id has to be
generated for each point. This is a deterministic way of doing so.
"""
return uuid.uuid5(UUID_NAMESPACE, _id).hex


QdrantPoint = Union[rest.ScoredPoint, rest.Record]


class QdrantToHaystack:
def __init__(self, content_field: str, name_field: str, embedding_field: str):
self.content_field = content_field
self.name_field = name_field
self.embedding_field = embedding_field
def convert_qdrant_point_to_haystack_document(point: QdrantPoint, use_sparse_embeddings: bool) -> Document:
payload = {**point.payload}
payload["score"] = point.score if hasattr(point, "score") else None

def point_to_document(self, point: QdrantPoint) -> Document:
payload = {**point.payload}
if not use_sparse_embeddings:
payload["embedding"] = point.vector if hasattr(point, "vector") else None
payload["score"] = point.score if hasattr(point, "score") else None
return Document.from_dict(payload)
elif hasattr(point, "vector") and point.vector is not None:
payload["embedding"] = point.vector.get(DENSE_VECTORS_NAME)

if SPARSE_VECTORS_NAME in point.vector:
parse_vector_dict = {
"indices": point.vector[SPARSE_VECTORS_NAME].indices,
"values": point.vector[SPARSE_VECTORS_NAME].values,
}
payload["sparse_embedding"] = parse_vector_dict

return Document.from_dict(payload)
Loading