Skip to content

Commit

Permalink
feat(Qdrant): start to work on sparse vector integration (#578)
Browse files Browse the repository at this point in the history
* feat(Qdrant): start to working on sparse vector integration

* Progress towards Sparse vector support with Fastembed

* __init__.py

* merge batch results for hybrid request

* feat(Qdrant): missing comma

* feat(Qdrant): making some test progress

* feat(Qdrant): all current test are fixed

* feat(Qdrant):  linting

* feat(Qdrant): working sparse retriver hooray

* feat(Qdrant): fix hybrid retriver

* feat(Qdrant): modify PR for haystack 2.1.0 with proper sparse vectors

* feat(Qdrant): fix lint

* test w Haystack main

* fix deps

* Update integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py

Co-authored-by: Anush  <[email protected]>

* feat(Qdrant): remove hybrid & old code, constant for vector field names

* Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py

Co-authored-by: Stefano Fiorucci <[email protected]>

* Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py

Co-authored-by: Stefano Fiorucci <[email protected]>

* feat(Qdrant): reverting pop change, changing Dict to SparseEmbedding type

* feat(Qdrant): fix lint

* feat(Qdrant): remove old todo

* simplify documents_to_batch

* feat(Qdrant): SparseEmbedding instead of Dict

* feat(Qdrant): introducing `use_sparse_embeddings` parameters for document store to make sparse embeddings non breaking change. Need more testing

* feat(Qdrant): `use_sparse_embeddings` true by default + bugfix

* feat(Qdrant): `use_sparse_embeddings` true by default + bugfix

* feat(Qdrant): `use_sparse_embeddings` true by default + bugfix

* feat(Qdrant): bugfix

* Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py

Co-authored-by: Anush  <[email protected]>

* Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py

Co-authored-by: Anush  <[email protected]>

* Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py

Co-authored-by: Anush  <[email protected]>

* Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py

Co-authored-by: Anush  <[email protected]>

* Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py

Co-authored-by: Anush  <[email protected]>

* Revert "Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py"

This reverts commit f7cf65e.

* feat(Qdrant): fixing test

* Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py

Co-authored-by: Anush  <[email protected]>

* feat(Qdrant): fixing creation

* feat(Qdrant): fixing creation

* little fixes

* make changes nonbreaking

* refactoring

---------

Co-authored-by: anakin87 <[email protected]>
Co-authored-by: Anush <[email protected]>
  • Loading branch information
3 people authored Apr 12, 2024
1 parent 363c7b5 commit 12cdc11
Show file tree
Hide file tree
Showing 9 changed files with 704 additions and 313 deletions.
4 changes: 3 additions & 1 deletion integrations/qdrant/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ classifiers = [
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
]
dependencies = ["haystack-ai", "qdrant-client"]
dependencies = ["haystack-ai>=2.0.1", "qdrant-client"]

[project.urls]
Source = "https://github.com/deepset-ai/haystack-core-integrations"
Expand Down Expand Up @@ -103,6 +103,8 @@ ignore = [
"B027",
# Allow boolean positional values in function calls, like `dict.get(... True)`
"FBT003",
# Allow boolean arguments in function definition
"FBT001", "FBT002",
# Ignore checks for possible passwords
"S105",
"S106",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
#
# SPDX-License-Identifier: Apache-2.0

from .retriever import QdrantEmbeddingRetriever
from .retriever import QdrantEmbeddingRetriever, QdrantSparseRetriever

__all__ = ("QdrantEmbeddingRetriever",)
__all__ = ("QdrantEmbeddingRetriever", "QdrantSparseRetriever")
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from typing import Any, Dict, List, Optional

from haystack import Document, component, default_from_dict, default_to_dict
from haystack.dataclasses.sparse_embedding import SparseEmbedding
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore


@component
class QdrantEmbeddingRetriever:
"""
A component for retrieving documents from an QdrantDocumentStore.
A component for retrieving documents from an QdrantDocumentStore using dense vectors.
Usage example:
```python
Expand All @@ -32,8 +33,8 @@ def __init__(
document_store: QdrantDocumentStore,
filters: Optional[Dict[str, Any]] = None,
top_k: int = 10,
scale_score: bool = True, # noqa: FBT001, FBT002
return_embedding: bool = False, # noqa: FBT001, FBT002
scale_score: bool = True,
return_embedding: bool = False,
):
"""
Create a QdrantEmbeddingRetriever component.
Expand Down Expand Up @@ -120,3 +121,121 @@ def run(
)

return {"documents": docs}


@component
class QdrantSparseRetriever:
"""
A component for retrieving documents from an QdrantDocumentStore using sparse vectors.
Usage example:
```python
from haystack_integrations.components.retrievers.qdrant import QdrantSparseRetriever
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
from haystack.dataclasses.sparse_embedding import SparseEmbedding
document_store = QdrantDocumentStore(
":memory:",
recreate_index=True,
return_embedding=True,
wait_result_from_api=True,
)
retriever = QdrantSparseRetriever(document_store=document_store)
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
retriever.run(query_sparse_embedding=sparse_embedding)
```
"""

def __init__(
self,
document_store: QdrantDocumentStore,
filters: Optional[Dict[str, Any]] = None,
top_k: int = 10,
scale_score: bool = True,
return_embedding: bool = False,
):
"""
Create a QdrantSparseRetriever component.
:param document_store: An instance of QdrantDocumentStore.
:param filters: A dictionary with filters to narrow down the search space. Default is None.
:param top_k: The maximum number of documents to retrieve. Default is 10.
:param scale_score: Whether to scale the scores of the retrieved documents or not. Default is True.
:param return_embedding: Whether to return the sparse embedding of the retrieved Documents. Default is False.
:raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore.
"""

if not isinstance(document_store, QdrantDocumentStore):
msg = "document_store must be an instance of QdrantDocumentStore"
raise ValueError(msg)

self._document_store = document_store
self._filters = filters
self._top_k = top_k
self._scale_score = scale_score
self._return_embedding = return_embedding

def to_dict(self) -> Dict[str, Any]:
"""
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
d = default_to_dict(
self,
document_store=self._document_store,
filters=self._filters,
top_k=self._top_k,
scale_score=self._scale_score,
return_embedding=self._return_embedding,
)
d["init_parameters"]["document_store"] = self._document_store.to_dict()

return d

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "QdrantEmbeddingRetriever":
"""
Deserializes the component from a dictionary.
:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
"""
document_store = QdrantDocumentStore.from_dict(data["init_parameters"]["document_store"])
data["init_parameters"]["document_store"] = document_store
return default_from_dict(cls, data)

@component.output_types(documents=List[Document])
def run(
self,
query_sparse_embedding: SparseEmbedding,
filters: Optional[Dict[str, Any]] = None,
top_k: Optional[int] = None,
scale_score: Optional[bool] = None,
return_embedding: Optional[bool] = None,
):
"""
Run the Sparse Embedding Retriever on the given input data.
:param query_sparse_embedding: Sparse Embedding of the query.
:param filters: A dictionary with filters to narrow down the search space.
:param top_k: The maximum number of documents to return.
:param scale_score: Whether to scale the scores of the retrieved documents or not.
:param return_embedding: Whether to return the embedding of the retrieved Documents.
:returns:
The retrieved documents.
"""
docs = self._document_store.query_by_sparse(
query_sparse_embedding=query_sparse_embedding,
filters=filters or self._filters,
top_k=top_k or self._top_k,
scale_score=scale_score or self._scale_score,
return_embedding=return_embedding or self._return_embedding,
)

return {"documents": docs}
Original file line number Diff line number Diff line change
Expand Up @@ -7,64 +7,74 @@

logger = logging.getLogger(__name__)

DENSE_VECTORS_NAME = "text-dense"
SPARSE_VECTORS_NAME = "text-sparse"

class HaystackToQdrant:
"""A converter from Haystack to Qdrant types."""

UUID_NAMESPACE = uuid.UUID("3896d314-1e95-4a3a-b45a-945f9f0b541d")
UUID_NAMESPACE = uuid.UUID("3896d314-1e95-4a3a-b45a-945f9f0b541d")

def documents_to_batch(
self,
documents: List[Document],
*,
embedding_field: str,
) -> List[rest.PointStruct]:
points = []
for document in documents:
payload = document.to_dict(flatten=False)

def convert_haystack_documents_to_qdrant_points(
documents: List[Document],
*,
embedding_field: str,
use_sparse_embeddings: bool,
) -> List[rest.PointStruct]:
points = []
for document in documents:
payload = document.to_dict(flatten=False)
if use_sparse_embeddings:
vector = {}

dense_vector = payload.pop(embedding_field, None)
if dense_vector is not None:
vector[DENSE_VECTORS_NAME] = dense_vector

sparse_vector = payload.pop("sparse_embedding", None)
if sparse_vector is not None:
sparse_vector_instance = rest.SparseVector(**sparse_vector)
vector[SPARSE_VECTORS_NAME] = sparse_vector_instance

else:
vector = payload.pop(embedding_field) or {}
_id = self.convert_id(payload.get("id"))

# TODO: remove as soon as we introduce the support for sparse embeddings in Qdrant
if "sparse_embedding" in payload:
sparse_embedding = payload.pop("sparse_embedding", None)
if sparse_embedding:
logger.warning(
"Document %s has the `sparse_embedding` field set,"
"but storing sparse embeddings in Qdrant is not currently supported."
"The `sparse_embedding` field will be ignored.",
payload["id"],
)

point = rest.PointStruct(
payload=payload,
vector=vector,
id=_id,
)
points.append(point)
return points

def convert_id(self, _id: str) -> str:
"""
Converts any string into a UUID-like format in a deterministic way.
Qdrant does not accept any string as an id, so an internal id has to be
generated for each point. This is a deterministic way of doing so.
"""
return uuid.uuid5(self.UUID_NAMESPACE, _id).hex
_id = convert_id(payload.get("id"))

point = rest.PointStruct(
payload=payload,
vector=vector,
id=_id,
)
points.append(point)
return points


def convert_id(_id: str) -> str:
"""
Converts any string into a UUID-like format in a deterministic way.
Qdrant does not accept any string as an id, so an internal id has to be
generated for each point. This is a deterministic way of doing so.
"""
return uuid.uuid5(UUID_NAMESPACE, _id).hex


QdrantPoint = Union[rest.ScoredPoint, rest.Record]


class QdrantToHaystack:
def __init__(self, content_field: str, name_field: str, embedding_field: str):
self.content_field = content_field
self.name_field = name_field
self.embedding_field = embedding_field
def convert_qdrant_point_to_haystack_document(point: QdrantPoint, use_sparse_embeddings: bool) -> Document:
payload = {**point.payload}
payload["score"] = point.score if hasattr(point, "score") else None

def point_to_document(self, point: QdrantPoint) -> Document:
payload = {**point.payload}
if not use_sparse_embeddings:
payload["embedding"] = point.vector if hasattr(point, "vector") else None
payload["score"] = point.score if hasattr(point, "score") else None
return Document.from_dict(payload)
elif hasattr(point, "vector") and point.vector is not None:
payload["embedding"] = point.vector.get(DENSE_VECTORS_NAME)

if SPARSE_VECTORS_NAME in point.vector:
parse_vector_dict = {
"indices": point.vector[SPARSE_VECTORS_NAME].indices,
"values": point.vector[SPARSE_VECTORS_NAME].values,
}
payload["sparse_embedding"] = parse_vector_dict

return Document.from_dict(payload)
Loading

0 comments on commit 12cdc11

Please sign in to comment.