From c1d6d48b636a2ccecd7e70a7fc28eeeae997bbb2 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Wed, 13 Nov 2024 18:37:36 +0100 Subject: [PATCH] squashing --- README.md | 2 +- .../fastembed/examples/ranker_example.py | 22 ++ integrations/fastembed/pydoc/config.yml | 3 +- integrations/fastembed/pyproject.toml | 6 +- .../components/rankers/fastembed/__init__.py | 3 + .../components/rankers/fastembed/ranker.py | 202 ++++++++++++ .../fastembed/tests/test_fastembed_ranker.py | 292 ++++++++++++++++++ 7 files changed, 527 insertions(+), 3 deletions(-) create mode 100644 integrations/fastembed/examples/ranker_example.py create mode 100644 integrations/fastembed/src/haystack_integrations/components/rankers/fastembed/__init__.py create mode 100644 integrations/fastembed/src/haystack_integrations/components/rankers/fastembed/ranker.py create mode 100644 integrations/fastembed/tests/test_fastembed_ranker.py diff --git a/README.md b/README.md index 2b4a83253..af83d045d 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta | [cohere-haystack](integrations/cohere/) | Embedder, Generator, Ranker | [![PyPI - Version](https://img.shields.io/pypi/v/cohere-haystack.svg)](https://pypi.org/project/cohere-haystack) | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml) | | [deepeval-haystack](integrations/deepeval/) | Evaluator | [![PyPI - Version](https://img.shields.io/pypi/v/deepeval-haystack.svg)](https://pypi.org/project/deepeval-haystack) | [![Test / deepeval](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/deepeval.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/deepeval.yml) | | [elasticsearch-haystack](integrations/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml) | -| [fastembed-haystack](integrations/fastembed/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/fastembed-haystack.svg)](https://pypi.org/project/fastembed-haystack/) | [![Test / fastembed](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/fastembed.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/fastembed.yml) | +| [fastembed-haystack](integrations/fastembed/) | Embedder, Ranker | [![PyPI - Version](https://img.shields.io/pypi/v/fastembed-haystack.svg)](https://pypi.org/project/fastembed-haystack/) | [![Test / fastembed](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/fastembed.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/fastembed.yml) | | [google-ai-haystack](integrations/google_ai/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/google-ai-haystack.svg)](https://pypi.org/project/google-ai-haystack) | [![Test / google-ai](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_ai.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_ai.yml) | | [google-vertex-haystack](integrations/google_vertex/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/google-vertex-haystack.svg)](https://pypi.org/project/google-vertex-haystack) | [![Test / google-vertex](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml) | | [instructor-embedders-haystack](integrations/instructor_embedders/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack) | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml) | diff --git a/integrations/fastembed/examples/ranker_example.py b/integrations/fastembed/examples/ranker_example.py new file mode 100644 index 000000000..7a31e4646 --- /dev/null +++ b/integrations/fastembed/examples/ranker_example.py @@ -0,0 +1,22 @@ +from haystack import Document + +from haystack_integrations.components.rankers.fastembed import FastembedRanker + +query = "Who is maintaining Qdrant?" +documents = [ + Document( + content="This is built to be faster and lighter than other embedding libraries e.g. Transformers, Sentence-Transformers, etc." + ), + Document(content="fastembed is supported by and maintained by Qdrant."), +] + +ranker = FastembedRanker(model_name="Xenova/ms-marco-MiniLM-L-6-v2") +ranker.warm_up() +reranked_documents = ranker.run(query=query, documents=documents)["documents"] + + +print(reranked_documents["documents"][0]) + +# Document(id=..., +# content: 'fastembed is supported by and maintained by Qdrant.', +# score: 5.472434997558594..) diff --git a/integrations/fastembed/pydoc/config.yml b/integrations/fastembed/pydoc/config.yml index aad50e52c..8ab538cf8 100644 --- a/integrations/fastembed/pydoc/config.yml +++ b/integrations/fastembed/pydoc/config.yml @@ -6,7 +6,8 @@ loaders: "haystack_integrations.components.embedders.fastembed.fastembed_document_embedder", "haystack_integrations.components.embedders.fastembed.fastembed_text_embedder", "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder", - "haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder" + "haystack_integrations.components.embedders.fastembed.fastembed_sparse_text_embedder", + "haystack_integrations.components.rankers.fastembed.ranker" ] ignore_when_discovered: ["__init__"] processors: diff --git a/integrations/fastembed/pyproject.toml b/integrations/fastembed/pyproject.toml index b9f1f6cfd..abae78d8a 100644 --- a/integrations/fastembed/pyproject.toml +++ b/integrations/fastembed/pyproject.toml @@ -23,7 +23,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = ["haystack-ai>=2.0.1", "fastembed>=0.2.5", "onnxruntime<1.20.0"] +dependencies = ["haystack-ai>=2.0.1", "fastembed>=0.4.2"] [project.urls] Source = "https://github.com/deepset-ai/haystack-core-integrations" @@ -154,6 +154,10 @@ omit = ["*/tests/*", "*/__init__.py"] show_missing = true exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] +[tool.pytest.ini_options] +minversion = "6.0" +markers = ["unit: unit tests", "integration: integration tests"] + [[tool.mypy.overrides]] module = [ "haystack.*", diff --git a/integrations/fastembed/src/haystack_integrations/components/rankers/fastembed/__init__.py b/integrations/fastembed/src/haystack_integrations/components/rankers/fastembed/__init__.py new file mode 100644 index 000000000..ece5e858b --- /dev/null +++ b/integrations/fastembed/src/haystack_integrations/components/rankers/fastembed/__init__.py @@ -0,0 +1,3 @@ +from .ranker import FastembedRanker + +__all__ = ["FastembedRanker"] diff --git a/integrations/fastembed/src/haystack_integrations/components/rankers/fastembed/ranker.py b/integrations/fastembed/src/haystack_integrations/components/rankers/fastembed/ranker.py new file mode 100644 index 000000000..8f077a30c --- /dev/null +++ b/integrations/fastembed/src/haystack_integrations/components/rankers/fastembed/ranker.py @@ -0,0 +1,202 @@ +from typing import Any, Dict, List, Optional + +from haystack import Document, component, default_from_dict, default_to_dict, logging + +from fastembed.rerank.cross_encoder import TextCrossEncoder + +logger = logging.getLogger(__name__) + + +@component +class FastembedRanker: + """ + Ranks Documents based on their similarity to the query using + [Fastembed models](https://qdrant.github.io/fastembed/examples/Supported_Models/). + + Documents are indexed from most to least semantically relevant to the query. + + Usage example: + ```python + from haystack import Document + from haystack_integrations.components.rankers.fastembed import FastembedRanker + + ranker = FastembedRanker(model_name="Xenova/ms-marco-MiniLM-L-6-v2", top_k=2) + + docs = [Document(content="Paris"), Document(content="Berlin")] + query = "What is the capital of germany?" + output = ranker.run(query=query, documents=docs) + print(output["documents"][0].content) + + # Berlin + ``` + """ + + def __init__( + self, + model_name: str = "Xenova/ms-marco-MiniLM-L-6-v2", + top_k: int = 10, + cache_dir: Optional[str] = None, + threads: Optional[int] = None, + batch_size: int = 64, + parallel: Optional[int] = None, + local_files_only: bool = False, + meta_fields_to_embed: Optional[List[str]] = None, + meta_data_separator: str = "\n", + ): + """ + Creates an instance of the 'FastembedRanker'. + + :param model_name: Fastembed model name. Check the list of supported models in the [Fastembed documentation](https://qdrant.github.io/fastembed/examples/Supported_Models/). + :param top_k: The maximum number of documents to return. + :param cache_dir: The path to the cache directory. + Can be set using the `FASTEMBED_CACHE_PATH` env variable. + Defaults to `fastembed_cache` in the system's temp directory. + :param threads: The number of threads single onnxruntime session can use. Defaults to None. + :param batch_size: Number of strings to encode at once. + :param parallel: + If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets. + If 0, use all available cores. + If None, don't use data-parallel processing, use default onnxruntime threading instead. + :param local_files_only: If `True`, only use the model files in the `cache_dir`. + :param meta_fields_to_embed: List of meta fields that should be concatenated + with the document content for reranking. + :param meta_data_separator: Separator used to concatenate the meta fields + to the Document content. + """ + if top_k <= 0: + msg = f"top_k must be > 0, but got {top_k}" + raise ValueError(msg) + + self.model_name = model_name + self.top_k = top_k + self.cache_dir = cache_dir + self.threads = threads + self.batch_size = batch_size + self.parallel = parallel + self.local_files_only = local_files_only + self.meta_fields_to_embed = meta_fields_to_embed or [] + self.meta_data_separator = meta_data_separator + self._model = None + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict( + self, + model_name=self.model_name, + top_k=self.top_k, + cache_dir=self.cache_dir, + threads=self.threads, + batch_size=self.batch_size, + parallel=self.parallel, + local_files_only=self.local_files_only, + meta_fields_to_embed=self.meta_fields_to_embed, + meta_data_separator=self.meta_data_separator, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "FastembedRanker": + """ + Deserializes the component from a dictionary. + + :param data: + The dictionary to deserialize from. + :returns: + The deserialized component. + """ + return default_from_dict(cls, data) + + def warm_up(self): + """ + Initializes the component. + """ + if self._model is None: + self._model = TextCrossEncoder( + model_name=self.model_name, + cache_dir=self.cache_dir, + threads=self.threads, + local_files_only=self.local_files_only, + ) + + def _prepare_fastembed_input_docs(self, documents: List[Document]) -> List[str]: + """ + Prepare the input by concatenating the document text with the metadata fields specified. + :param documents: The list of Document objects. + + :return: A list of strings to be given as input to Fastembed model. + """ + concatenated_input_list = [] + for doc in documents: + meta_values_to_embed = [ + str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta.get(key) + ] + concatenated_input = self.meta_data_separator.join([*meta_values_to_embed, doc.content or ""]) + concatenated_input_list.append(concatenated_input) + + return concatenated_input_list + + @component.output_types(documents=List[Document]) + def run(self, query: str, documents: List[Document], top_k: Optional[int] = None): + """ + Returns a list of documents ranked by their similarity to the given query, using FastEmbed. + + :param query: + The input query to compare the documents to. + :param documents: + A list of documents to be ranked. + :param top_k: + The maximum number of documents to return. + + :returns: + A dictionary with the following keys: + - `documents`: A list of documents closest to the query, sorted from most similar to least similar. + + :raises ValueError: If `top_k` is not > 0. + """ + if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): + msg = "FastembedRanker expects a list of Documents as input. " + raise TypeError(msg) + if query == "": + msg = "No query provided" + raise ValueError(msg) + + if not documents: + return {"documents": []} + + top_k = top_k or self.top_k + if top_k <= 0: + msg = f"top_k must be > 0, but got {top_k}" + raise ValueError(msg) + + if self._model is None: + msg = "The ranker model has not been loaded. Please call warm_up() before running." + raise RuntimeError(msg) + + fastembed_input_docs = self._prepare_fastembed_input_docs(documents) + + scores = list( + self._model.rerank( + query=query, + documents=fastembed_input_docs, + batch_size=self.batch_size, + parallel=self.parallel, + ) + ) + + # Combine the two lists into a single list of tuples + doc_scores = list(zip(documents, scores)) + + # Sort the list of tuples by the score in descending order + sorted_doc_scores = sorted(doc_scores, key=lambda x: x[1], reverse=True) + + # Get the top_k documents + top_k_documents = [] + for doc, score in sorted_doc_scores[:top_k]: + doc.score = score + top_k_documents.append(doc) + + return {"documents": top_k_documents} diff --git a/integrations/fastembed/tests/test_fastembed_ranker.py b/integrations/fastembed/tests/test_fastembed_ranker.py new file mode 100644 index 000000000..e38229c87 --- /dev/null +++ b/integrations/fastembed/tests/test_fastembed_ranker.py @@ -0,0 +1,292 @@ +from unittest.mock import MagicMock + +import pytest +from haystack import Document, default_from_dict + +from haystack_integrations.components.rankers.fastembed.ranker import ( + FastembedRanker, +) + + +class TestFastembedRanker: + def test_init_default(self): + """ + Test default initialization parameters for FastembedRanker. + """ + ranker = FastembedRanker(model_name="BAAI/bge-reranker-base") + assert ranker.model_name == "BAAI/bge-reranker-base" + assert ranker.top_k == 10 + assert ranker.cache_dir is None + assert ranker.threads is None + assert ranker.batch_size == 64 + assert ranker.parallel is None + assert not ranker.local_files_only + assert ranker.meta_fields_to_embed == [] + assert ranker.meta_data_separator == "\n" + + def test_init_with_parameters(self): + """ + Test custom initialization parameters for FastembedRanker. + """ + ranker = FastembedRanker( + model_name="BAAI/bge-reranker-base", + top_k=64, + cache_dir="fake_dir", + threads=2, + batch_size=50, + parallel=1, + local_files_only=True, + meta_fields_to_embed=["test_field"], + meta_data_separator=" | ", + ) + assert ranker.model_name == "BAAI/bge-reranker-base" + assert ranker.top_k == 64 + assert ranker.cache_dir == "fake_dir" + assert ranker.threads == 2 + assert ranker.batch_size == 50 + assert ranker.parallel == 1 + assert ranker.local_files_only + assert ranker.meta_fields_to_embed == ["test_field"] + assert ranker.meta_data_separator == " | " + + def test_init_with_incorrect_input(self): + """ + Test for checking incorrect input format on init + """ + with pytest.raises( + ValueError, + match="top_k must be > 0, but got 0", + ): + FastembedRanker(model_name="Xenova/ms-marco-MiniLM-L-12-v2", top_k=0) + + with pytest.raises( + ValueError, + match="top_k must be > 0, but got -3", + ): + FastembedRanker(model_name="Xenova/ms-marco-MiniLM-L-12-v2", top_k=-3) + + def test_to_dict(self): + """ + Test serialization of FastembedRanker to a dictionary, using default initialization parameters. + """ + ranker = FastembedRanker(model_name="BAAI/bge-reranker-base") + ranker_dict = ranker.to_dict() + assert ranker_dict == { + "type": "haystack_integrations.components.rankers.fastembed.ranker.FastembedRanker", + "init_parameters": { + "model_name": "BAAI/bge-reranker-base", + "top_k": 10, + "cache_dir": None, + "threads": None, + "batch_size": 64, + "parallel": None, + "local_files_only": False, + "meta_fields_to_embed": [], + "meta_data_separator": "\n", + }, + } + + def test_to_dict_with_custom_init_parameters(self): + """ + Test serialization of FastembedRanker to a dictionary, using custom initialization parameters. + """ + ranker = FastembedRanker( + model_name="BAAI/bge-reranker-base", + cache_dir="fake_dir", + threads=2, + top_k=5, + batch_size=50, + parallel=1, + local_files_only=True, + meta_fields_to_embed=["test_field"], + meta_data_separator=" | ", + ) + ranker_dict = ranker.to_dict() + assert ranker_dict == { + "type": "haystack_integrations.components.rankers.fastembed.ranker.FastembedRanker", + "init_parameters": { + "model_name": "BAAI/bge-reranker-base", + "cache_dir": "fake_dir", + "threads": 2, + "top_k": 5, + "batch_size": 50, + "parallel": 1, + "local_files_only": True, + "meta_fields_to_embed": ["test_field"], + "meta_data_separator": " | ", + }, + } + + def test_from_dict(self): + """ + Test deserialization of FastembedRanker from a dictionary, using default initialization parameters. + """ + ranker_dict = { + "type": "haystack_integrations.components.rankers.fastembed.ranker.FastembedRanker", + "init_parameters": { + "model_name": "BAAI/bge-reranker-base", + "cache_dir": None, + "threads": None, + "top_k": 5, + "batch_size": 50, + "parallel": None, + "local_files_only": False, + "meta_fields_to_embed": [], + "meta_data_separator": "\n", + }, + } + ranker = default_from_dict(FastembedRanker, ranker_dict) + assert ranker.model_name == "BAAI/bge-reranker-base" + assert ranker.cache_dir is None + assert ranker.threads is None + assert ranker.top_k == 5 + assert ranker.batch_size == 50 + assert ranker.parallel is None + assert not ranker.local_files_only + assert ranker.meta_fields_to_embed == [] + assert ranker.meta_data_separator == "\n" + + def test_from_dict_with_custom_init_parameters(self): + """ + Test deserialization of FastembedRanker from a dictionary, using custom initialization parameters. + """ + ranker_dict = { + "type": "haystack_integrations.components.rankers.fastembed.ranker.FastembedRanker", + "init_parameters": { + "model_name": "BAAI/bge-reranker-base", + "cache_dir": "fake_dir", + "threads": 2, + "top_k": 5, + "batch_size": 50, + "parallel": 1, + "local_files_only": True, + "meta_fields_to_embed": ["test_field"], + "meta_data_separator": " | ", + }, + } + ranker = default_from_dict(FastembedRanker, ranker_dict) + assert ranker.model_name == "BAAI/bge-reranker-base" + assert ranker.cache_dir == "fake_dir" + assert ranker.threads == 2 + assert ranker.top_k == 5 + assert ranker.batch_size == 50 + assert ranker.parallel == 1 + assert ranker.local_files_only + assert ranker.meta_fields_to_embed == ["test_field"] + assert ranker.meta_data_separator == " | " + + def test_run_incorrect_input_format(self): + """ + Test for checking incorrect input format. + """ + ranker = FastembedRanker(model_name="Xenova/ms-marco-MiniLM-L-12-v2") + ranker._model = "mock_model" + + query = "query" + string_input = "text" + list_integers_input = [1, 2, 3] + list_document = [Document("Document 1")] + + with pytest.raises( + TypeError, + match="FastembedRanker expects a list of Documents as input.", + ): + ranker.run(query=query, documents=string_input) + + with pytest.raises( + TypeError, + match="FastembedRanker expects a list of Documents as input.", + ): + ranker.run(query=query, documents=list_integers_input) + + with pytest.raises( + ValueError, + match="No query provided", + ): + ranker.run(query="", documents=list_document) + + with pytest.raises( + ValueError, + match="top_k must be > 0, but got -3", + ): + ranker.run(query=query, documents=list_document, top_k=-3) + + def test_run_no_warmup(self): + """ + Test for checking error when calling without a warmup. + """ + ranker = FastembedRanker(model_name="Xenova/ms-marco-MiniLM-L-12-v2") + + query = "query" + list_document = [Document("Document 1")] + + with pytest.raises( + RuntimeError, + ): + ranker.run(query=query, documents=list_document) + + def test_run_empty_document_list(self): + """ + Test for no error when sending no documents. + """ + ranker = FastembedRanker(model_name="Xenova/ms-marco-MiniLM-L-12-v2") + ranker._model = "mock_model" + + query = "query" + list_document = [] + + result = ranker.run(query=query, documents=list_document) + assert len(result["documents"]) == 0 + + def test_embed_metadata(self): + """ + Tests the embedding of metadata fields in document content for ranking. + """ + ranker = FastembedRanker( + model_name="model_name", + meta_fields_to_embed=["meta_field"], + ) + ranker._model = MagicMock() + + documents = [Document(content=f"document-number {i}", meta={"meta_field": f"meta_value {i}"}) for i in range(5)] + query = "test" + ranker.run(query=query, documents=documents) + + ranker._model.rerank.assert_called_once_with( + query=query, + documents=[ + "meta_value 0\ndocument-number 0", + "meta_value 1\ndocument-number 1", + "meta_value 2\ndocument-number 2", + "meta_value 3\ndocument-number 3", + "meta_value 4\ndocument-number 4", + ], + batch_size=64, + parallel=None, + ) + + @pytest.mark.integration + def test_run(self): + ranker = FastembedRanker(model_name="Xenova/ms-marco-MiniLM-L-6-v2", top_k=2) + ranker.warm_up() + + query = "Who is maintaining Qdrant?" + documents = [ + Document( + content="This is built to be faster and lighter than other embedding \ +libraries e.g. Transformers, Sentence-Transformers, etc." + ), + Document(content="This is some random input"), + Document(content="fastembed is supported by and maintained by Qdrant."), + ] + + result = ranker.run(query=query, documents=documents) + + assert len(result["documents"]) == 2 + first_document = result["documents"][0] + second_document = result["documents"][1] + + assert isinstance(first_document, Document) + assert isinstance(second_document, Document) + assert first_document.content == "fastembed is supported by and maintained by Qdrant." + assert first_document.score > second_document.score