From f6efef4320d2d5a401f3654fb037c55093bc510c Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Tue, 9 Jan 2024 15:18:34 +0100 Subject: [PATCH 01/15] Add weight and ranking_mode as params to run for easier experimentation --- haystack/components/rankers/meta_field.py | 46 +++++++++++++++++++++-- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/haystack/components/rankers/meta_field.py b/haystack/components/rankers/meta_field.py index 6980d3af1d..a708225350 100644 --- a/haystack/components/rankers/meta_field.py +++ b/haystack/components/rankers/meta_field.py @@ -45,7 +45,8 @@ def __init__( 0 disables ranking by a metadata field. 0.5 content and metadata fields have the same impact for the ranking. 1 means ranking by a metadata field only. The highest value comes first. - :param top_k: The maximum number of Documents you want the Ranker to return per query. + :param top_k: The maximum number of Documents you want the Ranker to return per query. If not provided, the + Ranker returns all documents it receives in the new ranking order. :param ranking_mode: The mode used to combine the Retriever's and Ranker's scores. Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'. Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1]. @@ -86,7 +87,13 @@ def to_dict(self) -> Dict[str, Any]: ) @component.output_types(documents=List[Document]) - def run(self, documents: List[Document], top_k: Optional[int] = None): + def run( + self, + documents: List[Document], + top_k: Optional[int] = None, + weight: Optional[float] = None, + ranking_mode: Optional[Literal["reciprocal_rank_fusion", "linear_score"]] = None, + ): """ Use this method to rank a list of Documents based on the selected metadata field by: 1. Sorting the Documents by the metadata field in descending order. @@ -94,7 +101,17 @@ def run(self, documents: List[Document], top_k: Optional[int] = None): 3. Returning the top-k documents. :param documents: Documents to be ranked. - :param top_k: (optional) The number of Documents you want the Ranker to return. If not provided, the Ranker returns all Documents it received. + :param top_k: (optional) The number of Documents you want the Ranker to return. + If not provided, the top_k provided at initialization time is used. + :param weight: (optional) In range [0,1]. + 0 disables ranking by a metadata field. + 0.5 content and metadata fields have the same impact for the ranking. + 1 means ranking by a metadata field only. The highest value comes first. + If not provided, the weight provided at initialization time is used. + :param ranking_mode: (optional) The mode used to combine the Retriever's and Ranker's scores. + Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'. + Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1]. + If not provided, the ranking_mode provided at initialization time is used. """ if not documents: return {"documents": []} @@ -104,6 +121,29 @@ def run(self, documents: List[Document], top_k: Optional[int] = None): elif top_k <= 0: raise ValueError(f"top_k must be > 0, but got {top_k}") + weight = weight or self.weight + if weight < 0 or weight > 1: + raise ValueError( + """ + Parameter must be in range [0,1] but is currently set to '{}'.\n + '0' disables sorting by a metadata field, '0.5' assigns equal weight to the previous relevance scores and the metadata field, and '1' ranks by the metadata field only.\n + Change the parameter to a value in range 0 to 1. + """.format( + weight + ) + ) + + ranking_mode = ranking_mode or self.ranking_mode + if ranking_mode not in ["reciprocal_rank_fusion", "linear_score"]: + raise ValueError( + """ + The value of parameter must be 'reciprocal_rank_fusion' or 'linear_score', but is currently set to '{}'. \n + Change the value to 'reciprocal_rank_fusion' or 'linear_score'. + """.format( + ranking_mode + ) + ) + try: sorted_by_metadata = sorted(documents, key=lambda doc: doc.meta[self.meta_field], reverse=True) except KeyError: From fcdcb62d96c76044d7ab0ae6aa108162bcfed418 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Fri, 12 Jan 2024 08:43:10 +0100 Subject: [PATCH 02/15] renaming of metadata to meta --- haystack/components/rankers/meta_field.py | 51 ++++++++++++----------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/haystack/components/rankers/meta_field.py b/haystack/components/rankers/meta_field.py index a708225350..8b301dbd6c 100644 --- a/haystack/components/rankers/meta_field.py +++ b/haystack/components/rankers/meta_field.py @@ -11,7 +11,7 @@ @component class MetaFieldRanker: """ - Ranks Documents based on the value of their specific metadata field. The ranking is done in a descending order. + Ranks Documents based on the value of their specific meta field. The ranking is done in a descending order. Usage example: ``` @@ -36,32 +36,36 @@ def __init__( weight: float = 1.0, top_k: Optional[int] = None, ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"] = "reciprocal_rank_fusion", + infer_type: bool = True, ): """ Creates an instance of MetaFieldRanker. - :param meta_field: The name of the metadata field to rank by. + :param meta_field: The name of the meta field to rank by. :param weight: In range [0,1]. - 0 disables ranking by a metadata field. - 0.5 content and metadata fields have the same impact for the ranking. - 1 means ranking by a metadata field only. The highest value comes first. + 0 disables ranking by a meta field. + 0.5 content and meta fields have the same impact for the ranking. + 1 means ranking by a meta field only. The highest value comes first. :param top_k: The maximum number of Documents you want the Ranker to return per query. If not provided, the Ranker returns all documents it receives in the new ranking order. :param ranking_mode: The mode used to combine the Retriever's and Ranker's scores. Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'. Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1]. + :param infer_type: Whether to try and infer the data type of meta value that is a string. For example, we have + the field `"date": "2015-02-01"` we would infer the type of "date" to be a datetime object. """ self.meta_field = meta_field self.weight = weight self.top_k = top_k self.ranking_mode = ranking_mode + self.infer_type = infer_type if self.weight < 0 or self.weight > 1: raise ValueError( """ Parameter must be in range [0,1] but is currently set to '{}'.\n - '0' disables sorting by a metadata field, '0.5' assigns equal weight to the previous relevance scores and the metadata field, and '1' ranks by the metadata field only.\n + '0' disables sorting by a meta field, '0.5' assigns equal weight to the previous relevance scores and the meta field, and '1' ranks by the meta field only.\n Change the parameter to a value in range 0 to 1 when initializing the MetaFieldRanker. """.format( self.weight @@ -95,18 +99,18 @@ def run( ranking_mode: Optional[Literal["reciprocal_rank_fusion", "linear_score"]] = None, ): """ - Use this method to rank a list of Documents based on the selected metadata field by: - 1. Sorting the Documents by the metadata field in descending order. - 2. Merging the scores from the metadata field with the scores from the previous component according to the strategy and weight provided. + Use this method to rank a list of Documents based on the selected meta field by: + 1. Sorting the Documents by the meta field in descending order. + 2. Merging the scores from the meta field with the scores from the previous component according to the strategy and weight provided. 3. Returning the top-k documents. :param documents: Documents to be ranked. :param top_k: (optional) The number of Documents you want the Ranker to return. If not provided, the top_k provided at initialization time is used. :param weight: (optional) In range [0,1]. - 0 disables ranking by a metadata field. - 0.5 content and metadata fields have the same impact for the ranking. - 1 means ranking by a metadata field only. The highest value comes first. + 0 disables ranking by a meta field. + 0.5 content and meta fields have the same impact for the ranking. + 1 means ranking by a meta field only. The highest value comes first. If not provided, the weight provided at initialization time is used. :param ranking_mode: (optional) The mode used to combine the Retriever's and Ranker's scores. Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'. @@ -116,9 +120,8 @@ def run( if not documents: return {"documents": []} - if top_k is None: - top_k = self.top_k - elif top_k <= 0: + top_k = top_k or self.top_k + if top_k is not None and top_k <= 0: raise ValueError(f"top_k must be > 0, but got {top_k}") weight = weight or self.weight @@ -126,7 +129,7 @@ def run( raise ValueError( """ Parameter must be in range [0,1] but is currently set to '{}'.\n - '0' disables sorting by a metadata field, '0.5' assigns equal weight to the previous relevance scores and the metadata field, and '1' ranks by the metadata field only.\n + '0' disables sorting by a meta field, '0.5' assigns equal weight to the previous relevance scores and the meta field, and '1' ranks by the meta field only.\n Change the parameter to a value in range 0 to 1. """.format( weight @@ -145,27 +148,27 @@ def run( ) try: - sorted_by_metadata = sorted(documents, key=lambda doc: doc.meta[self.meta_field], reverse=True) + sorted_by_meta = sorted(documents, key=lambda doc: doc.meta[self.meta_field], reverse=True) except KeyError: raise ComponentError( """ - The parameter is currently set to '{}' but the Documents {} don't have this metadata key.\n - Double-check the names of the metadata fields in your documents \n - and set to the name of the field that contains the metadata you want to use for ranking. + The parameter is currently set to '{}' but the Documents {} don't have this meta key.\n + Double-check the names of the meta fields in your documents \n + and set to the name of the field that contains the meta you want to use for ranking. """.format( self.meta_field, ",".join([doc.id for doc in documents if self.meta_field not in doc.meta]) ) ) if self.weight > 0: - sorted_documents = self._merge_scores(documents, sorted_by_metadata) + sorted_documents = self._merge_scores(documents, sorted_by_meta) return {"documents": sorted_documents[:top_k]} else: - return {"documents": sorted_by_metadata[:top_k]} + return {"documents": sorted_by_meta[:top_k]} def _merge_scores(self, documents: List[Document], sorted_documents: List[Document]) -> List[Document]: """ - Merge scores for Documents sorted both by their content and by their metadata field. + Merge scores for Documents sorted both by their content and by their meta field. """ scores_map: Dict = defaultdict(int) @@ -207,7 +210,7 @@ def _calculate_rrf(rank: int, k: int = 61) -> float: @staticmethod def _calc_linear_score(rank: int, amount: int) -> float: """ - Calculate the metadata field score as a linear score between the greatest and the lowest score in the list. + Calculate the meta field score as a linear score between the greatest and the lowest score in the list. This linear scaling is useful for: - Reducing the effect of outliers - Creating scores that are meaningfully distributed in the range [0,1], From 803ad270872252c023ef73f7d769755f3b32d0f1 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Fri, 12 Jan 2024 09:26:07 +0100 Subject: [PATCH 03/15] User logger.warning instead of warnings --- haystack/components/rankers/meta_field.py | 55 +++++++++------------- test/components/rankers/test_metafield.py | 57 ++++++++++++++--------- 2 files changed, 56 insertions(+), 56 deletions(-) diff --git a/haystack/components/rankers/meta_field.py b/haystack/components/rankers/meta_field.py index 8b301dbd6c..f4625bc208 100644 --- a/haystack/components/rankers/meta_field.py +++ b/haystack/components/rankers/meta_field.py @@ -1,5 +1,4 @@ import logging -import warnings from collections import defaultdict from typing import List, Dict, Any, Optional, Literal @@ -36,6 +35,7 @@ def __init__( weight: float = 1.0, top_k: Optional[int] = None, ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"] = "reciprocal_rank_fusion", + sort_order: Literal["ascending", "descending"] = "descending", infer_type: bool = True, ): """ @@ -51,6 +51,7 @@ def __init__( :param ranking_mode: The mode used to combine the Retriever's and Ranker's scores. Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'. Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1]. + :param sort_order: :param infer_type: Whether to try and infer the data type of meta value that is a string. For example, we have the field `"date": "2015-02-01"` we would infer the type of "date" to be a datetime object. """ @@ -60,8 +61,15 @@ def __init__( self.top_k = top_k self.ranking_mode = ranking_mode self.infer_type = infer_type + self._validate_params(weight=weight, top_k=top_k, ranking_mode=ranking_mode) - if self.weight < 0 or self.weight > 1: + def _validate_params( + self, weight: float, top_k: Optional[int], ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"] + ): + if top_k is not None and top_k <= 0: + raise ValueError(f"top_k must be > 0, but got {top_k}") + + if weight < 0 or weight > 1: raise ValueError( """ Parameter must be in range [0,1] but is currently set to '{}'.\n @@ -72,13 +80,13 @@ def __init__( ) ) - if self.ranking_mode not in ["reciprocal_rank_fusion", "linear_score"]: + if ranking_mode not in ["reciprocal_rank_fusion", "linear_score"]: raise ValueError( """ The value of parameter must be 'reciprocal_rank_fusion' or 'linear_score', but is currently set to '{}'. \n Change the value to 'reciprocal_rank_fusion' or 'linear_score' when initializing the MetaFieldRanker. """.format( - self.ranking_mode + ranking_mode ) ) @@ -87,7 +95,12 @@ def to_dict(self) -> Dict[str, Any]: Serialize object to a dictionary. """ return default_to_dict( - self, meta_field=self.meta_field, weight=self.weight, top_k=self.top_k, ranking_mode=self.ranking_mode + self, + meta_field=self.meta_field, + weight=self.weight, + top_k=self.top_k, + ranking_mode=self.ranking_mode, + infer_type=self.infer_type, ) @component.output_types(documents=List[Document]) @@ -121,31 +134,9 @@ def run( return {"documents": []} top_k = top_k or self.top_k - if top_k is not None and top_k <= 0: - raise ValueError(f"top_k must be > 0, but got {top_k}") - weight = weight or self.weight - if weight < 0 or weight > 1: - raise ValueError( - """ - Parameter must be in range [0,1] but is currently set to '{}'.\n - '0' disables sorting by a meta field, '0.5' assigns equal weight to the previous relevance scores and the meta field, and '1' ranks by the meta field only.\n - Change the parameter to a value in range 0 to 1. - """.format( - weight - ) - ) - ranking_mode = ranking_mode or self.ranking_mode - if ranking_mode not in ["reciprocal_rank_fusion", "linear_score"]: - raise ValueError( - """ - The value of parameter must be 'reciprocal_rank_fusion' or 'linear_score', but is currently set to '{}'. \n - Change the value to 'reciprocal_rank_fusion' or 'linear_score'. - """.format( - ranking_mode - ) - ) + self._validate_params(weight=weight, top_k=top_k, ranking_mode=ranking_mode) try: sorted_by_meta = sorted(documents, key=lambda doc: doc.meta[self.meta_field], reverse=True) @@ -180,12 +171,10 @@ def _merge_scores(self, documents: List[Document], sorted_documents: List[Docume for i, (doc, sorted_doc) in enumerate(zip(documents, sorted_documents)): score = float(0) if doc.score is None: - warnings.warn("The score wasn't provided; defaulting to 0.") + logger.warning("The score wasn't provided; defaulting to 0.") elif doc.score < 0 or doc.score > 1: - warnings.warn( - "The score {} for Document {} is outside the [0,1] range; defaulting to 0".format( - doc.score, doc.id - ) + logger.warning( + "The score %s for Document %s is outside the [0,1] range; defaulting to 0", doc.score, doc.id ) else: score = doc.score diff --git a/test/components/rankers/test_metafield.py b/test/components/rankers/test_metafield.py index 1c085ad446..8b3e577703 100644 --- a/test/components/rankers/test_metafield.py +++ b/test/components/rankers/test_metafield.py @@ -1,4 +1,5 @@ import pytest +import logging from haystack import Document, ComponentError from haystack.components.rankers.meta_field import MetaFieldRanker @@ -15,25 +16,33 @@ def test_to_dict(self): "weight": 1.0, "top_k": None, "ranking_mode": "reciprocal_rank_fusion", + "infer_type": True, }, } def test_to_dict_with_custom_init_parameters(self): - component = MetaFieldRanker(meta_field="rating", weight=0.5, top_k=5, ranking_mode="linear_score") + component = MetaFieldRanker( + meta_field="rating", weight=0.5, top_k=5, ranking_mode="linear_score", infer_type=False + ) data = component.to_dict() assert data == { "type": "haystack.components.rankers.meta_field.MetaFieldRanker", - "init_parameters": {"meta_field": "rating", "weight": 0.5, "top_k": 5, "ranking_mode": "linear_score"}, + "init_parameters": { + "meta_field": "rating", + "weight": 0.5, + "top_k": 5, + "ranking_mode": "linear_score", + "infer_type": False, + }, } - @pytest.mark.integration - @pytest.mark.parametrize("metafield_values, expected_first_value", [([1.3, 0.7, 2.1], 2.1), ([1, 5, 8], 8)]) - def test_run(self, metafield_values, expected_first_value): + @pytest.mark.parametrize("meta_field_values, expected_first_value", [([1.3, 0.7, 2.1], 2.1), ([1, 5, 8], 8)]) + def test_run(self, meta_field_values, expected_first_value): """ Test if the component ranks documents correctly. """ ranker = MetaFieldRanker(meta_field="rating") - docs_before = [Document(content="abc", meta={"rating": value}) for value in metafield_values] + docs_before = [Document(content="abc", meta={"rating": value}) for value in meta_field_values] output = ranker.run(documents=docs_before) docs_after = output["documents"] @@ -44,32 +53,37 @@ def test_run(self, metafield_values, expected_first_value): sorted_scores = sorted([doc.meta["rating"] for doc in docs_after], reverse=True) assert [doc.meta["rating"] for doc in docs_after] == sorted_scores - @pytest.mark.integration + def test_run_with_weight_equal_to_0(self): + ranker = MetaFieldRanker(meta_field="rating", weight=0) + docs_before = [Document(content="abc", meta={"rating": value}) for value in [1.1, 0.5, 2.3]] + output = ranker.run(documents=docs_before) + docs_after = output["documents"] + + assert len(docs_after) == 3 + sorted_scores = sorted([doc.meta["rating"] for doc in docs_after], reverse=True) + assert [doc.meta["rating"] for doc in docs_after] == sorted_scores + def test_returns_empty_list_if_no_documents_are_provided(self): ranker = MetaFieldRanker(meta_field="rating") output = ranker.run(documents=[]) docs_after = output["documents"] assert docs_after == [] - @pytest.mark.integration def test_raises_component_error_if_metadata_not_found(self): ranker = MetaFieldRanker(meta_field="rating") docs_before = [Document(content="abc", meta={"wrong_field": 1.3})] with pytest.raises(ComponentError): ranker.run(documents=docs_before) - @pytest.mark.integration def test_raises_component_error_if_wrong_ranking_mode(self): with pytest.raises(ValueError): MetaFieldRanker(meta_field="rating", ranking_mode="wrong_mode") - @pytest.mark.integration @pytest.mark.parametrize("score", [-1, 2, 1.3, 2.1]) def test_raises_component_error_if_wrong_weight(self, score): with pytest.raises(ValueError): MetaFieldRanker(meta_field="rating", weight=score) - @pytest.mark.integration def test_linear_score(self): ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5) docs_before = [ @@ -81,7 +95,6 @@ def test_linear_score(self): docs_after = output["documents"] assert docs_after[0].score == 0.8 - @pytest.mark.integration def test_reciprocal_rank_fusion(self): ranker = MetaFieldRanker(meta_field="rating", ranking_mode="reciprocal_rank_fusion", weight=0.5) docs_before = [ @@ -93,22 +106,19 @@ def test_reciprocal_rank_fusion(self): docs_after = output["documents"] assert docs_after[0].score == 0.01626123744050767 - @pytest.mark.integration @pytest.mark.parametrize("score", [-1, 2, 1.3, 2.1]) - def test_linear_score_raises_warning_if_doc_wrong_score(self, score): + def test_linear_score_raises_warning_if_doc_wrong_score(self, score, caplog): ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5) docs_before = [ - Document(id=1, content="abc", meta={"rating": 1.3}, score=score), - Document(id=2, content="abc", meta={"rating": 0.7}, score=0.4), - Document(id=3, content="abc", meta={"rating": 2.1}, score=0.6), + Document(id="1", content="abc", meta={"rating": 1.3}, score=score), + Document(id="2", content="abc", meta={"rating": 0.7}, score=0.4), + Document(id="3", content="abc", meta={"rating": 2.1}, score=0.6), ] - with pytest.warns( - UserWarning, match=rf"The score {score} for Document 1 is outside the \[0,1\] range; defaulting to 0" - ): + with caplog.at_level(logging.WARNING): ranker.run(documents=docs_before) + assert f"The score {score} for Document 1 is outside the [0,1] range; defaulting to 0" in caplog.text - @pytest.mark.integration - def test_linear_score_raises_raises_warning_if_doc_without_score(self): + def test_linear_score_raises_raises_warning_if_doc_without_score(self, caplog): ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5) docs_before = [ Document(content="abc", meta={"rating": 1.3}), @@ -116,5 +126,6 @@ def test_linear_score_raises_raises_warning_if_doc_without_score(self): Document(content="abc", meta={"rating": 2.1}), ] - with pytest.warns(UserWarning, match="The score wasn't provided; defaulting to 0."): + with caplog.at_level(logging.WARNING): ranker.run(documents=docs_before) + assert "The score wasn't provided; defaulting to 0." in caplog.text From bc489f460037407d0bd5d20c34d7a41cedd184da Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Fri, 12 Jan 2024 09:27:28 +0100 Subject: [PATCH 04/15] Add another unit test --- test/components/rankers/test_metafield.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/components/rankers/test_metafield.py b/test/components/rankers/test_metafield.py index 8b3e577703..52d8404a3d 100644 --- a/test/components/rankers/test_metafield.py +++ b/test/components/rankers/test_metafield.py @@ -75,10 +75,14 @@ def test_raises_component_error_if_metadata_not_found(self): with pytest.raises(ComponentError): ranker.run(documents=docs_before) - def test_raises_component_error_if_wrong_ranking_mode(self): + def test_raises_value_error_if_wrong_ranking_mode(self): with pytest.raises(ValueError): MetaFieldRanker(meta_field="rating", ranking_mode="wrong_mode") + def test_raises_value_error_if_wrong_top_k(self): + with pytest.raises(ValueError): + MetaFieldRanker(meta_field="rating", top_k=-1) + @pytest.mark.parametrize("score", [-1, 2, 1.3, 2.1]) def test_raises_component_error_if_wrong_weight(self, score): with pytest.raises(ValueError): From 5875dfb0eafd44313c3586f621ef3d133ee1eec7 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Fri, 12 Jan 2024 09:45:38 +0100 Subject: [PATCH 05/15] Add support for sort_order and fix formatting of error messages --- haystack/components/rankers/meta_field.py | 66 +++++++++++++---------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/haystack/components/rankers/meta_field.py b/haystack/components/rankers/meta_field.py index f4625bc208..37065a52dc 100644 --- a/haystack/components/rankers/meta_field.py +++ b/haystack/components/rankers/meta_field.py @@ -10,7 +10,8 @@ @component class MetaFieldRanker: """ - Ranks Documents based on the value of their specific meta field. The ranking is done in a descending order. + Ranks Documents based on the value of their specific meta field. + The ranking can be performed in descending order or ascending order. Usage example: ``` @@ -51,7 +52,8 @@ def __init__( :param ranking_mode: The mode used to combine the Retriever's and Ranker's scores. Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'. Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1]. - :param sort_order: + :param sort_order: Whether to sort the meta field by ascending or descending order. + Possible values are `descending` (default) and `ascending`. :param infer_type: Whether to try and infer the data type of meta value that is a string. For example, we have the field `"date": "2015-02-01"` we would infer the type of "date" to be a datetime object. """ @@ -60,34 +62,42 @@ def __init__( self.weight = weight self.top_k = top_k self.ranking_mode = ranking_mode + self.sort_order = sort_order self.infer_type = infer_type - self._validate_params(weight=weight, top_k=top_k, ranking_mode=ranking_mode) + self._validate_params( + weight=self.weight, top_k=self.top_k, ranking_mode=self.ranking_mode, sort_order=self.sort_order + ) def _validate_params( - self, weight: float, top_k: Optional[int], ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"] + self, + weight: float, + top_k: Optional[int], + ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"], + sort_order: Literal["ascending", "descending"], ): if top_k is not None and top_k <= 0: - raise ValueError(f"top_k must be > 0, but got {top_k}") + raise ValueError("top_k must be > 0, but got %s" % top_k) if weight < 0 or weight > 1: raise ValueError( - """ - Parameter must be in range [0,1] but is currently set to '{}'.\n - '0' disables sorting by a meta field, '0.5' assigns equal weight to the previous relevance scores and the meta field, and '1' ranks by the meta field only.\n - Change the parameter to a value in range 0 to 1 when initializing the MetaFieldRanker. - """.format( - self.weight - ) + "Parameter must be in range [0,1] but is currently set to '%s'.\n'0' disables sorting by a " + "meta field, '0.5' assigns equal weight to the previous relevance scores and the meta field, and " + "'1' ranks by the meta field only.\nChange the parameter to a value in range 0 to 1 when " + "initializing the MetaFieldRanker." % self.weight ) if ranking_mode not in ["reciprocal_rank_fusion", "linear_score"]: raise ValueError( - """ - The value of parameter must be 'reciprocal_rank_fusion' or 'linear_score', but is currently set to '{}'. \n - Change the value to 'reciprocal_rank_fusion' or 'linear_score' when initializing the MetaFieldRanker. - """.format( - ranking_mode - ) + "The value of parameter must be 'reciprocal_rank_fusion' or 'linear_score', but is " + "currently set to '%s'.\nChange the value to 'reciprocal_rank_fusion' or " + "'linear_score' when initializing the MetaFieldRanker." % ranking_mode + ) + + if sort_order not in ["ascending", "descending"]: + raise ValueError( + "The value of parameter must be 'ascending' or 'descending', but is currently set to '%s'.\n" + "Change the value to 'ascending' or 'descending' when initializing the " + "MetaFieldRanker." % sort_order ) def to_dict(self) -> Dict[str, Any]: @@ -110,6 +120,8 @@ def run( top_k: Optional[int] = None, weight: Optional[float] = None, ranking_mode: Optional[Literal["reciprocal_rank_fusion", "linear_score"]] = None, + sort_order: Optional[Literal["ascending", "descending"]] = None, + infer_type: Optional[bool] = None, ): """ Use this method to rank a list of Documents based on the selected meta field by: @@ -136,19 +148,19 @@ def run( top_k = top_k or self.top_k weight = weight or self.weight ranking_mode = ranking_mode or self.ranking_mode - self._validate_params(weight=weight, top_k=top_k, ranking_mode=ranking_mode) + sort_order = sort_order or self.sort_order + infer_type = infer_type or self.infer_type + self._validate_params(weight=weight, top_k=top_k, ranking_mode=ranking_mode, sort_order=sort_order) + reverse = sort_order == "descending" try: - sorted_by_meta = sorted(documents, key=lambda doc: doc.meta[self.meta_field], reverse=True) + sorted_by_meta = sorted(documents, key=lambda doc: doc.meta[self.meta_field], reverse=reverse) except KeyError: raise ComponentError( - """ - The parameter is currently set to '{}' but the Documents {} don't have this meta key.\n - Double-check the names of the meta fields in your documents \n - and set to the name of the field that contains the meta you want to use for ranking. - """.format( - self.meta_field, ",".join([doc.id for doc in documents if self.meta_field not in doc.meta]) - ) + "The parameter is currently set to '%s' but the Documents %s don't have this meta key.\n" + "Double-check the names of the meta fields in your documents \n" + "and set to the name of the field that contains the meta you want to use for ranking." + % (self.meta_field, ",".join([doc.id for doc in documents if self.meta_field not in doc.meta])) ) if self.weight > 0: From 5f618cf4421c25bc5ceb44d067ab9dbac9afdcc1 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Fri, 12 Jan 2024 12:54:52 +0100 Subject: [PATCH 06/15] Make MetaFieldRanker more robust. Doesn't crash pipeline if some Documents are missing keys. --- haystack/components/rankers/meta_field.py | 90 +++++++++++++++++------ test/components/rankers/test_metafield.py | 22 +++++- 2 files changed, 88 insertions(+), 24 deletions(-) diff --git a/haystack/components/rankers/meta_field.py b/haystack/components/rankers/meta_field.py index 37065a52dc..ee50f6038b 100644 --- a/haystack/components/rankers/meta_field.py +++ b/haystack/components/rankers/meta_field.py @@ -1,8 +1,9 @@ import logging from collections import defaultdict -from typing import List, Dict, Any, Optional, Literal +from typing import List, Dict, Any, Optional, Literal, Set +from dateutil.parser import parse -from haystack import ComponentError, Document, component, default_to_dict +from haystack import Document, component, default_to_dict logger = logging.getLogger(__name__) @@ -125,7 +126,7 @@ def run( ): """ Use this method to rank a list of Documents based on the selected meta field by: - 1. Sorting the Documents by the meta field in descending order. + 1. Sorting the Documents by the meta field in descending or ascending order. 2. Merging the scores from the meta field with the scores from the previous component according to the strategy and weight provided. 3. Returning the top-k documents. @@ -141,6 +142,12 @@ def run( Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'. Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1]. If not provided, the ranking_mode provided at initialization time is used. + :param sort_order: Whether to sort the meta field by ascending or descending order. + Possible values are `descending` (default) and `ascending`. + If not provided, the sort_order provided at initialization time is used. + :param infer_type: Whether to try and infer the data type of meta value that is a string. For example, we have + the field `"date": "2015-02-01"` we would infer the type of "date" to be a datetime object. + If not provided, the infer_type provided at initialization time is used. """ if not documents: return {"documents": []} @@ -152,23 +159,60 @@ def run( infer_type = infer_type or self.infer_type self._validate_params(weight=weight, top_k=top_k, ranking_mode=ranking_mode, sort_order=sort_order) - reverse = sort_order == "descending" - try: - sorted_by_meta = sorted(documents, key=lambda doc: doc.meta[self.meta_field], reverse=reverse) - except KeyError: - raise ComponentError( - "The parameter is currently set to '%s' but the Documents %s don't have this meta key.\n" - "Double-check the names of the meta fields in your documents \n" - "and set to the name of the field that contains the meta you want to use for ranking." - % (self.meta_field, ",".join([doc.id for doc in documents if self.meta_field not in doc.meta])) + docs_with_meta_field = [] + docs_no_meta_field = [] + unique_meta_values = set() + for document in documents: + # Using try except block to handle situation that the meta_value could be None + try: + meta_value = document.meta[self.meta_field] + docs_with_meta_field.append(document) + unique_meta_values.add(meta_value) + except KeyError: + docs_no_meta_field.append(document) + + if len(docs_with_meta_field) == 0: + logger.warning( + "The parameter is currently set to '%s', but none of the provided Documents have this meta key.\n" + "The provided Document IDs are %s.\n" + "Set to the name of the field that is present within the provided Documents.\n" + % (self.meta_field, ",".join([doc.id for doc in documents])) + ) + + if len(docs_no_meta_field) > 0: + logger.warning( + "The parameter is currently set to '%s' but the Documents with IDs %s don't have this meta key.\n" + "These Documents will be placed at the end of the sorting order.\n" + % (self.meta_field, ",".join([doc.id for doc in docs_no_meta_field])) ) + reverse = sort_order == "descending" + sorted_by_meta = sorted(docs_with_meta_field, key=lambda doc: doc.meta[self.meta_field], reverse=reverse) + # Add the docs missing the meta_field back on the end + sorted_by_meta += docs_no_meta_field + if self.weight > 0: sorted_documents = self._merge_scores(documents, sorted_by_meta) return {"documents": sorted_documents[:top_k]} else: return {"documents": sorted_by_meta[:top_k]} + def _infer_type(self, unique_meta_values: Set[str]): + return None + + def _is_date(self, string: str): + """ + Return whether the string can be interpreted as a date. + + :param string: str, string to check for date + """ + try: + parse(string) + except ValueError: + return False + + return True + def _merge_scores(self, documents: List[Document], sorted_documents: List[Document]) -> List[Document]: """ Merge scores for Documents sorted both by their content and by their meta field. @@ -176,26 +220,28 @@ def _merge_scores(self, documents: List[Document], sorted_documents: List[Docume scores_map: Dict = defaultdict(int) if self.ranking_mode == "reciprocal_rank_fusion": - for i, (doc, sorted_doc) in enumerate(zip(documents, sorted_documents)): - scores_map[doc.id] += self._calculate_rrf(rank=i) * (1 - self.weight) + for i, (document, sorted_doc) in enumerate(zip(documents, sorted_documents)): + scores_map[document.id] += self._calculate_rrf(rank=i) * (1 - self.weight) scores_map[sorted_doc.id] += self._calculate_rrf(rank=i) * self.weight elif self.ranking_mode == "linear_score": - for i, (doc, sorted_doc) in enumerate(zip(documents, sorted_documents)): + for i, (document, sorted_doc) in enumerate(zip(documents, sorted_documents)): score = float(0) - if doc.score is None: + if document.score is None: logger.warning("The score wasn't provided; defaulting to 0.") - elif doc.score < 0 or doc.score > 1: + elif document.score < 0 or document.score > 1: logger.warning( - "The score %s for Document %s is outside the [0,1] range; defaulting to 0", doc.score, doc.id + "The score %s for Document %s is outside the [0,1] range; defaulting to 0", + document.score, + document.id, ) else: - score = doc.score + score = document.score - scores_map[doc.id] += score * (1 - self.weight) + scores_map[document.id] += score * (1 - self.weight) scores_map[sorted_doc.id] += self._calc_linear_score(rank=i, amount=len(sorted_documents)) * self.weight - for doc in documents: - doc.score = scores_map[doc.id] + for document in documents: + document.score = scores_map[document.id] new_sorted_documents = sorted(documents, key=lambda doc: doc.score if doc.score else -1, reverse=True) return new_sorted_documents diff --git a/test/components/rankers/test_metafield.py b/test/components/rankers/test_metafield.py index 52d8404a3d..cff721571e 100644 --- a/test/components/rankers/test_metafield.py +++ b/test/components/rankers/test_metafield.py @@ -63,17 +63,31 @@ def test_run_with_weight_equal_to_0(self): sorted_scores = sorted([doc.meta["rating"] for doc in docs_after], reverse=True) assert [doc.meta["rating"] for doc in docs_after] == sorted_scores + def test_sort_order_ascending(self): + ranker = MetaFieldRanker(meta_field="rating", weight=0, sort_order="ascending") + docs_before = [Document(content="abc", meta={"rating": value}) for value in [1.1, 0.5, 2.3]] + output = ranker.run(documents=docs_before) + docs_after = output["documents"] + + assert len(docs_after) == 3 + sorted_scores = sorted([doc.meta["rating"] for doc in docs_after]) + assert [doc.meta["rating"] for doc in docs_after] == sorted_scores + def test_returns_empty_list_if_no_documents_are_provided(self): ranker = MetaFieldRanker(meta_field="rating") output = ranker.run(documents=[]) docs_after = output["documents"] assert docs_after == [] - def test_raises_component_error_if_metadata_not_found(self): + def test_raises_component_error_if_metadata_not_found(self, caplog): ranker = MetaFieldRanker(meta_field="rating") docs_before = [Document(content="abc", meta={"wrong_field": 1.3})] - with pytest.raises(ComponentError): + with caplog.at_level(logging.WARNING): ranker.run(documents=docs_before) + assert ( + "The parameter is currently set to 'rating', but none of the provided Documents have this meta key." + in caplog.text + ) def test_raises_value_error_if_wrong_ranking_mode(self): with pytest.raises(ValueError): @@ -88,6 +102,10 @@ def test_raises_component_error_if_wrong_weight(self, score): with pytest.raises(ValueError): MetaFieldRanker(meta_field="rating", weight=score) + def test_raises_value_error_if_wrong_sort_order(self): + with pytest.raises(ValueError): + MetaFieldRanker(meta_field="rating", sort_order="wrong_order") + def test_linear_score(self): ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5) docs_before = [ From edc3da1a9f17b2fb68813999fe619c136fa2e6eb Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Fri, 12 Jan 2024 12:57:29 +0100 Subject: [PATCH 07/15] Don't print same warning message twice --- haystack/components/rankers/meta_field.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/haystack/components/rankers/meta_field.py b/haystack/components/rankers/meta_field.py index ee50f6038b..2c1e0421f0 100644 --- a/haystack/components/rankers/meta_field.py +++ b/haystack/components/rankers/meta_field.py @@ -160,7 +160,7 @@ def run( self._validate_params(weight=weight, top_k=top_k, ranking_mode=ranking_mode, sort_order=sort_order) docs_with_meta_field = [] - docs_no_meta_field = [] + docs_missing_meta_field = [] unique_meta_values = set() for document in documents: # Using try except block to handle situation that the meta_value could be None @@ -169,27 +169,26 @@ def run( docs_with_meta_field.append(document) unique_meta_values.add(meta_value) except KeyError: - docs_no_meta_field.append(document) + docs_missing_meta_field.append(document) - if len(docs_with_meta_field) == 0: + if len(docs_missing_meta_field) == len(documents): logger.warning( "The parameter is currently set to '%s', but none of the provided Documents have this meta key.\n" "The provided Document IDs are %s.\n" "Set to the name of the field that is present within the provided Documents.\n" % (self.meta_field, ",".join([doc.id for doc in documents])) ) - - if len(docs_no_meta_field) > 0: + elif len(docs_missing_meta_field) > 0: logger.warning( "The parameter is currently set to '%s' but the Documents with IDs %s don't have this meta key.\n" "These Documents will be placed at the end of the sorting order.\n" - % (self.meta_field, ",".join([doc.id for doc in docs_no_meta_field])) + % (self.meta_field, ",".join([doc.id for doc in docs_missing_meta_field])) ) reverse = sort_order == "descending" sorted_by_meta = sorted(docs_with_meta_field, key=lambda doc: doc.meta[self.meta_field], reverse=reverse) # Add the docs missing the meta_field back on the end - sorted_by_meta += docs_no_meta_field + sorted_by_meta += docs_missing_meta_field if self.weight > 0: sorted_documents = self._merge_scores(documents, sorted_by_meta) From 8534c6aef83a9f7701b1ad6ef4b53186f4805bd3 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Fri, 12 Jan 2024 13:00:10 +0100 Subject: [PATCH 08/15] Add another test --- test/components/rankers/test_metafield.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/test/components/rankers/test_metafield.py b/test/components/rankers/test_metafield.py index cff721571e..b355eb3e8f 100644 --- a/test/components/rankers/test_metafield.py +++ b/test/components/rankers/test_metafield.py @@ -79,7 +79,7 @@ def test_returns_empty_list_if_no_documents_are_provided(self): docs_after = output["documents"] assert docs_after == [] - def test_raises_component_error_if_metadata_not_found(self, caplog): + def test_warning_if_meta_not_found(self, caplog): ranker = MetaFieldRanker(meta_field="rating") docs_before = [Document(content="abc", meta={"wrong_field": 1.3})] with caplog.at_level(logging.WARNING): @@ -89,6 +89,19 @@ def test_raises_component_error_if_metadata_not_found(self, caplog): in caplog.text ) + def test_warning_if_some_meta_not_found(self, caplog): + ranker = MetaFieldRanker(meta_field="rating") + docs_before = [ + Document(id="1", content="abc", meta={"wrong_field": 1.3}), + Document(id="2", content="def", meta={"rating": 1.3}), + ] + with caplog.at_level(logging.WARNING): + ranker.run(documents=docs_before) + assert ( + "The parameter is currently set to 'rating' but the Documents with IDs 1 don't have this meta key." + in caplog.text + ) + def test_raises_value_error_if_wrong_ranking_mode(self): with pytest.raises(ValueError): MetaFieldRanker(meta_field="rating", ranking_mode="wrong_mode") From f8c3773cd94b49ff80832d480b3d5c2cb0709661 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Fri, 12 Jan 2024 14:27:12 +0100 Subject: [PATCH 09/15] Making MetaFieldRanker more robust --- haystack/components/rankers/meta_field.py | 75 +++++++++++++---------- test/components/rankers/test_metafield.py | 6 +- 2 files changed, 47 insertions(+), 34 deletions(-) diff --git a/haystack/components/rankers/meta_field.py b/haystack/components/rankers/meta_field.py index 2c1e0421f0..9fab049255 100644 --- a/haystack/components/rankers/meta_field.py +++ b/haystack/components/rankers/meta_field.py @@ -2,6 +2,7 @@ from collections import defaultdict from typing import List, Dict, Any, Optional, Literal, Set from dateutil.parser import parse +import yaml from haystack import Document, component, default_to_dict @@ -163,7 +164,7 @@ def run( docs_missing_meta_field = [] unique_meta_values = set() for document in documents: - # Using try except block to handle situation that the meta_value could be None + # Using try except block instead of .meta.get() to handle situation that the meta_value could be None try: meta_value = document.meta[self.meta_field] docs_with_meta_field.append(document) @@ -171,50 +172,62 @@ def run( except KeyError: docs_missing_meta_field.append(document) + # If all docs are missing self.meta_field return original documents if len(docs_missing_meta_field) == len(documents): logger.warning( - "The parameter is currently set to '%s', but none of the provided Documents have this meta key.\n" - "The provided Document IDs are %s.\n" - "Set to the name of the field that is present within the provided Documents.\n" - % (self.meta_field, ",".join([doc.id for doc in documents])) + "The parameter is currently set to '%s', but none of the provided Documents with IDs %s have this meta key.\n" + "Set to the name of a field that is present within the provided Documents.\n" + "Returning the original Documents since there are no values to rank.", + self.meta_field, + ",".join([doc.id for doc in documents]), ) - elif len(docs_missing_meta_field) > 0: + return {"documents": documents} + + if len(docs_missing_meta_field) > 0: logger.warning( "The parameter is currently set to '%s' but the Documents with IDs %s don't have this meta key.\n" - "These Documents will be placed at the end of the sorting order.\n" - % (self.meta_field, ",".join([doc.id for doc in docs_missing_meta_field])) + "These Documents will be placed at the end of the sorting order.", + self.meta_field, + ",".join([doc.id for doc in docs_missing_meta_field]), ) - reverse = sort_order == "descending" - sorted_by_meta = sorted(docs_with_meta_field, key=lambda doc: doc.meta[self.meta_field], reverse=reverse) - # Add the docs missing the meta_field back on the end - sorted_by_meta += docs_missing_meta_field - - if self.weight > 0: - sorted_documents = self._merge_scores(documents, sorted_by_meta) - return {"documents": sorted_documents[:top_k]} + # - If all string type then try to infer using yaml.safe_load ?? + if infer_type and all(isinstance(d.meta[self.meta_field], str) for d in docs_with_meta_field): + ... else: - return {"documents": sorted_by_meta[:top_k]} + logger.warning( + "The parameter is currently set to `%s`, but not all of meta values in the provided Documents are strings.\n" + "Therefore,", + infer_type, + ) - def _infer_type(self, unique_meta_values: Set[str]): - return None + reverse = sort_order == "descending" + try: + sorted_by_meta = sorted(docs_with_meta_field, key=lambda doc: doc.meta[self.meta_field], reverse=reverse) + except TypeError as error: + # Return original documents if mixed types that are not comparable are returned (e.g. int and list) + logger.warning( + "Tried to sort Documents with IDs %s, but got TypeError with the message: %s\n" + "Returning the original Documents since meta field ranking is not possible.", + ",".join([doc.id for doc in docs_missing_meta_field]), + error, + ) + return {"documents": documents} - def _is_date(self, string: str): - """ - Return whether the string can be interpreted as a date. + # Add the docs missing the meta_field back on the end + sorted_documents = sorted_by_meta + docs_missing_meta_field - :param string: str, string to check for date - """ - try: - parse(string) - except ValueError: - return False + if self.weight > 0: + sorted_documents = self._merge_rankings(documents, sorted_documents) + return {"documents": sorted_documents[:top_k]} - return True + def _check_all_same_type(self, documents: List[Document]) -> bool: + all_types = {type(d.meta[self.meta_field]) for d in documents} + return len(all_types) == 1 - def _merge_scores(self, documents: List[Document], sorted_documents: List[Document]) -> List[Document]: + def _merge_rankings(self, documents: List[Document], sorted_documents: List[Document]) -> List[Document]: """ - Merge scores for Documents sorted both by their content and by their meta field. + Merge the two different rankings for Documents sorted both by their content and by their meta field. """ scores_map: Dict = defaultdict(int) diff --git a/test/components/rankers/test_metafield.py b/test/components/rankers/test_metafield.py index b355eb3e8f..bddfa4a691 100644 --- a/test/components/rankers/test_metafield.py +++ b/test/components/rankers/test_metafield.py @@ -1,7 +1,7 @@ import pytest import logging -from haystack import Document, ComponentError +from haystack import Document from haystack.components.rankers.meta_field import MetaFieldRanker @@ -81,11 +81,11 @@ def test_returns_empty_list_if_no_documents_are_provided(self): def test_warning_if_meta_not_found(self, caplog): ranker = MetaFieldRanker(meta_field="rating") - docs_before = [Document(content="abc", meta={"wrong_field": 1.3})] + docs_before = [Document(id="1", content="abc", meta={"wrong_field": 1.3})] with caplog.at_level(logging.WARNING): ranker.run(documents=docs_before) assert ( - "The parameter is currently set to 'rating', but none of the provided Documents have this meta key." + "The parameter is currently set to 'rating', but none of the provided Documents with IDs 1 have this meta key." in caplog.text ) From 2c311fc9a15386851c475d53fdb694a92f64f3b3 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Fri, 12 Jan 2024 14:33:59 +0100 Subject: [PATCH 10/15] Move up if return statement to earlier in the function --- haystack/components/rankers/meta_field.py | 15 ++++++++++----- test/components/rankers/test_metafield.py | 13 +++++++++++-- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/haystack/components/rankers/meta_field.py b/haystack/components/rankers/meta_field.py index 9fab049255..bb04c79292 100644 --- a/haystack/components/rankers/meta_field.py +++ b/haystack/components/rankers/meta_field.py @@ -160,6 +160,10 @@ def run( infer_type = infer_type or self.infer_type self._validate_params(weight=weight, top_k=top_k, ranking_mode=ranking_mode, sort_order=sort_order) + # If the weight is 0 then ranking by meta field is disabled and the original documents should be returned + if weight == 0: + return {"documents": documents[:top_k]} + docs_with_meta_field = [] docs_missing_meta_field = [] unique_meta_values = set() @@ -181,7 +185,7 @@ def run( self.meta_field, ",".join([doc.id for doc in documents]), ) - return {"documents": documents} + return {"documents": documents[:top_k]} if len(docs_missing_meta_field) > 0: logger.warning( @@ -191,7 +195,7 @@ def run( ",".join([doc.id for doc in docs_missing_meta_field]), ) - # - If all string type then try to infer using yaml.safe_load ?? + # If all string type then try to infer using yaml.safe_load ?? if infer_type and all(isinstance(d.meta[self.meta_field], str) for d in docs_with_meta_field): ... else: @@ -201,6 +205,7 @@ def run( infer_type, ) + # Sort the documents by self.meta_field reverse = sort_order == "descending" try: sorted_by_meta = sorted(docs_with_meta_field, key=lambda doc: doc.meta[self.meta_field], reverse=reverse) @@ -212,13 +217,13 @@ def run( ",".join([doc.id for doc in docs_missing_meta_field]), error, ) - return {"documents": documents} + return {"documents": documents[:top_k]} # Add the docs missing the meta_field back on the end sorted_documents = sorted_by_meta + docs_missing_meta_field + # Merge the two ranked lists + sorted_documents = self._merge_rankings(documents, sorted_documents) - if self.weight > 0: - sorted_documents = self._merge_rankings(documents, sorted_documents) return {"documents": sorted_documents[:top_k]} def _check_all_same_type(self, documents: List[Document]) -> bool: diff --git a/test/components/rankers/test_metafield.py b/test/components/rankers/test_metafield.py index bddfa4a691..34972d88e3 100644 --- a/test/components/rankers/test_metafield.py +++ b/test/components/rankers/test_metafield.py @@ -54,7 +54,16 @@ def test_run(self, meta_field_values, expected_first_value): assert [doc.meta["rating"] for doc in docs_after] == sorted_scores def test_run_with_weight_equal_to_0(self): - ranker = MetaFieldRanker(meta_field="rating", weight=0) + ranker = MetaFieldRanker(meta_field="rating", weight=0.0) + docs_before = [Document(content="abc", meta={"rating": value}) for value in [1.1, 0.5, 2.3]] + output = ranker.run(documents=docs_before) + docs_after = output["documents"] + + assert len(docs_after) == 3 + assert [doc.meta["rating"] for doc in docs_after] == [1.1, 0.5, 2.3] + + def test_run_with_weight_equal_to_1(self): + ranker = MetaFieldRanker(meta_field="rating", weight=1.0) docs_before = [Document(content="abc", meta={"rating": value}) for value in [1.1, 0.5, 2.3]] output = ranker.run(documents=docs_before) docs_after = output["documents"] @@ -64,7 +73,7 @@ def test_run_with_weight_equal_to_0(self): assert [doc.meta["rating"] for doc in docs_after] == sorted_scores def test_sort_order_ascending(self): - ranker = MetaFieldRanker(meta_field="rating", weight=0, sort_order="ascending") + ranker = MetaFieldRanker(meta_field="rating", weight=1.0, sort_order="ascending") docs_before = [Document(content="abc", meta={"rating": value}) for value in [1.1, 0.5, 2.3]] output = ranker.run(documents=docs_before) docs_after = output["documents"] From 3afe7faf33b6aeedf84c63b267b30db711e51077 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Fri, 12 Jan 2024 16:08:32 +0100 Subject: [PATCH 11/15] Setting up infer_type --- haystack/components/rankers/meta_field.py | 69 ++++++++++++----------- test/components/rankers/test_metafield.py | 18 +++++- 2 files changed, 51 insertions(+), 36 deletions(-) diff --git a/haystack/components/rankers/meta_field.py b/haystack/components/rankers/meta_field.py index bb04c79292..6bd53932f5 100644 --- a/haystack/components/rankers/meta_field.py +++ b/haystack/components/rankers/meta_field.py @@ -1,6 +1,6 @@ import logging from collections import defaultdict -from typing import List, Dict, Any, Optional, Literal, Set +from typing import List, Dict, Any, Optional, Literal from dateutil.parser import parse import yaml @@ -39,7 +39,7 @@ def __init__( top_k: Optional[int] = None, ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"] = "reciprocal_rank_fusion", sort_order: Literal["ascending", "descending"] = "descending", - infer_type: bool = True, + infer_type: bool = False, ): """ Creates an instance of MetaFieldRanker. @@ -164,24 +164,15 @@ def run( if weight == 0: return {"documents": documents[:top_k]} - docs_with_meta_field = [] - docs_missing_meta_field = [] - unique_meta_values = set() - for document in documents: - # Using try except block instead of .meta.get() to handle situation that the meta_value could be None - try: - meta_value = document.meta[self.meta_field] - docs_with_meta_field.append(document) - unique_meta_values.add(meta_value) - except KeyError: - docs_missing_meta_field.append(document) + docs_with_meta_field = [doc for doc in documents if self.meta_field in doc.meta] + docs_missing_meta_field = [doc for doc in documents if self.meta_field not in doc.meta] # If all docs are missing self.meta_field return original documents - if len(docs_missing_meta_field) == len(documents): + if len(docs_with_meta_field) == 0: logger.warning( "The parameter is currently set to '%s', but none of the provided Documents with IDs %s have this meta key.\n" "Set to the name of a field that is present within the provided Documents.\n" - "Returning the original Documents since there are no values to rank.", + "Returning the of the original Documents since there are no values to rank.", self.meta_field, ",".join([doc.id for doc in documents]), ) @@ -195,40 +186,52 @@ def run( ",".join([doc.id for doc in docs_missing_meta_field]), ) - # If all string type then try to infer using yaml.safe_load ?? - if infer_type and all(isinstance(d.meta[self.meta_field], str) for d in docs_with_meta_field): - ... - else: - logger.warning( - "The parameter is currently set to `%s`, but not all of meta values in the provided Documents are strings.\n" - "Therefore,", - infer_type, - ) - # Sort the documents by self.meta_field + parsed_meta = self._parse_meta(docs_with_meta_field=docs_with_meta_field, infer_type=infer_type) reverse = sort_order == "descending" + tuple_parsed_meta_and_docs = list(zip(parsed_meta, docs_with_meta_field)) try: - sorted_by_meta = sorted(docs_with_meta_field, key=lambda doc: doc.meta[self.meta_field], reverse=reverse) + sorted_by_meta = sorted(tuple_parsed_meta_and_docs, key=lambda x: x[0], reverse=reverse) except TypeError as error: # Return original documents if mixed types that are not comparable are returned (e.g. int and list) logger.warning( "Tried to sort Documents with IDs %s, but got TypeError with the message: %s\n" - "Returning the original Documents since meta field ranking is not possible.", - ",".join([doc.id for doc in docs_missing_meta_field]), + "Returning the of the original Documents since meta field ranking is not possible.", + ",".join([doc.id for doc in docs_with_meta_field]), error, ) return {"documents": documents[:top_k]} # Add the docs missing the meta_field back on the end + sorted_by_meta = [doc for meta, doc in sorted_by_meta] sorted_documents = sorted_by_meta + docs_missing_meta_field - # Merge the two ranked lists sorted_documents = self._merge_rankings(documents, sorted_documents) - return {"documents": sorted_documents[:top_k]} - def _check_all_same_type(self, documents: List[Document]) -> bool: - all_types = {type(d.meta[self.meta_field]) for d in documents} - return len(all_types) == 1 + def _parse_meta(self, docs_with_meta_field: List[Document], infer_type: bool) -> List[Any]: + parse_fn = self._identity + if infer_type: + # If all string type try to parse the string using self._parse otherwise use self._identity + unique_meta_values = {doc.meta[self.meta_field] for doc in docs_with_meta_field} + if all(isinstance(meta_value, str) for meta_value in unique_meta_values): + parse_fn = self._date_parse + else: + logger.warning( + "The parameter is currently set to `True`, but not all of meta values in the " + "provided Documents with IDs %s are strings.\n" + "Therefore, inferring the type of the meta values will be skipped.\n" + "Set all meta values found under the parameter to strings to use .", + ",".join([doc.id for doc in docs_with_meta_field]), + ) + return [parse_fn(d.meta[self.meta_field]) for d in docs_with_meta_field] + + @staticmethod + def _date_parse(meta_value: str) -> Any: + return meta_value + + @staticmethod + def _identity(meta_value: Any) -> Any: + return meta_value def _merge_rankings(self, documents: List[Document], sorted_documents: List[Document]) -> List[Document]: """ diff --git a/test/components/rankers/test_metafield.py b/test/components/rankers/test_metafield.py index 34972d88e3..6c0191169b 100644 --- a/test/components/rankers/test_metafield.py +++ b/test/components/rankers/test_metafield.py @@ -16,13 +16,13 @@ def test_to_dict(self): "weight": 1.0, "top_k": None, "ranking_mode": "reciprocal_rank_fusion", - "infer_type": True, + "infer_type": False, }, } def test_to_dict_with_custom_init_parameters(self): component = MetaFieldRanker( - meta_field="rating", weight=0.5, top_k=5, ranking_mode="linear_score", infer_type=False + meta_field="rating", weight=0.5, top_k=5, ranking_mode="linear_score", infer_type=True ) data = component.to_dict() assert data == { @@ -32,7 +32,7 @@ def test_to_dict_with_custom_init_parameters(self): "weight": 0.5, "top_k": 5, "ranking_mode": "linear_score", - "infer_type": False, + "infer_type": True, }, } @@ -111,6 +111,18 @@ def test_warning_if_some_meta_not_found(self, caplog): in caplog.text ) + def test_warning_if_unsortable_values(self, caplog): + ranker = MetaFieldRanker(meta_field="rating") + docs_before = [ + Document(id="1", content="abc", meta={"rating": 1.3}), + Document(id="2", content="abc", meta={"rating": "1.2"}), + Document(id="3", content="abc", meta={"rating": 2.1}), + ] + with caplog.at_level(logging.WARNING): + output = ranker.run(documents=docs_before) + assert len(output["documents"]) == 3 + assert "Tried to sort Documents with IDs 1,2,3, but got TypeError with the message:" in caplog.text + def test_raises_value_error_if_wrong_ranking_mode(self): with pytest.raises(ValueError): MetaFieldRanker(meta_field="rating", ranking_mode="wrong_mode") From 9ec440eda462c466409ff634f704aefcaa4d056e Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Mon, 15 Jan 2024 10:42:08 +0100 Subject: [PATCH 12/15] Remove infer_type for now --- haystack/components/rankers/meta_field.py | 43 ++--------------------- test/components/rankers/test_metafield.py | 6 ++-- 2 files changed, 5 insertions(+), 44 deletions(-) diff --git a/haystack/components/rankers/meta_field.py b/haystack/components/rankers/meta_field.py index 6bd53932f5..57e48995e2 100644 --- a/haystack/components/rankers/meta_field.py +++ b/haystack/components/rankers/meta_field.py @@ -1,8 +1,6 @@ import logging from collections import defaultdict from typing import List, Dict, Any, Optional, Literal -from dateutil.parser import parse -import yaml from haystack import Document, component, default_to_dict @@ -39,7 +37,6 @@ def __init__( top_k: Optional[int] = None, ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"] = "reciprocal_rank_fusion", sort_order: Literal["ascending", "descending"] = "descending", - infer_type: bool = False, ): """ Creates an instance of MetaFieldRanker. @@ -56,8 +53,6 @@ def __init__( Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1]. :param sort_order: Whether to sort the meta field by ascending or descending order. Possible values are `descending` (default) and `ascending`. - :param infer_type: Whether to try and infer the data type of meta value that is a string. For example, we have - the field `"date": "2015-02-01"` we would infer the type of "date" to be a datetime object. """ self.meta_field = meta_field @@ -65,7 +60,6 @@ def __init__( self.top_k = top_k self.ranking_mode = ranking_mode self.sort_order = sort_order - self.infer_type = infer_type self._validate_params( weight=self.weight, top_k=self.top_k, ranking_mode=self.ranking_mode, sort_order=self.sort_order ) @@ -112,7 +106,7 @@ def to_dict(self) -> Dict[str, Any]: weight=self.weight, top_k=self.top_k, ranking_mode=self.ranking_mode, - infer_type=self.infer_type, + sort_order=self.sort_order, ) @component.output_types(documents=List[Document]) @@ -123,7 +117,6 @@ def run( weight: Optional[float] = None, ranking_mode: Optional[Literal["reciprocal_rank_fusion", "linear_score"]] = None, sort_order: Optional[Literal["ascending", "descending"]] = None, - infer_type: Optional[bool] = None, ): """ Use this method to rank a list of Documents based on the selected meta field by: @@ -146,9 +139,6 @@ def run( :param sort_order: Whether to sort the meta field by ascending or descending order. Possible values are `descending` (default) and `ascending`. If not provided, the sort_order provided at initialization time is used. - :param infer_type: Whether to try and infer the data type of meta value that is a string. For example, we have - the field `"date": "2015-02-01"` we would infer the type of "date" to be a datetime object. - If not provided, the infer_type provided at initialization time is used. """ if not documents: return {"documents": []} @@ -157,7 +147,6 @@ def run( weight = weight or self.weight ranking_mode = ranking_mode or self.ranking_mode sort_order = sort_order or self.sort_order - infer_type = infer_type or self.infer_type self._validate_params(weight=weight, top_k=top_k, ranking_mode=ranking_mode, sort_order=sort_order) # If the weight is 0 then ranking by meta field is disabled and the original documents should be returned @@ -187,11 +176,9 @@ def run( ) # Sort the documents by self.meta_field - parsed_meta = self._parse_meta(docs_with_meta_field=docs_with_meta_field, infer_type=infer_type) reverse = sort_order == "descending" - tuple_parsed_meta_and_docs = list(zip(parsed_meta, docs_with_meta_field)) try: - sorted_by_meta = sorted(tuple_parsed_meta_and_docs, key=lambda x: x[0], reverse=reverse) + sorted_by_meta = sorted(docs_with_meta_field, key=lambda doc: doc.meta[self.meta_field], reverse=reverse) except TypeError as error: # Return original documents if mixed types that are not comparable are returned (e.g. int and list) logger.warning( @@ -203,36 +190,10 @@ def run( return {"documents": documents[:top_k]} # Add the docs missing the meta_field back on the end - sorted_by_meta = [doc for meta, doc in sorted_by_meta] sorted_documents = sorted_by_meta + docs_missing_meta_field sorted_documents = self._merge_rankings(documents, sorted_documents) return {"documents": sorted_documents[:top_k]} - def _parse_meta(self, docs_with_meta_field: List[Document], infer_type: bool) -> List[Any]: - parse_fn = self._identity - if infer_type: - # If all string type try to parse the string using self._parse otherwise use self._identity - unique_meta_values = {doc.meta[self.meta_field] for doc in docs_with_meta_field} - if all(isinstance(meta_value, str) for meta_value in unique_meta_values): - parse_fn = self._date_parse - else: - logger.warning( - "The parameter is currently set to `True`, but not all of meta values in the " - "provided Documents with IDs %s are strings.\n" - "Therefore, inferring the type of the meta values will be skipped.\n" - "Set all meta values found under the parameter to strings to use .", - ",".join([doc.id for doc in docs_with_meta_field]), - ) - return [parse_fn(d.meta[self.meta_field]) for d in docs_with_meta_field] - - @staticmethod - def _date_parse(meta_value: str) -> Any: - return meta_value - - @staticmethod - def _identity(meta_value: Any) -> Any: - return meta_value - def _merge_rankings(self, documents: List[Document], sorted_documents: List[Document]) -> List[Document]: """ Merge the two different rankings for Documents sorted both by their content and by their meta field. diff --git a/test/components/rankers/test_metafield.py b/test/components/rankers/test_metafield.py index 6c0191169b..1269d3ca83 100644 --- a/test/components/rankers/test_metafield.py +++ b/test/components/rankers/test_metafield.py @@ -16,13 +16,13 @@ def test_to_dict(self): "weight": 1.0, "top_k": None, "ranking_mode": "reciprocal_rank_fusion", - "infer_type": False, + "sort_order": "descending", }, } def test_to_dict_with_custom_init_parameters(self): component = MetaFieldRanker( - meta_field="rating", weight=0.5, top_k=5, ranking_mode="linear_score", infer_type=True + meta_field="rating", weight=0.5, top_k=5, ranking_mode="linear_score", sort_order="ascending" ) data = component.to_dict() assert data == { @@ -32,7 +32,7 @@ def test_to_dict_with_custom_init_parameters(self): "weight": 0.5, "top_k": 5, "ranking_mode": "linear_score", - "infer_type": True, + "sort_order": "ascending", }, } From 15dee06aaff5deb6aba4c3854a1d596fd7d34428 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Mon, 15 Jan 2024 13:17:33 +0100 Subject: [PATCH 13/15] Release notes --- ...etafieldranker_sort-order_refactor-2000d89dc40dc15a.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 releasenotes/notes/metafieldranker_sort-order_refactor-2000d89dc40dc15a.yaml diff --git a/releasenotes/notes/metafieldranker_sort-order_refactor-2000d89dc40dc15a.yaml b/releasenotes/notes/metafieldranker_sort-order_refactor-2000d89dc40dc15a.yaml new file mode 100644 index 0000000000..c07f1a74aa --- /dev/null +++ b/releasenotes/notes/metafieldranker_sort-order_refactor-2000d89dc40dc15a.yaml @@ -0,0 +1,6 @@ +--- +enhancements: + - | + Prevent the ranker from throwing an error if one or more of the documents doesn't contain the specific meta data field. Now those documents will be ignored for ranking purposes and placed at the end of the ranked list so we don't completely throw them away. + Adding a sort_order that can have values of descending or ascending. + Added more runtime parameters. From 0488e595e7ed928898600e5c69c3d8dd836e988b Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Mon, 15 Jan 2024 15:04:59 +0100 Subject: [PATCH 14/15] Add init file --- test/components/rankers/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 test/components/rankers/__init__.py diff --git a/test/components/rankers/__init__.py b/test/components/rankers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 From 7b4029739406691a24dad19e5137ed442cf869d6 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Mon, 15 Jan 2024 22:21:24 +0100 Subject: [PATCH 15/15] Update releasenotes/notes/metafieldranker_sort-order_refactor-2000d89dc40dc15a.yaml Co-authored-by: Stefano Fiorucci --- .../metafieldranker_sort-order_refactor-2000d89dc40dc15a.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/releasenotes/notes/metafieldranker_sort-order_refactor-2000d89dc40dc15a.yaml b/releasenotes/notes/metafieldranker_sort-order_refactor-2000d89dc40dc15a.yaml index c07f1a74aa..18a792c232 100644 --- a/releasenotes/notes/metafieldranker_sort-order_refactor-2000d89dc40dc15a.yaml +++ b/releasenotes/notes/metafieldranker_sort-order_refactor-2000d89dc40dc15a.yaml @@ -1,6 +1,6 @@ --- enhancements: - | - Prevent the ranker from throwing an error if one or more of the documents doesn't contain the specific meta data field. Now those documents will be ignored for ranking purposes and placed at the end of the ranked list so we don't completely throw them away. + Prevent the `MetaFieldRanker` from throwing an error if one or more of the documents doesn't contain the specific meta data field. Now those documents will be ignored for ranking purposes and placed at the end of the ranked list so we don't completely throw them away. Adding a sort_order that can have values of descending or ascending. Added more runtime parameters.