From a21b0c223323ab2513521d494ecd5faba2826368 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 21 May 2024 16:29:17 +0200 Subject: [PATCH 01/23] wip --- .../components/evaluators/llm_evaluator.py | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index e4eebbd9ab..0c21c8dfeb 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -50,6 +50,7 @@ def __init__( inputs: List[Tuple[str, Type[List]]], outputs: List[str], examples: List[Dict[str, Any]], + raises_on_failure: bool = True, *, api: str = "openai", api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"), @@ -70,6 +71,8 @@ def __init__( `outputs` parameters. Each example is a dictionary with keys "inputs" and "outputs" They contain the input and output as dictionaries respectively. + :param raises_on_failure: + If True, the component will raise an exception if the evaluation fails. :param api: The API to use for calling an LLM through a Generator. Supported APIs: "openai". @@ -78,7 +81,7 @@ def __init__( """ self.validate_init_parameters(inputs, outputs, examples) - + self.raise_on_failure = raises_on_failure self.instructions = instructions self.inputs = inputs self.outputs = outputs @@ -293,8 +296,7 @@ def validate_input_parameters(expected: Dict[str, Any], received: Dict[str, Any] ) raise ValueError(msg) - @staticmethod - def validate_outputs(expected: List[str], received: str) -> None: + def validate_outputs(self, expected: List[str], received: str) -> None: """ Validate the output. @@ -306,7 +308,14 @@ def validate_outputs(expected: List[str], received: str) -> None: :raises ValueError: If not all expected outputs are present in the received outputs """ - parsed_output = json.loads(received) - if not all(output in parsed_output for output in expected): - msg = f"Expected response from LLM evaluator to be JSON with keys {expected}, got {received}." - raise ValueError(msg) + try: + parsed_output = json.loads(received) + + if not all(output in parsed_output for output in expected): + msg = f"Expected response from LLM evaluator to be JSON with keys {expected}, got {received}." + raise ValueError(msg) + + except json.JSONDecodeError: + if self.raise_on_failure: + raise ValueError("Response from LLM evaluator is not a valid JSON.") + # ToDo: issue a warning or/and log the error From 91ad2ef96bf1ddc946d04649d8cdf58e6eb9677b Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 21 May 2024 16:48:20 +0200 Subject: [PATCH 02/23] initial import --- haystack/components/evaluators/llm_evaluator.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index 0c21c8dfeb..6d1270f040 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -3,7 +3,9 @@ # SPDX-License-Identifier: Apache-2.0 import json -from typing import Any, Dict, List, Tuple, Type +from typing import Any, Dict, List, Optional, Tuple, Type + +from tqdm import tqdm from haystack import component, default_from_dict, default_to_dict from haystack.components.builders import PromptBuilder @@ -51,6 +53,7 @@ def __init__( outputs: List[str], examples: List[Dict[str, Any]], raises_on_failure: bool = True, + progress_bar: bool = True, *, api: str = "openai", api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"), @@ -82,6 +85,7 @@ def __init__( """ self.validate_init_parameters(inputs, outputs, examples) self.raise_on_failure = raises_on_failure + self.progress_bar = progress_bar self.instructions = instructions self.inputs = inputs self.outputs = outputs @@ -176,10 +180,12 @@ def run(self, **inputs) -> Dict[str, Any]: list_of_input_names_to_values = [dict(zip(input_names, v)) for v in values] results = [] - for input_names_to_values in list_of_input_names_to_values: + for input_names_to_values in tqdm(list_of_input_names_to_values, disable=not self.progress_bar): prompt = self.builder.run(**input_names_to_values) result = self.generator.run(prompt=prompt["prompt"]) + # ToDo: how to handle too large context + self.validate_outputs(expected=self.outputs, received=result["replies"][0]) parsed_result = json.loads(result["replies"][0]) results.append(parsed_result) @@ -296,7 +302,7 @@ def validate_input_parameters(expected: Dict[str, Any], received: Dict[str, Any] ) raise ValueError(msg) - def validate_outputs(self, expected: List[str], received: str) -> None: + def validate_outputs(self, expected: List[str], received: str) -> Optional[str]: """ Validate the output. @@ -319,3 +325,5 @@ def validate_outputs(self, expected: List[str], received: str) -> None: if self.raise_on_failure: raise ValueError("Response from LLM evaluator is not a valid JSON.") # ToDo: issue a warning or/and log the error + Warning("Response from LLM evaluator is not a valid JSON.") + return "{}" From 8746035910caaaa33ff0ad1c14de10844e9b136c Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 21 May 2024 18:05:20 +0200 Subject: [PATCH 03/23] adding tests --- .../components/evaluators/llm_evaluator.py | 25 +++++++++++++---- .../evaluators/test_llm_evaluator.py | 28 +++++++++++++++++++ 2 files changed, 47 insertions(+), 6 deletions(-) diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index 6d1270f040..0bd274ead4 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -3,8 +3,11 @@ # SPDX-License-Identifier: Apache-2.0 import json +import logging from typing import Any, Dict, List, Optional, Tuple, Type +from warnings import warn +import numpy as np from tqdm import tqdm from haystack import component, default_from_dict, default_to_dict @@ -12,6 +15,8 @@ from haystack.components.generators import OpenAIGenerator from haystack.utils import Secret, deserialize_secrets_inplace +logger = logging.getLogger(__name__) + @component class LLMEvaluator: @@ -302,10 +307,15 @@ def validate_input_parameters(expected: Dict[str, Any], received: Dict[str, Any] ) raise ValueError(msg) - def validate_outputs(self, expected: List[str], received: str) -> Optional[str]: + def validate_outputs(self, expected: List[str], received: str) -> Optional[float]: """ Validate the output. + If `raise_on_failure` is True, raise a ValueError if not all expected outputs are present in the received + outputs or if the received outputs are not a valid JSON. + + If `raise_on_failure` is False, print a warning if the received outputs are not a valid JSON and return a `nan`. + :param expected: Names of expected outputs :param received: @@ -319,11 +329,14 @@ def validate_outputs(self, expected: List[str], received: str) -> Optional[str]: if not all(output in parsed_output for output in expected): msg = f"Expected response from LLM evaluator to be JSON with keys {expected}, got {received}." - raise ValueError(msg) + if self.raise_on_failure: + raise ValueError(msg) + warn(msg) + return np.nan except json.JSONDecodeError: + msg = "Response from LLM evaluator is not a valid JSON." if self.raise_on_failure: - raise ValueError("Response from LLM evaluator is not a valid JSON.") - # ToDo: issue a warning or/and log the error - Warning("Response from LLM evaluator is not a valid JSON.") - return "{}" + raise ValueError(msg) + warn(msg) + return np.nan diff --git a/test/components/evaluators/test_llm_evaluator.py b/test/components/evaluators/test_llm_evaluator.py index b1d41e000c..ef72f5ac7f 100644 --- a/test/components/evaluators/test_llm_evaluator.py +++ b/test/components/evaluators/test_llm_evaluator.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 from typing import List +import numpy as np import pytest from haystack.components.evaluators import LLMEvaluator @@ -382,6 +383,33 @@ def test_invalid_outputs(self, monkeypatch): with pytest.raises(ValueError): component.validate_outputs(expected=["score"], received='{"wrong_name": 1.0}') + def test_output_invalid_json_raise_on_failure_false(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + component = LLMEvaluator( + instructions="test-instruction", + inputs=[("predicted_answers", List[str])], + outputs=["score"], + examples=[ + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} + ], + raises_on_failure=False, + ) + result = component.validate_outputs(expected=["score"], received="some_invalid_json_output") + assert np.isnan(result) + + def test_output_invalid_json_raise_on_failure_true(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + component = LLMEvaluator( + instructions="test-instruction", + inputs=[("predicted_answers", List[str])], + outputs=["score"], + examples=[ + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} + ], + ) + with pytest.raises(ValueError): + component.validate_outputs(expected=["score"], received="some_invalid_json_output") + def test_unsupported_api(self): with pytest.raises(ValueError): LLMEvaluator( From 3d16830a09100dd9e3c496c7bdcadb18532a7d26 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 21 May 2024 19:37:01 +0200 Subject: [PATCH 04/23] adding params --- haystack/components/evaluators/context_relevance.py | 6 ++++++ haystack/components/evaluators/faithfulness.py | 6 ++++++ haystack/components/evaluators/llm_evaluator.py | 4 ++-- test/components/evaluators/test_llm_evaluator.py | 2 +- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py index 9988bdeb02..61f085366a 100644 --- a/haystack/components/evaluators/context_relevance.py +++ b/haystack/components/evaluators/context_relevance.py @@ -69,6 +69,8 @@ def __init__( examples: Optional[List[Dict[str, Any]]] = None, api: str = "openai", api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"), + raise_on_failure: bool = True, + progress_bar: bool = True, ): """ Creates an instance of ContextRelevanceEvaluator. @@ -107,6 +109,8 @@ def __init__( self.examples = examples or _DEFAULT_EXAMPLES self.api = api self.api_key = api_key + self.raise_on_failure = raise_on_failure + self.progress_bar = progress_bar super().__init__( instructions=self.instructions, @@ -115,6 +119,8 @@ def __init__( examples=self.examples, api=self.api, api_key=self.api_key, + raise_on_failure=self.raise_on_failure, + progress_bar=self.progress_bar, ) @component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]]) diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py index 2bcbb9b086..e9e38a30e7 100644 --- a/haystack/components/evaluators/faithfulness.py +++ b/haystack/components/evaluators/faithfulness.py @@ -83,6 +83,8 @@ def __init__( examples: Optional[List[Dict[str, Any]]] = None, api: str = "openai", api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"), + raise_on_failure: bool = True, + progress_bar: bool = True, ): """ Creates an instance of FaithfulnessEvaluator. @@ -123,6 +125,8 @@ def __init__( self.examples = examples or _DEFAULT_EXAMPLES self.api = api self.api_key = api_key + self.raise_on_failure = raise_on_failure + self.progress_bar = progress_bar super().__init__( instructions=self.instructions, @@ -131,6 +135,8 @@ def __init__( examples=self.examples, api=self.api, api_key=self.api_key, + raise_on_failure=self.raise_on_failure, + progress_bar=self.progress_bar, ) @component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]]) diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index 0bd274ead4..3427012ed1 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -57,7 +57,7 @@ def __init__( inputs: List[Tuple[str, Type[List]]], outputs: List[str], examples: List[Dict[str, Any]], - raises_on_failure: bool = True, + raise_on_failure: bool = True, progress_bar: bool = True, *, api: str = "openai", @@ -89,7 +89,7 @@ def __init__( """ self.validate_init_parameters(inputs, outputs, examples) - self.raise_on_failure = raises_on_failure + self.raise_on_failure = raise_on_failure self.progress_bar = progress_bar self.instructions = instructions self.inputs = inputs diff --git a/test/components/evaluators/test_llm_evaluator.py b/test/components/evaluators/test_llm_evaluator.py index ef72f5ac7f..7bd459c89c 100644 --- a/test/components/evaluators/test_llm_evaluator.py +++ b/test/components/evaluators/test_llm_evaluator.py @@ -392,7 +392,7 @@ def test_output_invalid_json_raise_on_failure_false(self, monkeypatch): examples=[ {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} ], - raises_on_failure=False, + raise_on_failure=False, ) result = component.validate_outputs(expected=["score"], received="some_invalid_json_output") assert np.isnan(result) From 33dd22dbb2c902884c3045a342a990342c80e3ae Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 21 May 2024 23:35:02 +0200 Subject: [PATCH 05/23] adding safeguards for nan in evaluators --- .../evaluators/context_relevance.py | 6 ++- .../components/evaluators/faithfulness.py | 6 ++- .../components/evaluators/llm_evaluator.py | 45 ++++++++++++------- 3 files changed, 40 insertions(+), 17 deletions(-) diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py index 61f085366a..6f75b88a3e 100644 --- a/haystack/components/evaluators/context_relevance.py +++ b/haystack/components/evaluators/context_relevance.py @@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional +from numpy import isnan from numpy import mean as np_mean from haystack import default_from_dict @@ -141,7 +142,10 @@ def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any] result = super().run(questions=questions, contexts=contexts) # calculate average statement relevance score per query - for res in result["results"]: + for idx, res in enumerate(result["results"]): + if isinstance(res, float) and isnan(res): + result["results"][idx] = {"statements": [], "statement_scores": [], "score": 0} + continue if not res["statements"]: res["score"] = 0 else: diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py index e9e38a30e7..d9eb3efe38 100644 --- a/haystack/components/evaluators/faithfulness.py +++ b/haystack/components/evaluators/faithfulness.py @@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional +from numpy import isnan from numpy import mean as np_mean from haystack import default_from_dict @@ -159,7 +160,10 @@ def run(self, questions: List[str], contexts: List[List[str]], predicted_answers result = super().run(questions=questions, contexts=contexts, predicted_answers=predicted_answers) # calculate average statement faithfulness score per query - for res in result["results"]: + for idx, res in enumerate(result["results"]): + if isinstance(res, float) and isnan(res): + result["results"][idx] = {"statements": [], "statement_scores": [], "score": 0} + continue if not res["statements"]: res["score"] = 0 else: diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index 3427012ed1..0d176e8a61 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -79,7 +79,7 @@ def __init__( `outputs` parameters. Each example is a dictionary with keys "inputs" and "outputs" They contain the input and output as dictionaries respectively. - :param raises_on_failure: + :param raise_on_failure: If True, the component will raise an exception if the evaluation fails. :param api: The API to use for calling an LLM through a Generator. @@ -170,6 +170,8 @@ def run(self, **inputs) -> Dict[str, Any]: """ Run the LLM evaluator. + # ToDo: add more details about the behavior of this method and it's exceptions + :param inputs: The input values to evaluate. The keys are the input names and the values are lists of input values. :returns: @@ -187,13 +189,21 @@ def run(self, **inputs) -> Dict[str, Any]: results = [] for input_names_to_values in tqdm(list_of_input_names_to_values, disable=not self.progress_bar): prompt = self.builder.run(**input_names_to_values) - result = self.generator.run(prompt=prompt["prompt"]) - - # ToDo: how to handle too large context + try: + result = self.generator.run(prompt=prompt["prompt"]) + except Exception as e: + msg = f"Error while generating response for prompt: {prompt}. Error: {e}" + if self.raise_on_failure: + raise ValueError(msg) + warn(msg) + results.append(np.nan) + continue - self.validate_outputs(expected=self.outputs, received=result["replies"][0]) - parsed_result = json.loads(result["replies"][0]) - results.append(parsed_result) + if self.is_valid_json(expected=self.outputs, received=result["replies"][0]): + parsed_result = json.loads(result["replies"][0]) + results.append(parsed_result) + else: + results.append(np.nan) return {"results": results} @@ -307,14 +317,14 @@ def validate_input_parameters(expected: Dict[str, Any], received: Dict[str, Any] ) raise ValueError(msg) - def validate_outputs(self, expected: List[str], received: str) -> Optional[float]: + def is_valid_json(self, expected: List[str], received: str) -> bool: """ - Validate the output. + Output must be a valid JSON with the expected keys. - If `raise_on_failure` is True, raise a ValueError if not all expected outputs are present in the received - outputs or if the received outputs are not a valid JSON. - - If `raise_on_failure` is False, print a warning if the received outputs are not a valid JSON and return a `nan`. + If the output is not a valid JSON with the expected keys: + - with `raise_on_failure` set to True a ValueError is raised. + - with `raise_on_failure` set to False a warning is issued and False is returned. + If the output is a valid JSON with the expected keys, True is returned. :param expected: Names of expected outputs @@ -323,6 +333,9 @@ def validate_outputs(self, expected: List[str], received: str) -> Optional[float :raises ValueError: If not all expected outputs are present in the received outputs + + :returns: + True if the received output is a valid JSON with the expected keys, False otherwise. """ try: parsed_output = json.loads(received) @@ -332,11 +345,13 @@ def validate_outputs(self, expected: List[str], received: str) -> Optional[float if self.raise_on_failure: raise ValueError(msg) warn(msg) - return np.nan + return False except json.JSONDecodeError: msg = "Response from LLM evaluator is not a valid JSON." if self.raise_on_failure: raise ValueError(msg) warn(msg) - return np.nan + return False + + return True From 7473d1fd64a31fe31cdab54871af170a3e9fddf9 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 22 May 2024 09:35:38 +0200 Subject: [PATCH 06/23] adding docstrings --- haystack/components/evaluators/llm_evaluator.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index 0d176e8a61..39633d279b 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -170,7 +170,11 @@ def run(self, **inputs) -> Dict[str, Any]: """ Run the LLM evaluator. - # ToDo: add more details about the behavior of this method and it's exceptions + Running the LLM evaluator is done within a try-except block to catch any exceptions that may + occur during the run. If an exception occurs, the method will return a np.nan value for the result. + + Likewise, if the output is not a valid JSON with the expected keys, the method will return a np.nan value + for the result. :param inputs: The input values to evaluate. The keys are the input names and the values are lists of input values. From b2ff89a7512a6d2a4855f301831249a722c1513b Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 22 May 2024 09:56:26 +0200 Subject: [PATCH 07/23] fixing tests --- test/components/evaluators/test_llm_evaluator.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/test/components/evaluators/test_llm_evaluator.py b/test/components/evaluators/test_llm_evaluator.py index 7bd459c89c..f1930e3074 100644 --- a/test/components/evaluators/test_llm_evaluator.py +++ b/test/components/evaluators/test_llm_evaluator.py @@ -378,10 +378,10 @@ def test_invalid_outputs(self, monkeypatch): ], ) with pytest.raises(ValueError): - component.validate_outputs(expected=["score", "another_expected_output"], received='{"score": 1.0}') + component.is_valid_json(expected=["score", "another_expected_output"], received='{"score": 1.0}') with pytest.raises(ValueError): - component.validate_outputs(expected=["score"], received='{"wrong_name": 1.0}') + component.is_valid_json(expected=["score"], received='{"wrong_name": 1.0}') def test_output_invalid_json_raise_on_failure_false(self, monkeypatch): monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") @@ -394,8 +394,7 @@ def test_output_invalid_json_raise_on_failure_false(self, monkeypatch): ], raise_on_failure=False, ) - result = component.validate_outputs(expected=["score"], received="some_invalid_json_output") - assert np.isnan(result) + assert component.is_valid_json(expected=["score"], received="some_invalid_json_output") is False def test_output_invalid_json_raise_on_failure_true(self, monkeypatch): monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") @@ -408,7 +407,7 @@ def test_output_invalid_json_raise_on_failure_true(self, monkeypatch): ], ) with pytest.raises(ValueError): - component.validate_outputs(expected=["score"], received="some_invalid_json_output") + component.is_valid_json(expected=["score"], received="some_invalid_json_output") def test_unsupported_api(self): with pytest.raises(ValueError): From 860c2aac85d47e5cfb7663b61a16bec3559595cb Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 22 May 2024 10:04:06 +0200 Subject: [PATCH 08/23] removing unused imports --- haystack/components/evaluators/llm_evaluator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index 39633d279b..e1651b52df 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -4,7 +4,7 @@ import json import logging -from typing import Any, Dict, List, Optional, Tuple, Type +from typing import Any, Dict, List, Tuple, Type from warnings import warn import numpy as np From d502ed969ba0bc22375463dbecfe9c8bf4283620 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 22 May 2024 10:38:20 +0200 Subject: [PATCH 09/23] removing unused imports --- haystack/components/evaluators/llm_evaluator.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index e1651b52df..177834b8c3 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -15,8 +15,6 @@ from haystack.components.generators import OpenAIGenerator from haystack.utils import Secret, deserialize_secrets_inplace -logger = logging.getLogger(__name__) - @component class LLMEvaluator: @@ -191,6 +189,7 @@ def run(self, **inputs) -> Dict[str, Any]: list_of_input_names_to_values = [dict(zip(input_names, v)) for v in values] results = [] + errors = 0 for input_names_to_values in tqdm(list_of_input_names_to_values, disable=not self.progress_bar): prompt = self.builder.run(**input_names_to_values) try: @@ -201,6 +200,7 @@ def run(self, **inputs) -> Dict[str, Any]: raise ValueError(msg) warn(msg) results.append(np.nan) + errors += 1 continue if self.is_valid_json(expected=self.outputs, received=result["replies"][0]): @@ -208,6 +208,11 @@ def run(self, **inputs) -> Dict[str, Any]: results.append(parsed_result) else: results.append(np.nan) + errors += 1 + + if errors > 0: + msg = f"LLM evaluator failed for {errors} out of {len(list_of_input_names_to_values)} inputs." + warn(msg) return {"results": results} From 2538ed3ec903e11047f681877ec3fbd162b06a36 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 22 May 2024 10:47:58 +0200 Subject: [PATCH 10/23] removing unused imports --- haystack/components/evaluators/llm_evaluator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index 177834b8c3..04671196f8 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 import json -import logging from typing import Any, Dict, List, Tuple, Type from warnings import warn From f5f3818e788aac0b861521b54ba4c8b4538b6f31 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 22 May 2024 11:47:31 +0200 Subject: [PATCH 11/23] adding tests to context and faithfullness evaluators --- .../test_context_relevance_evaluator.py | 36 ++++++++++++++++ .../evaluators/test_faithfulness_evaluator.py | 41 +++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/test/components/evaluators/test_context_relevance_evaluator.py b/test/components/evaluators/test_context_relevance_evaluator.py index ecbc215d06..7e28edd18d 100644 --- a/test/components/evaluators/test_context_relevance_evaluator.py +++ b/test/components/evaluators/test_context_relevance_evaluator.py @@ -159,6 +159,42 @@ def test_run_missing_parameters(self, monkeypatch): with pytest.raises(TypeError, match="missing 2 required positional arguments"): component.run() + def test_run_handles_nan(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + component = ContextRelevanceEvaluator(progress_bar=False, raise_on_failure=False) + + def generator_run(self, *args, **kwargs): + if "Python" in kwargs["prompt"]: + raise Exception("OpenAI API request failed.") + else: + return {"replies": ['{"statements": ["c", "d"], "statement_scores": [1, 1]}']} + + monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run) + + questions = ["Which is the most popular global sport?", "Who created the Python language?"] + contexts = [ + [ + "The popularity of sports can be measured in various ways, including TV viewership, social media " + "presence, number of participants, and economic impact. Football is undoubtedly the world's most " + "popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and " + "Messi, drawing a followership of more than 4 billion people." + ], + [ + "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming " + "language. Its design philosophy emphasizes code readability, and its language constructs aim to help " + "programmers write clear, logical code for both small and large-scale software projects." + ], + ] + results = component.run(questions=questions, contexts=contexts) + assert results == { + "individual_scores": [1, 0], + "results": [ + {"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]}, + {"score": 0, "statement_scores": [], "statements": []}, + ], + "score": 0.5, + } + @pytest.mark.skipif( not os.environ.get("OPENAI_API_KEY", None), reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.", diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py index e493b709ef..38434c1dab 100644 --- a/test/components/evaluators/test_faithfulness_evaluator.py +++ b/test/components/evaluators/test_faithfulness_evaluator.py @@ -4,6 +4,7 @@ import os from typing import List +import numpy as np import pytest from haystack.components.evaluators import FaithfulnessEvaluator @@ -191,6 +192,46 @@ def test_run_missing_parameters(self, monkeypatch): with pytest.raises(TypeError, match="missing 3 required positional arguments"): component.run() + def test_run_handles_nan(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + component = FaithfulnessEvaluator(progress_bar=False, raise_on_failure=False) + + def generator_run(self, *args, **kwargs): + if "Python" in kwargs["prompt"]: + raise Exception("OpenAI API request failed.") + else: + return {"replies": ['{"statements": ["c", "d"], "statement_scores": [1, 1]}']} + + monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run) + + questions = ["Which is the most popular global sport?", "Who created the Python language?"] + contexts = [ + [ + "The popularity of sports can be measured in various ways, including TV viewership, social media " + "presence, number of participants, and economic impact. Football is undoubtedly the world's most " + "popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and " + "Messi, drawing a followership of more than 4 billion people." + ], + [ + "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming " + "language. Its design philosophy emphasizes code readability, and its language constructs aim to help " + "programmers write clear, logical code for both small and large-scale software projects." + ], + ] + predicted_answers = [ + "Football is the most popular sport with around 4 billion followers worldwide.", + "Guido van Rossum.", + ] + results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers) + assert results == { + "individual_scores": [1, 0], + "results": [ + {"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]}, + {"score": 0, "statement_scores": [], "statements": []}, + ], + "score": 0.5, + } + @pytest.mark.skipif( not os.environ.get("OPENAI_API_KEY", None), reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.", From a271db7a399bd053a12644110b46e036679d61b8 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 22 May 2024 12:24:41 +0200 Subject: [PATCH 12/23] fixing docstrings --- haystack/components/evaluators/llm_evaluator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index 04671196f8..0711fcb89d 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -168,10 +168,10 @@ def run(self, **inputs) -> Dict[str, Any]: Run the LLM evaluator. Running the LLM evaluator is done within a try-except block to catch any exceptions that may - occur during the run. If an exception occurs, the method will return a np.nan value for the result. + occur during the run. If an exception occurs, the method will return a `np.nan` value for the result. - Likewise, if the output is not a valid JSON with the expected keys, the method will return a np.nan value - for the result. + Likewise, if the output is not a valid JSON or does not have the expected keys, the method will return a + `np.nan` value for the result. :param inputs: The input values to evaluate. The keys are the input names and the values are lists of input values. From 54a0146f120f46a782f06bfef70c4652395b5864 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 22 May 2024 12:30:33 +0200 Subject: [PATCH 13/23] nit --- haystack/components/evaluators/context_relevance.py | 3 --- haystack/components/evaluators/faithfulness.py | 3 --- haystack/components/evaluators/llm_evaluator.py | 4 +--- .../components/evaluators/test_context_relevance_evaluator.py | 2 +- test/components/evaluators/test_faithfulness_evaluator.py | 2 +- 5 files changed, 3 insertions(+), 11 deletions(-) diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py index 6f75b88a3e..d44b3f3b33 100644 --- a/haystack/components/evaluators/context_relevance.py +++ b/haystack/components/evaluators/context_relevance.py @@ -71,7 +71,6 @@ def __init__( api: str = "openai", api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"), raise_on_failure: bool = True, - progress_bar: bool = True, ): """ Creates an instance of ContextRelevanceEvaluator. @@ -111,7 +110,6 @@ def __init__( self.api = api self.api_key = api_key self.raise_on_failure = raise_on_failure - self.progress_bar = progress_bar super().__init__( instructions=self.instructions, @@ -121,7 +119,6 @@ def __init__( api=self.api, api_key=self.api_key, raise_on_failure=self.raise_on_failure, - progress_bar=self.progress_bar, ) @component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]]) diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py index d9eb3efe38..7307e867c7 100644 --- a/haystack/components/evaluators/faithfulness.py +++ b/haystack/components/evaluators/faithfulness.py @@ -85,7 +85,6 @@ def __init__( api: str = "openai", api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"), raise_on_failure: bool = True, - progress_bar: bool = True, ): """ Creates an instance of FaithfulnessEvaluator. @@ -127,7 +126,6 @@ def __init__( self.api = api self.api_key = api_key self.raise_on_failure = raise_on_failure - self.progress_bar = progress_bar super().__init__( instructions=self.instructions, @@ -137,7 +135,6 @@ def __init__( api=self.api, api_key=self.api_key, raise_on_failure=self.raise_on_failure, - progress_bar=self.progress_bar, ) @component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]]) diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index 0711fcb89d..e48043d091 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -55,7 +55,6 @@ def __init__( outputs: List[str], examples: List[Dict[str, Any]], raise_on_failure: bool = True, - progress_bar: bool = True, *, api: str = "openai", api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"), @@ -87,7 +86,6 @@ def __init__( """ self.validate_init_parameters(inputs, outputs, examples) self.raise_on_failure = raise_on_failure - self.progress_bar = progress_bar self.instructions = instructions self.inputs = inputs self.outputs = outputs @@ -189,7 +187,7 @@ def run(self, **inputs) -> Dict[str, Any]: results = [] errors = 0 - for input_names_to_values in tqdm(list_of_input_names_to_values, disable=not self.progress_bar): + for input_names_to_values in list_of_input_names_to_values: prompt = self.builder.run(**input_names_to_values) try: result = self.generator.run(prompt=prompt["prompt"]) diff --git a/test/components/evaluators/test_context_relevance_evaluator.py b/test/components/evaluators/test_context_relevance_evaluator.py index 7e28edd18d..1f21f8537c 100644 --- a/test/components/evaluators/test_context_relevance_evaluator.py +++ b/test/components/evaluators/test_context_relevance_evaluator.py @@ -161,7 +161,7 @@ def test_run_missing_parameters(self, monkeypatch): def test_run_handles_nan(self, monkeypatch): monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") - component = ContextRelevanceEvaluator(progress_bar=False, raise_on_failure=False) + component = ContextRelevanceEvaluator(raise_on_failure=False) def generator_run(self, *args, **kwargs): if "Python" in kwargs["prompt"]: diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py index 38434c1dab..6e3ea8749f 100644 --- a/test/components/evaluators/test_faithfulness_evaluator.py +++ b/test/components/evaluators/test_faithfulness_evaluator.py @@ -194,7 +194,7 @@ def test_run_missing_parameters(self, monkeypatch): def test_run_handles_nan(self, monkeypatch): monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") - component = FaithfulnessEvaluator(progress_bar=False, raise_on_failure=False) + component = FaithfulnessEvaluator(raise_on_failure=False) def generator_run(self, *args, **kwargs): if "Python" in kwargs["prompt"]: From 12164d8971e10c346c8076bcb840139ba059ffc0 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 22 May 2024 12:40:28 +0200 Subject: [PATCH 14/23] removing unused imports --- haystack/components/evaluators/llm_evaluator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index e48043d091..b0a0a6b06e 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -7,7 +7,6 @@ from warnings import warn import numpy as np -from tqdm import tqdm from haystack import component, default_from_dict, default_to_dict from haystack.components.builders import PromptBuilder From 687312f75a36df0176e53181693d2320322e5dd8 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 22 May 2024 12:44:50 +0200 Subject: [PATCH 15/23] adding release notes --- ...d-failsafe-for-LLM-based-evaluators-34cdc183ab545315.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 releasenotes/notes/add-failsafe-for-LLM-based-evaluators-34cdc183ab545315.yaml diff --git a/releasenotes/notes/add-failsafe-for-LLM-based-evaluators-34cdc183ab545315.yaml b/releasenotes/notes/add-failsafe-for-LLM-based-evaluators-34cdc183ab545315.yaml new file mode 100644 index 0000000000..ae21560df4 --- /dev/null +++ b/releasenotes/notes/add-failsafe-for-LLM-based-evaluators-34cdc183ab545315.yaml @@ -0,0 +1,5 @@ +--- +enhancements: + - | + If an LLM-based evaluator (e.g., `Faithfulness` or `ContextRelevance`) is initialised with `raise_on_failure=False`, and if a call to an LLM fails or an LLM outputs an invalid JSON, it returns `np.nan` and continues the evaluation. + The user is notified with a warning indicating the number of requests that failed. From 2b94818d2b4c0cbecd17ac6a8b88b11702150302 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 22 May 2024 16:21:36 +0200 Subject: [PATCH 16/23] attending PR comments --- .../evaluators/context_relevance.py | 12 ++++---- .../components/evaluators/faithfulness.py | 11 +++---- .../components/evaluators/llm_evaluator.py | 30 +++++++------------ ...LLM-based-evaluators-34cdc183ab545315.yaml | 2 +- 4 files changed, 25 insertions(+), 30 deletions(-) diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py index d44b3f3b33..83a8f5af09 100644 --- a/haystack/components/evaluators/context_relevance.py +++ b/haystack/components/evaluators/context_relevance.py @@ -2,9 +2,10 @@ # # SPDX-License-Identifier: Apache-2.0 +from math import isnan from typing import Any, Dict, List, Optional -from numpy import isnan +import numpy as np from numpy import mean as np_mean from haystack import default_from_dict @@ -96,6 +97,8 @@ def __init__( Supported APIs: "openai". :param api_key: The API key. + :param raise_on_failure: + Whether to raise an exception if the API call fails. """ self.instructions = ( @@ -109,7 +112,6 @@ def __init__( self.examples = examples or _DEFAULT_EXAMPLES self.api = api self.api_key = api_key - self.raise_on_failure = raise_on_failure super().__init__( instructions=self.instructions, @@ -118,7 +120,7 @@ def __init__( examples=self.examples, api=self.api, api_key=self.api_key, - raise_on_failure=self.raise_on_failure, + raise_on_failure=raise_on_failure, ) @component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]]) @@ -140,8 +142,8 @@ def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any] # calculate average statement relevance score per query for idx, res in enumerate(result["results"]): - if isinstance(res, float) and isnan(res): - result["results"][idx] = {"statements": [], "statement_scores": [], "score": 0} + if not res: + result["results"][idx] = {"statements": [], "statement_scores": [], "score": float("nan")} continue if not res["statements"]: res["score"] = 0 diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py index 7307e867c7..3d2dac3458 100644 --- a/haystack/components/evaluators/faithfulness.py +++ b/haystack/components/evaluators/faithfulness.py @@ -2,9 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 +from math import isnan from typing import Any, Dict, List, Optional -from numpy import isnan from numpy import mean as np_mean from haystack import default_from_dict @@ -111,6 +111,8 @@ def __init__( Supported APIs: "openai". :param api_key: The API key. + :param raise_on_failure: + Whether to raise an exception if the API call fails. """ self.instructions = ( @@ -125,7 +127,6 @@ def __init__( self.examples = examples or _DEFAULT_EXAMPLES self.api = api self.api_key = api_key - self.raise_on_failure = raise_on_failure super().__init__( instructions=self.instructions, @@ -134,7 +135,7 @@ def __init__( examples=self.examples, api=self.api, api_key=self.api_key, - raise_on_failure=self.raise_on_failure, + raise_on_failure=raise_on_failure, ) @component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]]) @@ -158,8 +159,8 @@ def run(self, questions: List[str], contexts: List[List[str]], predicted_answers # calculate average statement faithfulness score per query for idx, res in enumerate(result["results"]): - if isinstance(res, float) and isnan(res): - result["results"][idx] = {"statements": [], "statement_scores": [], "score": 0} + if not res: + result["results"][idx] = {"statements": [], "statement_scores": [], "score": float("nan")} continue if not res["statements"]: res["score"] = 0 diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index b0a0a6b06e..79de577a38 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -53,8 +53,8 @@ def __init__( inputs: List[Tuple[str, Type[List]]], outputs: List[str], examples: List[Dict[str, Any]], - raise_on_failure: bool = True, *, + raise_on_failure: bool = True, api: str = "openai", api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"), ): @@ -75,7 +75,7 @@ def __init__( Each example is a dictionary with keys "inputs" and "outputs" They contain the input and output as dictionaries respectively. :param raise_on_failure: - If True, the component will raise an exception if the evaluation fails. + If True, the component will raise an exception on an unsuccessful API call. :param api: The API to use for calling an LLM through a Generator. Supported APIs: "openai". @@ -164,18 +164,13 @@ def run(self, **inputs) -> Dict[str, Any]: """ Run the LLM evaluator. - Running the LLM evaluator is done within a try-except block to catch any exceptions that may - occur during the run. If an exception occurs, the method will return a `np.nan` value for the result. - - Likewise, if the output is not a valid JSON or does not have the expected keys, the method will return a - `np.nan` value for the result. - :param inputs: The input values to evaluate. The keys are the input names and the values are lists of input values. :returns: A dictionary with a single `results` entry that contains a list of results. Each result is a dictionary containing the keys as defined in the `outputs` parameter of the LLMEvaluator - and the evaluation results as the values. + and the evaluation results as the values. If an exception occurs for a particular input value, the result + will be `None` for that entry. """ self.validate_input_parameters(dict(self.inputs), inputs) @@ -195,15 +190,15 @@ def run(self, **inputs) -> Dict[str, Any]: if self.raise_on_failure: raise ValueError(msg) warn(msg) - results.append(np.nan) + results.append(None) errors += 1 continue - if self.is_valid_json(expected=self.outputs, received=result["replies"][0]): + if self.is_valid_json_and_has_expected_keys(expected=self.outputs, received=result["replies"][0]): parsed_result = json.loads(result["replies"][0]) results.append(parsed_result) else: - results.append(np.nan) + results.append(None) errors += 1 if errors > 0: @@ -322,22 +317,19 @@ def validate_input_parameters(expected: Dict[str, Any], received: Dict[str, Any] ) raise ValueError(msg) - def is_valid_json(self, expected: List[str], received: str) -> bool: + def is_valid_json_and_has_expected_keys(self, expected: List[str], received: str) -> bool: """ Output must be a valid JSON with the expected keys. - If the output is not a valid JSON with the expected keys: - - with `raise_on_failure` set to True a ValueError is raised. - - with `raise_on_failure` set to False a warning is issued and False is returned. - If the output is a valid JSON with the expected keys, True is returned. - :param expected: Names of expected outputs :param received: Names of received outputs :raises ValueError: - If not all expected outputs are present in the received outputs + If the output is not a valid JSON with the expected keys: + - with `raise_on_failure` set to True a ValueError is raised. + - with `raise_on_failure` set to False a warning is issued and False is returned. :returns: True if the received output is a valid JSON with the expected keys, False otherwise. diff --git a/releasenotes/notes/add-failsafe-for-LLM-based-evaluators-34cdc183ab545315.yaml b/releasenotes/notes/add-failsafe-for-LLM-based-evaluators-34cdc183ab545315.yaml index ae21560df4..a97d33c8a2 100644 --- a/releasenotes/notes/add-failsafe-for-LLM-based-evaluators-34cdc183ab545315.yaml +++ b/releasenotes/notes/add-failsafe-for-LLM-based-evaluators-34cdc183ab545315.yaml @@ -1,5 +1,5 @@ --- enhancements: - | - If an LLM-based evaluator (e.g., `Faithfulness` or `ContextRelevance`) is initialised with `raise_on_failure=False`, and if a call to an LLM fails or an LLM outputs an invalid JSON, it returns `np.nan` and continues the evaluation. + If an LLM-based evaluator (e.g., `Faithfulness` or `ContextRelevance`) is initialised with `raise_on_failure=False`, and if a call to an LLM fails or an LLM outputs an invalid JSON, the score of the sample is set to `NaN` instead of raising an exception. The user is notified with a warning indicating the number of requests that failed. From a2c69dd59906af6d06c45f4454afc9d8f2061cd0 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 22 May 2024 17:10:14 +0200 Subject: [PATCH 17/23] fixing tests --- haystack/components/evaluators/llm_evaluator.py | 15 +++++++-------- .../evaluators/test_faithfulness_evaluator.py | 9 +++++---- test/components/evaluators/test_llm_evaluator.py | 13 +++++++++---- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index 79de577a38..6aec6419dd 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -336,14 +336,6 @@ def is_valid_json_and_has_expected_keys(self, expected: List[str], received: str """ try: parsed_output = json.loads(received) - - if not all(output in parsed_output for output in expected): - msg = f"Expected response from LLM evaluator to be JSON with keys {expected}, got {received}." - if self.raise_on_failure: - raise ValueError(msg) - warn(msg) - return False - except json.JSONDecodeError: msg = "Response from LLM evaluator is not a valid JSON." if self.raise_on_failure: @@ -351,4 +343,11 @@ def is_valid_json_and_has_expected_keys(self, expected: List[str], received: str warn(msg) return False + if not all(output in parsed_output for output in expected): + msg = f"Expected response from LLM evaluator to be JSON with keys {expected}, got {received}." + if self.raise_on_failure: + raise ValueError(msg) + warn(msg) + return False + return True diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py index 6e3ea8749f..a393d29afc 100644 --- a/test/components/evaluators/test_faithfulness_evaluator.py +++ b/test/components/evaluators/test_faithfulness_evaluator.py @@ -223,13 +223,14 @@ def generator_run(self, *args, **kwargs): "Guido van Rossum.", ] results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers) + assert results == { - "individual_scores": [1, 0], + "individual_scores": [1.0, float(nan)], "results": [ - {"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]}, - {"score": 0, "statement_scores": [], "statements": []}, + {"statements": ["c", "d"], "statement_scores": [1, 1], "score": 1.0}, + {"statements": [], "statement_scores": [], "score": float(nan)}, ], - "score": 0.5, + "score": float(nan), } @pytest.mark.skipif( diff --git a/test/components/evaluators/test_llm_evaluator.py b/test/components/evaluators/test_llm_evaluator.py index f1930e3074..f96170d66f 100644 --- a/test/components/evaluators/test_llm_evaluator.py +++ b/test/components/evaluators/test_llm_evaluator.py @@ -378,10 +378,12 @@ def test_invalid_outputs(self, monkeypatch): ], ) with pytest.raises(ValueError): - component.is_valid_json(expected=["score", "another_expected_output"], received='{"score": 1.0}') + component.is_valid_json_and_has_expected_keys( + expected=["score", "another_expected_output"], received='{"score": 1.0}' + ) with pytest.raises(ValueError): - component.is_valid_json(expected=["score"], received='{"wrong_name": 1.0}') + component.is_valid_json_and_has_expected_keys(expected=["score"], received='{"wrong_name": 1.0}') def test_output_invalid_json_raise_on_failure_false(self, monkeypatch): monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") @@ -394,7 +396,10 @@ def test_output_invalid_json_raise_on_failure_false(self, monkeypatch): ], raise_on_failure=False, ) - assert component.is_valid_json(expected=["score"], received="some_invalid_json_output") is False + assert ( + component.is_valid_json_and_has_expected_keys(expected=["score"], received="some_invalid_json_output") + is False + ) def test_output_invalid_json_raise_on_failure_true(self, monkeypatch): monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") @@ -407,7 +412,7 @@ def test_output_invalid_json_raise_on_failure_true(self, monkeypatch): ], ) with pytest.raises(ValueError): - component.is_valid_json(expected=["score"], received="some_invalid_json_output") + component.is_valid_json_and_has_expected_keys(expected=["score"], received="some_invalid_json_output") def test_unsupported_api(self): with pytest.raises(ValueError): From e9497ec542e14147f560fc051386fc8433c23f17 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 22 May 2024 17:36:45 +0200 Subject: [PATCH 18/23] fixing tests --- .../components/evaluators/llm_evaluator.py | 2 -- .../test_context_relevance_evaluator.py | 21 ++++++++++++------- .../evaluators/test_faithfulness_evaluator.py | 19 ++++++++++------- 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index 6aec6419dd..7a63e39790 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -6,8 +6,6 @@ from typing import Any, Dict, List, Tuple, Type from warnings import warn -import numpy as np - from haystack import component, default_from_dict, default_to_dict from haystack.components.builders import PromptBuilder from haystack.components.generators import OpenAIGenerator diff --git a/test/components/evaluators/test_context_relevance_evaluator.py b/test/components/evaluators/test_context_relevance_evaluator.py index 1f21f8537c..de5ec907c0 100644 --- a/test/components/evaluators/test_context_relevance_evaluator.py +++ b/test/components/evaluators/test_context_relevance_evaluator.py @@ -4,6 +4,8 @@ import os from typing import List +import math + import pytest from haystack.components.evaluators import ContextRelevanceEvaluator @@ -186,14 +188,17 @@ def generator_run(self, *args, **kwargs): ], ] results = component.run(questions=questions, contexts=contexts) - assert results == { - "individual_scores": [1, 0], - "results": [ - {"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]}, - {"score": 0, "statement_scores": [], "statements": []}, - ], - "score": 0.5, - } + + assert math.isnan(results["score"]) + + assert results["individual_scores"][0] == 1.0 + assert math.isnan(results["individual_scores"][1]) + + assert results["results"][0] == {"statements": ["c", "d"], "statement_scores": [1, 1], "score": 1.0} + + assert results["results"][1]["statements"] == [] + assert results["results"][1]["statement_scores"] == [] + assert math.isnan(results["results"][1]["score"]) @pytest.mark.skipif( not os.environ.get("OPENAI_API_KEY", None), diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py index a393d29afc..abfe74c455 100644 --- a/test/components/evaluators/test_faithfulness_evaluator.py +++ b/test/components/evaluators/test_faithfulness_evaluator.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 import os +import math from typing import List import numpy as np @@ -224,14 +225,16 @@ def generator_run(self, *args, **kwargs): ] results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers) - assert results == { - "individual_scores": [1.0, float(nan)], - "results": [ - {"statements": ["c", "d"], "statement_scores": [1, 1], "score": 1.0}, - {"statements": [], "statement_scores": [], "score": float(nan)}, - ], - "score": float(nan), - } + assert math.isnan(results["score"]) + + assert results["individual_scores"][0] == 1.0 + assert math.isnan(results["individual_scores"][1]) + + assert results["results"][0] == {"statements": ["c", "d"], "statement_scores": [1, 1], "score": 1.0} + + assert results["results"][1]["statements"] == [] + assert results["results"][1]["statement_scores"] == [] + assert math.isnan(results["results"][1]["score"]) @pytest.mark.skipif( not os.environ.get("OPENAI_API_KEY", None), From c0570ecb1843017aea9b425edfcaede52618f9da Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 22 May 2024 17:52:51 +0200 Subject: [PATCH 19/23] adding types --- haystack/components/evaluators/llm_evaluator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index 7a63e39790..fd9bcde39e 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import json -from typing import Any, Dict, List, Tuple, Type +from typing import Any, Dict, List, Optional, Tuple, Type from warnings import warn from haystack import component, default_from_dict, default_to_dict @@ -177,7 +177,7 @@ def run(self, **inputs) -> Dict[str, Any]: input_names, values = inputs.keys(), list(zip(*inputs.values())) list_of_input_names_to_values = [dict(zip(input_names, v)) for v in values] - results = [] + results: List[Optional[Dict[str, Any]]] = [] errors = 0 for input_names_to_values in list_of_input_names_to_values: prompt = self.builder.run(**input_names_to_values) From 796588c5514263f9eeb819756ba4d0701128665b Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 22 May 2024 18:02:49 +0200 Subject: [PATCH 20/23] removing unused imports --- haystack/components/evaluators/context_relevance.py | 2 -- haystack/components/evaluators/faithfulness.py | 1 - 2 files changed, 3 deletions(-) diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py index 83a8f5af09..29ce5edea1 100644 --- a/haystack/components/evaluators/context_relevance.py +++ b/haystack/components/evaluators/context_relevance.py @@ -2,10 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 -from math import isnan from typing import Any, Dict, List, Optional -import numpy as np from numpy import mean as np_mean from haystack import default_from_dict diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py index 3d2dac3458..7f9dec88e7 100644 --- a/haystack/components/evaluators/faithfulness.py +++ b/haystack/components/evaluators/faithfulness.py @@ -2,7 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from math import isnan from typing import Any, Dict, List, Optional from numpy import mean as np_mean From 50f64773edaf850eab2d1c5c655a705c8e49bbd4 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 23 May 2024 09:21:26 +0200 Subject: [PATCH 21/23] Update haystack/components/evaluators/context_relevance.py Co-authored-by: Madeesh Kannan --- haystack/components/evaluators/context_relevance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py index 29ce5edea1..8a4ef124f8 100644 --- a/haystack/components/evaluators/context_relevance.py +++ b/haystack/components/evaluators/context_relevance.py @@ -140,7 +140,7 @@ def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any] # calculate average statement relevance score per query for idx, res in enumerate(result["results"]): - if not res: + if res is None: result["results"][idx] = {"statements": [], "statement_scores": [], "score": float("nan")} continue if not res["statements"]: From 8ce0c9dacef7a93c6449cf70e98da6d3cc59b7cd Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 23 May 2024 09:21:32 +0200 Subject: [PATCH 22/23] Update haystack/components/evaluators/faithfulness.py Co-authored-by: Madeesh Kannan --- haystack/components/evaluators/faithfulness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py index 7f9dec88e7..8d46e9bf9e 100644 --- a/haystack/components/evaluators/faithfulness.py +++ b/haystack/components/evaluators/faithfulness.py @@ -158,7 +158,7 @@ def run(self, questions: List[str], contexts: List[List[str]], predicted_answers # calculate average statement faithfulness score per query for idx, res in enumerate(result["results"]): - if not res: + if res is None: result["results"][idx] = {"statements": [], "statement_scores": [], "score": float("nan")} continue if not res["statements"]: From 391e4fac564caad6ded4459a6fa8dc182c783a5f Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 23 May 2024 17:13:13 +0200 Subject: [PATCH 23/23] attending PR comments --- haystack/components/evaluators/llm_evaluator.py | 3 +++ test/components/evaluators/test_context_relevance_evaluator.py | 2 +- test/components/evaluators/test_faithfulness_evaluator.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index fe6f1e8b4d..fdfe49ffd1 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -175,6 +175,9 @@ def run(self, **inputs) -> Dict[str, Any]: Each result is a dictionary containing the keys as defined in the `outputs` parameter of the LLMEvaluator and the evaluation results as the values. If an exception occurs for a particular input value, the result will be `None` for that entry. + :raises ValueError: + Only in the case that `raise_on_failure` is set to True and the received inputs are not lists or have + different lengths, or if the output is not a valid JSON or doesn't contain the expected keys. """ self.validate_input_parameters(dict(self.inputs), inputs) diff --git a/test/components/evaluators/test_context_relevance_evaluator.py b/test/components/evaluators/test_context_relevance_evaluator.py index de5ec907c0..2db69004d1 100644 --- a/test/components/evaluators/test_context_relevance_evaluator.py +++ b/test/components/evaluators/test_context_relevance_evaluator.py @@ -161,7 +161,7 @@ def test_run_missing_parameters(self, monkeypatch): with pytest.raises(TypeError, match="missing 2 required positional arguments"): component.run() - def test_run_handles_nan(self, monkeypatch): + def test_run_returns_nan_raise_on_failure_false(self, monkeypatch): monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") component = ContextRelevanceEvaluator(raise_on_failure=False) diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py index abfe74c455..5c32f8c067 100644 --- a/test/components/evaluators/test_faithfulness_evaluator.py +++ b/test/components/evaluators/test_faithfulness_evaluator.py @@ -193,7 +193,7 @@ def test_run_missing_parameters(self, monkeypatch): with pytest.raises(TypeError, match="missing 3 required positional arguments"): component.run() - def test_run_handles_nan(self, monkeypatch): + def test_run_returns_nan_raise_on_failure_false(self, monkeypatch): monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") component = FaithfulnessEvaluator(raise_on_failure=False)