From a21b0c223323ab2513521d494ecd5faba2826368 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Tue, 21 May 2024 16:29:17 +0200
Subject: [PATCH 01/23] wip

---
 .../components/evaluators/llm_evaluator.py    | 23 +++++++++++++------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index e4eebbd9ab..0c21c8dfeb 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -50,6 +50,7 @@ def __init__(
         inputs: List[Tuple[str, Type[List]]],
         outputs: List[str],
         examples: List[Dict[str, Any]],
+        raises_on_failure: bool = True,
         *,
         api: str = "openai",
         api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
@@ -70,6 +71,8 @@ def __init__(
              `outputs` parameters.
             Each example is a dictionary with keys "inputs" and "outputs"
             They contain the input and output as dictionaries respectively.
+        :param raises_on_failure:
+            If True, the component will raise an exception if the evaluation fails.
         :param api:
             The API to use for calling an LLM through a Generator.
             Supported APIs: "openai".
@@ -78,7 +81,7 @@ def __init__(
 
         """
         self.validate_init_parameters(inputs, outputs, examples)
-
+        self.raise_on_failure = raises_on_failure
         self.instructions = instructions
         self.inputs = inputs
         self.outputs = outputs
@@ -293,8 +296,7 @@ def validate_input_parameters(expected: Dict[str, Any], received: Dict[str, Any]
             )
             raise ValueError(msg)
 
-    @staticmethod
-    def validate_outputs(expected: List[str], received: str) -> None:
+    def validate_outputs(self, expected: List[str], received: str) -> None:
         """
         Validate the output.
 
@@ -306,7 +308,14 @@ def validate_outputs(expected: List[str], received: str) -> None:
         :raises ValueError:
             If not all expected outputs are present in the received outputs
         """
-        parsed_output = json.loads(received)
-        if not all(output in parsed_output for output in expected):
-            msg = f"Expected response from LLM evaluator to be JSON with keys {expected}, got {received}."
-            raise ValueError(msg)
+        try:
+            parsed_output = json.loads(received)
+
+            if not all(output in parsed_output for output in expected):
+                msg = f"Expected response from LLM evaluator to be JSON with keys {expected}, got {received}."
+                raise ValueError(msg)
+
+        except json.JSONDecodeError:
+            if self.raise_on_failure:
+                raise ValueError("Response from LLM evaluator is not a valid JSON.")
+            # ToDo: issue a warning or/and log the error

From 91ad2ef96bf1ddc946d04649d8cdf58e6eb9677b Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Tue, 21 May 2024 16:48:20 +0200
Subject: [PATCH 02/23] initial import

---
 haystack/components/evaluators/llm_evaluator.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index 0c21c8dfeb..6d1270f040 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -3,7 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
-from typing import Any, Dict, List, Tuple, Type
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+from tqdm import tqdm
 
 from haystack import component, default_from_dict, default_to_dict
 from haystack.components.builders import PromptBuilder
@@ -51,6 +53,7 @@ def __init__(
         outputs: List[str],
         examples: List[Dict[str, Any]],
         raises_on_failure: bool = True,
+        progress_bar: bool = True,
         *,
         api: str = "openai",
         api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
@@ -82,6 +85,7 @@ def __init__(
         """
         self.validate_init_parameters(inputs, outputs, examples)
         self.raise_on_failure = raises_on_failure
+        self.progress_bar = progress_bar
         self.instructions = instructions
         self.inputs = inputs
         self.outputs = outputs
@@ -176,10 +180,12 @@ def run(self, **inputs) -> Dict[str, Any]:
         list_of_input_names_to_values = [dict(zip(input_names, v)) for v in values]
 
         results = []
-        for input_names_to_values in list_of_input_names_to_values:
+        for input_names_to_values in tqdm(list_of_input_names_to_values, disable=not self.progress_bar):
             prompt = self.builder.run(**input_names_to_values)
             result = self.generator.run(prompt=prompt["prompt"])
 
+            # ToDo: how to handle too large context
+
             self.validate_outputs(expected=self.outputs, received=result["replies"][0])
             parsed_result = json.loads(result["replies"][0])
             results.append(parsed_result)
@@ -296,7 +302,7 @@ def validate_input_parameters(expected: Dict[str, Any], received: Dict[str, Any]
             )
             raise ValueError(msg)
 
-    def validate_outputs(self, expected: List[str], received: str) -> None:
+    def validate_outputs(self, expected: List[str], received: str) -> Optional[str]:
         """
         Validate the output.
 
@@ -319,3 +325,5 @@ def validate_outputs(self, expected: List[str], received: str) -> None:
             if self.raise_on_failure:
                 raise ValueError("Response from LLM evaluator is not a valid JSON.")
             # ToDo: issue a warning or/and log the error
+            Warning("Response from LLM evaluator is not a valid JSON.")
+            return "{}"

From 8746035910caaaa33ff0ad1c14de10844e9b136c Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Tue, 21 May 2024 18:05:20 +0200
Subject: [PATCH 03/23] adding tests

---
 .../components/evaluators/llm_evaluator.py    | 25 +++++++++++++----
 .../evaluators/test_llm_evaluator.py          | 28 +++++++++++++++++++
 2 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index 6d1270f040..0bd274ead4 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -3,8 +3,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
+import logging
 from typing import Any, Dict, List, Optional, Tuple, Type
+from warnings import warn
 
+import numpy as np
 from tqdm import tqdm
 
 from haystack import component, default_from_dict, default_to_dict
@@ -12,6 +15,8 @@
 from haystack.components.generators import OpenAIGenerator
 from haystack.utils import Secret, deserialize_secrets_inplace
 
+logger = logging.getLogger(__name__)
+
 
 @component
 class LLMEvaluator:
@@ -302,10 +307,15 @@ def validate_input_parameters(expected: Dict[str, Any], received: Dict[str, Any]
             )
             raise ValueError(msg)
 
-    def validate_outputs(self, expected: List[str], received: str) -> Optional[str]:
+    def validate_outputs(self, expected: List[str], received: str) -> Optional[float]:
         """
         Validate the output.
 
+        If `raise_on_failure` is True, raise a ValueError if not all expected outputs are present in the received
+        outputs or if the received outputs are not a valid JSON.
+
+        If `raise_on_failure` is False, print a warning if the received outputs are not a valid JSON and return a `nan`.
+
         :param expected:
             Names of expected outputs
         :param received:
@@ -319,11 +329,14 @@ def validate_outputs(self, expected: List[str], received: str) -> Optional[str]:
 
             if not all(output in parsed_output for output in expected):
                 msg = f"Expected response from LLM evaluator to be JSON with keys {expected}, got {received}."
-                raise ValueError(msg)
+                if self.raise_on_failure:
+                    raise ValueError(msg)
+                warn(msg)
+                return np.nan
 
         except json.JSONDecodeError:
+            msg = "Response from LLM evaluator is not a valid JSON."
             if self.raise_on_failure:
-                raise ValueError("Response from LLM evaluator is not a valid JSON.")
-            # ToDo: issue a warning or/and log the error
-            Warning("Response from LLM evaluator is not a valid JSON.")
-            return "{}"
+                raise ValueError(msg)
+            warn(msg)
+            return np.nan
diff --git a/test/components/evaluators/test_llm_evaluator.py b/test/components/evaluators/test_llm_evaluator.py
index b1d41e000c..ef72f5ac7f 100644
--- a/test/components/evaluators/test_llm_evaluator.py
+++ b/test/components/evaluators/test_llm_evaluator.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 from typing import List
 
+import numpy as np
 import pytest
 
 from haystack.components.evaluators import LLMEvaluator
@@ -382,6 +383,33 @@ def test_invalid_outputs(self, monkeypatch):
         with pytest.raises(ValueError):
             component.validate_outputs(expected=["score"], received='{"wrong_name": 1.0}')
 
+    def test_output_invalid_json_raise_on_failure_false(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = LLMEvaluator(
+            instructions="test-instruction",
+            inputs=[("predicted_answers", List[str])],
+            outputs=["score"],
+            examples=[
+                {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
+            ],
+            raises_on_failure=False,
+        )
+        result = component.validate_outputs(expected=["score"], received="some_invalid_json_output")
+        assert np.isnan(result)
+
+    def test_output_invalid_json_raise_on_failure_true(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = LLMEvaluator(
+            instructions="test-instruction",
+            inputs=[("predicted_answers", List[str])],
+            outputs=["score"],
+            examples=[
+                {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
+            ],
+        )
+        with pytest.raises(ValueError):
+            component.validate_outputs(expected=["score"], received="some_invalid_json_output")
+
     def test_unsupported_api(self):
         with pytest.raises(ValueError):
             LLMEvaluator(

From 3d16830a09100dd9e3c496c7bdcadb18532a7d26 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Tue, 21 May 2024 19:37:01 +0200
Subject: [PATCH 04/23] adding params

---
 haystack/components/evaluators/context_relevance.py | 6 ++++++
 haystack/components/evaluators/faithfulness.py      | 6 ++++++
 haystack/components/evaluators/llm_evaluator.py     | 4 ++--
 test/components/evaluators/test_llm_evaluator.py    | 2 +-
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py
index 9988bdeb02..61f085366a 100644
--- a/haystack/components/evaluators/context_relevance.py
+++ b/haystack/components/evaluators/context_relevance.py
@@ -69,6 +69,8 @@ def __init__(
         examples: Optional[List[Dict[str, Any]]] = None,
         api: str = "openai",
         api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
+        raise_on_failure: bool = True,
+        progress_bar: bool = True,
     ):
         """
         Creates an instance of ContextRelevanceEvaluator.
@@ -107,6 +109,8 @@ def __init__(
         self.examples = examples or _DEFAULT_EXAMPLES
         self.api = api
         self.api_key = api_key
+        self.raise_on_failure = raise_on_failure
+        self.progress_bar = progress_bar
 
         super().__init__(
             instructions=self.instructions,
@@ -115,6 +119,8 @@ def __init__(
             examples=self.examples,
             api=self.api,
             api_key=self.api_key,
+            raise_on_failure=self.raise_on_failure,
+            progress_bar=self.progress_bar,
         )
 
     @component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]])
diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py
index 2bcbb9b086..e9e38a30e7 100644
--- a/haystack/components/evaluators/faithfulness.py
+++ b/haystack/components/evaluators/faithfulness.py
@@ -83,6 +83,8 @@ def __init__(
         examples: Optional[List[Dict[str, Any]]] = None,
         api: str = "openai",
         api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
+        raise_on_failure: bool = True,
+        progress_bar: bool = True,
     ):
         """
         Creates an instance of FaithfulnessEvaluator.
@@ -123,6 +125,8 @@ def __init__(
         self.examples = examples or _DEFAULT_EXAMPLES
         self.api = api
         self.api_key = api_key
+        self.raise_on_failure = raise_on_failure
+        self.progress_bar = progress_bar
 
         super().__init__(
             instructions=self.instructions,
@@ -131,6 +135,8 @@ def __init__(
             examples=self.examples,
             api=self.api,
             api_key=self.api_key,
+            raise_on_failure=self.raise_on_failure,
+            progress_bar=self.progress_bar,
         )
 
     @component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]])
diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index 0bd274ead4..3427012ed1 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -57,7 +57,7 @@ def __init__(
         inputs: List[Tuple[str, Type[List]]],
         outputs: List[str],
         examples: List[Dict[str, Any]],
-        raises_on_failure: bool = True,
+        raise_on_failure: bool = True,
         progress_bar: bool = True,
         *,
         api: str = "openai",
@@ -89,7 +89,7 @@ def __init__(
 
         """
         self.validate_init_parameters(inputs, outputs, examples)
-        self.raise_on_failure = raises_on_failure
+        self.raise_on_failure = raise_on_failure
         self.progress_bar = progress_bar
         self.instructions = instructions
         self.inputs = inputs
diff --git a/test/components/evaluators/test_llm_evaluator.py b/test/components/evaluators/test_llm_evaluator.py
index ef72f5ac7f..7bd459c89c 100644
--- a/test/components/evaluators/test_llm_evaluator.py
+++ b/test/components/evaluators/test_llm_evaluator.py
@@ -392,7 +392,7 @@ def test_output_invalid_json_raise_on_failure_false(self, monkeypatch):
             examples=[
                 {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
             ],
-            raises_on_failure=False,
+            raise_on_failure=False,
         )
         result = component.validate_outputs(expected=["score"], received="some_invalid_json_output")
         assert np.isnan(result)

From 33dd22dbb2c902884c3045a342a990342c80e3ae Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Tue, 21 May 2024 23:35:02 +0200
Subject: [PATCH 05/23] adding safeguards for nan in evaluators

---
 .../evaluators/context_relevance.py           |  6 ++-
 .../components/evaluators/faithfulness.py     |  6 ++-
 .../components/evaluators/llm_evaluator.py    | 45 ++++++++++++-------
 3 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py
index 61f085366a..6f75b88a3e 100644
--- a/haystack/components/evaluators/context_relevance.py
+++ b/haystack/components/evaluators/context_relevance.py
@@ -4,6 +4,7 @@
 
 from typing import Any, Dict, List, Optional
 
+from numpy import isnan
 from numpy import mean as np_mean
 
 from haystack import default_from_dict
@@ -141,7 +142,10 @@ def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]
         result = super().run(questions=questions, contexts=contexts)
 
         # calculate average statement relevance score per query
-        for res in result["results"]:
+        for idx, res in enumerate(result["results"]):
+            if isinstance(res, float) and isnan(res):
+                result["results"][idx] = {"statements": [], "statement_scores": [], "score": 0}
+                continue
             if not res["statements"]:
                 res["score"] = 0
             else:
diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py
index e9e38a30e7..d9eb3efe38 100644
--- a/haystack/components/evaluators/faithfulness.py
+++ b/haystack/components/evaluators/faithfulness.py
@@ -4,6 +4,7 @@
 
 from typing import Any, Dict, List, Optional
 
+from numpy import isnan
 from numpy import mean as np_mean
 
 from haystack import default_from_dict
@@ -159,7 +160,10 @@ def run(self, questions: List[str], contexts: List[List[str]], predicted_answers
         result = super().run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
 
         # calculate average statement faithfulness score per query
-        for res in result["results"]:
+        for idx, res in enumerate(result["results"]):
+            if isinstance(res, float) and isnan(res):
+                result["results"][idx] = {"statements": [], "statement_scores": [], "score": 0}
+                continue
             if not res["statements"]:
                 res["score"] = 0
             else:
diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index 3427012ed1..0d176e8a61 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -79,7 +79,7 @@ def __init__(
              `outputs` parameters.
             Each example is a dictionary with keys "inputs" and "outputs"
             They contain the input and output as dictionaries respectively.
-        :param raises_on_failure:
+        :param raise_on_failure:
             If True, the component will raise an exception if the evaluation fails.
         :param api:
             The API to use for calling an LLM through a Generator.
@@ -170,6 +170,8 @@ def run(self, **inputs) -> Dict[str, Any]:
         """
         Run the LLM evaluator.
 
+        # ToDo: add more details about the behavior of this method and it's exceptions
+
         :param inputs:
             The input values to evaluate. The keys are the input names and the values are lists of input values.
         :returns:
@@ -187,13 +189,21 @@ def run(self, **inputs) -> Dict[str, Any]:
         results = []
         for input_names_to_values in tqdm(list_of_input_names_to_values, disable=not self.progress_bar):
             prompt = self.builder.run(**input_names_to_values)
-            result = self.generator.run(prompt=prompt["prompt"])
-
-            # ToDo: how to handle too large context
+            try:
+                result = self.generator.run(prompt=prompt["prompt"])
+            except Exception as e:
+                msg = f"Error while generating response for prompt: {prompt}. Error: {e}"
+                if self.raise_on_failure:
+                    raise ValueError(msg)
+                warn(msg)
+                results.append(np.nan)
+                continue
 
-            self.validate_outputs(expected=self.outputs, received=result["replies"][0])
-            parsed_result = json.loads(result["replies"][0])
-            results.append(parsed_result)
+            if self.is_valid_json(expected=self.outputs, received=result["replies"][0]):
+                parsed_result = json.loads(result["replies"][0])
+                results.append(parsed_result)
+            else:
+                results.append(np.nan)
 
         return {"results": results}
 
@@ -307,14 +317,14 @@ def validate_input_parameters(expected: Dict[str, Any], received: Dict[str, Any]
             )
             raise ValueError(msg)
 
-    def validate_outputs(self, expected: List[str], received: str) -> Optional[float]:
+    def is_valid_json(self, expected: List[str], received: str) -> bool:
         """
-        Validate the output.
+        Output must be a valid JSON with the expected keys.
 
-        If `raise_on_failure` is True, raise a ValueError if not all expected outputs are present in the received
-        outputs or if the received outputs are not a valid JSON.
-
-        If `raise_on_failure` is False, print a warning if the received outputs are not a valid JSON and return a `nan`.
+        If the output is not a valid JSON with the expected keys:
+            - with `raise_on_failure` set to True a ValueError is raised.
+            - with `raise_on_failure` set to False a warning is issued and False is returned.
+        If the output is a valid JSON with the expected keys, True is returned.
 
         :param expected:
             Names of expected outputs
@@ -323,6 +333,9 @@ def validate_outputs(self, expected: List[str], received: str) -> Optional[float
 
         :raises ValueError:
             If not all expected outputs are present in the received outputs
+
+        :returns:
+            True if the received output is a valid JSON with the expected keys, False otherwise.
         """
         try:
             parsed_output = json.loads(received)
@@ -332,11 +345,13 @@ def validate_outputs(self, expected: List[str], received: str) -> Optional[float
                 if self.raise_on_failure:
                     raise ValueError(msg)
                 warn(msg)
-                return np.nan
+                return False
 
         except json.JSONDecodeError:
             msg = "Response from LLM evaluator is not a valid JSON."
             if self.raise_on_failure:
                 raise ValueError(msg)
             warn(msg)
-            return np.nan
+            return False
+
+        return True

From 7473d1fd64a31fe31cdab54871af170a3e9fddf9 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Wed, 22 May 2024 09:35:38 +0200
Subject: [PATCH 06/23] adding docstrings

---
 haystack/components/evaluators/llm_evaluator.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index 0d176e8a61..39633d279b 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -170,7 +170,11 @@ def run(self, **inputs) -> Dict[str, Any]:
         """
         Run the LLM evaluator.
 
-        # ToDo: add more details about the behavior of this method and it's exceptions
+        Running the LLM evaluator is done within a try-except block to catch any exceptions that may
+        occur during the run. If an exception occurs, the method will return a np.nan value for the result.
+
+        Likewise, if the output is not a valid JSON with the expected keys, the method will return a np.nan value
+        for the result.
 
         :param inputs:
             The input values to evaluate. The keys are the input names and the values are lists of input values.

From b2ff89a7512a6d2a4855f301831249a722c1513b Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Wed, 22 May 2024 09:56:26 +0200
Subject: [PATCH 07/23] fixing tests

---
 test/components/evaluators/test_llm_evaluator.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/test/components/evaluators/test_llm_evaluator.py b/test/components/evaluators/test_llm_evaluator.py
index 7bd459c89c..f1930e3074 100644
--- a/test/components/evaluators/test_llm_evaluator.py
+++ b/test/components/evaluators/test_llm_evaluator.py
@@ -378,10 +378,10 @@ def test_invalid_outputs(self, monkeypatch):
             ],
         )
         with pytest.raises(ValueError):
-            component.validate_outputs(expected=["score", "another_expected_output"], received='{"score": 1.0}')
+            component.is_valid_json(expected=["score", "another_expected_output"], received='{"score": 1.0}')
 
         with pytest.raises(ValueError):
-            component.validate_outputs(expected=["score"], received='{"wrong_name": 1.0}')
+            component.is_valid_json(expected=["score"], received='{"wrong_name": 1.0}')
 
     def test_output_invalid_json_raise_on_failure_false(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
@@ -394,8 +394,7 @@ def test_output_invalid_json_raise_on_failure_false(self, monkeypatch):
             ],
             raise_on_failure=False,
         )
-        result = component.validate_outputs(expected=["score"], received="some_invalid_json_output")
-        assert np.isnan(result)
+        assert component.is_valid_json(expected=["score"], received="some_invalid_json_output") is False
 
     def test_output_invalid_json_raise_on_failure_true(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
@@ -408,7 +407,7 @@ def test_output_invalid_json_raise_on_failure_true(self, monkeypatch):
             ],
         )
         with pytest.raises(ValueError):
-            component.validate_outputs(expected=["score"], received="some_invalid_json_output")
+            component.is_valid_json(expected=["score"], received="some_invalid_json_output")
 
     def test_unsupported_api(self):
         with pytest.raises(ValueError):

From 860c2aac85d47e5cfb7663b61a16bec3559595cb Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Wed, 22 May 2024 10:04:06 +0200
Subject: [PATCH 08/23] removing unused imports

---
 haystack/components/evaluators/llm_evaluator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index 39633d279b..e1651b52df 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -4,7 +4,7 @@
 
 import json
 import logging
-from typing import Any, Dict, List, Optional, Tuple, Type
+from typing import Any, Dict, List, Tuple, Type
 from warnings import warn
 
 import numpy as np

From d502ed969ba0bc22375463dbecfe9c8bf4283620 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Wed, 22 May 2024 10:38:20 +0200
Subject: [PATCH 09/23] removing unused imports

---
 haystack/components/evaluators/llm_evaluator.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index e1651b52df..177834b8c3 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -15,8 +15,6 @@
 from haystack.components.generators import OpenAIGenerator
 from haystack.utils import Secret, deserialize_secrets_inplace
 
-logger = logging.getLogger(__name__)
-
 
 @component
 class LLMEvaluator:
@@ -191,6 +189,7 @@ def run(self, **inputs) -> Dict[str, Any]:
         list_of_input_names_to_values = [dict(zip(input_names, v)) for v in values]
 
         results = []
+        errors = 0
         for input_names_to_values in tqdm(list_of_input_names_to_values, disable=not self.progress_bar):
             prompt = self.builder.run(**input_names_to_values)
             try:
@@ -201,6 +200,7 @@ def run(self, **inputs) -> Dict[str, Any]:
                     raise ValueError(msg)
                 warn(msg)
                 results.append(np.nan)
+                errors += 1
                 continue
 
             if self.is_valid_json(expected=self.outputs, received=result["replies"][0]):
@@ -208,6 +208,11 @@ def run(self, **inputs) -> Dict[str, Any]:
                 results.append(parsed_result)
             else:
                 results.append(np.nan)
+                errors += 1
+
+        if errors > 0:
+            msg = f"LLM evaluator failed for {errors} out of {len(list_of_input_names_to_values)} inputs."
+            warn(msg)
 
         return {"results": results}
 

From 2538ed3ec903e11047f681877ec3fbd162b06a36 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Wed, 22 May 2024 10:47:58 +0200
Subject: [PATCH 10/23] removing unused imports

---
 haystack/components/evaluators/llm_evaluator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index 177834b8c3..04671196f8 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
-import logging
 from typing import Any, Dict, List, Tuple, Type
 from warnings import warn
 

From f5f3818e788aac0b861521b54ba4c8b4538b6f31 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Wed, 22 May 2024 11:47:31 +0200
Subject: [PATCH 11/23] adding tests to context and faithfullness evaluators

---
 .../test_context_relevance_evaluator.py       | 36 ++++++++++++++++
 .../evaluators/test_faithfulness_evaluator.py | 41 +++++++++++++++++++
 2 files changed, 77 insertions(+)

diff --git a/test/components/evaluators/test_context_relevance_evaluator.py b/test/components/evaluators/test_context_relevance_evaluator.py
index ecbc215d06..7e28edd18d 100644
--- a/test/components/evaluators/test_context_relevance_evaluator.py
+++ b/test/components/evaluators/test_context_relevance_evaluator.py
@@ -159,6 +159,42 @@ def test_run_missing_parameters(self, monkeypatch):
         with pytest.raises(TypeError, match="missing 2 required positional arguments"):
             component.run()
 
+    def test_run_handles_nan(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = ContextRelevanceEvaluator(progress_bar=False, raise_on_failure=False)
+
+        def generator_run(self, *args, **kwargs):
+            if "Python" in kwargs["prompt"]:
+                raise Exception("OpenAI API request failed.")
+            else:
+                return {"replies": ['{"statements": ["c", "d"], "statement_scores": [1, 1]}']}
+
+        monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
+
+        questions = ["Which is the most popular global sport?", "Who created the Python language?"]
+        contexts = [
+            [
+                "The popularity of sports can be measured in various ways, including TV viewership, social media "
+                "presence, number of participants, and economic impact. Football is undoubtedly the world's most "
+                "popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and "
+                "Messi, drawing a followership of more than 4 billion people."
+            ],
+            [
+                "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming "
+                "language. Its design philosophy emphasizes code readability, and its language constructs aim to help "
+                "programmers write clear, logical code for both small and large-scale software projects."
+            ],
+        ]
+        results = component.run(questions=questions, contexts=contexts)
+        assert results == {
+            "individual_scores": [1, 0],
+            "results": [
+                {"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]},
+                {"score": 0, "statement_scores": [], "statements": []},
+            ],
+            "score": 0.5,
+        }
+
     @pytest.mark.skipif(
         not os.environ.get("OPENAI_API_KEY", None),
         reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py
index e493b709ef..38434c1dab 100644
--- a/test/components/evaluators/test_faithfulness_evaluator.py
+++ b/test/components/evaluators/test_faithfulness_evaluator.py
@@ -4,6 +4,7 @@
 import os
 from typing import List
 
+import numpy as np
 import pytest
 
 from haystack.components.evaluators import FaithfulnessEvaluator
@@ -191,6 +192,46 @@ def test_run_missing_parameters(self, monkeypatch):
         with pytest.raises(TypeError, match="missing 3 required positional arguments"):
             component.run()
 
+    def test_run_handles_nan(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = FaithfulnessEvaluator(progress_bar=False, raise_on_failure=False)
+
+        def generator_run(self, *args, **kwargs):
+            if "Python" in kwargs["prompt"]:
+                raise Exception("OpenAI API request failed.")
+            else:
+                return {"replies": ['{"statements": ["c", "d"], "statement_scores": [1, 1]}']}
+
+        monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
+
+        questions = ["Which is the most popular global sport?", "Who created the Python language?"]
+        contexts = [
+            [
+                "The popularity of sports can be measured in various ways, including TV viewership, social media "
+                "presence, number of participants, and economic impact. Football is undoubtedly the world's most "
+                "popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and "
+                "Messi, drawing a followership of more than 4 billion people."
+            ],
+            [
+                "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming "
+                "language. Its design philosophy emphasizes code readability, and its language constructs aim to help "
+                "programmers write clear, logical code for both small and large-scale software projects."
+            ],
+        ]
+        predicted_answers = [
+            "Football is the most popular sport with around 4 billion followers worldwide.",
+            "Guido van Rossum.",
+        ]
+        results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
+        assert results == {
+            "individual_scores": [1, 0],
+            "results": [
+                {"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]},
+                {"score": 0, "statement_scores": [], "statements": []},
+            ],
+            "score": 0.5,
+        }
+
     @pytest.mark.skipif(
         not os.environ.get("OPENAI_API_KEY", None),
         reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",

From a271db7a399bd053a12644110b46e036679d61b8 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Wed, 22 May 2024 12:24:41 +0200
Subject: [PATCH 12/23] fixing docstrings

---
 haystack/components/evaluators/llm_evaluator.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index 04671196f8..0711fcb89d 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -168,10 +168,10 @@ def run(self, **inputs) -> Dict[str, Any]:
         Run the LLM evaluator.
 
         Running the LLM evaluator is done within a try-except block to catch any exceptions that may
-        occur during the run. If an exception occurs, the method will return a np.nan value for the result.
+        occur during the run. If an exception occurs, the method will return a `np.nan` value for the result.
 
-        Likewise, if the output is not a valid JSON with the expected keys, the method will return a np.nan value
-        for the result.
+        Likewise, if the output is not a valid JSON or does not have the  expected keys, the method will return a
+        `np.nan` value for the result.
 
         :param inputs:
             The input values to evaluate. The keys are the input names and the values are lists of input values.

From 54a0146f120f46a782f06bfef70c4652395b5864 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Wed, 22 May 2024 12:30:33 +0200
Subject: [PATCH 13/23] nit

---
 haystack/components/evaluators/context_relevance.py           | 3 ---
 haystack/components/evaluators/faithfulness.py                | 3 ---
 haystack/components/evaluators/llm_evaluator.py               | 4 +---
 .../components/evaluators/test_context_relevance_evaluator.py | 2 +-
 test/components/evaluators/test_faithfulness_evaluator.py     | 2 +-
 5 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py
index 6f75b88a3e..d44b3f3b33 100644
--- a/haystack/components/evaluators/context_relevance.py
+++ b/haystack/components/evaluators/context_relevance.py
@@ -71,7 +71,6 @@ def __init__(
         api: str = "openai",
         api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
         raise_on_failure: bool = True,
-        progress_bar: bool = True,
     ):
         """
         Creates an instance of ContextRelevanceEvaluator.
@@ -111,7 +110,6 @@ def __init__(
         self.api = api
         self.api_key = api_key
         self.raise_on_failure = raise_on_failure
-        self.progress_bar = progress_bar
 
         super().__init__(
             instructions=self.instructions,
@@ -121,7 +119,6 @@ def __init__(
             api=self.api,
             api_key=self.api_key,
             raise_on_failure=self.raise_on_failure,
-            progress_bar=self.progress_bar,
         )
 
     @component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]])
diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py
index d9eb3efe38..7307e867c7 100644
--- a/haystack/components/evaluators/faithfulness.py
+++ b/haystack/components/evaluators/faithfulness.py
@@ -85,7 +85,6 @@ def __init__(
         api: str = "openai",
         api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
         raise_on_failure: bool = True,
-        progress_bar: bool = True,
     ):
         """
         Creates an instance of FaithfulnessEvaluator.
@@ -127,7 +126,6 @@ def __init__(
         self.api = api
         self.api_key = api_key
         self.raise_on_failure = raise_on_failure
-        self.progress_bar = progress_bar
 
         super().__init__(
             instructions=self.instructions,
@@ -137,7 +135,6 @@ def __init__(
             api=self.api,
             api_key=self.api_key,
             raise_on_failure=self.raise_on_failure,
-            progress_bar=self.progress_bar,
         )
 
     @component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]])
diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index 0711fcb89d..e48043d091 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -55,7 +55,6 @@ def __init__(
         outputs: List[str],
         examples: List[Dict[str, Any]],
         raise_on_failure: bool = True,
-        progress_bar: bool = True,
         *,
         api: str = "openai",
         api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
@@ -87,7 +86,6 @@ def __init__(
         """
         self.validate_init_parameters(inputs, outputs, examples)
         self.raise_on_failure = raise_on_failure
-        self.progress_bar = progress_bar
         self.instructions = instructions
         self.inputs = inputs
         self.outputs = outputs
@@ -189,7 +187,7 @@ def run(self, **inputs) -> Dict[str, Any]:
 
         results = []
         errors = 0
-        for input_names_to_values in tqdm(list_of_input_names_to_values, disable=not self.progress_bar):
+        for input_names_to_values in list_of_input_names_to_values:
             prompt = self.builder.run(**input_names_to_values)
             try:
                 result = self.generator.run(prompt=prompt["prompt"])
diff --git a/test/components/evaluators/test_context_relevance_evaluator.py b/test/components/evaluators/test_context_relevance_evaluator.py
index 7e28edd18d..1f21f8537c 100644
--- a/test/components/evaluators/test_context_relevance_evaluator.py
+++ b/test/components/evaluators/test_context_relevance_evaluator.py
@@ -161,7 +161,7 @@ def test_run_missing_parameters(self, monkeypatch):
 
     def test_run_handles_nan(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
-        component = ContextRelevanceEvaluator(progress_bar=False, raise_on_failure=False)
+        component = ContextRelevanceEvaluator(raise_on_failure=False)
 
         def generator_run(self, *args, **kwargs):
             if "Python" in kwargs["prompt"]:
diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py
index 38434c1dab..6e3ea8749f 100644
--- a/test/components/evaluators/test_faithfulness_evaluator.py
+++ b/test/components/evaluators/test_faithfulness_evaluator.py
@@ -194,7 +194,7 @@ def test_run_missing_parameters(self, monkeypatch):
 
     def test_run_handles_nan(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
-        component = FaithfulnessEvaluator(progress_bar=False, raise_on_failure=False)
+        component = FaithfulnessEvaluator(raise_on_failure=False)
 
         def generator_run(self, *args, **kwargs):
             if "Python" in kwargs["prompt"]:

From 12164d8971e10c346c8076bcb840139ba059ffc0 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Wed, 22 May 2024 12:40:28 +0200
Subject: [PATCH 14/23] removing unused imports

---
 haystack/components/evaluators/llm_evaluator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index e48043d091..b0a0a6b06e 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -7,7 +7,6 @@
 from warnings import warn
 
 import numpy as np
-from tqdm import tqdm
 
 from haystack import component, default_from_dict, default_to_dict
 from haystack.components.builders import PromptBuilder

From 687312f75a36df0176e53181693d2320322e5dd8 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Wed, 22 May 2024 12:44:50 +0200
Subject: [PATCH 15/23] adding release notes

---
 ...d-failsafe-for-LLM-based-evaluators-34cdc183ab545315.yaml | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 releasenotes/notes/add-failsafe-for-LLM-based-evaluators-34cdc183ab545315.yaml

diff --git a/releasenotes/notes/add-failsafe-for-LLM-based-evaluators-34cdc183ab545315.yaml b/releasenotes/notes/add-failsafe-for-LLM-based-evaluators-34cdc183ab545315.yaml
new file mode 100644
index 0000000000..ae21560df4
--- /dev/null
+++ b/releasenotes/notes/add-failsafe-for-LLM-based-evaluators-34cdc183ab545315.yaml
@@ -0,0 +1,5 @@
+---
+enhancements:
+  - |
+    If an LLM-based evaluator (e.g., `Faithfulness` or `ContextRelevance`) is initialised with `raise_on_failure=False`, and if a call to an LLM fails or an LLM outputs an invalid JSON, it returns `np.nan` and continues the evaluation.
+    The user is notified with a warning indicating the number of requests that failed.

From 2b94818d2b4c0cbecd17ac6a8b88b11702150302 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Wed, 22 May 2024 16:21:36 +0200
Subject: [PATCH 16/23] attending PR comments

---
 .../evaluators/context_relevance.py           | 12 ++++----
 .../components/evaluators/faithfulness.py     | 11 +++----
 .../components/evaluators/llm_evaluator.py    | 30 +++++++------------
 ...LLM-based-evaluators-34cdc183ab545315.yaml |  2 +-
 4 files changed, 25 insertions(+), 30 deletions(-)

diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py
index d44b3f3b33..83a8f5af09 100644
--- a/haystack/components/evaluators/context_relevance.py
+++ b/haystack/components/evaluators/context_relevance.py
@@ -2,9 +2,10 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from math import isnan
 from typing import Any, Dict, List, Optional
 
-from numpy import isnan
+import numpy as np
 from numpy import mean as np_mean
 
 from haystack import default_from_dict
@@ -96,6 +97,8 @@ def __init__(
             Supported APIs: "openai".
         :param api_key:
             The API key.
+        :param raise_on_failure:
+            Whether to raise an exception if the API call fails.
 
         """
         self.instructions = (
@@ -109,7 +112,6 @@ def __init__(
         self.examples = examples or _DEFAULT_EXAMPLES
         self.api = api
         self.api_key = api_key
-        self.raise_on_failure = raise_on_failure
 
         super().__init__(
             instructions=self.instructions,
@@ -118,7 +120,7 @@ def __init__(
             examples=self.examples,
             api=self.api,
             api_key=self.api_key,
-            raise_on_failure=self.raise_on_failure,
+            raise_on_failure=raise_on_failure,
         )
 
     @component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]])
@@ -140,8 +142,8 @@ def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]
 
         # calculate average statement relevance score per query
         for idx, res in enumerate(result["results"]):
-            if isinstance(res, float) and isnan(res):
-                result["results"][idx] = {"statements": [], "statement_scores": [], "score": 0}
+            if not res:
+                result["results"][idx] = {"statements": [], "statement_scores": [], "score": float("nan")}
                 continue
             if not res["statements"]:
                 res["score"] = 0
diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py
index 7307e867c7..3d2dac3458 100644
--- a/haystack/components/evaluators/faithfulness.py
+++ b/haystack/components/evaluators/faithfulness.py
@@ -2,9 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from math import isnan
 from typing import Any, Dict, List, Optional
 
-from numpy import isnan
 from numpy import mean as np_mean
 
 from haystack import default_from_dict
@@ -111,6 +111,8 @@ def __init__(
             Supported APIs: "openai".
         :param api_key:
             The API key.
+        :param raise_on_failure:
+            Whether to raise an exception if the API call fails.
 
         """
         self.instructions = (
@@ -125,7 +127,6 @@ def __init__(
         self.examples = examples or _DEFAULT_EXAMPLES
         self.api = api
         self.api_key = api_key
-        self.raise_on_failure = raise_on_failure
 
         super().__init__(
             instructions=self.instructions,
@@ -134,7 +135,7 @@ def __init__(
             examples=self.examples,
             api=self.api,
             api_key=self.api_key,
-            raise_on_failure=self.raise_on_failure,
+            raise_on_failure=raise_on_failure,
         )
 
     @component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]])
@@ -158,8 +159,8 @@ def run(self, questions: List[str], contexts: List[List[str]], predicted_answers
 
         # calculate average statement faithfulness score per query
         for idx, res in enumerate(result["results"]):
-            if isinstance(res, float) and isnan(res):
-                result["results"][idx] = {"statements": [], "statement_scores": [], "score": 0}
+            if not res:
+                result["results"][idx] = {"statements": [], "statement_scores": [], "score": float("nan")}
                 continue
             if not res["statements"]:
                 res["score"] = 0
diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index b0a0a6b06e..79de577a38 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -53,8 +53,8 @@ def __init__(
         inputs: List[Tuple[str, Type[List]]],
         outputs: List[str],
         examples: List[Dict[str, Any]],
-        raise_on_failure: bool = True,
         *,
+        raise_on_failure: bool = True,
         api: str = "openai",
         api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
     ):
@@ -75,7 +75,7 @@ def __init__(
             Each example is a dictionary with keys "inputs" and "outputs"
             They contain the input and output as dictionaries respectively.
         :param raise_on_failure:
-            If True, the component will raise an exception if the evaluation fails.
+            If True, the component will raise an exception on an unsuccessful API call.
         :param api:
             The API to use for calling an LLM through a Generator.
             Supported APIs: "openai".
@@ -164,18 +164,13 @@ def run(self, **inputs) -> Dict[str, Any]:
         """
         Run the LLM evaluator.
 
-        Running the LLM evaluator is done within a try-except block to catch any exceptions that may
-        occur during the run. If an exception occurs, the method will return a `np.nan` value for the result.
-
-        Likewise, if the output is not a valid JSON or does not have the  expected keys, the method will return a
-        `np.nan` value for the result.
-
         :param inputs:
             The input values to evaluate. The keys are the input names and the values are lists of input values.
         :returns:
             A dictionary with a single `results` entry that contains a list of results.
             Each result is a dictionary containing the keys as defined in the `outputs` parameter of the LLMEvaluator
-            and the evaluation results as the values.
+            and the evaluation results as the values. If an exception occurs for a particular input value, the result
+            will be `None` for that entry.
         """
         self.validate_input_parameters(dict(self.inputs), inputs)
 
@@ -195,15 +190,15 @@ def run(self, **inputs) -> Dict[str, Any]:
                 if self.raise_on_failure:
                     raise ValueError(msg)
                 warn(msg)
-                results.append(np.nan)
+                results.append(None)
                 errors += 1
                 continue
 
-            if self.is_valid_json(expected=self.outputs, received=result["replies"][0]):
+            if self.is_valid_json_and_has_expected_keys(expected=self.outputs, received=result["replies"][0]):
                 parsed_result = json.loads(result["replies"][0])
                 results.append(parsed_result)
             else:
-                results.append(np.nan)
+                results.append(None)
                 errors += 1
 
         if errors > 0:
@@ -322,22 +317,19 @@ def validate_input_parameters(expected: Dict[str, Any], received: Dict[str, Any]
             )
             raise ValueError(msg)
 
-    def is_valid_json(self, expected: List[str], received: str) -> bool:
+    def is_valid_json_and_has_expected_keys(self, expected: List[str], received: str) -> bool:
         """
         Output must be a valid JSON with the expected keys.
 
-        If the output is not a valid JSON with the expected keys:
-            - with `raise_on_failure` set to True a ValueError is raised.
-            - with `raise_on_failure` set to False a warning is issued and False is returned.
-        If the output is a valid JSON with the expected keys, True is returned.
-
         :param expected:
             Names of expected outputs
         :param received:
             Names of received outputs
 
         :raises ValueError:
-            If not all expected outputs are present in the received outputs
+            If the output is not a valid JSON with the expected keys:
+            - with `raise_on_failure` set to True a ValueError is raised.
+            - with `raise_on_failure` set to False a warning is issued and False is returned.
 
         :returns:
             True if the received output is a valid JSON with the expected keys, False otherwise.
diff --git a/releasenotes/notes/add-failsafe-for-LLM-based-evaluators-34cdc183ab545315.yaml b/releasenotes/notes/add-failsafe-for-LLM-based-evaluators-34cdc183ab545315.yaml
index ae21560df4..a97d33c8a2 100644
--- a/releasenotes/notes/add-failsafe-for-LLM-based-evaluators-34cdc183ab545315.yaml
+++ b/releasenotes/notes/add-failsafe-for-LLM-based-evaluators-34cdc183ab545315.yaml
@@ -1,5 +1,5 @@
 ---
 enhancements:
   - |
-    If an LLM-based evaluator (e.g., `Faithfulness` or `ContextRelevance`) is initialised with `raise_on_failure=False`, and if a call to an LLM fails or an LLM outputs an invalid JSON, it returns `np.nan` and continues the evaluation.
+    If an LLM-based evaluator (e.g., `Faithfulness` or `ContextRelevance`) is initialised with `raise_on_failure=False`, and if a call to an LLM fails or an LLM outputs an invalid JSON, the score of the sample is set to `NaN` instead of raising an exception.
     The user is notified with a warning indicating the number of requests that failed.

From a2c69dd59906af6d06c45f4454afc9d8f2061cd0 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Wed, 22 May 2024 17:10:14 +0200
Subject: [PATCH 17/23] fixing tests

---
 haystack/components/evaluators/llm_evaluator.py   | 15 +++++++--------
 .../evaluators/test_faithfulness_evaluator.py     |  9 +++++----
 test/components/evaluators/test_llm_evaluator.py  | 13 +++++++++----
 3 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index 79de577a38..6aec6419dd 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -336,14 +336,6 @@ def is_valid_json_and_has_expected_keys(self, expected: List[str], received: str
         """
         try:
             parsed_output = json.loads(received)
-
-            if not all(output in parsed_output for output in expected):
-                msg = f"Expected response from LLM evaluator to be JSON with keys {expected}, got {received}."
-                if self.raise_on_failure:
-                    raise ValueError(msg)
-                warn(msg)
-                return False
-
         except json.JSONDecodeError:
             msg = "Response from LLM evaluator is not a valid JSON."
             if self.raise_on_failure:
@@ -351,4 +343,11 @@ def is_valid_json_and_has_expected_keys(self, expected: List[str], received: str
             warn(msg)
             return False
 
+        if not all(output in parsed_output for output in expected):
+            msg = f"Expected response from LLM evaluator to be JSON with keys {expected}, got {received}."
+            if self.raise_on_failure:
+                raise ValueError(msg)
+            warn(msg)
+            return False
+
         return True
diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py
index 6e3ea8749f..a393d29afc 100644
--- a/test/components/evaluators/test_faithfulness_evaluator.py
+++ b/test/components/evaluators/test_faithfulness_evaluator.py
@@ -223,13 +223,14 @@ def generator_run(self, *args, **kwargs):
             "Guido van Rossum.",
         ]
         results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
+
         assert results == {
-            "individual_scores": [1, 0],
+            "individual_scores": [1.0, float(nan)],
             "results": [
-                {"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]},
-                {"score": 0, "statement_scores": [], "statements": []},
+                {"statements": ["c", "d"], "statement_scores": [1, 1], "score": 1.0},
+                {"statements": [], "statement_scores": [], "score": float(nan)},
             ],
-            "score": 0.5,
+            "score": float(nan),
         }
 
     @pytest.mark.skipif(
diff --git a/test/components/evaluators/test_llm_evaluator.py b/test/components/evaluators/test_llm_evaluator.py
index f1930e3074..f96170d66f 100644
--- a/test/components/evaluators/test_llm_evaluator.py
+++ b/test/components/evaluators/test_llm_evaluator.py
@@ -378,10 +378,12 @@ def test_invalid_outputs(self, monkeypatch):
             ],
         )
         with pytest.raises(ValueError):
-            component.is_valid_json(expected=["score", "another_expected_output"], received='{"score": 1.0}')
+            component.is_valid_json_and_has_expected_keys(
+                expected=["score", "another_expected_output"], received='{"score": 1.0}'
+            )
 
         with pytest.raises(ValueError):
-            component.is_valid_json(expected=["score"], received='{"wrong_name": 1.0}')
+            component.is_valid_json_and_has_expected_keys(expected=["score"], received='{"wrong_name": 1.0}')
 
     def test_output_invalid_json_raise_on_failure_false(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
@@ -394,7 +396,10 @@ def test_output_invalid_json_raise_on_failure_false(self, monkeypatch):
             ],
             raise_on_failure=False,
         )
-        assert component.is_valid_json(expected=["score"], received="some_invalid_json_output") is False
+        assert (
+            component.is_valid_json_and_has_expected_keys(expected=["score"], received="some_invalid_json_output")
+            is False
+        )
 
     def test_output_invalid_json_raise_on_failure_true(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
@@ -407,7 +412,7 @@ def test_output_invalid_json_raise_on_failure_true(self, monkeypatch):
             ],
         )
         with pytest.raises(ValueError):
-            component.is_valid_json(expected=["score"], received="some_invalid_json_output")
+            component.is_valid_json_and_has_expected_keys(expected=["score"], received="some_invalid_json_output")
 
     def test_unsupported_api(self):
         with pytest.raises(ValueError):

From e9497ec542e14147f560fc051386fc8433c23f17 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Wed, 22 May 2024 17:36:45 +0200
Subject: [PATCH 18/23] fixing tests

---
 .../components/evaluators/llm_evaluator.py    |  2 --
 .../test_context_relevance_evaluator.py       | 21 ++++++++++++-------
 .../evaluators/test_faithfulness_evaluator.py | 19 ++++++++++-------
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index 6aec6419dd..7a63e39790 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -6,8 +6,6 @@
 from typing import Any, Dict, List, Tuple, Type
 from warnings import warn
 
-import numpy as np
-
 from haystack import component, default_from_dict, default_to_dict
 from haystack.components.builders import PromptBuilder
 from haystack.components.generators import OpenAIGenerator
diff --git a/test/components/evaluators/test_context_relevance_evaluator.py b/test/components/evaluators/test_context_relevance_evaluator.py
index 1f21f8537c..de5ec907c0 100644
--- a/test/components/evaluators/test_context_relevance_evaluator.py
+++ b/test/components/evaluators/test_context_relevance_evaluator.py
@@ -4,6 +4,8 @@
 import os
 from typing import List
 
+import math
+
 import pytest
 
 from haystack.components.evaluators import ContextRelevanceEvaluator
@@ -186,14 +188,17 @@ def generator_run(self, *args, **kwargs):
             ],
         ]
         results = component.run(questions=questions, contexts=contexts)
-        assert results == {
-            "individual_scores": [1, 0],
-            "results": [
-                {"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]},
-                {"score": 0, "statement_scores": [], "statements": []},
-            ],
-            "score": 0.5,
-        }
+
+        assert math.isnan(results["score"])
+
+        assert results["individual_scores"][0] == 1.0
+        assert math.isnan(results["individual_scores"][1])
+
+        assert results["results"][0] == {"statements": ["c", "d"], "statement_scores": [1, 1], "score": 1.0}
+
+        assert results["results"][1]["statements"] == []
+        assert results["results"][1]["statement_scores"] == []
+        assert math.isnan(results["results"][1]["score"])
 
     @pytest.mark.skipif(
         not os.environ.get("OPENAI_API_KEY", None),
diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py
index a393d29afc..abfe74c455 100644
--- a/test/components/evaluators/test_faithfulness_evaluator.py
+++ b/test/components/evaluators/test_faithfulness_evaluator.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 import os
+import math
 from typing import List
 
 import numpy as np
@@ -224,14 +225,16 @@ def generator_run(self, *args, **kwargs):
         ]
         results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
 
-        assert results == {
-            "individual_scores": [1.0, float(nan)],
-            "results": [
-                {"statements": ["c", "d"], "statement_scores": [1, 1], "score": 1.0},
-                {"statements": [], "statement_scores": [], "score": float(nan)},
-            ],
-            "score": float(nan),
-        }
+        assert math.isnan(results["score"])
+
+        assert results["individual_scores"][0] == 1.0
+        assert math.isnan(results["individual_scores"][1])
+
+        assert results["results"][0] == {"statements": ["c", "d"], "statement_scores": [1, 1], "score": 1.0}
+
+        assert results["results"][1]["statements"] == []
+        assert results["results"][1]["statement_scores"] == []
+        assert math.isnan(results["results"][1]["score"])
 
     @pytest.mark.skipif(
         not os.environ.get("OPENAI_API_KEY", None),

From c0570ecb1843017aea9b425edfcaede52618f9da Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Wed, 22 May 2024 17:52:51 +0200
Subject: [PATCH 19/23] adding types

---
 haystack/components/evaluators/llm_evaluator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index 7a63e39790..fd9bcde39e 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
-from typing import Any, Dict, List, Tuple, Type
+from typing import Any, Dict, List, Optional, Tuple, Type
 from warnings import warn
 
 from haystack import component, default_from_dict, default_to_dict
@@ -177,7 +177,7 @@ def run(self, **inputs) -> Dict[str, Any]:
         input_names, values = inputs.keys(), list(zip(*inputs.values()))
         list_of_input_names_to_values = [dict(zip(input_names, v)) for v in values]
 
-        results = []
+        results: List[Optional[Dict[str, Any]]] = []
         errors = 0
         for input_names_to_values in list_of_input_names_to_values:
             prompt = self.builder.run(**input_names_to_values)

From 796588c5514263f9eeb819756ba4d0701128665b Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Wed, 22 May 2024 18:02:49 +0200
Subject: [PATCH 20/23] removing unused imports

---
 haystack/components/evaluators/context_relevance.py | 2 --
 haystack/components/evaluators/faithfulness.py      | 1 -
 2 files changed, 3 deletions(-)

diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py
index 83a8f5af09..29ce5edea1 100644
--- a/haystack/components/evaluators/context_relevance.py
+++ b/haystack/components/evaluators/context_relevance.py
@@ -2,10 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from math import isnan
 from typing import Any, Dict, List, Optional
 
-import numpy as np
 from numpy import mean as np_mean
 
 from haystack import default_from_dict
diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py
index 3d2dac3458..7f9dec88e7 100644
--- a/haystack/components/evaluators/faithfulness.py
+++ b/haystack/components/evaluators/faithfulness.py
@@ -2,7 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from math import isnan
 from typing import Any, Dict, List, Optional
 
 from numpy import mean as np_mean

From 50f64773edaf850eab2d1c5c655a705c8e49bbd4 Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Thu, 23 May 2024 09:21:26 +0200
Subject: [PATCH 21/23] Update
 haystack/components/evaluators/context_relevance.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 haystack/components/evaluators/context_relevance.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py
index 29ce5edea1..8a4ef124f8 100644
--- a/haystack/components/evaluators/context_relevance.py
+++ b/haystack/components/evaluators/context_relevance.py
@@ -140,7 +140,7 @@ def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]
 
         # calculate average statement relevance score per query
         for idx, res in enumerate(result["results"]):
-            if not res:
+            if res is None:
                 result["results"][idx] = {"statements": [], "statement_scores": [], "score": float("nan")}
                 continue
             if not res["statements"]:

From 8ce0c9dacef7a93c6449cf70e98da6d3cc59b7cd Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Thu, 23 May 2024 09:21:32 +0200
Subject: [PATCH 22/23] Update haystack/components/evaluators/faithfulness.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 haystack/components/evaluators/faithfulness.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py
index 7f9dec88e7..8d46e9bf9e 100644
--- a/haystack/components/evaluators/faithfulness.py
+++ b/haystack/components/evaluators/faithfulness.py
@@ -158,7 +158,7 @@ def run(self, questions: List[str], contexts: List[List[str]], predicted_answers
 
         # calculate average statement faithfulness score per query
         for idx, res in enumerate(result["results"]):
-            if not res:
+            if res is None:
                 result["results"][idx] = {"statements": [], "statement_scores": [], "score": float("nan")}
                 continue
             if not res["statements"]:

From 391e4fac564caad6ded4459a6fa8dc182c783a5f Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Thu, 23 May 2024 17:13:13 +0200
Subject: [PATCH 23/23] attending PR comments

---
 haystack/components/evaluators/llm_evaluator.py                | 3 +++
 test/components/evaluators/test_context_relevance_evaluator.py | 2 +-
 test/components/evaluators/test_faithfulness_evaluator.py      | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index fe6f1e8b4d..fdfe49ffd1 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -175,6 +175,9 @@ def run(self, **inputs) -> Dict[str, Any]:
             Each result is a dictionary containing the keys as defined in the `outputs` parameter of the LLMEvaluator
             and the evaluation results as the values. If an exception occurs for a particular input value, the result
             will be `None` for that entry.
+        :raises ValueError:
+            Only in the case that  `raise_on_failure` is set to True and the received inputs are not lists or have
+            different lengths, or if the output is not a valid JSON or doesn't contain the expected keys.
         """
         self.validate_input_parameters(dict(self.inputs), inputs)
 
diff --git a/test/components/evaluators/test_context_relevance_evaluator.py b/test/components/evaluators/test_context_relevance_evaluator.py
index de5ec907c0..2db69004d1 100644
--- a/test/components/evaluators/test_context_relevance_evaluator.py
+++ b/test/components/evaluators/test_context_relevance_evaluator.py
@@ -161,7 +161,7 @@ def test_run_missing_parameters(self, monkeypatch):
         with pytest.raises(TypeError, match="missing 2 required positional arguments"):
             component.run()
 
-    def test_run_handles_nan(self, monkeypatch):
+    def test_run_returns_nan_raise_on_failure_false(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
         component = ContextRelevanceEvaluator(raise_on_failure=False)
 
diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py
index abfe74c455..5c32f8c067 100644
--- a/test/components/evaluators/test_faithfulness_evaluator.py
+++ b/test/components/evaluators/test_faithfulness_evaluator.py
@@ -193,7 +193,7 @@ def test_run_missing_parameters(self, monkeypatch):
         with pytest.raises(TypeError, match="missing 3 required positional arguments"):
             component.run()
 
-    def test_run_handles_nan(self, monkeypatch):
+    def test_run_returns_nan_raise_on_failure_false(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
         component = FaithfulnessEvaluator(raise_on_failure=False)