adding safeguards for nan in evaluators

deepset-ai · davidsbatista · May 23, 2024 · May 21, 2024 · May 21, 2024 · May 21, 2024
commit 33dd22dbb2c902884c3045a342a990342c80e3ae
@@ -4,6 +4,7 @@
 
 from typing import Any, Dict, List, Optional
 
+from numpy import isnan
 from numpy import mean as np_mean
 
 from haystack import default_from_dict
@@ -141,7 +142,10 @@ def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]
         result = super().run(questions=questions, contexts=contexts)
 
         # calculate average statement relevance score per query
-        for res in result["results"]:
+        for idx, res in enumerate(result["results"]):
+            if isinstance(res, float) and isnan(res):
+                result["results"][idx] = {"statements": [], "statement_scores": [], "score": 0}
+                continue
             if not res["statements"]:
                 res["score"] = 0
             else:

@@ -4,6 +4,7 @@
 
 from typing import Any, Dict, List, Optional
 
+from numpy import isnan
 from numpy import mean as np_mean
 
 from haystack import default_from_dict
@@ -159,7 +160,10 @@ def run(self, questions: List[str], contexts: List[List[str]], predicted_answers
         result = super().run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
 
         # calculate average statement faithfulness score per query
-        for res in result["results"]:
+        for idx, res in enumerate(result["results"]):
+            if isinstance(res, float) and isnan(res):
+                result["results"][idx] = {"statements": [], "statement_scores": [], "score": 0}
+                continue
             if not res["statements"]:
                 res["score"] = 0
             else:

@@ -79,7 +79,7 @@ def __init__(
              `outputs` parameters.
             Each example is a dictionary with keys "inputs" and "outputs"
             They contain the input and output as dictionaries respectively.
-        :param raises_on_failure:
+        :param raise_on_failure:
             If True, the component will raise an exception if the evaluation fails.
         :param api:
             The API to use for calling an LLM through a Generator.
@@ -170,6 +170,8 @@ def run(self, **inputs) -> Dict[str, Any]:
         """
         Run the LLM evaluator.
 
+        # ToDo: add more details about the behavior of this method and it's exceptions
+
         :param inputs:
             The input values to evaluate. The keys are the input names and the values are lists of input values.
         :returns:
@@ -187,13 +189,21 @@ def run(self, **inputs) -> Dict[str, Any]:
         results = []
         for input_names_to_values in tqdm(list_of_input_names_to_values, disable=not self.progress_bar):
             prompt = self.builder.run(**input_names_to_values)
-            result = self.generator.run(prompt=prompt["prompt"])
-
-            # ToDo: how to handle too large context
+            try:
+                result = self.generator.run(prompt=prompt["prompt"])
+            except Exception as e:
+                msg = f"Error while generating response for prompt: {prompt}. Error: {e}"
+                if self.raise_on_failure:
+                    raise ValueError(msg)
+                warn(msg)
+                results.append(np.nan)
+                continue
 
-            self.validate_outputs(expected=self.outputs, received=result["replies"][0])
-            parsed_result = json.loads(result["replies"][0])
-            results.append(parsed_result)
+            if self.is_valid_json(expected=self.outputs, received=result["replies"][0]):
+                parsed_result = json.loads(result["replies"][0])
+                results.append(parsed_result)
+            else:
+                results.append(np.nan)
 
         return {"results": results}
 
@@ -307,14 +317,14 @@ def validate_input_parameters(expected: Dict[str, Any], received: Dict[str, Any]
             )
             raise ValueError(msg)
 
-    def validate_outputs(self, expected: List[str], received: str) -> Optional[float]:
+    def is_valid_json(self, expected: List[str], received: str) -> bool:
         """
-        Validate the output.
+        Output must be a valid JSON with the expected keys.
 
-        If `raise_on_failure` is True, raise a ValueError if not all expected outputs are present in the received
-        outputs or if the received outputs are not a valid JSON.
-
-        If `raise_on_failure` is False, print a warning if the received outputs are not a valid JSON and return a `nan`.
+        If the output is not a valid JSON with the expected keys:
+            - with `raise_on_failure` set to True a ValueError is raised.
+            - with `raise_on_failure` set to False a warning is issued and False is returned.
+        If the output is a valid JSON with the expected keys, True is returned.
 
         :param expected:
             Names of expected outputs
@@ -323,6 +333,9 @@ def validate_outputs(self, expected: List[str], received: str) -> Optional[float
 
         :raises ValueError:
             If not all expected outputs are present in the received outputs
+
+        :returns:
+            True if the received output is a valid JSON with the expected keys, False otherwise.
         """
         try:
             parsed_output = json.loads(received)
@@ -332,11 +345,13 @@ def validate_outputs(self, expected: List[str], received: str) -> Optional[float
                 if self.raise_on_failure:
                     raise ValueError(msg)
                 warn(msg)
-                return np.nan
+                return False
 
         except json.JSONDecodeError:
             msg = "Response from LLM evaluator is not a valid JSON."
             if self.raise_on_failure:
                 raise ValueError(msg)
             warn(msg)
-            return np.nan
+            return False
+
+        return True