From 2cfa36e5f7e66f2602d8e60eb477310c5306410e Mon Sep 17 00:00:00 2001 From: isaac hershenson Date: Tue, 8 Oct 2024 13:37:20 -0700 Subject: [PATCH 1/3] track evaluator errors from sdk --- python/langsmith/client.py | 5 ++++ python/langsmith/evaluation/_runner.py | 32 ++++++++++++++++++++++++ python/langsmith/evaluation/evaluator.py | 2 ++ python/langsmith/schemas.py | 2 ++ 4 files changed, 41 insertions(+) diff --git a/python/langsmith/client.py b/python/langsmith/client.py index 6fe4d33c0..ae55a7470 100644 --- a/python/langsmith/client.py +++ b/python/langsmith/client.py @@ -4245,6 +4245,7 @@ def _submit_feedback(**kwargs): ), feedback_source_type=ls_schemas.FeedbackSourceType.MODEL, project_id=project_id, + extra=res.extra, ) return results @@ -4315,6 +4316,7 @@ def create_feedback( project_id: Optional[ID_TYPE] = None, comparative_experiment_id: Optional[ID_TYPE] = None, feedback_group_id: Optional[ID_TYPE] = None, + extra: Optional[Dict] = None, **kwargs: Any, ) -> ls_schemas.Feedback: """Create a feedback in the LangSmith API. @@ -4360,6 +4362,8 @@ def create_feedback( feedback_group_id : str or UUID When logging preferences, ranking runs, or other comparative feedback, this is used to group feedback together. + extra : dict + Metadata for the feedback. """ if run_id is None and project_id is None: raise ValueError("One of run_id and project_id must be provided") @@ -4419,6 +4423,7 @@ def create_feedback( comparative_experiment_id, accept_null=True ), feedback_group_id=_ensure_uuid(feedback_group_id, accept_null=True), + extra=extra, ) feedback_block = _dumps_json(feedback.dict(exclude_none=True)) self.request_with_retries( diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py index a040ea7a3..378abb124 100644 --- a/python/langsmith/evaluation/_runner.py +++ b/python/langsmith/evaluation/_runner.py @@ -13,6 +13,8 @@ import random import threading import uuid +import inspect +import re from contextvars import copy_context from typing import ( Awaitable, @@ -82,6 +84,27 @@ ], ] +def extract_code_evaluator_feedback_keys(python_code: str) -> list[str]: + # Find the return statement + return_match = re.search(r'return\s*({[^}]+})', python_code) + if not return_match: + return [] + + # Extract the dictionary from the return statement + dict_str = return_match.group(1) + + # Find all keys in the dictionary + key_matches = re.findall(r'"([^"]+)":', dict_str) + + # Filter out 'key' and 'score' + feedback_keys = [key for key in key_matches if key not in ['key', 'score']] + + # If 'key' is present in the dictionary, add its value to the feedback_keys + key_value_match = re.search(r'"key"\s*:\s*"([^"]+)"', dict_str) + if key_value_match: + feedback_keys.append(key_value_match.group(1)) + + return feedback_keys def evaluate( target: TARGET_T, @@ -1353,6 +1376,15 @@ def _run_evaluators( ) ) except Exception as e: + feedback_keys = extract_code_evaluator_feedback_keys(inspect.getsource(evaluator.func)) + error_response = EvaluationResults(results=[EvaluationResult(key=key,source_run_id=run.id, + comment=repr(e),extra={"error":True}) for key in feedback_keys]) + eval_results["results"].extend( + # TODO: This is a hack + self.client._log_evaluation_feedback( + error_response, run=run, _executor=executor + ) + ) logger.error( f"Error running evaluator {repr(evaluator)} on" f" run {run.id}: {repr(e)}", diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py index 7e3e748ba..065f5b16b 100644 --- a/python/langsmith/evaluation/evaluator.py +++ b/python/langsmith/evaluation/evaluator.py @@ -90,6 +90,8 @@ class EvaluationResult(BaseModel): If none provided, the evaluation feedback is applied to the root trace being.""" + extra: Optional[Dict] = None + """Metadata for the evaluator run.""" class Config: """Pydantic model configuration.""" diff --git a/python/langsmith/schemas.py b/python/langsmith/schemas.py index d859418e7..1c5145472 100644 --- a/python/langsmith/schemas.py +++ b/python/langsmith/schemas.py @@ -460,6 +460,8 @@ class FeedbackBase(BaseModel): """For preference scoring, this group ID is shared across feedbacks for each run in the group that was being compared.""" + extra: Optional[Dict] = None + """The metadata of the feedback.""" class Config: """Configuration class for the schema.""" From 6de1edf18c5310667bc4ab44a7973e5cc239cf44 Mon Sep 17 00:00:00 2001 From: isaac hershenson Date: Tue, 8 Oct 2024 21:51:17 -0700 Subject: [PATCH 2/3] tests + error handling --- python/langsmith/client.py | 3 +- python/langsmith/evaluation/_runner.py | 101 ++++++++++++++++----- python/tests/evaluation/test_evaluation.py | 84 +++++++++++++++++ 3 files changed, 162 insertions(+), 26 deletions(-) diff --git a/python/langsmith/client.py b/python/langsmith/client.py index ae55a7470..b8c1dd05a 100644 --- a/python/langsmith/client.py +++ b/python/langsmith/client.py @@ -4214,13 +4214,12 @@ def _log_evaluation_feedback( _executor: Optional[cf.ThreadPoolExecutor] = None, ) -> List[ls_evaluator.EvaluationResult]: results = self._select_eval_results(evaluator_response) - def _submit_feedback(**kwargs): if _executor: _executor.submit(self.create_feedback, **kwargs) else: self.create_feedback(**kwargs) - + for res in results: source_info_ = source_info or {} if res.evaluator_info: diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py index 378abb124..1e5325555 100644 --- a/python/langsmith/evaluation/_runner.py +++ b/python/langsmith/evaluation/_runner.py @@ -14,7 +14,8 @@ import threading import uuid import inspect -import re +import ast +import textwrap from contextvars import copy_context from typing import ( Awaitable, @@ -84,27 +85,76 @@ ], ] + + def extract_code_evaluator_feedback_keys(python_code: str) -> list[str]: - # Find the return statement - return_match = re.search(r'return\s*({[^}]+})', python_code) - if not return_match: + def extract_dict_keys(node): + if isinstance(node, ast.Dict): + keys = [] + key_value = None + for key, value in zip(node.keys, node.values): + if isinstance(key, (ast.Str, ast.Constant)): + key_str = key.s if isinstance(key, ast.Str) else key.value + if key_str == 'key' and isinstance(value, (ast.Str, ast.Constant)): + key_value = value.s if isinstance(value, ast.Str) else value.value + elif key_str not in ['key', 'score']: + keys.append(key_str) + return [key_value] if key_value else keys return [] - # Extract the dictionary from the return statement - dict_str = return_match.group(1) - - # Find all keys in the dictionary - key_matches = re.findall(r'"([^"]+)":', dict_str) + def extract_evaluation_result_key(node): + if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == 'EvaluationResult': + for keyword in node.keywords: + if keyword.arg == 'key' and isinstance(keyword.value, (ast.Str, ast.Constant)): + return [keyword.value.s if isinstance(keyword.value, ast.Str) else keyword.value.value] + return [] - # Filter out 'key' and 'score' - feedback_keys = [key for key in key_matches if key not in ['key', 'score']] + def extract_evaluation_results_keys(node, variables): + if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == 'EvaluationResults': + for keyword in node.keywords: + if keyword.arg == 'results': + if isinstance(keyword.value, ast.Name): + return variables.get(keyword.value.id, []) + elif isinstance(keyword.value, ast.List): + keys = [] + for elt in keyword.value.elts: + keys.extend(extract_evaluation_result_key(elt)) + return keys + return [] - # If 'key' is present in the dictionary, add its value to the feedback_keys - key_value_match = re.search(r'"key"\s*:\s*"([^"]+)"', dict_str) - if key_value_match: - feedback_keys.append(key_value_match.group(1)) + python_code = textwrap.dedent(python_code) - return feedback_keys + try: + tree = ast.parse(python_code) + function_def = tree.body[0] + if not isinstance(function_def, ast.FunctionDef): + return [] + + variables = {} + keys = [] + + for node in ast.walk(function_def): + if isinstance(node, ast.Assign): + if isinstance(node.value, ast.List): + list_keys = [] + for elt in node.value.elts: + list_keys.extend(extract_evaluation_result_key(elt)) + if isinstance(node.targets[0], ast.Name): + variables[node.targets[0].id] = list_keys + elif isinstance(node, ast.Return) and node.value is not None: + dict_keys = extract_dict_keys(node.value) + eval_result_key = extract_evaluation_result_key(node.value) + eval_results_keys = extract_evaluation_results_keys(node.value, variables) + + keys.extend(dict_keys) + keys.extend(eval_result_key) + keys.extend(eval_results_keys) + + # If no keys found, return the function name + return keys if keys else [function_def.name] + + except SyntaxError: + return [] def evaluate( target: TARGET_T, @@ -1376,15 +1426,18 @@ def _run_evaluators( ) ) except Exception as e: - feedback_keys = extract_code_evaluator_feedback_keys(inspect.getsource(evaluator.func)) - error_response = EvaluationResults(results=[EvaluationResult(key=key,source_run_id=run.id, - comment=repr(e),extra={"error":True}) for key in feedback_keys]) - eval_results["results"].extend( - # TODO: This is a hack - self.client._log_evaluation_feedback( - error_response, run=run, _executor=executor + try: + feedback_keys = extract_code_evaluator_feedback_keys(inspect.getsource(evaluator.func)) + error_response = EvaluationResults(results=[EvaluationResult(key=key,source_run_id=run.id, + comment=repr(e),extra={"error":True}) for key in feedback_keys]) + eval_results["results"].extend( + # TODO: This is a hack + self.client._log_evaluation_feedback( + error_response, run=run, _executor=executor + ) ) - ) + except: + pass logger.error( f"Error running evaluator {repr(evaluator)} on" f" run {run.id}: {repr(e)}", diff --git a/python/tests/evaluation/test_evaluation.py b/python/tests/evaluation/test_evaluation.py index 62eb0551c..8e02a91dc 100644 --- a/python/tests/evaluation/test_evaluation.py +++ b/python/tests/evaluation/test_evaluation.py @@ -3,6 +3,11 @@ from typing import Callable, Sequence, Tuple, TypeVar import pytest +import sys +import os + +# Add the current directory (which contains 'langsmith') to the Python path +sys.path.insert(0, os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))) from langsmith import Client, aevaluate, evaluate, expect, test from langsmith.schemas import Example, Run @@ -32,6 +37,85 @@ def wait_for( raise ValueError(f"Callable did not return within {total_time}") +def test_error_handling_evaluators(): + client = Client() + _ = client.clone_public_dataset( + "https://smith.langchain.com/public/419dcab2-1d66-4b94-8901-0357ead390df/d" + ) + dataset_name = "Evaluate Examples" + + # Case 1: Normal dictionary return + def error_dict_evaluator(run: Run, example: Example): + if True: # This condition ensures the error is always raised + raise ValueError("Error in dict evaluator") + return {"key": "dict_key", "score": 1} + + # Case 2: EvaluationResult return + def error_evaluation_result(run: Run, example: Example): + if True: # This condition ensures the error is always raised + raise ValueError("Error in EvaluationResult evaluator") + return EvaluationResult(key="eval_result_key", score=1) + + # Case 3: EvaluationResults return + def error_evaluation_results(run: Run, example: Example): + if True: # This condition ensures the error is always raised + raise ValueError("Error in EvaluationResults evaluator") + return EvaluationResults( + results=[ + EvaluationResult(key="eval_results_key1", score=1), + EvaluationResult(key="eval_results_key2", score=2) + ] + ) + + # Case 4: Dictionary without 'key' field + def error_dict_no_key(run: Run, example: Example): + if True: # This condition ensures the error is always raised + raise ValueError("Error in dict without key evaluator") + return {"score":1} + + def predict(inputs: dict) -> dict: + return {"output": "Yes"} + + results = evaluate( + predict, + data=dataset_name, + evaluators=[ + error_dict_evaluator, + error_evaluation_result, + error_evaluation_results, + error_dict_no_key, + ], + max_concurrency=1, # To ensure deterministic order + ) + + assert len(results) == 10 # Assuming 10 examples in the dataset + + for result in results: + eval_results = result["evaluation_results"]["results"] + assert len(eval_results) == 5 + + # Check error handling for each evaluator + assert eval_results[0].key == "dict_key" + assert "Error in dict evaluator" in eval_results[0].comment + assert eval_results[0].extra.get("error") is True + + assert eval_results[1].key == "eval_result_key" + assert "Error in EvaluationResult evaluator" in eval_results[1].comment + assert eval_results[1].extra.get("error") is True + + assert eval_results[2].key == "eval_results_key1" + assert "Error in EvaluationResults evaluator" in eval_results[2].comment + assert eval_results[2].extra.get("error") is True + + assert eval_results[3].key == "eval_results_key2" + assert "Error in EvaluationResults evaluator" in eval_results[3].comment + assert eval_results[3].extra.get("error") is True + + assert eval_results[4].key == "error_dict_no_key" + assert "Error in dict without key evaluator" in eval_results[4].comment + assert eval_results[4].extra.get("error") is True + + def test_evaluate(): client = Client() _ = client.clone_public_dataset( From ba83c0af9aa81bf998c26af469c40fb9968187d5 Mon Sep 17 00:00:00 2001 From: William Fu-Hinthorn <13333726+hinthornw@users.noreply.github.com> Date: Wed, 9 Oct 2024 15:15:37 -0700 Subject: [PATCH 3/3] Fix tests & logs --- python/langsmith/client.py | 3 +- python/langsmith/evaluation/_runner.py | 202 +++++++++++++-------- python/tests/evaluation/test_evaluation.py | 120 +++++++----- 3 files changed, 203 insertions(+), 122 deletions(-) diff --git a/python/langsmith/client.py b/python/langsmith/client.py index b8c1dd05a..ae55a7470 100644 --- a/python/langsmith/client.py +++ b/python/langsmith/client.py @@ -4214,12 +4214,13 @@ def _log_evaluation_feedback( _executor: Optional[cf.ThreadPoolExecutor] = None, ) -> List[ls_evaluator.EvaluationResult]: results = self._select_eval_results(evaluator_response) + def _submit_feedback(**kwargs): if _executor: _executor.submit(self.create_feedback, **kwargs) else: self.create_feedback(**kwargs) - + for res in results: source_info_ = source_info or {} if res.evaluator_info: diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py index 1e5325555..88de67e85 100644 --- a/python/langsmith/evaluation/_runner.py +++ b/python/langsmith/evaluation/_runner.py @@ -2,20 +2,20 @@ from __future__ import annotations +import ast import collections import concurrent.futures as cf import datetime import functools +import inspect import itertools import logging import pathlib import queue import random +import textwrap import threading import uuid -import inspect -import ast -import textwrap from contextvars import copy_context from typing import ( Awaitable, @@ -45,6 +45,7 @@ from langsmith.evaluation.evaluator import ( ComparisonEvaluationResult, DynamicComparisonRunEvaluator, + DynamicRunEvaluator, EvaluationResult, EvaluationResults, RunEvaluator, @@ -86,76 +87,6 @@ ] - -def extract_code_evaluator_feedback_keys(python_code: str) -> list[str]: - def extract_dict_keys(node): - if isinstance(node, ast.Dict): - keys = [] - key_value = None - for key, value in zip(node.keys, node.values): - if isinstance(key, (ast.Str, ast.Constant)): - key_str = key.s if isinstance(key, ast.Str) else key.value - if key_str == 'key' and isinstance(value, (ast.Str, ast.Constant)): - key_value = value.s if isinstance(value, ast.Str) else value.value - elif key_str not in ['key', 'score']: - keys.append(key_str) - return [key_value] if key_value else keys - return [] - - def extract_evaluation_result_key(node): - if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == 'EvaluationResult': - for keyword in node.keywords: - if keyword.arg == 'key' and isinstance(keyword.value, (ast.Str, ast.Constant)): - return [keyword.value.s if isinstance(keyword.value, ast.Str) else keyword.value.value] - return [] - - def extract_evaluation_results_keys(node, variables): - if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == 'EvaluationResults': - for keyword in node.keywords: - if keyword.arg == 'results': - if isinstance(keyword.value, ast.Name): - return variables.get(keyword.value.id, []) - elif isinstance(keyword.value, ast.List): - keys = [] - for elt in keyword.value.elts: - keys.extend(extract_evaluation_result_key(elt)) - return keys - return [] - - python_code = textwrap.dedent(python_code) - - try: - tree = ast.parse(python_code) - function_def = tree.body[0] - if not isinstance(function_def, ast.FunctionDef): - return [] - - variables = {} - keys = [] - - for node in ast.walk(function_def): - if isinstance(node, ast.Assign): - if isinstance(node.value, ast.List): - list_keys = [] - for elt in node.value.elts: - list_keys.extend(extract_evaluation_result_key(elt)) - if isinstance(node.targets[0], ast.Name): - variables[node.targets[0].id] = list_keys - elif isinstance(node, ast.Return) and node.value is not None: - dict_keys = extract_dict_keys(node.value) - eval_result_key = extract_evaluation_result_key(node.value) - eval_results_keys = extract_evaluation_results_keys(node.value, variables) - - keys.extend(dict_keys) - keys.extend(eval_result_key) - keys.extend(eval_results_keys) - - # If no keys found, return the function name - return keys if keys else [function_def.name] - - except SyntaxError: - return [] - def evaluate( target: TARGET_T, /, @@ -1427,16 +1358,27 @@ def _run_evaluators( ) except Exception as e: try: - feedback_keys = extract_code_evaluator_feedback_keys(inspect.getsource(evaluator.func)) - error_response = EvaluationResults(results=[EvaluationResult(key=key,source_run_id=run.id, - comment=repr(e),extra={"error":True}) for key in feedback_keys]) + feedback_keys = _extract_feedback_keys(evaluator) + + error_response = EvaluationResults( + results=[ + EvaluationResult( + key=key, + source_run_id=run.id, + comment=repr(e), + extra={"error": True}, + ) + for key in feedback_keys + ] + ) eval_results["results"].extend( # TODO: This is a hack self.client._log_evaluation_feedback( error_response, run=run, _executor=executor ) ) - except: + except BaseException as e2: + logger.debug(f"Error parsing feedback keys: {e2}") pass logger.error( f"Error running evaluator {repr(evaluator)} on" @@ -1735,3 +1677,109 @@ def _get_random_name() -> str: from langsmith.evaluation._name_generation import random_name # noqa: F401 return random_name() + + +def _extract_feedback_keys(evaluator: RunEvaluator): + if isinstance(evaluator, DynamicRunEvaluator): + if getattr(evaluator, "func", None): + return _extract_code_evaluator_feedback_keys(evaluator.func) + elif getattr(evaluator, "afunc", None): + return _extract_code_evaluator_feedback_keys(evaluator.afunc) + # TODO: Support for DynamicComparisonRunEvaluator + if hasattr(evaluator, "evaluator"): + # LangChainStringEvaluator + if getattr(getattr(evaluator, "evaluator"), "evaluation_name", None): + return [evaluator.evaluator.evaluation_name] + return [] + + +def _extract_code_evaluator_feedback_keys(func: Callable) -> list[str]: + python_code = inspect.getsource(func) + + def extract_dict_keys(node): + if isinstance(node, ast.Dict): + keys = [] + key_value = None + for key, value in zip(node.keys, node.values): + if isinstance(key, (ast.Str, ast.Constant)): + key_str = key.s if isinstance(key, ast.Str) else key.value + if key_str == "key" and isinstance(value, (ast.Str, ast.Constant)): + key_value = ( + value.s if isinstance(value, ast.Str) else value.value + ) + elif key_str not in ["key", "score"]: + keys.append(key_str) + return [key_value] if key_value else keys + return [] + + def extract_evaluation_result_key(node): + if ( + isinstance(node, ast.Call) + and isinstance(node.func, ast.Name) + and node.func.id == "EvaluationResult" + ): + for keyword in node.keywords: + if keyword.arg == "key" and isinstance( + keyword.value, (ast.Str, ast.Constant) + ): + return [ + ( + keyword.value.s + if isinstance(keyword.value, ast.Str) + else keyword.value.value + ) + ] + return [] + + def extract_evaluation_results_keys(node, variables): + if ( + isinstance(node, ast.Call) + and isinstance(node.func, ast.Name) + and node.func.id == "EvaluationResults" + ): + for keyword in node.keywords: + if keyword.arg == "results": + if isinstance(keyword.value, ast.Name): + return variables.get(keyword.value.id, []) + elif isinstance(keyword.value, ast.List): + keys = [] + for elt in keyword.value.elts: + keys.extend(extract_evaluation_result_key(elt)) + return keys + return [] + + python_code = textwrap.dedent(python_code) + + try: + tree = ast.parse(python_code) + function_def = tree.body[0] + if not isinstance(function_def, ast.FunctionDef): + return [] + + variables = {} + keys = [] + + for node in ast.walk(function_def): + if isinstance(node, ast.Assign): + if isinstance(node.value, ast.List): + list_keys = [] + for elt in node.value.elts: + list_keys.extend(extract_evaluation_result_key(elt)) + if isinstance(node.targets[0], ast.Name): + variables[node.targets[0].id] = list_keys + elif isinstance(node, ast.Return) and node.value is not None: + dict_keys = extract_dict_keys(node.value) + eval_result_key = extract_evaluation_result_key(node.value) + eval_results_keys = extract_evaluation_results_keys( + node.value, variables + ) + + keys.extend(dict_keys) + keys.extend(eval_result_key) + keys.extend(eval_results_keys) + + # If no keys found, return the function name + return keys if keys else [function_def.name] + + except SyntaxError: + return [] diff --git a/python/tests/evaluation/test_evaluation.py b/python/tests/evaluation/test_evaluation.py index 8e02a91dc..74b8b0301 100644 --- a/python/tests/evaluation/test_evaluation.py +++ b/python/tests/evaluation/test_evaluation.py @@ -1,20 +1,29 @@ import asyncio +import logging import time +from contextlib import contextmanager from typing import Callable, Sequence, Tuple, TypeVar import pytest -import sys -import os - -# Add the current directory (which contains 'langsmith') to the Python path -sys.path.insert(0, os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))) from langsmith import Client, aevaluate, evaluate, expect, test +from langsmith.evaluation import EvaluationResult, EvaluationResults from langsmith.schemas import Example, Run T = TypeVar("T") +@contextmanager +def suppress_warnings(): + logger = logging.getLogger() + current_level = logger.level + logger.setLevel(logging.CRITICAL) + try: + yield + finally: + logger.setLevel(current_level) + + def wait_for( condition: Callable[[], Tuple[T, bool]], max_sleep_time: int = 120, @@ -37,7 +46,7 @@ def wait_for( raise ValueError(f"Callable did not return within {total_time}") -def test_error_handling_evaluators(): +async def test_error_handling_evaluators(): client = Client() _ = client.clone_public_dataset( "https://smith.langchain.com/public/419dcab2-1d66-4b94-8901-0357ead390df/d" @@ -63,7 +72,7 @@ def error_evaluation_results(run: Run, example: Example): return EvaluationResults( results=[ EvaluationResult(key="eval_results_key1", score=1), - EvaluationResult(key="eval_results_key2", score=2) + EvaluationResult(key="eval_results_key2", score=2), ] ) @@ -71,49 +80,72 @@ def error_evaluation_results(run: Run, example: Example): def error_dict_no_key(run: Run, example: Example): if True: # This condition ensures the error is always raised raise ValueError("Error in dict without key evaluator") - return {"score":1} + return {"score": 1} def predict(inputs: dict) -> dict: return {"output": "Yes"} - results = evaluate( - predict, - data=dataset_name, - evaluators=[ - error_dict_evaluator, - error_evaluation_result, - error_evaluation_results, - error_dict_no_key, - ], - max_concurrency=1, # To ensure deterministic order - ) - - assert len(results) == 10 # Assuming 10 examples in the dataset - - for result in results: - eval_results = result["evaluation_results"]["results"] - assert len(eval_results) == 5 - - # Check error handling for each evaluator - assert eval_results[0].key == "dict_key" - assert "Error in dict evaluator" in eval_results[0].comment - assert eval_results[0].extra.get("error") is True - - assert eval_results[1].key == "eval_result_key" - assert "Error in EvaluationResult evaluator" in eval_results[1].comment - assert eval_results[1].extra.get("error") is True - - assert eval_results[2].key == "eval_results_key1" - assert "Error in EvaluationResults evaluator" in eval_results[2].comment - assert eval_results[2].extra.get("error") is True + with suppress_warnings(): + sync_results = evaluate( + predict, + data=dataset_name, + evaluators=[ + error_dict_evaluator, + error_evaluation_result, + error_evaluation_results, + error_dict_no_key, + ], + max_concurrency=1, # To ensure deterministic order + ) - assert eval_results[3].key == "eval_results_key2" - assert "Error in EvaluationResults evaluator" in eval_results[3].comment - assert eval_results[3].extra.get("error") is True + assert len(sync_results) == 10 # Assuming 10 examples in the dataset + + def check_results(results): + for result in results: + eval_results = result["evaluation_results"]["results"] + assert len(eval_results) == 5 + + # Check error handling for each evaluator + assert eval_results[0].key == "dict_key" + assert "Error in dict evaluator" in eval_results[0].comment + assert eval_results[0].extra.get("error") is True + + assert eval_results[1].key == "eval_result_key" + assert "Error in EvaluationResult evaluator" in eval_results[1].comment + assert eval_results[1].extra.get("error") is True + + assert eval_results[2].key == "eval_results_key1" + assert "Error in EvaluationResults evaluator" in eval_results[2].comment + assert eval_results[2].extra.get("error") is True + + assert eval_results[3].key == "eval_results_key2" + assert "Error in EvaluationResults evaluator" in eval_results[3].comment + assert eval_results[3].extra.get("error") is True + + assert eval_results[4].key == "error_dict_no_key" + assert "Error in dict without key evaluator" in eval_results[4].comment + assert eval_results[4].extra.get("error") is True + + check_results(sync_results) + + async def apredict(inputs: dict): + return predict(inputs) + + with suppress_warnings(): + async_results = await aevaluate( + apredict, + data=dataset_name, + evaluators=[ + error_dict_evaluator, + error_evaluation_result, + error_evaluation_results, + error_dict_no_key, + ], + max_concurrency=1, # To ensure deterministic order + ) - assert eval_results[4].key == "error_dict_no_key" - assert "Error in dict without key evaluator" in eval_results[4].comment - assert eval_results[4].extra.get("error") is True + assert len(async_results) == 10 # Assuming 10 examples in the dataset + check_results([res async for res in async_results]) def test_evaluate():