From 197c037c7e24837a6f078c6e6be83dbab7d75f8c Mon Sep 17 00:00:00 2001 From: Bagatur Date: Mon, 23 Dec 2024 19:03:42 -0500 Subject: [PATCH] fmt --- python/tests/integration_tests/test_client.py | 60 ------------------- .../unit_tests/evaluation/test_runner.py | 45 ++++++++------ 2 files changed, 27 insertions(+), 78 deletions(-) diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py index 25536149d..3bcd9d04c 100644 --- a/python/tests/integration_tests/test_client.py +++ b/python/tests/integration_tests/test_client.py @@ -24,7 +24,6 @@ from langsmith.schemas import ( AttachmentsOperations, DataType, - EvaluationResult, Example, ExampleUpdateWithAttachments, ExampleUploadWithAttachments, @@ -1255,65 +1254,6 @@ def test_list_examples_attachments_keys(langchain_client: Client) -> None: langchain_client.delete_dataset(dataset_id=dataset.id) -async def test_summary_evaluation_with_evaluator_results( - langchain_client: Client, -) -> None: - """Test summary evaluators receive evaluator results.""" - dataset_name = "__test_summary_evaluation_inline_eval" + uuid4().hex[:4] - dataset = langchain_client.create_dataset( - dataset_name, - description="Test dataset for evals with attachments", - data_type=DataType.kv, - ) - - example_id = uuid4() - langchain_client.create_example( - dataset_id=dataset.id, - inputs={"question": "What is 2+2?"}, - outputs={"answer": "4"}, - example_id=example_id, - ) - - def target(inputs: Dict[str, Any]) -> Dict[str, Any]: - return {"answer": "4"} - - async def target_async(inputs: Dict[str, Any]) -> Dict[str, Any]: - return {"answer": "4"} - - def evaluator(outputs: dict, reference_outputs: dict) -> dict: - return {"score": 1, "key": "foo"} - - def summary_evaluator(evaluation_results: list[EvaluationResult]) -> bool: - assert len(evaluation_results) == 1 - assert evaluation_results[0][0].key == "foo" - assert evaluation_results[0][0].score == 1 - return True - - results = langchain_client.evaluate( - target, - data=dataset_name, - evaluators=[evaluator], - summary_evaluators=[summary_evaluator], - num_repetitions=1, - ) - assert len(results._summary_results["results"]) == 1 - assert results._summary_results["results"][0].score == 1 - assert results._summary_results["results"][0].key == "summary_evaluator" - - results = await langchain_client.aevaluate( - target_async, - data=dataset_name, - evaluators=[evaluator], - summary_evaluators=[summary_evaluator], - num_repetitions=1, - ) - assert len(results._summary_results["results"]) == 1 - assert results._summary_results["results"][0].score == 1 - assert results._summary_results["results"][0].key == "summary_evaluator" - - langchain_client.delete_dataset(dataset_id=dataset.id) - - def test_evaluate_with_attachments_multiple_evaluators( langchain_client: Client, ) -> None: diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py index e33d07fd5..9ad151bfc 100644 --- a/python/tests/unit_tests/evaluation/test_runner.py +++ b/python/tests/unit_tests/evaluation/test_runner.py @@ -265,15 +265,6 @@ def eval_list(run, example): {"score": 1, "key": "list_eval_int"}, ] - def summary_eval_runs_examples(runs_, examples_): - return {"score": len(runs_[0].dotted_order)} - - def summary_eval_inputs_outputs(inputs, outputs): - return [{"score": len([x["in"] for x in inputs])}] - - def summary_eval_outputs_reference(outputs, reference_outputs): - return len([x["answer"] for x in reference_outputs]) - evaluators = [ score_value_first, score_unpacked_inputs_outputs, @@ -285,10 +276,23 @@ def summary_eval_outputs_reference(outputs, reference_outputs): eval_list, ] + def summary_eval_runs_examples(runs_, examples_): + return {"score": len(runs_[0].dotted_order)} + + def summary_eval_inputs_outputs(inputs, outputs): + return [{"score": len([x["in"] for x in inputs])}] + + def summary_eval_outputs_reference(outputs, reference_outputs): + return len([x["answer"] for x in reference_outputs]) + + def summary_eval_evaluation_results(evaluation_results): + return all(len(r) == len(evaluators) + 1 for r in evaluation_results) + summary_evaluators = [ summary_eval_runs_examples, summary_eval_inputs_outputs, summary_eval_outputs_reference, + summary_eval_evaluation_results, ] results = evaluate( @@ -302,6 +306,7 @@ def summary_eval_outputs_reference(outputs, reference_outputs): upload_results=upload_results, max_concurrency=None, ) + if not blocking: deltas = [] last = None @@ -557,15 +562,6 @@ async def eval_list(run, example): {"score": 1, "key": "list_eval_int"}, ] - def summary_eval_runs_examples(runs_, examples_): - return {"score": len(runs_[0].dotted_order)} - - def summary_eval_inputs_outputs(inputs, outputs): - return {"score": len([x["in"] for x in inputs])} - - def summary_eval_outputs_reference(outputs, reference_outputs): - return {"score": len([x["answer"] for x in reference_outputs])} - evaluators = [ score_value_first, score_unpacked_inputs_outputs, @@ -577,10 +573,23 @@ def summary_eval_outputs_reference(outputs, reference_outputs): eval_list, ] + def summary_eval_runs_examples(runs_, examples_): + return {"score": len(runs_[0].dotted_order)} + + def summary_eval_inputs_outputs(inputs, outputs): + return {"score": len([x["in"] for x in inputs])} + + def summary_eval_outputs_reference(outputs, reference_outputs): + return {"score": len([x["answer"] for x in reference_outputs])} + + def summary_eval_evaluation_results(evaluation_results): + return all(len(r) == len(evaluators) + 1 for r in evaluation_results) + summary_evaluators = [ summary_eval_runs_examples, summary_eval_inputs_outputs, summary_eval_outputs_reference, + summary_eval_evaluation_results, ] results = await aevaluate(