diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py index 25ea0d62a..39d324ba7 100644 --- a/python/langsmith/evaluation/_arunner.py +++ b/python/langsmith/evaluation/_arunner.py @@ -844,6 +844,10 @@ async def _arun_evaluators( f" run {run.id}: {repr(e)}", exc_info=True, ) + if example.attachments is not None: + for attachment in example.attachments: + reader = example.attachments[attachment]["reader"] + reader.seek(0) return ExperimentResultRow( run=run, example=example, diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py index a2425c2af..3bcd9d04c 100644 --- a/python/tests/integration_tests/test_client.py +++ b/python/tests/integration_tests/test_client.py @@ -1508,7 +1508,20 @@ async def target( assert image_data.read() == b"fake image data for testing" return {"answer": "test image"} - async def evaluator( + async def evaluator_1( + outputs: dict, reference_outputs: dict, attachments: dict + ) -> Dict[str, Any]: + assert "image" in attachments + assert "presigned_url" in attachments["image"] + image_data = attachments["image"]["reader"] + assert image_data.read() == b"fake image data for testing" + return { + "score": float( + reference_outputs.get("answer") == outputs.get("answer") # type: ignore + ) + } + + async def evaluator_2( outputs: dict, reference_outputs: dict, attachments: dict ) -> Dict[str, Any]: assert "image" in attachments @@ -1522,12 +1535,17 @@ async def evaluator( } results = await langchain_client.aevaluate( - target, data=dataset_name, evaluators=[evaluator], num_repetitions=2 + target, + data=dataset_name, + evaluators=[evaluator_1, evaluator_2], + num_repetitions=2, + max_concurrency=3, ) assert len(results) == 2 async for result in results: assert result["evaluation_results"]["results"][0].score == 1.0 + assert result["evaluation_results"]["results"][1].score == 1.0 langchain_client.delete_dataset(dataset_name=dataset_name)