langchain-ai · agola11 · Dec 10, 2024 · Nov 20, 2024 · Nov 20, 2024 · Dec 2, 2024
diff --git a/python/langsmith/client.py b/python/langsmith/client.py
@@ -1,4 +1,4 @@
 """Client for interacting with the LangSmith API.

 Use the client to customize API keys / workspace ocnnections, SSl certs,
 etc. for tracing.
@@ -140,6 +140,16 @@
 URLLIB3_SUPPORTS_BLOCKSIZE = "key_blocksize" in signature(PoolKey).parameters
 
 
+class AutoSeekBytesIO(io.BytesIO):
+    """BytesIO class that resets on read."""
+
+    def read(self, *args, **kwargs):
+        """Reset on read."""
+        data = super().read(*args, **kwargs)
+        self.seek(0)
+        return data
+
+
 def _parse_token_or_url(
     url_or_token: Union[str, uuid.UUID],
     api_url: str,
@@ -3808,7 +3818,7 @@
             for key, value in example["attachment_urls"].items():
                 response = requests.get(value["presigned_url"], stream=True)
                 response.raise_for_status()
-                reader = io.BytesIO(response.content)
+                reader = AutoSeekBytesIO(response.content)
                 attachment_urls[key.split(".")[1]] = (
                     value["presigned_url"],
                     reader,
@@ -3895,7 +3905,7 @@
                 for key, value in example["attachment_urls"].items():
                     response = requests.get(value["presigned_url"], stream=True)
                     response.raise_for_status()
-                    reader = io.BytesIO(response.content)
+                    reader = AutoSeekBytesIO(response.content)
                     attachment_urls[key.split(".")[1]] = (
                         value["presigned_url"],
                         reader,

diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
@@ -624,7 +624,14 @@ def _normalize_evaluator_func(
     Callable[[Run, Optional[Example]], _RUNNABLE_OUTPUT],
     Callable[[Run, Optional[Example]], Awaitable[_RUNNABLE_OUTPUT]],
 ]:
-    supported_args = ("run", "example", "inputs", "outputs", "reference_outputs")
+    supported_args = (
+        "run",
+        "example",
+        "inputs",
+        "outputs",
+        "reference_outputs",
+        "attachments",
+    )
     sig = inspect.signature(func)
     positional_args = [
         pname
@@ -659,6 +666,7 @@ async def awrapper(
                     "example": example,
                     "inputs": example.inputs if example else {},
                     "outputs": run.outputs or {},
+                    "attachments": example.attachment_urls or {},
                     "reference_outputs": example.outputs or {} if example else {},
                 }
                 args = (arg_map[arg] for arg in positional_args)
@@ -679,6 +687,7 @@ def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
                     "example": example,
                     "inputs": example.inputs if example else {},
                     "outputs": run.outputs or {},
+                    "attachments": example.attachment_urls or {},
                     "reference_outputs": example.outputs or {} if example else {},
                 }
                 args = (arg_map[arg] for arg in positional_args)

diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py
@@ -20,7 +20,7 @@
 from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor
 
 from langsmith.client import ID_TYPE, Client
-from langsmith.evaluation import evaluate
+from langsmith.evaluation import aevaluate, evaluate
 from langsmith.schemas import (
     DataType,
     Example,
@@ -1233,9 +1233,6 @@ def create_encoder(*args, **kwargs):
         assert not caplog.records
 
 
-@pytest.mark.skip(
-    reason="Need to land https://github.com/langchain-ai/langsmith-sdk/pull/1209 first"
-)
 def test_list_examples_attachments_keys(langchain_client: Client) -> None:
     """Test list_examples returns same keys with and without attachments."""
     dataset_name = "__test_list_examples_attachments" + uuid4().hex[:4]
@@ -1271,24 +1268,16 @@ def test_list_examples_attachments_keys(langchain_client: Client) -> None:
     langchain_client.delete_dataset(dataset_id=dataset.id)
 
 
-@pytest.mark.skip(
-    reason="Need to land https://github.com/langchain-ai/langsmith-sdk/pull/1209 first"
-)
 def test_evaluate_with_attachments(langchain_client: Client) -> None:
     """Test evaluating examples with attachments."""
     dataset_name = "__test_evaluate_attachments" + uuid4().hex[:4]
-    langchain_client = Client(
-        api_key="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",
-        api_url="https://dev.api.smith.langchain.com",
-    )
-    # 1. Create dataset
+
     dataset = langchain_client.create_dataset(
         dataset_name,
         description="Test dataset for evals with attachments",
         data_type=DataType.kv,
     )
 
-    # 2. Create example with attachments
     example = ExampleUpsertWithAttachments(
         dataset_id=dataset.id,
         inputs={"question": "What is shown in the image?"},
@@ -1300,23 +1289,25 @@ def test_evaluate_with_attachments(langchain_client: Client) -> None:
 
     langchain_client.upsert_examples_multipart(upserts=[example])
 
-    # 3. Define target function that uses attachments
     def target(inputs: Dict[str, Any], attachments: Dict[str, Any]) -> Dict[str, Any]:
         # Verify we receive the attachment data
         assert "image" in attachments
         image_url, image_data = attachments["image"]
         assert image_data.read() == b"fake image data for testing"
         return {"answer": "test image"}
 
-    # 4. Define simple evaluator
-    def evaluator(run: Run, example: Example) -> Dict[str, Any]:
+    def evaluator(
+        outputs: dict, reference_outputs: dict, attachments: dict
+    ) -> Dict[str, Any]:
+        assert "image" in attachments
+        image_url, image_data = attachments["image"]
+        assert image_data.read() == b"fake image data for testing"
         return {
             "score": float(
-                run.outputs.get("answer") == example.outputs.get("answer")  # type: ignore
+                reference_outputs.get("answer") == outputs.get("answer")  # type: ignore
             )
         }
 
-    # 5. Run evaluation
     results = evaluate(
         target,
         data=dataset_name,
@@ -1325,12 +1316,10 @@ def evaluator(run: Run, example: Example) -> Dict[str, Any]:
         num_repetitions=2,
     )
 
-    # 6. Verify results
     assert len(results) == 2
     for result in results:
         assert result["evaluation_results"]["results"][0].score == 1.0
 
-    # Cleanup
     langchain_client.delete_dataset(dataset_name=dataset_name)
 
 
@@ -1381,6 +1370,106 @@ def evaluator(run: Run, example: Example) -> Dict[str, Any]:
 
     langchain_client.delete_dataset(dataset_name=dataset_name)
 
+async def test_aevaluate_with_attachments(langchain_client: Client) -> None:
+    """Test evaluating examples with attachments."""
+    dataset_name = "__test_aevaluate_attachments" + uuid4().hex[:4]
+    dataset = langchain_client.create_dataset(
+        dataset_name,
+        description="Test dataset for evals with attachments",
+        data_type=DataType.kv,
+    )
+
+    example = ExampleUpsertWithAttachments(
+        dataset_id=dataset.id,
+        inputs={"question": "What is shown in the image?"},
+        outputs={"answer": "test image"},
+        attachments={
+            "image": ("image/png", b"fake image data for testing"),
+        },
+    )
+
+    langchain_client.upsert_examples_multipart(upserts=[example])
+
+    async def target(
+        inputs: Dict[str, Any], attachments: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        # Verify we receive the attachment data
+        assert "image" in attachments
+        image_url, image_data = attachments["image"]
+        assert image_data.read() == b"fake image data for testing"
+        return {"answer": "test image"}
+
+    async def evaluator(
+        outputs: dict, reference_outputs: dict, attachments: dict
+    ) -> Dict[str, Any]:
+        assert "image" in attachments
+        image_url, image_data = attachments["image"]
+        assert image_data.read() == b"fake image data for testing"
+        return {
+            "score": float(
+                reference_outputs.get("answer") == outputs.get("answer")  # type: ignore
+            )
+        }
+
+    results = await aevaluate(
+        target, data=dataset_name, evaluators=[evaluator], client=langchain_client
+    )
+
+    assert len(results) == 1
+    async for result in results:
+        assert result["evaluation_results"]["results"][0].score == 1.0
+
+    langchain_client.delete_dataset(dataset_name=dataset_name)
+
+
+async def test_aevaluate_with_no_attachments(langchain_client: Client) -> None:
+    """Test evaluating examples without attachments using a target with attachments."""
+    dataset_name = "__test_aevaluate_no_attachments" + uuid4().hex[:4]
+    dataset = langchain_client.create_dataset(
+        dataset_name,
+        description="Test dataset for evals without attachments",
+        data_type=DataType.kv,
+    )
+
+    # Create example using old way, attachments should be set to {}
+    langchain_client.create_example(
+        dataset_id=dataset.id,
+        inputs={"question": "What is 2+2?"},
+        outputs={"answer": "4"},
+    )
+
+    # Verify we can create example the new way without attachments
+    example = ExampleUpsertWithAttachments(
+        dataset_id=dataset.id,
+        inputs={"question": "What is 3+1?"},
+        outputs={"answer": "4"},
+    )
+    langchain_client.upsert_examples_multipart(upserts=[example])
+
+    async def target(
+        inputs: Dict[str, Any], attachments: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        # Verify we receive an empty attachments dict
+        assert isinstance(attachments, dict)
+        assert len(attachments) == 0
+        return {"answer": "4"}
+
+    async def evaluator(run: Run, example: Example) -> Dict[str, Any]:
+        return {
+            "score": float(
+                run.outputs.get("answer") == example.outputs.get("answer")  # type: ignore
+            )
+        }
+
+    results = await aevaluate(
+        target, data=dataset_name, evaluators=[evaluator], client=langchain_client
+    )
+
+    assert len(results) == 2
+    async for result in results:
+        assert result["evaluation_results"]["results"][0].score == 1.0
+
+    langchain_client.delete_dataset(dataset_name=dataset_name)
 
 def test_examples_length_validation(langchain_client: Client) -> None:
     """Test that mismatched lengths raise ValueError for create and update examples."""