From a5ee5990f47903a81d252f996d923758c02252f4 Mon Sep 17 00:00:00 2001
From: isaac hershenson <ihershenson@hmc.edu>
Date: Mon, 9 Dec 2024 08:46:18 -0800
Subject: [PATCH] refactor

---
 python/langsmith/client.py                    | 29 ++++++------
 python/langsmith/evaluation/_arunner.py       |  6 ++-
 python/langsmith/evaluation/_runner.py        |  8 ++--
 python/langsmith/schemas.py                   | 30 +++++++++----
 python/tests/integration_tests/test_client.py | 44 +++++--------------
 5 files changed, 56 insertions(+), 61 deletions(-)

diff --git a/python/langsmith/client.py b/python/langsmith/client.py
index 9fe393125..778911808 100644
--- a/python/langsmith/client.py
+++ b/python/langsmith/client.py
@@ -3580,6 +3580,7 @@ def _prepate_multipart_data(
     def upload_examples_multipart(
         self,
         *,
+        dataset_id: ID_TYPE,
         uploads: List[ls_schemas.ExampleUploadWithAttachments] = None,
     ) -> ls_schemas.UpsertExamplesResponse:
         """Upload examples."""
@@ -3592,10 +3593,6 @@ def upload_examples_multipart(
         if uploads is None:
             uploads = []
         encoder, data = self._prepate_multipart_data(uploads, include_dataset_id=False)
-        dataset_ids = set([example.dataset_id for example in uploads])
-        if len(dataset_ids) > 1:
-            raise ValueError("All examples must be in the same dataset.")
-        dataset_id = list(dataset_ids)[0]
 
         response = self.request_with_retries(
             "POST",
@@ -3823,21 +3820,21 @@ def read_example(
         )
 
         example = response.json()
-        attachment_urls = {}
+        attachments_info = {}
         if example["attachment_urls"]:
             for key, value in example["attachment_urls"].items():
                 response = requests.get(value["presigned_url"], stream=True)
                 response.raise_for_status()
                 reader = io.BytesIO(response.content)
-                attachment_urls[key.split(".")[1]] = (
-                    value["presigned_url"],
-                    reader,
-                )
+                attachments_info[key.split(".")[1]] = {
+                    "presigned_url": value["presigned_url"],
+                    "reader": reader,
+                }
         del example["attachment_urls"]
 
         return ls_schemas.Example(
             **example,
-            attachment_urls=attachment_urls,
+            attachments_info=attachments_info,
             _host_url=self._host_url,
             _tenant_id=self._get_optional_tenant_id(),
         )
@@ -3910,21 +3907,21 @@ def list_examples(
         for i, example in enumerate(
             self._get_paginated_list("/examples", params=params)
         ):
-            attachment_urls = {}
+            attachments_info = {}
             if example["attachment_urls"]:
                 for key, value in example["attachment_urls"].items():
                     response = requests.get(value["presigned_url"], stream=True)
                     response.raise_for_status()
                     reader = io.BytesIO(response.content)
-                    attachment_urls[key.split(".")[1]] = (
-                        value["presigned_url"],
-                        reader,
-                    )
+                    attachments_info[key.split(".")[1]] = {
+                        "presigned_url": value["presigned_url"],
+                        "reader": reader,
+                    }
             del example["attachment_urls"]
 
             yield ls_schemas.Example(
                 **example,
-                attachment_urls=attachment_urls,
+                attachments_info=attachments_info,
                 _host_url=self._host_url,
                 _tenant_id=self._get_optional_tenant_id(),
             )
diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py
index 9412bf5f3..5a22ba305 100644
--- a/python/langsmith/evaluation/_arunner.py
+++ b/python/langsmith/evaluation/_arunner.py
@@ -1023,7 +1023,7 @@ def _get_run(r: run_trees.RunTree) -> None:
     with rh.tracing_context(enabled=True):
         try:
             args = (
-                (example.inputs, example.attachment_urls)
+                (example.inputs, example.attachments_info)
                 if include_attachments
                 else (example.inputs,)
             )
@@ -1044,6 +1044,10 @@ def _get_run(r: run_trees.RunTree) -> None:
                     client=client,
                 ),
             )
+            if include_attachments and example.attachments_info is not None:
+                for attachment in example.attachments_info:
+                    reader = example.attachments_info[attachment]["reader"]
+                    reader.seek(0)
         except Exception as e:
             logger.error(
                 f"Error running target function: {e}", exc_info=True, stacklevel=1
diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
index ebd259e14..fbb096484 100644
--- a/python/langsmith/evaluation/_runner.py
+++ b/python/langsmith/evaluation/_runner.py
@@ -1834,7 +1834,7 @@ def _get_run(r: rt.RunTree) -> None:
         )
         try:
             args = (
-                (example.inputs, example.attachment_urls)
+                (example.inputs, example.attachments_info)
                 if include_attachments
                 else (example.inputs,)
             )
@@ -1842,9 +1842,9 @@ def _get_run(r: rt.RunTree) -> None:
                 *args,
                 langsmith_extra=langsmith_extra,
             )
-            if include_attachments and example.attachment_urls is not None:
-                for attachment in example.attachment_urls:
-                    _, reader = example.attachment_urls[attachment]
+            if include_attachments and example.attachments_info is not None:
+                for attachment in example.attachments_info:
+                    reader = example.attachments_info[attachment]["reader"]
                     reader.seek(0)
         except Exception as e:
             logger.error(
diff --git a/python/langsmith/schemas.py b/python/langsmith/schemas.py
index 34de09aaa..b06552dcd 100644
--- a/python/langsmith/schemas.py
+++ b/python/langsmith/schemas.py
@@ -93,9 +93,6 @@ class ExampleBase(BaseModel):
     inputs: Dict[str, Any] = Field(default_factory=dict)
     outputs: Optional[Dict[str, Any]] = Field(default=None)
     metadata: Optional[Dict[str, Any]] = Field(default=None)
-    attachment_urls: Optional[Dict[str, Tuple[str, BinaryIOLike]]] = Field(default=None)
-    """Dictionary with attachment names as keys and a tuple of the S3 url
-    and a reader of the data for the file."""
 
     class Config:
         """Configuration class for the schema."""
@@ -112,16 +109,30 @@ class ExampleCreate(ExampleBase):
     split: Optional[Union[str, List[str]]] = None
 
 
-class ExampleUpsertWithAttachments(ExampleCreate):
-    """Example create with attachments."""
+class ExampleUploadWithAttachments(BaseModel):
+    """Example upload with attachments."""
 
+    id: Optional[UUID]
+    created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
+    inputs: Dict[str, Any] = Field(default_factory=dict)
+    outputs: Optional[Dict[str, Any]] = Field(default=None)
+    metadata: Optional[Dict[str, Any]] = Field(default=None)
+    split: Optional[Union[str, List[str]]] = None
     attachments: Optional[Attachments] = None
 
 
-class ExampleUploadWithAttachments(ExampleUpsertWithAttachments):
-    """Example upload with attachments."""
+class ExampleUpsertWithAttachments(ExampleUploadWithAttachments):
+    """Example create with attachments."""
+
+    dataset_id: UUID
 
-    pass
+
+class AttachmentInfo(TypedDict):
+    """Info for an attachment."""
+
+    presigned_url: str
+    reader: BinaryIOLike
+    # TODO: add mime type
 
 
 class Example(ExampleBase):
@@ -135,6 +146,9 @@ class Example(ExampleBase):
     modified_at: Optional[datetime] = Field(default=None)
     runs: List[Run] = Field(default_factory=list)
     source_run_id: Optional[UUID] = None
+    attachments_info: Optional[Dict[str, AttachmentInfo]] = Field(default=None)
+    """Dictionary with attachment names as keys and a tuple of the S3 url
+    and a reader of the data for the file."""
     _host_url: Optional[str] = PrivateAttr(default=None)
     _tenant_id: Optional[UUID] = PrivateAttr(default=None)
 
diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py
index bc3f6f33f..e175efc9e 100644
--- a/python/tests/integration_tests/test_client.py
+++ b/python/tests/integration_tests/test_client.py
@@ -393,7 +393,6 @@ def test_upload_examples_multipart(langchain_client: Client):
     example_id = uuid4()
     example_1 = ExampleUploadWithAttachments(
         id=example_id,
-        dataset_id=dataset.id,
         inputs={"text": "hello world"},
         attachments={
             "test_file": ("text/plain", b"test content"),
@@ -402,13 +401,11 @@ def test_upload_examples_multipart(langchain_client: Client):
 
     # Test example with minimum required fields
     example_2 = ExampleUploadWithAttachments(
-        dataset_id=dataset.id,
         inputs={"text": "minimal example"},
     )
 
     # Test example with outputs and multiple attachments
     example_3 = ExampleUploadWithAttachments(
-        dataset_id=dataset.id,
         inputs={"text": "example with outputs"},
         outputs={"response": "test response"},
         attachments={
@@ -419,7 +416,7 @@ def test_upload_examples_multipart(langchain_client: Client):
 
     # Test uploading multiple examples at once
     created_examples = langchain_client.upload_examples_multipart(
-        uploads=[example_1, example_2, example_3]
+        dataset_id=dataset.id, uploads=[example_1, example_2, example_3]
     )
     assert created_examples["count"] == 3
 
@@ -439,7 +436,7 @@ def test_upload_examples_multipart(langchain_client: Client):
     # Verify example with ID was created with correct ID
     example_with_id = [ex for ex in examples if ex.id == example_id][0]
     assert example_with_id.inputs["text"] == "hello world"
-    assert "test_file" in example_with_id.attachment_urls
+    assert "test_file" in example_with_id.attachments_info
 
     # Verify example with outputs and multiple attachments
     example_with_outputs = next(
@@ -447,35 +444,20 @@ def test_upload_examples_multipart(langchain_client: Client):
         for ex in examples
         if ex.outputs and ex.outputs.get("response") == "test response"
     )
-    assert len(example_with_outputs.attachment_urls) == 2
-    assert "file1" in example_with_outputs.attachment_urls
-    assert "file2" in example_with_outputs.attachment_urls
+    assert len(example_with_outputs.attachments_info) == 2
+    assert "file1" in example_with_outputs.attachments_info
+    assert "file2" in example_with_outputs.attachments_info
 
     # Test uploading to non-existent dataset fails
     fake_id = uuid4()
     with pytest.raises(LangSmithNotFoundError):
         langchain_client.upload_examples_multipart(
+            dataset_id=fake_id,
             uploads=[
                 ExampleUploadWithAttachments(
-                    dataset_id=fake_id,
                     inputs={"text": "should fail"},
                 )
-            ]
-        )
-
-    # Test uploading examples to different datasets fails
-    with pytest.raises(ValueError, match="All examples must be in the same dataset"):
-        langchain_client.upload_examples_multipart(
-            uploads=[
-                ExampleUploadWithAttachments(
-                    dataset_id=dataset.id,
-                    inputs={"text": "example 1"},
-                ),
-                ExampleUploadWithAttachments(
-                    dataset_id=uuid4(),
-                    inputs={"text": "example 2"},
-                ),
-            ]
+            ],
         )
 
     # Clean up
@@ -1283,8 +1265,7 @@ def test_evaluate_with_attachments(langchain_client: Client) -> None:
     )
 
     # 2. Create example with attachments
-    example = ExampleUpsertWithAttachments(
-        dataset_id=dataset.id,
+    example = ExampleUploadWithAttachments(
         inputs={"question": "What is shown in the image?"},
         outputs={"answer": "test image"},
         attachments={
@@ -1292,13 +1273,13 @@ def test_evaluate_with_attachments(langchain_client: Client) -> None:
         },
     )
 
-    langchain_client.upsert_examples_multipart(upserts=[example])
+    langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example])
 
     # 3. Define target function that uses attachments
     def target(inputs: Dict[str, Any], attachments: Dict[str, Any]) -> Dict[str, Any]:
         # Verify we receive the attachment data
         assert "image" in attachments
-        image_url, image_data = attachments["image"]
+        image_data = attachments["image"]["reader"]
         assert image_data.read() == b"fake image data for testing"
         return {"answer": "test image"}
 
@@ -1345,12 +1326,11 @@ def test_evaluate_with_no_attachments(langchain_client: Client) -> None:
     )
 
     # Verify we can create example the new way without attachments
-    example = ExampleUpsertWithAttachments(
-        dataset_id=dataset.id,
+    example = ExampleUploadWithAttachments(
         inputs={"question": "What is 3+1?"},
         outputs={"answer": "4"},
     )
-    langchain_client.upsert_examples_multipart(upserts=[example])
+    langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example])
 
     def target(inputs: Dict[str, Any], attachments: Dict[str, Any]) -> Dict[str, Any]:
         # Verify we receive an empty attachments dict