langchain-ai · isahers1 · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024
diff --git a/python/langsmith/_internal/_operations.py b/python/langsmith/_internal/_operations.py
@@ -1,7 +1,8 @@
 from __future__ import annotations
 
 import itertools
 import logging
+import os
 import uuid
 from typing import Literal, Optional, Union, cast
 
@@ -246,7 +247,7 @@
             ),
         )
     if op.attachments:
-        for n, (content_type, valb) in op.attachments.items():
+        for n, (content_type, data) in op.attachments.items():
             if "." in n:
                 logger.warning(
                     f"Skipping logging of attachment '{n}' "
@@ -256,17 +257,32 @@
                 )
                 continue
 
-            acc_parts.append(
-                (
-                    f"attachment.{op.id}.{n}",
+            if isinstance(data, bytes):
+                acc_parts.append(
                     (
-                        None,
-                        valb,
-                        content_type,
-                        {"Content-Length": str(len(valb))},
-                    ),
+                        f"attachment.{op.id}.{n}",
+                        (
+                            None,
+                            data,
+                            content_type,
+                            {"Content-Length": str(len(data))},
+                        ),
+                    )
+                )
+            else:
+                file_path = data
+                file_size = os.path.getsize(file_path)
+                acc_parts.append(
+                    (
+                        f"attachment.{op.id}.{n}",
+                        (
+                            None,
+                            open(file_path, "rb"),  # type: ignore[arg-type]
+                            f"{content_type}; length={file_size}",
+                            {},
+                        ),
+                    )
                 )
-            )
     return MultipartPartsAndContext(
         acc_parts,
         f"trace={op.trace_id},id={op.id}",

diff --git a/python/langsmith/client.py b/python/langsmith/client.py
@@ -1537,6 +1537,7 @@ def multipart_ingest(
         ] = None,
         *,
         pre_sampled: bool = False,
+        dangerously_allow_filesystem: bool = False,
     ) -> None:
         """Batch ingest/upsert multiple runs in the Langsmith system.
 
@@ -1621,6 +1622,18 @@ def multipart_ingest(
             )
         )
 
+        for op in serialized_ops:
+            if isinstance(op, SerializedRunOperation) and op.attachments:
+                for attachment in op.attachments.values():
+                    if (
+                        isinstance(attachment, tuple)
+                        and isinstance(attachment[1], Path)
+                        and not dangerously_allow_filesystem
+                    ):
+                        raise ValueError(
+                            "Must set dangerously_allow_filesystem to True to use filesystem paths in multipart ingest."
+                        )
+
         # sent the runs in multipart requests
         self._multipart_ingest_ops(serialized_ops)
 
@@ -1684,6 +1697,7 @@ def update_run(
         extra: Optional[Dict] = None,
         tags: Optional[List[str]] = None,
         attachments: Optional[ls_schemas.Attachments] = None,
+        dangerously_allow_filesystem: bool = False,
         **kwargs: Any,
     ) -> None:
         """Update a run in the LangSmith API.
@@ -1726,6 +1740,15 @@ def update_run(
             "session_name": kwargs.pop("session_name", None),
         }
         if attachments:
+            for _, attachment in attachments.items():
+                if (
+                    isinstance(attachment, tuple)
+                    and isinstance(attachment[1], Path)
+                    and not dangerously_allow_filesystem
+                ):
+                    raise ValueError(
+                        "Must set dangerously_allow_filesystem=True to allow filesystem attachments."
+                    )
             data["attachments"] = attachments
         use_multipart = (
             self.tracing_queue is not None
@@ -1835,7 +1858,12 @@ def read_run(
         response = self.request_with_retries(
             "GET", f"/runs/{_as_uuid(run_id, 'run_id')}"
         )
-        run = ls_schemas.Run(**response.json(), _host_url=self._host_url)
+        attachments = _convert_stored_attachments_to_attachments_dict(
+            response.json(), "s3_urls"
+        )
+        run = ls_schemas.Run(
+            attachments=attachments, **response.json(), _host_url=self._host_url
+        )
         if load_child_runs and run.child_run_ids:
             run = self._load_child_runs(run)
         return run
@@ -2031,7 +2059,13 @@ def list_runs(
         for i, run in enumerate(
             self._get_cursor_paginated_list("/runs/query", body=body_query)
         ):
-            yield ls_schemas.Run(**run, _host_url=self._host_url)
+            # Should this be behind a flag?
+            attachments = _convert_stored_attachments_to_attachments_dict(
+                run, "s3_urls"
+            )
+            yield ls_schemas.Run(
+                attachments=attachments, **run, _host_url=self._host_url
+            )
             if limit is not None and i + 1 >= limit:
                 break
 
@@ -3470,6 +3504,7 @@ def _prepare_multipart_data(
             | List[ls_schemas.ExampleUpdateWithAttachments],
         ],
         include_dataset_id: bool = False,
+        dangerously_allow_filesystem: bool = False,
     ) -> Tuple[Any, bytes]:
         parts: List[MultipartPart] = []
         if include_dataset_id:
@@ -3555,19 +3590,24 @@ def _prepare_multipart_data(
                 for name, attachment in example.attachments.items():
                     if isinstance(attachment, tuple):
                         if isinstance(attachment[1], Path):
-                            mime_type, file_path = attachment
-                            file_size = os.path.getsize(file_path)
-                            parts.append(
-                                (
-                                    f"{example_id}.attachment.{name}",
+                            if dangerously_allow_filesystem:
+                                mime_type, file_path = attachment
+                                file_size = os.path.getsize(file_path)
+                                parts.append(
                                     (
-                                        None,
-                                        open(file_path, "rb"),  # type: ignore[arg-type]
-                                        f"{mime_type}; length={file_size}",
-                                        {},
-                                    ),
+                                        f"{example_id}.attachment.{name}",
+                                        (
+                                            None,
+                                            open(file_path, "rb"),  # type: ignore[arg-type]
+                                            f"{mime_type}; length={file_size}",
+                                            {},
+                                        ),
+                                    )
+                                )
+                            else:
+                                raise ValueError(
+                                    "dangerously_allow_filesystem must be True to upload files from the filesystem"
                                 )
-                            )
                         else:
                             mime_type, data = attachment
                             parts.append(
@@ -3624,6 +3664,7 @@ def update_examples_multipart(
         *,
         dataset_id: ID_TYPE,
         updates: Optional[List[ls_schemas.ExampleUpdateWithAttachments]] = None,
+        dangerously_allow_filesystem: bool = False,
     ) -> ls_schemas.UpsertExamplesResponse:
         """Upload examples."""
         if not (self.info.instance_flags or {}).get(
@@ -3635,7 +3676,11 @@ def update_examples_multipart(
         if updates is None:
             updates = []
 
-        encoder, data = self._prepare_multipart_data(updates, include_dataset_id=False)
+        encoder, data = self._prepare_multipart_data(
+            updates,
+            include_dataset_id=False,
+            dangerously_allow_filesystem=dangerously_allow_filesystem,
+        )
 
         response = self.request_with_retries(
             "PATCH",
@@ -3656,6 +3701,7 @@ def upload_examples_multipart(
         *,
         dataset_id: ID_TYPE,
         uploads: Optional[List[ls_schemas.ExampleUploadWithAttachments]] = None,
+        dangerously_allow_filesystem: bool = False,
     ) -> ls_schemas.UpsertExamplesResponse:
         """Upload examples."""
         if not (self.info.instance_flags or {}).get(
@@ -3666,7 +3712,11 @@ def upload_examples_multipart(
             )
         if uploads is None:
             uploads = []
-        encoder, data = self._prepare_multipart_data(uploads, include_dataset_id=False)
+        encoder, data = self._prepare_multipart_data(
+            uploads,
+            include_dataset_id=False,
+            dangerously_allow_filesystem=dangerously_allow_filesystem,
+        )
 
         response = self.request_with_retries(
             "POST",
@@ -3686,6 +3736,7 @@ def upsert_examples_multipart(
         self,
         *,
         upserts: Optional[List[ls_schemas.ExampleUpsertWithAttachments]] = None,
+        dangerously_allow_filesystem: bool = False,
     ) -> ls_schemas.UpsertExamplesResponse:
         """Upsert examples.
 
@@ -3702,7 +3753,11 @@ def upsert_examples_multipart(
         if upserts is None:
             upserts = []
 
-        encoder, data = self._prepare_multipart_data(upserts, include_dataset_id=True)
+        encoder, data = self._prepare_multipart_data(
+            upserts,
+            include_dataset_id=True,
+            dangerously_allow_filesystem=dangerously_allow_filesystem,
+        )
 
         response = self.request_with_retries(
             "POST",
@@ -3894,16 +3949,9 @@ def read_example(
         )
 
         example = response.json()
-        attachments = {}
-        if example["attachment_urls"]:
-            for key, value in example["attachment_urls"].items():
-                response = requests.get(value["presigned_url"], stream=True)
-                response.raise_for_status()
-                reader = io.BytesIO(response.content)
-                attachments[key.removeprefix("attachment.")] = {
-                    "presigned_url": value["presigned_url"],
-                    "reader": reader,
-                }
+        attachments = _convert_stored_attachments_to_attachments_dict(
+            example, "attachment_urls"
+        )
 
         return ls_schemas.Example(
             **{k: v for k, v in example.items() if k != "attachment_urls"},
@@ -3980,17 +4028,9 @@ def list_examples(
         for i, example in enumerate(
             self._get_paginated_list("/examples", params=params)
         ):
-            attachments = {}
-            if example["attachment_urls"]:
-                for key, value in example["attachment_urls"].items():
-                    response = requests.get(value["presigned_url"], stream=True)
-                    response.raise_for_status()
-                    reader = io.BytesIO(response.content)
-                    attachments[key.removeprefix("attachment.")] = {
-                        "presigned_url": value["presigned_url"],
-                        "reader": reader,
-                    }
-
+            attachments = _convert_stored_attachments_to_attachments_dict(
+                example, "attachment_urls"
+            )
             yield ls_schemas.Example(
                 **{k: v for k, v in example.items() if k != "attachment_urls"},
                 attachments=attachments,
@@ -6657,3 +6697,17 @@ def convert_prompt_to_anthropic_format(
         return anthropic._get_request_payload(messages, stop=stop)
     except Exception as e:
         raise ls_utils.LangSmithError(f"Error converting to Anthropic format: {e}")
+
+
+def _convert_stored_attachments_to_attachments_dict(data, attachments_key):
+    attachments_dict = {}
+    if attachments_key in data and data[attachments_key]:
+        for key, value in data[attachments_key].items():
+            response = requests.get(value["presigned_url"], stream=True)
+            response.raise_for_status()
+            reader = io.BytesIO(response.content)
+            attachments_dict[key.removeprefix("attachment.")] = {
+                "presigned_url": value["presigned_url"],
+                "reader": reader,
+            }
+    return attachments_dict
diff --git a/python/langsmith/run_trees.py b/python/langsmith/run_trees.py
@@ -318,7 +318,9 @@ def patch(self) -> None:
         """Patch the run tree to the API in a background thread."""
         if not self.end_time:
             self.end()
-        attachments = self.attachments
+        attachments = {
+            a: v for a, v in self.attachments.items() if isinstance(v, tuple)
+        }
         try:
             # Avoid loading the same attachment twice
             if attachments:

diff --git a/python/langsmith/schemas.py b/python/langsmith/schemas.py
@@ -39,6 +39,8 @@
         StrictInt,
     )
 
+from pathlib import Path
+
 from typing_extensions import Literal
 
 SCORE_TYPE = Union[StrictBool, StrictInt, StrictFloat, None]
@@ -63,7 +65,7 @@ def my_function(bar: int, my_val: Attachment):
     data: bytes
 
 
-Attachments = Dict[str, Union[Tuple[str, bytes], Attachment]]
+Attachments = Dict[str, Union[Tuple[str, bytes], Attachment, Tuple[str, Path]]]
 """Attachments associated with the run. 
 Each entry is a tuple of (mime_type, bytes), or (mime_type, file_path)"""
 
@@ -76,10 +78,6 @@ def read(self, size: int = -1) -> bytes:
         """Read function."""
         ...
 
-    def write(self, b: bytes) -> int:
-        """Write function."""
-        ...
-
     def seek(self, offset: int, whence: int = 0) -> int:
         """Seek function."""
         ...
@@ -370,7 +368,9 @@ class RunBase(BaseModel):
     tags: Optional[List[str]] = None
     """Tags for categorizing or annotating the run."""
 
-    attachments: Attachments = Field(default_factory=dict)
+    attachments: Union[Attachments, Dict[str, AttachmentInfo]] = Field(
+        default_factory=dict
+    )
     """Attachments associated with the run.
     Each entry is a tuple of (mime_type, bytes)."""
 
@@ -390,6 +390,11 @@ def __repr__(self):
         """Return a string representation of the RunBase object."""
         return f"{self.__class__}(id={self.id}, name='{self.name}', run_type='{self.run_type}')"
 
+    class Config:
+        """Configuration class for the schema."""
+
+        arbitrary_types_allowed = True
+
 
 class Run(RunBase):
     """Run schema when loading from the DB."""