langchain-ai · agola11 · Dec 10, 2024 · Nov 13, 2024 · Nov 13, 2024 · Nov 13, 2024
diff --git a/python/langsmith/client.py b/python/langsmith/client.py
@@ -1,4 +1,4 @@
 """Client for interacting with the LangSmith API.

 Use the client to customize API keys / workspace ocnnections, SSl certs,
 etc. for tracing.
@@ -82,6 +82,7 @@
     _SIZE_LIMIT_BYTES,
 )
 from langsmith._internal._multipart import (
+    MultipartPart,
     MultipartPartsAndContext,
     join_multipart_parts_and_context,
 )
@@ -3369,6 +3370,133 @@
             created_at=created_at,
         )
 
+    def upsert_examples_multipart(
+        self,
+        *,
+        upserts: List[ls_schemas.ExampleCreateWithAttachments] = [],
+    ) -> None:
+        """Upsert examples."""
+        # not sure if the below checks are necessary
+        if not isinstance(upserts, list):
+            raise TypeError(f"upserts must be a list, got {type(upserts)}")
+        for item in upserts:
+            if not isinstance(item, ls_schemas.ExampleCreateWithAttachments):
+                raise TypeError(f"Each item must be ExampleCreateWithAttachments, got {type(item)}")
+
+        parts: list[MultipartPart] = []
+
+        for example in upserts:
+            if example.id is not None:
+                example_id = str(example.id)  # is the conversion to string neccessary?
+            else:
+                example_id = str(uuid.uuid4())
+
+            remaining_values = {
+                "dataset_id": example.dataset_id,
+                "created_at": example.created_at,
+            }
+            if example.metadata is not None:
+                remaining_values["metadata"] = example.metadata
+            if example.split is not None:
+                remaining_values["split"] = example.split
+            valb = _dumps_json(remaining_values)
+
+            (
+                parts.append(
+                    (
+                        f"{example_id}",
+                        (
+                            None,
+                            valb,
+                            "application/json",
+                            {"Content-Length": str(len(valb))},
+                        ),
+                    )
+                ),
+            )
+
+            inputsb = _dumps_json(example.inputs)
+            outputsb = _dumps_json(example.outputs)
+
+            (
+                parts.append(
+                    (
+                        f"{example_id}.inputs",
+                        (
+                            None,
+                            inputsb,
+                            "application/json",
+                            {"Content-Length": str(len(inputsb))},
+                        ),
+                    )
+                ),
+            )
+
+            (
+                parts.append(
+                    (
+                        f"{example_id}.outputs",
+                        (
+                            None,
+                            outputsb,
+                            "application/json",
+                            {"Content-Length": str(len(outputsb))},
+                        ),
+                    )
+                ),
+            )
+
+            if example.attachments:
+                for name, attachment in example.attachments.items():
+                    if isinstance(attachment, tuple):
+                        mime_type, data = attachment
+                        (
+                            parts.append(
+                                (
+                                    f"{example_id}.attachment.{name}",
+                                    (
+                                        None,
+                                        data,
+                                        f"{mime_type}; length={len(data)}",
+                                        {},
+                                    ),
+                                )
+                            ),
+                        )
+                    else:
+                        (
+                            parts.append(
+                                (
+                                    f"{example_id}.attachment.{name}",
+                                    (
+                                        None,
+                                        attachment.data,
+                                        f"{attachment.mime_type}; length={len(attachment.data)}",
+                                        {},
+                                    ),
+                                )
+                            ),
+                        )
+
+        encoder = rqtb_multipart.MultipartEncoder(parts, boundary=BOUNDARY)
+        if encoder.len <= 20_000_000:  # ~20 MB
+            data = encoder.to_string()
+        else:
+            data = encoder
+
+        response = self.request_with_retries(
+            "POST",
+            "/v1/platform/examples/multipart",  # No clue what this is supposed to be
+            request_kwargs={
+                "data": data,
+                "headers": {
+                    **self._headers,
+                    "Content-Type": encoder.content_type,
+                },
+            },
+        )
+        ls_utils.raise_for_status_with_text(response)
+
     def create_examples(
         self,
         *,

diff --git a/python/langsmith/schemas.py b/python/langsmith/schemas.py
@@ -89,6 +89,12 @@ class ExampleCreate(ExampleBase):
     split: Optional[Union[str, List[str]]] = None
 
 
+class ExampleCreateWithAttachments(ExampleCreate):
+    """Example create with attachments."""
+
+    attachments: Optional[Attachments] = None
+
+
 class Example(ExampleBase):
     """Example model."""
 

diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py
@@ -20,8 +20,9 @@
 from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor
 
 from langsmith.client import ID_TYPE, Client
-from langsmith.schemas import DataType
+from langsmith.schemas import DataType, ExampleCreateWithAttachments
 from langsmith.utils import (
+    LangSmithNotFoundError,
     LangSmithConnectionError,
     LangSmithError,
     get_env_var,
@@ -368,6 +369,72 @@ def test_error_surfaced_invalid_uri(uri: str) -> None:
     with pytest.raises(LangSmithConnectionError):
         client.create_run("My Run", inputs={"text": "hello world"}, run_type="llm")
 
+# NEED TO FIX ONCE CHANGES PUSH TO PROD
+@pytest.mark.parametrize("uri", ["https://dev.api.smith.langchain.com"])
+def test_upsert_examples_multipart(uri: str) -> None:
+    """Test upserting examples with attachments via multipart endpoint."""
+    dataset_name = "__test_upsert_examples_multipart" + uuid4().hex[:4]
+    langchain_client = Client(api_url=uri, api_key="NEED TO HARDCODE FOR TESTING")
+    if langchain_client.has_dataset(dataset_name=dataset_name):
+        langchain_client.delete_dataset(dataset_name=dataset_name)
+
+    dataset = langchain_client.create_dataset(
+        dataset_name,
+        description="Test dataset for multipart example upload",
+        data_type=DataType.kv,
+    )
+
+    # Test example with all fields
+    example_id = uuid4()
+    example_1 = ExampleCreateWithAttachments(
+        id=example_id,
+        dataset_id=dataset.id,
+        inputs={"text": "hello world"},
+        outputs={"response": "greeting"},
+        attachments={
+            "test_file": ("text/plain", b"test content"),
+        },
+    )
+    # Test example without id
+    example_2 = ExampleCreateWithAttachments(
+        dataset_id=dataset.id,
+        inputs={"text": "foo bar"},
+        outputs={"response": "baz"},
+        attachments={
+            "my_file": ("text/plain", b"more test content"),
+        },
+    )
+
+    langchain_client.upsert_examples_multipart(upserts=[example_1, example_2])
+
+    created_example = langchain_client.read_example(example_id)
+    assert created_example.inputs["text"] == "hello world"
+    assert created_example.outputs["response"] == "greeting"
+
+    all_examples_in_dataset = [example for example in langchain_client.list_examples(dataset_id=dataset.id)]
+    assert len(all_examples_in_dataset) == 2
+
+    # Test that adding invalid example fails - even if valid examples are added alongside
+    example_3 = ExampleCreateWithAttachments(
+        dataset_id=uuid4(), # not a real dataset
+        inputs={"text": "foo bar"},
+        outputs={"response": "baz"},
+        attachments={
+            "my_file": ("text/plain", b"more test content"),
+        },
+    )
+
+    with pytest.raises(LangSmithNotFoundError):
+        langchain_client.upsert_examples_multipart(upserts=[example_3])
+
+    all_examples_in_dataset = [example for example in langchain_client.list_examples(dataset_id=dataset.id)]
+    assert len(all_examples_in_dataset) == 2
+
+    # Throw type errors when not passing ExampleCreateWithAttachments
+    with pytest.raises(TypeError):
+        langchain_client.upsert_examples_multipart(upserts=[{"foo":"bar"}])
+
+    langchain_client.delete_dataset(dataset_name=dataset_name)
 
 def test_create_dataset(langchain_client: Client) -> None:
     dataset_name = "__test_create_dataset" + uuid4().hex[:4]

diff --git a/python/tests/unit_tests/test_client.py b/python/tests/unit_tests/test_client.py
@@ -416,6 +416,93 @@ def test_create_run_mutate(
         assert outputs == {"messages": ["hi", "there"]}
 
 
+@mock.patch("langsmith.client.requests.Session")
+def test_upsert_examples_multipart(mock_session_cls: mock.Mock) -> None:
+    """Test that upsert_examples_multipart sends correct multipart data."""
+    mock_session = MagicMock()
+    mock_response = MagicMock()
+    mock_response.status_code = 200
+    mock_session.request.return_value = mock_response
+    mock_session_cls.return_value = mock_session
+
+    client = Client(api_url="http://localhost:1984", api_key="123")
+
+    # Create test data
+    example_id = uuid.uuid4()
+    dataset_id = uuid.uuid4()
+    created_at = datetime(2015, 1, 1, 0, 0, 0)
+
+    example = ls_schemas.ExampleCreateWithAttachments(
+        id=example_id,
+        dataset_id=dataset_id,
+        created_at=created_at,
+        inputs={"input": "test input"},
+        outputs={"output": "test output"},
+        metadata={"meta": "data"},
+        split="train",
+        attachments={
+            "file1": ("text/plain", b"test data"),
+            "file2": ls_schemas.Attachment(
+                mime_type="application/json", data=b'{"key": "value"}'
+            ),
+        },
+    )
+    client.upsert_examples_multipart(upserts=[example])
+
+    # Verify the request
+    assert mock_session.request.call_count == 2  # we always make a call to /info
+    call_args = mock_session.request.call_args
+
+    assert call_args[0][0] == "POST"
+    assert call_args[0][1].endswith("/v1/examples/multipart")
+
+    # Parse the multipart data
+    request_data = call_args[1]["data"]
+    content_type = call_args[1]["headers"]["Content-Type"]
+    boundary = parse_options_header(content_type)[1]["boundary"]
+
+    parser = MultipartParser(
+        io.BytesIO(
+            request_data
+            if isinstance(request_data, bytes)
+            else request_data.to_string()
+        ),
+        boundary,
+    )
+    parts = list(parser.parts())
+
+    # Verify all expected parts are present
+    expected_parts = {
+        str(example_id): {
+            "dataset_id": str(dataset_id),
+            "created_at": created_at.isoformat(),
+            "metadata": {"meta": "data"},
+            "split": "train",
+        },
+        f"{example_id}.inputs": {"input": "test input"},
+        f"{example_id}.outputs": {"output": "test output"},
+        f"{example_id}.attachment.file1": "test data",
+        f"{example_id}.attachment.file2": '{"key": "value"}',
+    }
+
+    assert len(parts) == len(expected_parts)
+
+    for part in parts:
+        name = part.name
+        assert name in expected_parts, f"Unexpected part: {name}"
+
+        if name.endswith(".attachment.file1"):
+            assert part.value == expected_parts[name]
+            assert part.headers["Content-Type"] == "text/plain"
+        elif name.endswith(".attachment.file2"):
+            assert part.value == expected_parts[name]
+            assert part.headers["Content-Type"] == "application/json"
+        else:
+            value = json.loads(part.value)
+            assert value == expected_parts[name]
+            assert part.headers["Content-Type"] == "application/json"
+
+
 class CallTracker:
     def __init__(self) -> None:
         self.counter = 0