diff --git a/python/langsmith/client.py b/python/langsmith/client.py index 5db801a64..eefdf936c 100644 --- a/python/langsmith/client.py +++ b/python/langsmith/client.py @@ -12,6 +12,7 @@ import logging import os import random +import re import socket import sys import threading @@ -165,6 +166,19 @@ def _default_retry_config() -> Retry: _MAX_DEPTH = 2 +def _simple_default(obj: Any) -> Any: + # Don't traverse into nested objects + try: + if isinstance(obj, datetime.datetime): + return obj.isoformat() + if isinstance(obj, uuid.UUID): + return str(obj) + return json.loads(json.dumps(obj)) + except BaseException as e: + logger.debug(f"Failed to serialize {type(obj)} to JSON: {e}") + return repr(obj) + + def _serialize_json(obj: Any, depth: int = 0, serialize_py: bool = True) -> Any: try: if depth >= _MAX_DEPTH: @@ -222,17 +236,37 @@ def _serialize_json(obj: Any, depth: int = 0, serialize_py: bool = True) -> Any: return repr(obj) +def _elide_surrogates(s: bytes) -> bytes: + pattern = re.compile(rb"\\ud[89a-f][0-9a-f]{2}", re.IGNORECASE) + result = pattern.sub(b"", s) + return result + + def _dumps_json_single( obj: Any, default: Optional[Callable[[Any], Any]] = None ) -> bytes: - return orjson.dumps( - obj, - default=default, - option=orjson.OPT_SERIALIZE_NUMPY - | orjson.OPT_SERIALIZE_DATACLASS - | orjson.OPT_SERIALIZE_UUID - | orjson.OPT_NON_STR_KEYS, - ) + try: + return orjson.dumps( + obj, + default=default, + option=orjson.OPT_SERIALIZE_NUMPY + | orjson.OPT_SERIALIZE_DATACLASS + | orjson.OPT_SERIALIZE_UUID + | orjson.OPT_NON_STR_KEYS, + ) + except TypeError as e: + # Usually caused by UTF surrogate characters + logger.debug(f"Orjson serialization failed: {repr(e)}. Falling back to json.") + result = json.dumps( + obj, + default=_simple_default, + ensure_ascii=True, + ).encode("utf-8") + try: + result = orjson.dumps(orjson.loads(result.decode("utf-8", errors="lossy"))) + except orjson.JSONDecodeError: + result = _elide_surrogates(result) + return result def _dumps_json(obj: Any, depth: int = 0, serialize_py: bool = True) -> bytes: diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py index 1bf53af33..4ff1a6afc 100644 --- a/python/tests/integration_tests/test_client.py +++ b/python/tests/integration_tests/test_client.py @@ -5,6 +5,7 @@ import os import random import string +import sys import time from datetime import timedelta from typing import Any, Callable, Dict, cast @@ -12,7 +13,6 @@ import pytest from freezegun import freeze_time -from langchain.schema import FunctionMessage, HumanMessage from langsmith.client import ID_TYPE, Client from langsmith.schemas import DataType @@ -299,6 +299,8 @@ def test_create_run_with_masked_inputs_outputs( def test_create_chat_example( monkeypatch: pytest.MonkeyPatch, langchain_client: Client ) -> None: + from langchain.schema import FunctionMessage, HumanMessage + dataset_name = "__createChatExample-test-dataset" try: existing_dataset = langchain_client.read_dataset(dataset_name=dataset_name) @@ -489,3 +491,39 @@ def _get_run(run_id: ID_TYPE, has_end: bool = False) -> bool: else: assert updated_run.tags == ["tag1", "tag2"] assert updated_run.extra["runtime"] == created_run.extra["runtime"] # type: ignore + + +def test_surrogates(): + chars = "".join(chr(cp) for cp in range(0, sys.maxunicode + 1)) + trans_table = str.maketrans("", "", "") + all_chars = chars.translate(trans_table) + langchain_client = Client() + langchain_client.create_run( + name="test_run", + inputs={ + "text": [ + "Hello\ud83d\ude00", + "Python\ud83d\udc0d", + "Surrogate\ud834\udd1e", + "Example\ud83c\udf89", + "String\ud83c\udfa7", + "With\ud83c\udf08", + "Surrogates\ud83d\ude0e", + "Embedded\ud83d\udcbb", + "In\ud83c\udf0e", + "The\ud83d\udcd6", + "Text\ud83d\udcac", + "收花🙄·到", + ] + }, + run_type="llm", + end_time=datetime.datetime.now(datetime.timezone.utc), + ) + langchain_client.create_run( + name="test_run", + inputs={ + "text": all_chars, + }, + run_type="llm", + end_time=datetime.datetime.now(datetime.timezone.utc), + ) diff --git a/python/tests/unit_tests/test_client.py b/python/tests/unit_tests/test_client.py index 95c1cb7b9..36e2e4922 100644 --- a/python/tests/unit_tests/test_client.py +++ b/python/tests/unit_tests/test_client.py @@ -7,6 +7,7 @@ import json import math import os +import sys import threading import time import uuid @@ -766,6 +767,18 @@ class MyNamedTuple(NamedTuple): raise +def test__dumps_json(): + chars = "".join(chr(cp) for cp in range(0, sys.maxunicode + 1)) + trans_table = str.maketrans("", "", "") + all_chars = chars.translate(trans_table) + serialized_json = _dumps_json({"chars": all_chars}) + assert isinstance(serialized_json, bytes) + serialized_str = serialized_json.decode("utf-8") + assert '"chars"' in serialized_str + assert "\\uD800" not in serialized_str + assert "\\uDC00" not in serialized_str + + @patch("langsmith.client.requests.Session", autospec=True) def test_host_url(_: MagicMock) -> None: client = Client(api_url="https://api.foobar.com/api", api_key="API_KEY")