Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle utf-16 surrogates #613

Merged
merged 3 commits into from
Apr 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 42 additions & 8 deletions python/langsmith/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import logging
import os
import random
import re
import socket
import sys
import threading
Expand Down Expand Up @@ -165,6 +166,19 @@ def _default_retry_config() -> Retry:
_MAX_DEPTH = 2


def _simple_default(obj: Any) -> Any:
# Don't traverse into nested objects
try:
if isinstance(obj, datetime.datetime):
return obj.isoformat()
if isinstance(obj, uuid.UUID):
return str(obj)
return json.loads(json.dumps(obj))
except BaseException as e:
logger.debug(f"Failed to serialize {type(obj)} to JSON: {e}")
return repr(obj)


def _serialize_json(obj: Any, depth: int = 0, serialize_py: bool = True) -> Any:
try:
if depth >= _MAX_DEPTH:
Expand Down Expand Up @@ -222,17 +236,37 @@ def _serialize_json(obj: Any, depth: int = 0, serialize_py: bool = True) -> Any:
return repr(obj)


def _elide_surrogates(s: bytes) -> bytes:
pattern = re.compile(rb"\\ud[89a-f][0-9a-f]{2}", re.IGNORECASE)
result = pattern.sub(b"", s)
return result


def _dumps_json_single(
obj: Any, default: Optional[Callable[[Any], Any]] = None
) -> bytes:
return orjson.dumps(
obj,
default=default,
option=orjson.OPT_SERIALIZE_NUMPY
| orjson.OPT_SERIALIZE_DATACLASS
| orjson.OPT_SERIALIZE_UUID
| orjson.OPT_NON_STR_KEYS,
)
try:
return orjson.dumps(
obj,
default=default,
option=orjson.OPT_SERIALIZE_NUMPY
| orjson.OPT_SERIALIZE_DATACLASS
| orjson.OPT_SERIALIZE_UUID
| orjson.OPT_NON_STR_KEYS,
)
except TypeError as e:
# Usually caused by UTF surrogate characters
logger.debug(f"Orjson serialization failed: {repr(e)}. Falling back to json.")
result = json.dumps(
obj,
default=_simple_default,
ensure_ascii=True,
).encode("utf-8")
try:
result = orjson.dumps(orjson.loads(result.decode("utf-8", errors="lossy")))
except orjson.JSONDecodeError:
result = _elide_surrogates(result)
return result


def _dumps_json(obj: Any, depth: int = 0, serialize_py: bool = True) -> bytes:
Expand Down
40 changes: 39 additions & 1 deletion python/tests/integration_tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@
import os
import random
import string
import sys
import time
from datetime import timedelta
from typing import Any, Callable, Dict, cast
from uuid import uuid4

import pytest
from freezegun import freeze_time
from langchain.schema import FunctionMessage, HumanMessage

from langsmith.client import ID_TYPE, Client
from langsmith.schemas import DataType
Expand Down Expand Up @@ -299,6 +299,8 @@ def test_create_run_with_masked_inputs_outputs(
def test_create_chat_example(
monkeypatch: pytest.MonkeyPatch, langchain_client: Client
) -> None:
from langchain.schema import FunctionMessage, HumanMessage

dataset_name = "__createChatExample-test-dataset"
try:
existing_dataset = langchain_client.read_dataset(dataset_name=dataset_name)
Expand Down Expand Up @@ -489,3 +491,39 @@ def _get_run(run_id: ID_TYPE, has_end: bool = False) -> bool:
else:
assert updated_run.tags == ["tag1", "tag2"]
assert updated_run.extra["runtime"] == created_run.extra["runtime"] # type: ignore


def test_surrogates():
chars = "".join(chr(cp) for cp in range(0, sys.maxunicode + 1))
trans_table = str.maketrans("", "", "")
all_chars = chars.translate(trans_table)
langchain_client = Client()
langchain_client.create_run(
name="test_run",
inputs={
"text": [
"Hello\ud83d\ude00",
"Python\ud83d\udc0d",
"Surrogate\ud834\udd1e",
"Example\ud83c\udf89",
"String\ud83c\udfa7",
"With\ud83c\udf08",
"Surrogates\ud83d\ude0e",
"Embedded\ud83d\udcbb",
"In\ud83c\udf0e",
"The\ud83d\udcd6",
"Text\ud83d\udcac",
"收花🙄·到",
]
},
run_type="llm",
end_time=datetime.datetime.now(datetime.timezone.utc),
)
langchain_client.create_run(
name="test_run",
inputs={
"text": all_chars,
},
run_type="llm",
end_time=datetime.datetime.now(datetime.timezone.utc),
)
13 changes: 13 additions & 0 deletions python/tests/unit_tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import json
import math
import os
import sys
import threading
import time
import uuid
Expand Down Expand Up @@ -766,6 +767,18 @@ class MyNamedTuple(NamedTuple):
raise


def test__dumps_json():
chars = "".join(chr(cp) for cp in range(0, sys.maxunicode + 1))
trans_table = str.maketrans("", "", "")
all_chars = chars.translate(trans_table)
serialized_json = _dumps_json({"chars": all_chars})
assert isinstance(serialized_json, bytes)
serialized_str = serialized_json.decode("utf-8")
assert '"chars"' in serialized_str
assert "\\uD800" not in serialized_str
assert "\\uDC00" not in serialized_str


@patch("langsmith.client.requests.Session", autospec=True)
def test_host_url(_: MagicMock) -> None:
client = Client(api_url="https://api.foobar.com/api", api_key="API_KEY")
Expand Down
Loading