Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: log wrap_openai runs with unified usage_metadata #1071

Merged
merged 11 commits into from
Oct 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion js/src/tests/evaluate.int.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import {

Check notice on line 1 in js/src/tests/evaluate.int.test.ts

View workflow job for this annotation

GitHub Actions / benchmark

Benchmark results

......................................... create_5_000_run_trees: Mean +- std dev: 560 ms +- 39 ms ......................................... create_10_000_run_trees: Mean +- std dev: 1.11 sec +- 0.05 sec ......................................... create_20_000_run_trees: Mean +- std dev: 1.11 sec +- 0.05 sec ......................................... dumps_class_nested_py_branch_and_leaf_200x400: Mean +- std dev: 769 us +- 8 us ......................................... dumps_class_nested_py_leaf_50x100: Mean +- std dev: 27.0 ms +- 0.2 ms ......................................... dumps_class_nested_py_leaf_100x200: Mean +- std dev: 111 ms +- 2 ms ......................................... dumps_dataclass_nested_50x100: Mean +- std dev: 27.4 ms +- 0.3 ms ......................................... WARNING: the benchmark result may be unstable * the standard deviation (5.83 ms) is 10% of the mean (56.2 ms) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. dumps_pydantic_nested_50x100: Mean +- std dev: 56.2 ms +- 5.8 ms ......................................... WARNING: the benchmark result may be unstable * the standard deviation (26.6 ms) is 13% of the mean (208 ms) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. dumps_pydanticv1_nested_50x100: Mean +- std dev: 208 ms +- 27 ms

Check notice on line 1 in js/src/tests/evaluate.int.test.ts

View workflow job for this annotation

GitHub Actions / benchmark

Comparison against main

+------------------------------------+----------+------------------------+ | Benchmark | main | changes | +====================================+==========+========================+ | dumps_pydantic_nested_50x100 | 61.6 ms | 56.2 ms: 1.09x faster | +------------------------------------+----------+------------------------+ | dumps_pydanticv1_nested_50x100 | 220 ms | 208 ms: 1.06x faster | +------------------------------------+----------+------------------------+ | create_5_000_run_trees | 580 ms | 560 ms: 1.04x faster | +------------------------------------+----------+------------------------+ | create_10_000_run_trees | 1.14 sec | 1.11 sec: 1.03x faster | +------------------------------------+----------+------------------------+ | dumps_class_nested_py_leaf_50x100 | 27.4 ms | 27.0 ms: 1.02x faster | +------------------------------------+----------+------------------------+ | dumps_dataclass_nested_50x100 | 27.8 ms | 27.4 ms: 1.02x faster | +------------------------------------+----------+------------------------+ | dumps_class_nested_py_leaf_100x200 | 112 ms | 111 ms: 1.01x faster | +------------------------------------+----------+------------------------+ | Geometric mean | (ref) | 1.03x faster | +------------------------------------+----------+------------------------+ Benchmark hidden because not significant (2): create_20_000_run_trees, dumps_class_nested_py_branch_and_leaf_200x400
EvaluationResult,
EvaluationResults,
} from "../evaluation/evaluator.js";
Expand Down Expand Up @@ -625,7 +625,7 @@
expect(receivedCommentStrings).toEqual(expectedCommentString);
});

test("Target func can be a runnable", async () => {
test.skip("Target func can be a runnable", async () => {
const targetFunc = RunnableSequence.from([
RunnableLambda.from((input: Record<string, any>) => ({
foo: input.input + 1,
Expand Down
63 changes: 62 additions & 1 deletion python/langsmith/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
)
from uuid import UUID

from typing_extensions import TypedDict
from typing_extensions import NotRequired, TypedDict

try:
from pydantic.v1 import ( # type: ignore[import]
Expand Down Expand Up @@ -891,3 +891,64 @@ class PromptSortField(str, Enum):
"""Last updated time."""
num_likes = "num_likes"
"""Number of likes."""


class InputTokenDetails(TypedDict, total=False):
Copy link
Contributor

@baskaryan baskaryan Oct 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unrelated to this pr — wonder if langchain-core should import these from langsmith sdk now

"""Breakdown of input token counts.

Does *not* need to sum to full input token count. Does *not* need to have all keys.
"""

audio: int
"""Audio input tokens."""
cache_creation: int
"""Input tokens that were cached and there was a cache miss.

Since there was a cache miss, the cache was created from these tokens.
"""
cache_read: int
"""Input tokens that were cached and there was a cache hit.

Since there was a cache hit, the tokens were read from the cache. More precisely,
the model state given these tokens was read from the cache.
"""


class OutputTokenDetails(TypedDict, total=False):
"""Breakdown of output token counts.

Does *not* need to sum to full output token count. Does *not* need to have all keys.
"""

audio: int
"""Audio output tokens."""
reasoning: int
"""Reasoning output tokens.

Tokens generated by the model in a chain of thought process (i.e. by OpenAI's o1
models) that are not returned as part of model output.
"""


class UsageMetadata(TypedDict):
"""Usage metadata for a message, such as token counts.

This is a standard representation of token usage that is consistent across models.
"""

input_tokens: int
"""Count of input (or prompt) tokens. Sum of all input token types."""
output_tokens: int
"""Count of output (or completion) tokens. Sum of all output token types."""
total_tokens: int
"""Total token count. Sum of input_tokens + output_tokens."""
input_token_details: NotRequired[InputTokenDetails]
"""Breakdown of input token counts.

Does *not* need to sum to full input token count. Does *not* need to have all keys.
"""
output_token_details: NotRequired[OutputTokenDetails]
"""Breakdown of output token counts.

Does *not* need to sum to full output token count. Does *not* need to have all keys.
"""
57 changes: 57 additions & 0 deletions python/langsmith/wrappers/_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

from langsmith import client as ls_client
from langsmith import run_helpers
from langsmith.schemas import InputTokenDetails, OutputTokenDetails, UsageMetadata

if TYPE_CHECKING:
from openai import AsyncOpenAI, OpenAI
Expand Down Expand Up @@ -141,6 +142,12 @@ def _reduce_chat(all_chunks: List[ChatCompletionChunk]) -> dict:
]
else:
d = {"choices": [{"message": {"role": "assistant", "content": ""}}]}
# streamed outputs don't go through `process_outputs`
# so we need to flatten metadata here
oai_token_usage = d.pop("usage", None)
d["usage_metadata"] = (
_create_usage_metadata(oai_token_usage) if oai_token_usage else None
)
return d


Expand All @@ -160,12 +167,59 @@ def _reduce_completions(all_chunks: List[Completion]) -> dict:
return d


def _create_usage_metadata(oai_token_usage: dict) -> UsageMetadata:
input_tokens = oai_token_usage.get("prompt_tokens") or 0
output_tokens = oai_token_usage.get("completion_tokens") or 0
total_tokens = oai_token_usage.get("total_tokens") or input_tokens + output_tokens
input_token_details: dict = {
"audio": (oai_token_usage.get("prompt_tokens_details") or {}).get(
"audio_tokens"
),
"cache_read": (oai_token_usage.get("prompt_tokens_details") or {}).get(
"cached_tokens"
),
}
output_token_details: dict = {
"audio": (oai_token_usage.get("completion_tokens_details") or {}).get(
"audio_tokens"
),
"reasoning": (oai_token_usage.get("completion_tokens_details") or {}).get(
"reasoning_tokens"
),
}
return UsageMetadata(
input_tokens=input_tokens,
output_tokens=output_tokens,
total_tokens=total_tokens,
input_token_details=InputTokenDetails(
**{k: v for k, v in input_token_details.items() if v is not None}
),
output_token_details=OutputTokenDetails(
**{k: v for k, v in output_token_details.items() if v is not None}
),
)


def _process_chat_completion(outputs: Any):
try:
rdict = outputs.model_dump()
oai_token_usage = rdict.pop("usage", None)
rdict["usage_metadata"] = (
_create_usage_metadata(oai_token_usage) if oai_token_usage else None
)
return rdict
except BaseException as e:
logger.debug(f"Error processing chat completion: {e}")
return {"output": outputs}


def _get_wrapper(
original_create: Callable,
name: str,
reduce_fn: Callable,
tracing_extra: Optional[TracingExtra] = None,
invocation_params_fn: Optional[Callable] = None,
process_outputs: Optional[Callable] = None,
) -> Callable:
textra = tracing_extra or {}

Expand All @@ -177,6 +231,7 @@ def create(*args, stream: bool = False, **kwargs):
reduce_fn=reduce_fn if stream else None,
process_inputs=_strip_not_given,
_invocation_params_fn=invocation_params_fn,
process_outputs=process_outputs,
**textra,
)

Expand All @@ -191,6 +246,7 @@ async def acreate(*args, stream: bool = False, **kwargs):
reduce_fn=reduce_fn if stream else None,
process_inputs=_strip_not_given,
_invocation_params_fn=invocation_params_fn,
process_outputs=process_outputs,
**textra,
)
return await decorator(original_create)(*args, stream=stream, **kwargs)
Expand Down Expand Up @@ -232,6 +288,7 @@ def wrap_openai(
_reduce_chat,
tracing_extra=tracing_extra,
invocation_params_fn=functools.partial(_infer_invocation_params, "chat"),
process_outputs=_process_chat_completion,
)
client.completions.create = _get_wrapper( # type: ignore[method-assign]
client.completions.create,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
{
"post": [
{
"id": "d0d84d31-923d-4cb5-94a8-40a0a0087578",
"start_time": "2024-10-11T20:58:23.298773+00:00",
"extra": {
"metadata": {
"ls_method": "traceable",
"ls_provider": "openai",
"ls_model_type": "chat",
"ls_model_name": "gpt-4o-mini",
"revision_id": "v0.1.82-381-g03d9e1a-dirty"
},
"runtime": {
"sdk": "langsmith-py",
"sdk_version": "0.1.131",
"library": "langsmith",
"platform": "macOS-13.2-arm64-arm-64bit",
"runtime": "python",
"py_implementation": "CPython",
"runtime_version": "3.11.7",
"langchain_version": "0.2.9",
"langchain_core_version": "0.2.21"
}
},
"serialized": {
"name": "ChatOpenAI",
"signature": "(*, messages: 'Iterable[ChatCompletionMessageParam]', model: 'Union[str, ChatModel]', frequency_penalty: 'Optional[float] | NotGiven' = NOT_GIVEN, function_call: 'completion_create_params.FunctionCall | NotGiven' = NOT_GIVEN, functions: 'Iterable[completion_create_params.Function] | NotGiven' = NOT_GIVEN, logit_bias: 'Optional[Dict[str, int]] | NotGiven' = NOT_GIVEN, logprobs: 'Optional[bool] | NotGiven' = NOT_GIVEN, max_completion_tokens: 'Optional[int] | NotGiven' = NOT_GIVEN, max_tokens: 'Optional[int] | NotGiven' = NOT_GIVEN, n: 'Optional[int] | NotGiven' = NOT_GIVEN, parallel_tool_calls: 'bool | NotGiven' = NOT_GIVEN, presence_penalty: 'Optional[float] | NotGiven' = NOT_GIVEN, response_format: 'completion_create_params.ResponseFormat | NotGiven' = NOT_GIVEN, seed: 'Optional[int] | NotGiven' = NOT_GIVEN, service_tier: \"Optional[Literal['auto', 'default']] | NotGiven\" = NOT_GIVEN, stop: 'Union[Optional[str], List[str]] | NotGiven' = NOT_GIVEN, stream: 'Optional[Literal[False]] | Literal[True] | NotGiven' = NOT_GIVEN, stream_options: 'Optional[ChatCompletionStreamOptionsParam] | NotGiven' = NOT_GIVEN, temperature: 'Optional[float] | NotGiven' = NOT_GIVEN, tool_choice: 'ChatCompletionToolChoiceOptionParam | NotGiven' = NOT_GIVEN, tools: 'Iterable[ChatCompletionToolParam] | NotGiven' = NOT_GIVEN, top_logprobs: 'Optional[int] | NotGiven' = NOT_GIVEN, top_p: 'Optional[float] | NotGiven' = NOT_GIVEN, user: 'str | NotGiven' = NOT_GIVEN, extra_headers: 'Headers | None' = None, extra_query: 'Query | None' = None, extra_body: 'Body | None' = None, timeout: 'float | httpx.Timeout | None | NotGiven' = NOT_GIVEN) -> 'ChatCompletion | AsyncStream[ChatCompletionChunk]'",
"doc": null
},
"events": [],
"tags": [],
"attachments": {},
"dotted_order": "20241011T205823298773Zd0d84d31-923d-4cb5-94a8-40a0a0087578",
"trace_id": "d0d84d31-923d-4cb5-94a8-40a0a0087578",
"outputs": {},
"session_name": "default",
"name": "ChatOpenAI",
"inputs": {
"messages": [
{
"role": "user",
"content": "howdy"
}
],
"model": "gpt-4o-mini",
"stream": false,
"extra_headers": null,
"extra_query": null,
"extra_body": null
},
"run_type": "llm"
}
],
"patch": [
{
"id": "d0d84d31-923d-4cb5-94a8-40a0a0087578",
"name": "ChatOpenAI",
"trace_id": "d0d84d31-923d-4cb5-94a8-40a0a0087578",
"parent_run_id": null,
"dotted_order": "20241011T205823298773Zd0d84d31-923d-4cb5-94a8-40a0a0087578",
"tags": [],
"extra": {
"metadata": {
"ls_method": "traceable",
"ls_provider": "openai",
"ls_model_type": "chat",
"ls_model_name": "gpt-4o-mini",
"revision_id": "v0.1.82-381-g03d9e1a-dirty"
},
"runtime": {
"sdk": "langsmith-py",
"sdk_version": "0.1.131",
"library": "langsmith",
"platform": "macOS-13.2-arm64-arm-64bit",
"runtime": "python",
"py_implementation": "CPython",
"runtime_version": "3.11.7",
"langchain_version": "0.2.9",
"langchain_core_version": "0.2.21"
}
},
"end_time": "2024-10-11T20:58:24.417106+00:00",
"outputs": {
"id": "chatcmpl-AHH0KBvLG7Wq3wfSEGQuxh0xE07Fl",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": null,
"message": {
"content": "Howdy! How can I assist you today?",
"refusal": null,
"role": "assistant",
"function_call": null,
"tool_calls": null
}
}
],
"created": 1728680304,
"model": "gpt-4o-mini-2024-07-18",
"object": "chat.completion",
"service_tier": null,
"system_fingerprint": "fp_e2bde53e6e",
"usage_metadata": {
"input_tokens": 9,
"output_tokens": 9,
"total_tokens": 18,
"input_token_details": {
"cache_read": 0
},
"output_token_details": {
"reasoning": 0
}
}
},
"events": []
}
]
}
Loading
Loading