Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

track evaluator errors from sdk #1079

Merged
merged 4 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions python/langsmith/client.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Client for interacting with the LangSmith API.

Check notice on line 1 in python/langsmith/client.py

View workflow job for this annotation

GitHub Actions / benchmark

Benchmark results

......................................... create_5_000_run_trees: Mean +- std dev: 580 ms +- 40 ms ......................................... create_10_000_run_trees: Mean +- std dev: 1.16 sec +- 0.05 sec ......................................... create_20_000_run_trees: Mean +- std dev: 1.15 sec +- 0.06 sec ......................................... dumps_class_nested_py_branch_and_leaf_200x400: Mean +- std dev: 770 us +- 11 us ......................................... dumps_class_nested_py_leaf_50x100: Mean +- std dev: 27.6 ms +- 0.4 ms ......................................... dumps_class_nested_py_leaf_100x200: Mean +- std dev: 114 ms +- 4 ms ......................................... dumps_dataclass_nested_50x100: Mean +- std dev: 28.2 ms +- 1.0 ms ......................................... WARNING: the benchmark result may be unstable * the standard deviation (15.8 ms) is 26% of the mean (61.1 ms) * the maximum (93.6 ms) is 53% greater than the mean (61.1 ms) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. dumps_pydantic_nested_50x100: Mean +- std dev: 61.1 ms +- 15.8 ms ......................................... WARNING: the benchmark result may be unstable * the standard deviation (31.4 ms) is 15% of the mean (215 ms) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. dumps_pydanticv1_nested_50x100: Mean +- std dev: 215 ms +- 31 ms

Check notice on line 1 in python/langsmith/client.py

View workflow job for this annotation

GitHub Actions / benchmark

Comparison against main

+-----------------------------------------------+----------+------------------------+ | Benchmark | main | changes | +===============================================+==========+========================+ | dumps_dataclass_nested_50x100 | 27.9 ms | 28.2 ms: 1.01x slower | +-----------------------------------------------+----------+------------------------+ | dumps_class_nested_py_leaf_100x200 | 113 ms | 114 ms: 1.01x slower | +-----------------------------------------------+----------+------------------------+ | dumps_class_nested_py_leaf_50x100 | 27.3 ms | 27.6 ms: 1.01x slower | +-----------------------------------------------+----------+------------------------+ | dumps_class_nested_py_branch_and_leaf_200x400 | 762 us | 770 us: 1.01x slower | +-----------------------------------------------+----------+------------------------+ | create_20_000_run_trees | 1.12 sec | 1.15 sec: 1.03x slower | +-----------------------------------------------+----------+------------------------+ | create_10_000_run_trees | 1.12 sec | 1.16 sec: 1.04x slower | +-----------------------------------------------+----------+------------------------+ | Geometric mean | (ref) | 1.01x slower | +-----------------------------------------------+----------+------------------------+ Benchmark hidden because not significant (3): dumps_pydantic_nested_50x100, dumps_pydanticv1_nested_50x100, create_5_000_run_trees

Use the client to customize API keys / workspace ocnnections, SSl certs,
etc. for tracing.
Expand Down Expand Up @@ -4245,6 +4245,7 @@
),
feedback_source_type=ls_schemas.FeedbackSourceType.MODEL,
project_id=project_id,
extra=res.extra,
)
return results

Expand Down Expand Up @@ -4315,6 +4316,7 @@
project_id: Optional[ID_TYPE] = None,
comparative_experiment_id: Optional[ID_TYPE] = None,
feedback_group_id: Optional[ID_TYPE] = None,
extra: Optional[Dict] = None,
**kwargs: Any,
) -> ls_schemas.Feedback:
"""Create a feedback in the LangSmith API.
Expand Down Expand Up @@ -4360,6 +4362,8 @@
feedback_group_id : str or UUID
When logging preferences, ranking runs, or other comparative feedback,
this is used to group feedback together.
extra : dict
Metadata for the feedback.
"""
if run_id is None and project_id is None:
raise ValueError("One of run_id and project_id must be provided")
Expand Down Expand Up @@ -4419,6 +4423,7 @@
comparative_experiment_id, accept_null=True
),
feedback_group_id=_ensure_uuid(feedback_group_id, accept_null=True),
extra=extra,
)
feedback_block = _dumps_json(feedback.dict(exclude_none=True))
self.request_with_retries(
Expand Down
32 changes: 32 additions & 0 deletions python/langsmith/evaluation/_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import random
import threading
import uuid
import inspect
import re
from contextvars import copy_context
from typing import (
Awaitable,
Expand Down Expand Up @@ -82,6 +84,27 @@
],
]

def extract_code_evaluator_feedback_keys(python_code: str) -> list[str]:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we commit to doing this, we'll need to handle case where you return EvaluationResult as well

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for calling this out. Tried to handle all the cases I could think of:

  • return {"key":"foo","score":1} -> should extract ["foo"]
  • return {"score":1} -> should extract name of function
  • return EvaluationResult(...) should extract key from object
  • return EvaluationResults(results=...) should extract keys from all the EvaluationResults objects passed to 'results'. This case is harder, but it works for simple invocations.

# Find the return statement
return_match = re.search(r'return\s*({[^}]+})', python_code)
if not return_match:
return []

# Extract the dictionary from the return statement
dict_str = return_match.group(1)

# Find all keys in the dictionary
key_matches = re.findall(r'"([^"]+)":', dict_str)

# Filter out 'key' and 'score'
feedback_keys = [key for key in key_matches if key not in ['key', 'score']]

# If 'key' is present in the dictionary, add its value to the feedback_keys
key_value_match = re.search(r'"key"\s*:\s*"([^"]+)"', dict_str)
if key_value_match:
feedback_keys.append(key_value_match.group(1))

return feedback_keys

def evaluate(
target: TARGET_T,
Expand Down Expand Up @@ -1353,6 +1376,15 @@ def _run_evaluators(
)
)
except Exception as e:
feedback_keys = extract_code_evaluator_feedback_keys(inspect.getsource(evaluator.func))
error_response = EvaluationResults(results=[EvaluationResult(key=key,source_run_id=run.id,
comment=repr(e),extra={"error":True}) for key in feedback_keys])
eval_results["results"].extend(
# TODO: This is a hack
self.client._log_evaluation_feedback(
error_response, run=run, _executor=executor
)
)
logger.error(
f"Error running evaluator {repr(evaluator)} on"
f" run {run.id}: {repr(e)}",
Expand Down
2 changes: 2 additions & 0 deletions python/langsmith/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ class EvaluationResult(BaseModel):

If none provided, the evaluation feedback is applied to the
root trace being."""
extra: Optional[Dict] = None
"""Metadata for the evaluator run."""

class Config:
"""Pydantic model configuration."""
Expand Down
2 changes: 2 additions & 0 deletions python/langsmith/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,8 @@ class FeedbackBase(BaseModel):
"""For preference scoring, this group ID is shared across feedbacks for each

run in the group that was being compared."""
extra: Optional[Dict] = None
"""The metadata of the feedback."""

class Config:
"""Configuration class for the schema."""
Expand Down
Loading