Skip to content

Commit

Permalink
Python feat: track evaluator errors from sdk (#1079)
Browse files Browse the repository at this point in the history
Co-authored-by: William Fu-Hinthorn <[email protected]>
  • Loading branch information
isahers1 and hinthornw authored Nov 4, 2024
1 parent a1a8ac3 commit 13a5f70
Show file tree
Hide file tree
Showing 6 changed files with 390 additions and 6 deletions.
6 changes: 6 additions & 0 deletions python/langsmith/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -4120,6 +4120,7 @@ def _submit_feedback(**kwargs):
),
feedback_source_type=ls_schemas.FeedbackSourceType.MODEL,
project_id=project_id,
extra=res.extra,
trace_id=run.trace_id if run else None,
)
return results
Expand Down Expand Up @@ -4191,6 +4192,7 @@ def create_feedback(
project_id: Optional[ID_TYPE] = None,
comparative_experiment_id: Optional[ID_TYPE] = None,
feedback_group_id: Optional[ID_TYPE] = None,
extra: Optional[Dict] = None,
trace_id: Optional[ID_TYPE] = None,
**kwargs: Any,
) -> ls_schemas.Feedback:
Expand Down Expand Up @@ -4239,6 +4241,9 @@ def create_feedback(
feedback_group_id : str or UUID
When logging preferences, ranking runs, or other comparative feedback,
this is used to group feedback together.
extra : dict
Metadata for the feedback.
trace_id: Optional[ID_TYPE] = The trace ID of the run to provide feedback for. Enables batch ingestion.
"""
if run_id is None and project_id is None:
raise ValueError("One of run_id and project_id must be provided")
Expand Down Expand Up @@ -4302,6 +4307,7 @@ def create_feedback(
comparative_experiment_id, accept_null=True
),
feedback_group_id=_ensure_uuid(feedback_group_id, accept_null=True),
extra=extra,
)

use_multipart = (self.info.batch_ingest_config or {}).get(
Expand Down
42 changes: 39 additions & 3 deletions python/langsmith/evaluation/_arunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
SUMMARY_EVALUATOR_T,
ExperimentResultRow,
_ExperimentManagerMixin,
_extract_feedback_keys,
_ForwardResults,
_load_examples_map,
_load_experiment,
Expand All @@ -46,7 +47,11 @@
_resolve_experiment,
_wrap_summary_evaluators,
)
from langsmith.evaluation.evaluator import EvaluationResults, RunEvaluator
from langsmith.evaluation.evaluator import (
EvaluationResult,
EvaluationResults,
RunEvaluator,
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -667,6 +672,34 @@ async def _arun_evaluators(
)
)
except Exception as e:
try:
feedback_keys = _extract_feedback_keys(evaluator)

error_response = EvaluationResults(
results=[
EvaluationResult(
key=key,
source_run_id=run.id,
comment=repr(e),
extra={"error": True},
)
for key in feedback_keys
]
)
eval_results["results"].extend(
# TODO: This is a hack
self.client._log_evaluation_feedback(
error_response, run=run, _executor=executor
)
)
except Exception as e2:
logger.debug(f"Error parsing feedback keys: {e2}")
pass
logger.error(
f"Error running evaluator {repr(evaluator)} on"
f" run {run.id}: {repr(e)}",
exc_info=True,
)
logger.error(
f"Error running evaluator {repr(evaluator)} on"
f" run {run.id}: {repr(e)}",
Expand Down Expand Up @@ -727,7 +760,8 @@ async def _aapply_summary_evaluators(
)
except Exception as e:
logger.error(
f"Error running summary evaluator {repr(evaluator)}: {e}"
f"Error running summary evaluator {repr(evaluator)}: {e}",
exc_info=True,
)
yield {"results": aggregate_feedback}

Expand Down Expand Up @@ -861,7 +895,9 @@ def _get_run(r: run_trees.RunTree) -> None:
),
)
except Exception as e:
logger.error(f"Error running target function: {e}")
logger.error(
f"Error running target function: {e}", exc_info=True, stacklevel=1
)
return _ForwardResults(
run=cast(schemas.Run, run),
example=example,
Expand Down
186 changes: 184 additions & 2 deletions python/langsmith/evaluation/_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,18 @@

from __future__ import annotations

import ast
import collections
import concurrent.futures as cf
import datetime
import functools
import inspect
import itertools
import logging
import pathlib
import queue
import random
import textwrap
import threading
import uuid
from contextvars import copy_context
Expand Down Expand Up @@ -42,6 +45,7 @@
from langsmith.evaluation.evaluator import (
ComparisonEvaluationResult,
DynamicComparisonRunEvaluator,
DynamicRunEvaluator,
EvaluationResult,
EvaluationResults,
RunEvaluator,
Expand Down Expand Up @@ -1365,6 +1369,29 @@ def _run_evaluators(
)
)
except Exception as e:
try:
feedback_keys = _extract_feedback_keys(evaluator)

error_response = EvaluationResults(
results=[
EvaluationResult(
key=key,
source_run_id=run.id,
comment=repr(e),
extra={"error": True},
)
for key in feedback_keys
]
)
eval_results["results"].extend(
# TODO: This is a hack
self.client._log_evaluation_feedback(
error_response, run=run, _executor=executor
)
)
except Exception as e2:
logger.debug(f"Error parsing feedback keys: {e2}")
pass
logger.error(
f"Error running evaluator {repr(evaluator)} on"
f" run {run.id}: {repr(e)}",
Expand Down Expand Up @@ -1469,7 +1496,8 @@ def _apply_summary_evaluators(
)
except Exception as e:
logger.error(
f"Error running summary evaluator {repr(evaluator)}: {e}"
f"Error running summary evaluator {repr(evaluator)}: {e}",
exc_info=True,
)
yield {"results": aggregate_feedback}

Expand Down Expand Up @@ -1593,7 +1621,9 @@ def _get_run(r: rt.RunTree) -> None:
),
)
except Exception as e:
logger.error(f"Error running target function: {e}")
logger.error(
f"Error running target function: {e}", exc_info=True, stacklevel=1
)
return _ForwardResults(
run=cast(schemas.Run, run),
example=example,
Expand Down Expand Up @@ -1662,3 +1692,155 @@ def _get_random_name() -> str:
from langsmith.evaluation._name_generation import random_name # noqa: F401

return random_name()


def _extract_feedback_keys(evaluator: RunEvaluator):
if isinstance(evaluator, DynamicRunEvaluator):
if getattr(evaluator, "func", None):
return _extract_code_evaluator_feedback_keys(evaluator.func)
elif getattr(evaluator, "afunc", None):
return _extract_code_evaluator_feedback_keys(evaluator.afunc)
# TODO: Support for DynamicComparisonRunEvaluator
if hasattr(evaluator, "evaluator"):
# LangChainStringEvaluator
if getattr(getattr(evaluator, "evaluator"), "evaluation_name", None):
return [evaluator.evaluator.evaluation_name]
return []


def _extract_code_evaluator_feedback_keys(func: Callable) -> list[str]:
python_code = inspect.getsource(func)

def extract_dict_keys(node):
if isinstance(node, ast.Dict):
keys = []
key_value = None
for key, value in zip(node.keys, node.values):
if isinstance(key, (ast.Str, ast.Constant)):
key_str = key.s if isinstance(key, ast.Str) else key.value
if key_str == "key" and isinstance(value, (ast.Str, ast.Constant)):
key_value = (
value.s if isinstance(value, ast.Str) else value.value
)
return [key_value] if key_value else keys
elif (
isinstance(node, ast.Call)
and isinstance(node.func, ast.Name)
and node.func.id == "dict"
):
for keyword in node.keywords:
if keyword.arg == "key" and isinstance(
keyword.value, (ast.Str, ast.Constant)
):
return [
(
keyword.value.s
if isinstance(keyword.value, ast.Str)
else keyword.value.value
)
]
return []

def extract_evaluation_result_key(node):
if (
isinstance(node, ast.Call)
and isinstance(node.func, ast.Name)
and node.func.id == "EvaluationResult"
):
for keyword in node.keywords:
if keyword.arg == "key" and isinstance(
keyword.value, (ast.Str, ast.Constant)
):
return [
(
keyword.value.s
if isinstance(keyword.value, ast.Str)
else keyword.value.value
)
]
return []

def extract_evaluation_results_keys(node, variables):
if (
isinstance(node, ast.Call)
and isinstance(node.func, ast.Name)
and node.func.id == "EvaluationResults"
):
for keyword in node.keywords:
if keyword.arg == "results":
if isinstance(keyword.value, ast.Name):
return variables.get(keyword.value.id, [])
elif isinstance(keyword.value, ast.List):
keys = []
for elt in keyword.value.elts:
keys.extend(extract_evaluation_result_key(elt))
return keys
elif isinstance(node, ast.Dict):
for key, value in zip(node.keys, node.values):
if isinstance(key, (ast.Str, ast.Constant)) and key.s == "results":
if isinstance(value, ast.List):
keys = []
for elt in value.elts:
if isinstance(elt, ast.Dict):
for elt_key, elt_value in zip(elt.keys, elt.values):
if (
isinstance(elt_key, (ast.Str, ast.Constant))
and elt_key.s == "key"
):
if isinstance(
elt_value, (ast.Str, ast.Constant)
):
keys.append(elt_value.s)
elif (
isinstance(elt, ast.Call)
and isinstance(elt.func, ast.Name)
and elt.func.id in ("EvaluationResult", "dict")
):
for keyword in elt.keywords:
if keyword.arg == "key" and isinstance(
keyword.value, (ast.Str, ast.Constant)
):
keys.append(
keyword.value.s
if isinstance(keyword.value, ast.Str)
else keyword.value.value
)

return keys
return []

python_code = textwrap.dedent(python_code)

try:
tree = ast.parse(python_code)
function_def = tree.body[0]
if not isinstance(function_def, ast.FunctionDef):
return []

variables = {}
keys = []

for node in ast.walk(function_def):
if isinstance(node, ast.Assign):
if isinstance(node.value, ast.List):
list_keys = []
for elt in node.value.elts:
list_keys.extend(extract_evaluation_result_key(elt))
if isinstance(node.targets[0], ast.Name):
variables[node.targets[0].id] = list_keys
elif isinstance(node, ast.Return) and node.value is not None:
dict_keys = extract_dict_keys(node.value)
eval_result_key = extract_evaluation_result_key(node.value)
eval_results_keys = extract_evaluation_results_keys(
node.value, variables
)

keys.extend(dict_keys)
keys.extend(eval_result_key)
keys.extend(eval_results_keys)

# If no keys found, return the function name
return keys if keys else [function_def.name]

except SyntaxError:
return []
2 changes: 2 additions & 0 deletions python/langsmith/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ class EvaluationResult(BaseModel):
If none provided, the evaluation feedback is applied to the
root trace being."""
extra: Optional[Dict] = None
"""Metadata for the evaluator run."""

class Config:
"""Pydantic model configuration."""
Expand Down
2 changes: 2 additions & 0 deletions python/langsmith/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,8 @@ class FeedbackBase(BaseModel):
"""For preference scoring, this group ID is shared across feedbacks for each
run in the group that was being compared."""
extra: Optional[Dict] = None
"""The metadata of the feedback."""

class Config:
"""Configuration class for the schema."""
Expand Down
Loading

0 comments on commit 13a5f70

Please sign in to comment.