diff --git a/python/langsmith/client.py b/python/langsmith/client.py index 6fe4d33c0..ae55a7470 100644 --- a/python/langsmith/client.py +++ b/python/langsmith/client.py @@ -4245,6 +4245,7 @@ def _submit_feedback(**kwargs): ), feedback_source_type=ls_schemas.FeedbackSourceType.MODEL, project_id=project_id, + extra=res.extra, ) return results @@ -4315,6 +4316,7 @@ def create_feedback( project_id: Optional[ID_TYPE] = None, comparative_experiment_id: Optional[ID_TYPE] = None, feedback_group_id: Optional[ID_TYPE] = None, + extra: Optional[Dict] = None, **kwargs: Any, ) -> ls_schemas.Feedback: """Create a feedback in the LangSmith API. @@ -4360,6 +4362,8 @@ def create_feedback( feedback_group_id : str or UUID When logging preferences, ranking runs, or other comparative feedback, this is used to group feedback together. + extra : dict + Metadata for the feedback. """ if run_id is None and project_id is None: raise ValueError("One of run_id and project_id must be provided") @@ -4419,6 +4423,7 @@ def create_feedback( comparative_experiment_id, accept_null=True ), feedback_group_id=_ensure_uuid(feedback_group_id, accept_null=True), + extra=extra, ) feedback_block = _dumps_json(feedback.dict(exclude_none=True)) self.request_with_retries( diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py index a040ea7a3..378abb124 100644 --- a/python/langsmith/evaluation/_runner.py +++ b/python/langsmith/evaluation/_runner.py @@ -13,6 +13,8 @@ import random import threading import uuid +import inspect +import re from contextvars import copy_context from typing import ( Awaitable, @@ -82,6 +84,27 @@ ], ] +def extract_code_evaluator_feedback_keys(python_code: str) -> list[str]: + # Find the return statement + return_match = re.search(r'return\s*({[^}]+})', python_code) + if not return_match: + return [] + + # Extract the dictionary from the return statement + dict_str = return_match.group(1) + + # Find all keys in the dictionary + key_matches = re.findall(r'"([^"]+)":', dict_str) + + # Filter out 'key' and 'score' + feedback_keys = [key for key in key_matches if key not in ['key', 'score']] + + # If 'key' is present in the dictionary, add its value to the feedback_keys + key_value_match = re.search(r'"key"\s*:\s*"([^"]+)"', dict_str) + if key_value_match: + feedback_keys.append(key_value_match.group(1)) + + return feedback_keys def evaluate( target: TARGET_T, @@ -1353,6 +1376,15 @@ def _run_evaluators( ) ) except Exception as e: + feedback_keys = extract_code_evaluator_feedback_keys(inspect.getsource(evaluator.func)) + error_response = EvaluationResults(results=[EvaluationResult(key=key,source_run_id=run.id, + comment=repr(e),extra={"error":True}) for key in feedback_keys]) + eval_results["results"].extend( + # TODO: This is a hack + self.client._log_evaluation_feedback( + error_response, run=run, _executor=executor + ) + ) logger.error( f"Error running evaluator {repr(evaluator)} on" f" run {run.id}: {repr(e)}", diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py index 7e3e748ba..065f5b16b 100644 --- a/python/langsmith/evaluation/evaluator.py +++ b/python/langsmith/evaluation/evaluator.py @@ -90,6 +90,8 @@ class EvaluationResult(BaseModel): If none provided, the evaluation feedback is applied to the root trace being.""" + extra: Optional[Dict] = None + """Metadata for the evaluator run.""" class Config: """Pydantic model configuration.""" diff --git a/python/langsmith/schemas.py b/python/langsmith/schemas.py index d859418e7..1c5145472 100644 --- a/python/langsmith/schemas.py +++ b/python/langsmith/schemas.py @@ -460,6 +460,8 @@ class FeedbackBase(BaseModel): """For preference scoring, this group ID is shared across feedbacks for each run in the group that was being compared.""" + extra: Optional[Dict] = None + """The metadata of the feedback.""" class Config: """Configuration class for the schema."""