track evaluator errors from sdk

langchain-ai · Oct 8, 2024 · 2cfa36e · 2cfa36e
1 parent f034c38
commit 2cfa36e
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 0 deletions.
diff --git a/python/langsmith/client.py b/python/langsmith/client.py
@@ -4245,6 +4245,7 @@ def _submit_feedback(**kwargs):
                 ),
                 feedback_source_type=ls_schemas.FeedbackSourceType.MODEL,
                 project_id=project_id,
+                extra=res.extra,
             )
         return results
 
@@ -4315,6 +4316,7 @@ def create_feedback(
         project_id: Optional[ID_TYPE] = None,
         comparative_experiment_id: Optional[ID_TYPE] = None,
         feedback_group_id: Optional[ID_TYPE] = None,
+        extra: Optional[Dict] = None,
         **kwargs: Any,
     ) -> ls_schemas.Feedback:
         """Create a feedback in the LangSmith API.
@@ -4360,6 +4362,8 @@ def create_feedback(
         feedback_group_id : str or UUID
             When logging preferences, ranking runs, or other comparative feedback,
             this is used to group feedback together.
+        extra : dict
+            Metadata for the feedback.
         """
         if run_id is None and project_id is None:
             raise ValueError("One of run_id and project_id must be provided")
@@ -4419,6 +4423,7 @@ def create_feedback(
                 comparative_experiment_id, accept_null=True
             ),
             feedback_group_id=_ensure_uuid(feedback_group_id, accept_null=True),
+            extra=extra,
         )
         feedback_block = _dumps_json(feedback.dict(exclude_none=True))
         self.request_with_retries(

diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
@@ -13,6 +13,8 @@
 import random
 import threading
 import uuid
+import inspect
+import re
 from contextvars import copy_context
 from typing import (
     Awaitable,
@@ -82,6 +84,27 @@
     ],
 ]
 
+def extract_code_evaluator_feedback_keys(python_code: str) -> list[str]:
+    # Find the return statement
+    return_match = re.search(r'return\s*({[^}]+})', python_code)
+    if not return_match:
+        return []
+
+    # Extract the dictionary from the return statement
+    dict_str = return_match.group(1)
+
+    # Find all keys in the dictionary
+    key_matches = re.findall(r'"([^"]+)":', dict_str)
+
+    # Filter out 'key' and 'score'
+    feedback_keys = [key for key in key_matches if key not in ['key', 'score']]
+
+    # If 'key' is present in the dictionary, add its value to the feedback_keys
+    key_value_match = re.search(r'"key"\s*:\s*"([^"]+)"', dict_str)
+    if key_value_match:
+        feedback_keys.append(key_value_match.group(1))
+
+    return feedback_keys
 
 def evaluate(
     target: TARGET_T,
@@ -1353,6 +1376,15 @@ def _run_evaluators(
                         )
                     )
                 except Exception as e:
+                    feedback_keys = extract_code_evaluator_feedback_keys(inspect.getsource(evaluator.func))
+                    error_response = EvaluationResults(results=[EvaluationResult(key=key,source_run_id=run.id,
+                                        comment=repr(e),extra={"error":True}) for key in feedback_keys])
+                    eval_results["results"].extend(
+                        # TODO: This is a hack
+                        self.client._log_evaluation_feedback(
+                            error_response, run=run, _executor=executor
+                        )
+                    )
                     logger.error(
                         f"Error running evaluator {repr(evaluator)} on"
                         f" run {run.id}: {repr(e)}",

diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
@@ -90,6 +90,8 @@ class EvaluationResult(BaseModel):
     
     If none provided, the evaluation feedback is applied to the
     root trace being."""
+    extra: Optional[Dict] = None
+    """Metadata for the evaluator run."""
 
     class Config:
         """Pydantic model configuration."""

diff --git a/python/langsmith/schemas.py b/python/langsmith/schemas.py
@@ -460,6 +460,8 @@ class FeedbackBase(BaseModel):
     """For preference scoring, this group ID is shared across feedbacks for each
 
     run in the group that was being compared."""
+    extra: Optional[Dict] = None
+    """The metadata of the feedback."""
 
     class Config:
         """Configuration class for the schema."""