Update evaluators (#106)

autoblocksai · Feb 23, 2024 · be454e5 · be454e5
1 parent 931ee12
commit be454e5
Show file tree

Hide file tree

Showing 6 changed files with 21 additions and 21 deletions.
diff --git a/Python/testing-sdk/my_project/evaluators/has_substrings.py b/Python/testing-sdk/my_project/evaluators/has_substrings.py
@@ -3,13 +3,13 @@
 from typing import List
 from typing import Optional
 
-from autoblocks.testing.models import BaseEvaluator
+from autoblocks.testing.models import BaseTestEvaluator
 from autoblocks.testing.models import BaseTestCase
 from autoblocks.testing.models import Evaluation
 from autoblocks.testing.models import Threshold
 
 
-class BaseHasSubstrings(BaseEvaluator, abc.ABC):
+class BaseHasSubstrings(BaseTestEvaluator, abc.ABC):
     id = "has-substrings"
 
     """
@@ -51,7 +51,7 @@ def output_as_str(self, output: Any) -> str:
         """
         return str(output)
 
-    def evaluate(self, test_case: BaseTestCase, output: Any) -> Evaluation:
+    def evaluate_test_case(self, test_case: BaseTestCase, output: Any) -> Evaluation:
         expected_substrings = self.expected_substrings(test_case)
         output_as_str = self.output_as_str(output)
 

diff --git a/Python/testing-sdk/my_project/evaluators/is_valid_json.py b/Python/testing-sdk/my_project/evaluators/is_valid_json.py
@@ -2,13 +2,13 @@
 from typing import Any
 from typing import Optional
 
-from autoblocks.testing.models import BaseEvaluator
+from autoblocks.testing.models import BaseTestEvaluator
 from autoblocks.testing.models import BaseTestCase
 from autoblocks.testing.models import Evaluation
 from autoblocks.testing.models import Threshold
 
 
-class IsValidJson(BaseEvaluator):
+class IsValidJson(BaseTestEvaluator):
     id = "is-valid-json"
 
     """
@@ -39,7 +39,7 @@ def output_as_str(self, output: SomeCustomOutputType) -> str:
         """
         return str(output)
 
-    def evaluate(self, test_case: BaseTestCase, output: Any) -> Evaluation:
+    def evaluate_test_case(self, test_case: BaseTestCase, output: Any) -> Evaluation:
         try:
             json.loads(self.output_as_str(output))
             return Evaluation(score=1, threshold=self.threshold)

diff --git a/Python/testing-sdk/my_project/test_suites/flashcard_generator/evaluators.py b/Python/testing-sdk/my_project/test_suites/flashcard_generator/evaluators.py
@@ -1,7 +1,7 @@
 import asyncio
 from typing import List
 
-from autoblocks.testing.models import BaseEvaluator
+from autoblocks.testing.models import BaseTestEvaluator
 from autoblocks.testing.models import Evaluation
 from openai import AsyncOpenAI
 
@@ -11,7 +11,7 @@
 openai_client = AsyncOpenAI()
 
 
-class IsProfessionalTone(BaseEvaluator):
+class IsProfessionalTone(BaseTestEvaluator):
     id = "is-professional-tone"
 
     prompt = """Please evaluate the provided text for its professionalism in the context of formal communication.
@@ -55,7 +55,7 @@ async def score_flashcard(self, flashcard: Flashcard) -> int:
 
         raise ValueError(f"Unexpected response: {raw_content}")
 
-    async def evaluate(
+    async def evaluate_test_case(
         self, test_case: TestCase, output: List[Flashcard]
     ) -> Evaluation:
         # Score each flashcard asynchronously
@@ -69,7 +69,7 @@ async def evaluate(
         return Evaluation(score=sum(scores) / len(scores))
 
 
-class IsSupportedByNotes(BaseEvaluator):
+class IsSupportedByNotes(BaseTestEvaluator):
     id = "is-supported-by-notes"
 
     prompt = """Given some notes by a student and a flashcard in the form of a question and answer, evaluate whether the flashcard's question and answer are supported by the notes.
@@ -117,7 +117,7 @@ async def score_flashcard(self, test_case: TestCase, flashcard: Flashcard) -> in
 
         raise ValueError(f"Unexpected response: {raw_content}")
 
-    async def evaluate(
+    async def evaluate_test_case(
         self, test_case: TestCase, output: List[Flashcard]
     ) -> Evaluation:
         """

diff --git a/Python/testing-sdk/my_project/test_suites/study_guide_outline/evaluators.py b/Python/testing-sdk/my_project/test_suites/study_guide_outline/evaluators.py
@@ -1,14 +1,14 @@
 from typing import List
 
-from autoblocks.testing.models import BaseEvaluator
+from autoblocks.testing.models import BaseTestEvaluator
 from autoblocks.testing.models import Evaluation
 from autoblocks.testing.models import Threshold
 
 from my_project.evaluators.has_substrings import BaseHasSubstrings
 from my_project.test_suites.study_guide_outline.test_cases import TestCase
 
 
-class Formatting(BaseEvaluator):
+class Formatting(BaseTestEvaluator):
     id = "formatting"
 
     @staticmethod
@@ -21,11 +21,11 @@ def score(output: str) -> int:
                 return 0
         return 1
 
-    def evaluate(self, test_case: TestCase, output: str) -> Evaluation:
+    def evaluate_test_case(self, test_case: TestCase, output: str) -> Evaluation:
         return Evaluation(score=self.score(output), threshold=Threshold(gte=1))
 
 
-class NumCategories(BaseEvaluator):
+class NumCategories(BaseTestEvaluator):
     id = "num-categories"
 
     min_categories: int = 5
@@ -34,7 +34,7 @@ class NumCategories(BaseEvaluator):
     def score(self, output: str) -> int:
         return int(self.min_categories <= output.count("* ") <= self.max_categories)
 
-    def evaluate(self, test_case: TestCase, output: str) -> Evaluation:
+    def evaluate_test_case(self, test_case: TestCase, output: str) -> Evaluation:
         return Evaluation(score=self.score(output), threshold=Threshold(gte=1))
 
 

diff --git a/Python/testing-sdk/poetry.lock b/Python/testing-sdk/poetry.lock
diff --git a/Python/testing-sdk/pyproject.toml b/Python/testing-sdk/pyproject.toml
@@ -10,7 +10,7 @@ packages = [{include = "my_project"}]
 
 [tool.poetry.dependencies]
 python = "^3.11"
-autoblocksai = ">=0.0.27"
+autoblocksai = ">=0.0.28"
 openai = "^1.0.0"
 
 [tool.poetry.scripts]