From be454e570402c8d47168acb0b0268c6aae614f82 Mon Sep 17 00:00:00 2001 From: Nicole White Date: Fri, 23 Feb 2024 11:33:37 -0500 Subject: [PATCH] Update evaluators (#106) --- .../my_project/evaluators/has_substrings.py | 6 +++--- .../testing-sdk/my_project/evaluators/is_valid_json.py | 6 +++--- .../test_suites/flashcard_generator/evaluators.py | 10 +++++----- .../test_suites/study_guide_outline/evaluators.py | 10 +++++----- Python/testing-sdk/poetry.lock | 8 ++++---- Python/testing-sdk/pyproject.toml | 2 +- 6 files changed, 21 insertions(+), 21 deletions(-) diff --git a/Python/testing-sdk/my_project/evaluators/has_substrings.py b/Python/testing-sdk/my_project/evaluators/has_substrings.py index 6738e554..ef658d32 100644 --- a/Python/testing-sdk/my_project/evaluators/has_substrings.py +++ b/Python/testing-sdk/my_project/evaluators/has_substrings.py @@ -3,13 +3,13 @@ from typing import List from typing import Optional -from autoblocks.testing.models import BaseEvaluator +from autoblocks.testing.models import BaseTestEvaluator from autoblocks.testing.models import BaseTestCase from autoblocks.testing.models import Evaluation from autoblocks.testing.models import Threshold -class BaseHasSubstrings(BaseEvaluator, abc.ABC): +class BaseHasSubstrings(BaseTestEvaluator, abc.ABC): id = "has-substrings" """ @@ -51,7 +51,7 @@ def output_as_str(self, output: Any) -> str: """ return str(output) - def evaluate(self, test_case: BaseTestCase, output: Any) -> Evaluation: + def evaluate_test_case(self, test_case: BaseTestCase, output: Any) -> Evaluation: expected_substrings = self.expected_substrings(test_case) output_as_str = self.output_as_str(output) diff --git a/Python/testing-sdk/my_project/evaluators/is_valid_json.py b/Python/testing-sdk/my_project/evaluators/is_valid_json.py index 2bf82a31..1b9bee08 100644 --- a/Python/testing-sdk/my_project/evaluators/is_valid_json.py +++ b/Python/testing-sdk/my_project/evaluators/is_valid_json.py @@ -2,13 +2,13 @@ from typing import Any from typing import Optional -from autoblocks.testing.models import BaseEvaluator +from autoblocks.testing.models import BaseTestEvaluator from autoblocks.testing.models import BaseTestCase from autoblocks.testing.models import Evaluation from autoblocks.testing.models import Threshold -class IsValidJson(BaseEvaluator): +class IsValidJson(BaseTestEvaluator): id = "is-valid-json" """ @@ -39,7 +39,7 @@ def output_as_str(self, output: SomeCustomOutputType) -> str: """ return str(output) - def evaluate(self, test_case: BaseTestCase, output: Any) -> Evaluation: + def evaluate_test_case(self, test_case: BaseTestCase, output: Any) -> Evaluation: try: json.loads(self.output_as_str(output)) return Evaluation(score=1, threshold=self.threshold) diff --git a/Python/testing-sdk/my_project/test_suites/flashcard_generator/evaluators.py b/Python/testing-sdk/my_project/test_suites/flashcard_generator/evaluators.py index 257a8b9e..c45655bf 100644 --- a/Python/testing-sdk/my_project/test_suites/flashcard_generator/evaluators.py +++ b/Python/testing-sdk/my_project/test_suites/flashcard_generator/evaluators.py @@ -1,7 +1,7 @@ import asyncio from typing import List -from autoblocks.testing.models import BaseEvaluator +from autoblocks.testing.models import BaseTestEvaluator from autoblocks.testing.models import Evaluation from openai import AsyncOpenAI @@ -11,7 +11,7 @@ openai_client = AsyncOpenAI() -class IsProfessionalTone(BaseEvaluator): +class IsProfessionalTone(BaseTestEvaluator): id = "is-professional-tone" prompt = """Please evaluate the provided text for its professionalism in the context of formal communication. @@ -55,7 +55,7 @@ async def score_flashcard(self, flashcard: Flashcard) -> int: raise ValueError(f"Unexpected response: {raw_content}") - async def evaluate( + async def evaluate_test_case( self, test_case: TestCase, output: List[Flashcard] ) -> Evaluation: # Score each flashcard asynchronously @@ -69,7 +69,7 @@ async def evaluate( return Evaluation(score=sum(scores) / len(scores)) -class IsSupportedByNotes(BaseEvaluator): +class IsSupportedByNotes(BaseTestEvaluator): id = "is-supported-by-notes" prompt = """Given some notes by a student and a flashcard in the form of a question and answer, evaluate whether the flashcard's question and answer are supported by the notes. @@ -117,7 +117,7 @@ async def score_flashcard(self, test_case: TestCase, flashcard: Flashcard) -> in raise ValueError(f"Unexpected response: {raw_content}") - async def evaluate( + async def evaluate_test_case( self, test_case: TestCase, output: List[Flashcard] ) -> Evaluation: """ diff --git a/Python/testing-sdk/my_project/test_suites/study_guide_outline/evaluators.py b/Python/testing-sdk/my_project/test_suites/study_guide_outline/evaluators.py index 16111f3c..81ab58ab 100644 --- a/Python/testing-sdk/my_project/test_suites/study_guide_outline/evaluators.py +++ b/Python/testing-sdk/my_project/test_suites/study_guide_outline/evaluators.py @@ -1,6 +1,6 @@ from typing import List -from autoblocks.testing.models import BaseEvaluator +from autoblocks.testing.models import BaseTestEvaluator from autoblocks.testing.models import Evaluation from autoblocks.testing.models import Threshold @@ -8,7 +8,7 @@ from my_project.test_suites.study_guide_outline.test_cases import TestCase -class Formatting(BaseEvaluator): +class Formatting(BaseTestEvaluator): id = "formatting" @staticmethod @@ -21,11 +21,11 @@ def score(output: str) -> int: return 0 return 1 - def evaluate(self, test_case: TestCase, output: str) -> Evaluation: + def evaluate_test_case(self, test_case: TestCase, output: str) -> Evaluation: return Evaluation(score=self.score(output), threshold=Threshold(gte=1)) -class NumCategories(BaseEvaluator): +class NumCategories(BaseTestEvaluator): id = "num-categories" min_categories: int = 5 @@ -34,7 +34,7 @@ class NumCategories(BaseEvaluator): def score(self, output: str) -> int: return int(self.min_categories <= output.count("* ") <= self.max_categories) - def evaluate(self, test_case: TestCase, output: str) -> Evaluation: + def evaluate_test_case(self, test_case: TestCase, output: str) -> Evaluation: return Evaluation(score=self.score(output), threshold=Threshold(gte=1)) diff --git a/Python/testing-sdk/poetry.lock b/Python/testing-sdk/poetry.lock index 94032874..4e8d2a74 100644 --- a/Python/testing-sdk/poetry.lock +++ b/Python/testing-sdk/poetry.lock @@ -33,13 +33,13 @@ trio = ["trio (>=0.23)"] [[package]] name = "autoblocksai" -version = "0.0.27" +version = "0.0.28" description = "Python client for Autoblocks" optional = false python-versions = ">=3.8.1,<4.0.0" files = [ - {file = "autoblocksai-0.0.27-py3-none-any.whl", hash = "sha256:6fb8976d957503d9ff757c7f224f4fd27ef14b0a69e5456afaa4a513bd5523db"}, - {file = "autoblocksai-0.0.27.tar.gz", hash = "sha256:d31fe964e5a5105d10913a30664a2b356073a7483addf1e23df3902683ddb78e"}, + {file = "autoblocksai-0.0.28-py3-none-any.whl", hash = "sha256:93c4f91e4bdb3ca2bde1ef7b4b49f42bae9d60f32c679de8d1465fd9aa35ce20"}, + {file = "autoblocksai-0.0.28.tar.gz", hash = "sha256:e711213b6aba28fdd7ed3846fa3dd91a3f6f880fc276ffacb427ec963ff4a2ac"}, ] [package.dependencies] @@ -448,4 +448,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "4925a91a5164a9e0abfb898b49ef93fdd270d5868d8002f5255df93dcceab0f4" +content-hash = "d3ba99cd223e5f63e2a4071381e3077e60b7f95cbb7f9fa1ddcb9624094112c4" diff --git a/Python/testing-sdk/pyproject.toml b/Python/testing-sdk/pyproject.toml index 4aa409de..827f1cad 100644 --- a/Python/testing-sdk/pyproject.toml +++ b/Python/testing-sdk/pyproject.toml @@ -10,7 +10,7 @@ packages = [{include = "my_project"}] [tool.poetry.dependencies] python = "^3.11" -autoblocksai = ">=0.0.27" +autoblocksai = ">=0.0.28" openai = "^1.0.0" [tool.poetry.scripts]