From 17878066dcb2a5ed27c0bc2d9d74ca3471e3f9eb Mon Sep 17 00:00:00 2001 From: Marlene <57748216+marlenezw@users.noreply.github.com> Date: Wed, 13 Nov 2024 21:41:16 +0000 Subject: [PATCH] adding evals for gh. --- src/api/evaluate/evaluate.py | 53 +++++++++++++++++------- src/api/evaluate/evaluators.py | 74 ++++++++++++++++++++-------------- 2 files changed, 82 insertions(+), 45 deletions(-) diff --git a/src/api/evaluate/evaluate.py b/src/api/evaluate/evaluate.py index 3c2bbf02..809de054 100644 --- a/src/api/evaluate/evaluate.py +++ b/src/api/evaluate/evaluate.py @@ -17,6 +17,8 @@ load_dotenv() folder = Path(__file__).parent.absolute().as_posix() +runningonGH = os.getenv("GITHUB_ACTIONS") + # # Add the api directory to the sys.path # sys.path.append(os.path.abspath('../src/api')) @@ -268,26 +270,47 @@ def evaluate_image(project_scope, image_path): import pandas as pd print("Image Evaluation summary:\n") - print("View in Azure AI Studio at: " + str(eval_results['studio_url'])) - metrics = {key: [value] for key, value in eval_results['metrics'].items()} - - results_df = pd.DataFrame.from_dict(metrics) - result_keys = [*metrics.keys()] + if runningonGH: + + metrics = {key: [value] for key, value in eval_results.items()} + + results_df = pd.DataFrame.from_dict(metrics) + + results_df_gpt_evals = results_df.loc[:, results_df.columns.str.contains('score')] + + mean_df = results_df_gpt_evals.mean() + print("\nAverage scores:") + print(mean_df) + + results_df.to_markdown(folder + '/image_eval_results.md') + with open(folder + '/image_eval_results.md', 'a') as file: + file.write("\n\nAverages scores:\n\n") + mean_df.to_markdown(folder + '/image_eval_results.md', 'a') + + with jsonlines.open(folder + '/image_eval_results.jsonl', 'w') as writer: + writer.write(eval_results) + else: + print("View in Azure AI Studio at: " + str(eval_results['studio_url'])) + metrics = {key: [value] for key, value in eval_results['metrics'].items()} - results_df_gpt_evals = results_df[result_keys] + results_df = pd.DataFrame.from_dict(metrics) - mean_df = results_df_gpt_evals.mean() - print("\nAverage scores:") - print(mean_df) + result_keys = [*metrics.keys()] + + results_df_gpt_evals = results_df[result_keys] - results_df.to_markdown(folder + '/image_eval_results.md') - with open(folder + '/image_eval_results.md', 'a') as file: - file.write("\n\nAverages scores:\n\n") - mean_df.to_markdown(folder + '/image_eval_results.md', 'a') + mean_df = results_df_gpt_evals.mean() + print("\nAverage scores:") + print(mean_df) - with jsonlines.open(folder + '/image_eval_results.jsonl', 'w') as writer: - writer.write(eval_results) + results_df.to_markdown(folder + '/image_eval_results.md') + with open(folder + '/image_eval_results.md', 'a') as file: + file.write("\n\nAverages scores:\n\n") + mean_df.to_markdown(folder + '/image_eval_results.md', 'a') + + with jsonlines.open(folder + '/image_eval_results.jsonl', 'w') as writer: + writer.write(eval_results) return eval_results diff --git a/src/api/evaluate/evaluators.py b/src/api/evaluate/evaluators.py index 7d51e249..553048e8 100644 --- a/src/api/evaluate/evaluators.py +++ b/src/api/evaluate/evaluators.py @@ -14,6 +14,10 @@ from azure.identity import DefaultAzureCredential +load_dotenv() + +runningonGH = os.getenv("GITHUB_ACTIONS") + logging.basicConfig(level=logging.CRITICAL) # logging.getLogger('promptflow.core._prompty_utils').setLevel(logging.CRITICAL) @@ -130,26 +134,26 @@ def __call__(self, *, data_path, **kwargs): class ImageEvaluator: def __init__(self, project_scope): self.evaluators = { - # "content_safety": ContentSafetyMultimodalEvaluator( - # credential=DefaultAzureCredential(), - # azure_ai_project=project_scope, - # ), - "violence":ViolenceMultimodalEvaluator( - credential=DefaultAzureCredential(), - azure_ai_project=project_scope, - ), - "self_harm":SelfHarmMultimodalEvaluator( - credential=DefaultAzureCredential(), - azure_ai_project=project_scope, - ), - "hate_unfairness":HateUnfairnessMultimodalEvaluator( - credential=DefaultAzureCredential(), - azure_ai_project=project_scope, - ), - "sexual":SexualMultimodalEvaluator( + "content_safety": ContentSafetyMultimodalEvaluator( credential=DefaultAzureCredential(), azure_ai_project=project_scope, ), + # "violence":ViolenceMultimodalEvaluator( + # credential=DefaultAzureCredential(), + # azure_ai_project=project_scope, + # ), + # "self_harm":SelfHarmMultimodalEvaluator( + # credential=DefaultAzureCredential(), + # azure_ai_project=project_scope, + # ), + # "hate_unfairness":HateUnfairnessMultimodalEvaluator( + # credential=DefaultAzureCredential(), + # azure_ai_project=project_scope, + # ), + # "sexual":SexualMultimodalEvaluator( + # credential=DefaultAzureCredential(), + # azure_ai_project=project_scope, + # ), "protected_material": ProtectedMaterialMultimodalEvaluator( credential=DefaultAzureCredential(), azure_ai_project=project_scope, @@ -181,21 +185,31 @@ def __call__(self, *, messages, **kwargs): input_data = pd.read_json(file_path, lines=True) pprint(input_data) - print("\n===== Calling Evaluate API - Content Safety & Protected Material Evaluator for multi-modal =======") output = {} - result = evaluate( - evaluation_name=f"evaluate-api-multi-modal-eval-dataset-{str(uuid.uuid4())}", - data=file_path, - evaluators=self.evaluators, - azure_ai_project=self.project_scope, - evaluator_config={ - "content_safety": {"conversation": "${data.conversation}"}, - "protected_material": {"conversation": "${data.conversation}"} - } - ) + if runningonGH: + for message in messages: + conversation = {"conversation": { "messages" : message}} - output.update(result) + content_safety_evaluator = ContentSafetyMultimodalEvaluator(credential=DefaultAzureCredential(),azure_ai_project=self.project_scope) + protected_material_evaluator = ProtectedMaterialMultimodalEvaluator(credential=DefaultAzureCredential(),azure_ai_project=self.project_scope) + result_1 = content_safety_evaluator(conversation=conversation["conversation"]) + output.update(result_1) + result_2 = protected_material_evaluator(conversation=conversation["conversation"]) + output.update(result_2) + else: + result = evaluate( + evaluation_name=f"evaluate-api-multi-modal-eval-dataset-{str(uuid.uuid4())}", + data=file_path, + evaluators=self.evaluators, + azure_ai_project=self.project_scope, + evaluator_config={ + "content_safety": {"conversation": "${data.conversation}"}, + "protected_material": {"conversation": "${data.conversation}"} + } + ) + + output.update(result) return output @@ -258,4 +272,4 @@ def evaluate_image(messages): print("results: ", resultsJson) - + \ No newline at end of file