Skip to content

Commit

Permalink
Merge pull request #261 from marlenezw/evals_gh
Browse files Browse the repository at this point in the history
adding evals for gh
  • Loading branch information
marlenezw authored Nov 13, 2024
2 parents 0d3b413 + 1787806 commit 9302f02
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 45 deletions.
53 changes: 38 additions & 15 deletions src/api/evaluate/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
load_dotenv()
folder = Path(__file__).parent.absolute().as_posix()

runningonGH = os.getenv("GITHUB_ACTIONS")

# # Add the api directory to the sys.path
# sys.path.append(os.path.abspath('../src/api'))

Expand Down Expand Up @@ -268,26 +270,47 @@ def evaluate_image(project_scope, image_path):
import pandas as pd

print("Image Evaluation summary:\n")
print("View in Azure AI Studio at: " + str(eval_results['studio_url']))
metrics = {key: [value] for key, value in eval_results['metrics'].items()}

results_df = pd.DataFrame.from_dict(metrics)

result_keys = [*metrics.keys()]
if runningonGH:

metrics = {key: [value] for key, value in eval_results.items()}

results_df = pd.DataFrame.from_dict(metrics)

results_df_gpt_evals = results_df.loc[:, results_df.columns.str.contains('score')]

mean_df = results_df_gpt_evals.mean()
print("\nAverage scores:")
print(mean_df)

results_df.to_markdown(folder + '/image_eval_results.md')
with open(folder + '/image_eval_results.md', 'a') as file:
file.write("\n\nAverages scores:\n\n")
mean_df.to_markdown(folder + '/image_eval_results.md', 'a')

with jsonlines.open(folder + '/image_eval_results.jsonl', 'w') as writer:
writer.write(eval_results)
else:
print("View in Azure AI Studio at: " + str(eval_results['studio_url']))
metrics = {key: [value] for key, value in eval_results['metrics'].items()}

results_df_gpt_evals = results_df[result_keys]
results_df = pd.DataFrame.from_dict(metrics)

mean_df = results_df_gpt_evals.mean()
print("\nAverage scores:")
print(mean_df)
result_keys = [*metrics.keys()]
results_df_gpt_evals = results_df[result_keys]

results_df.to_markdown(folder + '/image_eval_results.md')
with open(folder + '/image_eval_results.md', 'a') as file:
file.write("\n\nAverages scores:\n\n")
mean_df.to_markdown(folder + '/image_eval_results.md', 'a')
mean_df = results_df_gpt_evals.mean()
print("\nAverage scores:")
print(mean_df)

with jsonlines.open(folder + '/image_eval_results.jsonl', 'w') as writer:
writer.write(eval_results)
results_df.to_markdown(folder + '/image_eval_results.md')
with open(folder + '/image_eval_results.md', 'a') as file:
file.write("\n\nAverages scores:\n\n")
mean_df.to_markdown(folder + '/image_eval_results.md', 'a')

with jsonlines.open(folder + '/image_eval_results.jsonl', 'w') as writer:
writer.write(eval_results)

return eval_results

Expand Down
74 changes: 44 additions & 30 deletions src/api/evaluate/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@

from azure.identity import DefaultAzureCredential

load_dotenv()

runningonGH = os.getenv("GITHUB_ACTIONS")

logging.basicConfig(level=logging.CRITICAL)

# logging.getLogger('promptflow.core._prompty_utils').setLevel(logging.CRITICAL)
Expand Down Expand Up @@ -130,26 +134,26 @@ def __call__(self, *, data_path, **kwargs):
class ImageEvaluator:
def __init__(self, project_scope):
self.evaluators = {
# "content_safety": ContentSafetyMultimodalEvaluator(
# credential=DefaultAzureCredential(),
# azure_ai_project=project_scope,
# ),
"violence":ViolenceMultimodalEvaluator(
credential=DefaultAzureCredential(),
azure_ai_project=project_scope,
),
"self_harm":SelfHarmMultimodalEvaluator(
credential=DefaultAzureCredential(),
azure_ai_project=project_scope,
),
"hate_unfairness":HateUnfairnessMultimodalEvaluator(
credential=DefaultAzureCredential(),
azure_ai_project=project_scope,
),
"sexual":SexualMultimodalEvaluator(
"content_safety": ContentSafetyMultimodalEvaluator(
credential=DefaultAzureCredential(),
azure_ai_project=project_scope,
),
# "violence":ViolenceMultimodalEvaluator(
# credential=DefaultAzureCredential(),
# azure_ai_project=project_scope,
# ),
# "self_harm":SelfHarmMultimodalEvaluator(
# credential=DefaultAzureCredential(),
# azure_ai_project=project_scope,
# ),
# "hate_unfairness":HateUnfairnessMultimodalEvaluator(
# credential=DefaultAzureCredential(),
# azure_ai_project=project_scope,
# ),
# "sexual":SexualMultimodalEvaluator(
# credential=DefaultAzureCredential(),
# azure_ai_project=project_scope,
# ),
"protected_material": ProtectedMaterialMultimodalEvaluator(
credential=DefaultAzureCredential(),
azure_ai_project=project_scope,
Expand Down Expand Up @@ -181,21 +185,31 @@ def __call__(self, *, messages, **kwargs):
input_data = pd.read_json(file_path, lines=True)
pprint(input_data)


print("\n===== Calling Evaluate API - Content Safety & Protected Material Evaluator for multi-modal =======")
output = {}
result = evaluate(
evaluation_name=f"evaluate-api-multi-modal-eval-dataset-{str(uuid.uuid4())}",
data=file_path,
evaluators=self.evaluators,
azure_ai_project=self.project_scope,
evaluator_config={
"content_safety": {"conversation": "${data.conversation}"},
"protected_material": {"conversation": "${data.conversation}"}
}
)
if runningonGH:
for message in messages:
conversation = {"conversation": { "messages" : message}}

output.update(result)
content_safety_evaluator = ContentSafetyMultimodalEvaluator(credential=DefaultAzureCredential(),azure_ai_project=self.project_scope)
protected_material_evaluator = ProtectedMaterialMultimodalEvaluator(credential=DefaultAzureCredential(),azure_ai_project=self.project_scope)
result_1 = content_safety_evaluator(conversation=conversation["conversation"])
output.update(result_1)
result_2 = protected_material_evaluator(conversation=conversation["conversation"])
output.update(result_2)
else:
result = evaluate(
evaluation_name=f"evaluate-api-multi-modal-eval-dataset-{str(uuid.uuid4())}",
data=file_path,
evaluators=self.evaluators,
azure_ai_project=self.project_scope,
evaluator_config={
"content_safety": {"conversation": "${data.conversation}"},
"protected_material": {"conversation": "${data.conversation}"}
}
)

output.update(result)

return output

Expand Down Expand Up @@ -258,4 +272,4 @@ def evaluate_image(messages):
print("results: ", resultsJson)




0 comments on commit 9302f02

Please sign in to comment.