Skip to content

Commit

Permalink
[Evaluation] Upload results to AI Studio, remote evaluation, custom e…
Browse files Browse the repository at this point in the history
…valuator (#249)

* removed double role assignment 

Removed double role assignment for ID '8ebe5a00-799e-43f5-93ac-243d3dce84a7' (Search Index Data Contributor).

* Fix regex pattern in ghutils.ts

* adding ignite lab manual.

* Running pre-build

pre-build

* adding changes to notebooks.

* running pre-build.

* updating notebooks for workshop.

* Update workshop-4-ci-cd.ipynb

Fixed typo in workshop notebook 4.

* remote evals and local evals working

* undo bicep changes

* updates

---------

Co-authored-by: Carsten Lemm <[email protected]>
Co-authored-by: Sebastian Förg <[email protected]>
Co-authored-by: Marlene <[email protected]>
Co-authored-by: Josh Oh <[email protected]>
  • Loading branch information
5 people authored Nov 11, 2024
1 parent f5b2f4b commit 92f0cc1
Show file tree
Hide file tree
Showing 3 changed files with 208 additions and 96 deletions.
170 changes: 101 additions & 69 deletions src/api/evaluate/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,13 @@
import os
import sys
import json
import concurrent.futures
from pathlib import Path
from datetime import datetime
from promptflow.core import AzureOpenAIModelConfiguration
from azure.ai.evaluation import evaluate
from evaluate.evaluators import ArticleEvaluator
from .evaluators import ArticleEvaluator
from orchestrator import create
from prompty.tracer import trace
from tracing import init_tracing
from azure.identity import DefaultAzureCredential
from azure.ai.project import AIProjectClient
from azure.ai.project.models import Evaluation, Dataset, EvaluatorConfiguration, ConnectionType

from dotenv import load_dotenv

Expand All @@ -20,43 +18,96 @@
# # Add the api directory to the sys.path
# sys.path.append(os.path.abspath('../src/api'))

def evaluate_aistudio(model_config, project_scope, data_path):
# create unique id for each run with date and time
run_prefix = datetime.now().strftime("%Y%m%d%H%M%S")
run_id = f"{run_prefix}_chat_evaluation_sdk"
print(run_id)

result = evaluate(
evaluation_name=run_id,
data=data_path,
def evaluate_remote(data_path):
# Create an Azure AI Client from a connection string, copied from your AI Studio project.
# At the moment, it should be in the format "<HostName>;<AzureSubscriptionId>;<ResourceGroup>;<HubName>"
# Customer needs to login to Azure subscription via Azure CLI and set the environment variables

project_client = AIProjectClient.from_connection_string(
credential=DefaultAzureCredential(),
conn_str=os.getenv("PROJECT_CONNECTION_STRING"),
)

data_id = project_client.upload_file(data_path)

default_connection = project_client.connections.get_default(connection_type=ConnectionType.AZURE_OPEN_AI)

deployment_name = os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"]
api_version = os.environ["AZURE_OPENAI_API_VERSION"]

model_config = default_connection.to_evaluator_model_config(deployment_name=deployment_name, api_version=api_version)
# Create an evaluation
evaluation = Evaluation(
display_name="Remote Evaluation",
description="Evaluation of dataset",
data=Dataset(id=data_id),
evaluators={
"article": ArticleEvaluator(model_config, project_scope),
},
evaluator_config={
"defaults": {
"query": "${data.query}",
"response": "${data.response}",
"context": "${data.context}",
},
"relevance": EvaluatorConfiguration(
id="azureml://registries/azureml/models/Relevance-Evaluator/versions/4",
init_params={
"model_config": model_config
},
),
"fluency": EvaluatorConfiguration(
id="azureml://registries/azureml/models/Fluency-Evaluator/versions/4",
init_params={
"model_config": model_config
},
),
"coherence": EvaluatorConfiguration(
id="azureml://registries/azureml/models/Coherence-Evaluator/versions/4",
init_params={
"model_config": model_config
},
),
"groundedness": EvaluatorConfiguration(
id="azureml://registries/azureml/models/Groundedness-Evaluator/versions/4",
init_params={
"model_config": model_config
},
),
"violence": EvaluatorConfiguration(
id="azureml://registries/azureml/models/Violent-Content-Evaluator/versions/3",
init_params={
"azure_ai_project": project_client.scope
},
),
"hateunfairness": EvaluatorConfiguration(
id="azureml://registries/azureml/models/Hate-and-Unfairness-Evaluator/versions/4",
init_params={
"azure_ai_project": project_client.scope
},
),
"selfharm": EvaluatorConfiguration(
id="azureml://registries/azureml/models/Self-Harm-Related-Content-Evaluator/versions/3",
init_params={
"azure_ai_project": project_client.scope
},
),
"sexual": EvaluatorConfiguration(
id="azureml://registries/azureml/models/Sexual-Content-Evaluator/versions/3",
init_params={
"azure_ai_project": project_client.scope
},
),
},
)
return result

def evaluate_data(model_config, project_scope, data_path):
writer_evaluator = ArticleEvaluator(model_config,project_scope)
# Create evaluation
evaluation_response = project_client.evaluations.create(
evaluation=evaluation,
)
# Get evaluation
get_evaluation_response = project_client.evaluations.get(evaluation_response.id)

data = []
with open(data_path) as f:
for line in f:
data.append(json.loads(line))
print("----------------------------------------------------------------")
print("Created remote evaluation, evaluation ID: ", get_evaluation_response.id)
print("Evaluation status: ", get_evaluation_response.status)
print("AI Studio URI: ", get_evaluation_response.properties["AiStudioEvaluationUri"])
print("----------------------------------------------------------------")

results = []
for row in data:
result = writer_evaluator(query=row["query"], context=row["context"], response=row["response"])
print("Evaluation results: ", result)
results.append(result)

return results

def run_orchestrator(research_context, product_context, assignment_context):
query = {"research_context": research_context, "product_context": product_context, "assignment_context": assignment_context}
Expand Down Expand Up @@ -84,54 +135,37 @@ def run_orchestrator(research_context, product_context, assignment_context):
def evaluate_orchestrator(model_config, project_scope, data_path):
writer_evaluator = ArticleEvaluator(model_config, project_scope)

data = []
data = []
eval_data = []
with open(data_path) as f:
for line in f:
data.append(json.loads(line))

eval_data = []
eval_results = []

results = []
# futures = []
def evaluate_row(research_context, product_context, assignment_context):
result = { "research_context": research_context }
print("Running orchestrator...")
eval_data = run_orchestrator(research_context, product_context, assignment_context)
print('')
print("Evaluating results...")
eval_result = writer_evaluator(query=eval_data["query"], context=eval_data["context"], response=eval_data["response"])
result.update(eval_result)
print("Evaluation results: ", eval_result)
eval_results.append(result)

#can not execute concurrently with streamed data because of rate errors
# with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
for row in data:
results.append(evaluate_row(row["research_context"], row["product_context"], row["assignment_context"]))
# futures.append(executor.submit(evaluate_row, row["research_context"], row["product_context"], row["assignment_context"]))
# for future in futures:
# results.append(future.result())
row = json.loads(line)
data.append(row)
eval_data.append(run_orchestrator(row["research_context"], row["product_context"], row["assignment_context"]))

# write out eval data to a file so we can re-run evaluation on it
with jsonlines.open(folder + '/eval_data.jsonl', 'w') as writer:
for row in eval_data:
writer.write(row)

eval_data_path = folder + '/eval_data.jsonl'

eval_results = writer_evaluator(data_path=eval_data_path)
import pandas as pd

print("Evaluation summary:\n")
results_df = pd.DataFrame.from_dict(eval_results)
results_df_gpt_evals = results_df[['gpt_relevance', 'gpt_fluency', 'gpt_coherence','gpt_groundedness']]
results_df_content_safety = results_df[['violence_score', 'self_harm_score', 'hate_unfairness_score','sexual_score']]
print("View in Azure AI Studio at: " + str(eval_results['studio_url']))
metrics = {key: [value] for key, value in eval_results['metrics'].items()}
results_df = pd.DataFrame.from_dict(metrics)
results_df_gpt_evals = results_df[['relevance.gpt_relevance', 'fluency.gpt_fluency', 'coherence.gpt_coherence','groundedness.gpt_groundedness']]
results_df_content_safety = results_df[['violence.violence_defect_rate', 'self-harm.self_harm_defect_rate', 'hate-unfairness.hate_unfairness_defect_rate','sexual.sexual_defect_rate']]

# mean_df = results_df.drop("research_context", axis=1).mean()
mean_df = results_df_gpt_evals.mean()
print("\nAverage scores:")
print(mean_df)

content_safety_mean_df = results_df_content_safety.mean()
print("\nContent safety average scores:")
print("\nContent safety average defect rate:")
print(content_safety_mean_df)

results_df.to_markdown(folder + '/eval_results.md')
Expand Down Expand Up @@ -165,10 +199,8 @@ def evaluate_row(research_context, product_context, assignment_context):
# print(os.environ["BING_SEARCH_ENDPOINT"])
# print("value: ", os.environ["BING_SEARCH_KEY"], len(os.environ["BING_SEARCH_KEY"]))


tracer = init_tracing(local_tracing=True)

eval_result = evaluate_orchestrator(model_config, project_scope, data_path=folder +"/eval_inputs.jsonl")
evaluate_remote(data_path=folder +"/eval_data.jsonl")

end=time.time()
print(f"Finished evaluate in {end - start}s")
133 changes: 106 additions & 27 deletions src/api/evaluate/evaluators.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,126 @@
import os
import json
import logging

import prompty
from opentelemetry import trace
from opentelemetry.trace import set_span_in_context
from azure.ai.evaluation import RelevanceEvaluator, GroundednessEvaluator, FluencyEvaluator, CoherenceEvaluator, ContentSafetyEvaluator
from azure.ai.evaluation import ViolenceEvaluator, HateUnfairnessEvaluator, SelfHarmEvaluator, SexualEvaluator
from azure.ai.evaluation import evaluate

from azure.identity import DefaultAzureCredential

logging.getLogger('promptflow.core._prompty_utils').setLevel(logging.CRITICAL)

class FriendlinessEvaluator:
def __init__(self) -> None:
pass

def __call__(self, response):
model_config = {
"azure_deployment": os.environ.get("AZURE_OPENAI_4_EVAL_DEPLOYMENT_NAME"),
"azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT")
}
result = prompty.execute(
"friendliness.prompty",
configuration=model_config,
inputs={
"response": response,
}
)
return {"score": result}

class ArticleEvaluator:
def __init__(self, model_config, project_scope):
self.evaluators = [
RelevanceEvaluator(model_config),
FluencyEvaluator(model_config),
CoherenceEvaluator(model_config),
GroundednessEvaluator(model_config),
ViolenceEvaluator(azure_ai_project=project_scope, credential=DefaultAzureCredential()),
HateUnfairnessEvaluator(azure_ai_project=project_scope, credential=DefaultAzureCredential()),
SelfHarmEvaluator(azure_ai_project=project_scope, credential=DefaultAzureCredential()),
SexualEvaluator(azure_ai_project=project_scope, credential=DefaultAzureCredential())
]
self.evaluators = {
"relevance": RelevanceEvaluator(model_config),
"fluency": FluencyEvaluator(model_config),
"coherence": CoherenceEvaluator(model_config),
"groundedness": GroundednessEvaluator(model_config),
"violence": ViolenceEvaluator(azure_ai_project=project_scope, credential=DefaultAzureCredential()),
"hate-unfairness": HateUnfairnessEvaluator(azure_ai_project=project_scope, credential=DefaultAzureCredential()),
"self-harm": SelfHarmEvaluator(azure_ai_project=project_scope, credential=DefaultAzureCredential()),
"sexual": SexualEvaluator(azure_ai_project=project_scope, credential=DefaultAzureCredential()),
"friendliness": FriendlinessEvaluator(),
}
self.project_scope = project_scope

def __call__(self, *, query: str, context: str, response: str, **kwargs):
def __call__(self, *, data_path, **kwargs):
output = {}
for evaluator in self.evaluators:
result = evaluator(
query=query,
context=context,
response=response,
)
output.update(result)

if not isinstance(evaluator, ContentSafetyEvaluator):
print(f"{evaluator} evaluation done!")
else:
print(f"Content saftey evaluation in done!")

## NOTE: - The following code expects that the user has Storage Blob Data Contributor permissions in order for the results to upload to the Azure AI Studio.
result = evaluate(
data=data_path,
evaluators=self.evaluators,
## NOTE: If you do not have Storage Blob Data Contributor permissions, please comment out the below line of code.
azure_ai_project=self.project_scope,
evaluator_config={
"relevance": {
"column_mapping": {
"response": "${data.response}",
"context": "${data.context}",
"query": "${data.query}",
},
},
"fluency": {
"column_mapping": {
"response": "${data.response}",
"context": "${data.context}",
"query": "${data.query}",
},
},
"coherence": {
"column_mapping": {
"response": "${data.response}",
"query": "${data.query}",
},
},
"groundedness": {
"column_mapping": {
"response": "${data.response}",
"context": "${data.context}",
"query": "${data.query}",
},
},
"violence": {
"column_mapping": {
"response": "${data.response}",
"context": "${data.context}",
"query": "${data.query}",
},
},
"self-harm": {
"column_mapping": {
"response": "${data.response}",
"context": "${data.context}",
"query": "${data.query}",
},
},
"hate-unfairness": {
"column_mapping": {
"response": "${data.response}",
"context": "${data.context}",
"query": "${data.query}",
},
},
"sexual": {
"column_mapping": {
"response": "${data.response}",
"context": "${data.context}",
"query": "${data.query}",
},
},
"friendliness": {
"column_mapping": {
"response": "${data.response}",
"context": "${data.context}",
"query": "${data.query}",
},
},
},
)
output.update(result)
return output

def evaluate_article(data, trace_context):
tracer = trace.get_tracer(__name__)
with tracer.start_as_current_span("run_evaluators", context=trace_context) as span:
Expand All @@ -57,7 +136,7 @@ def evaluate_article(data, trace_context):
"project_name": os.environ["AZURE_AI_PROJECT_NAME"],
}
evaluator = ArticleEvaluator(configuration, project_scope)
results = evaluator(query=data['query'], context=data['context'], response=data['response'])
results = evaluator(data)
resultsJson = json.dumps(results)
span.set_attribute("output", resultsJson)

Expand Down
1 change: 1 addition & 0 deletions src/api/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ gunicorn==21.2.0
jupyter
opentelemetry-instrumentation
azure-identity==1.17.1
https://remoteevalbugbash.blob.core.windows.net/remoteevalbugbash/azure_ai_project-1.0.0b1-py3-none-any.whl
gunicorn==21.2.0
azure-keyvault-secrets
aiohttp==3.9.5
Expand Down

0 comments on commit 92f0cc1

Please sign in to comment.