From 92f0cc17ac3f4193997e3d7c8072dfa83bb17768 Mon Sep 17 00:00:00 2001 From: slister1001 <103153180+slister1001@users.noreply.github.com> Date: Mon, 11 Nov 2024 09:14:30 -0800 Subject: [PATCH] [Evaluation] Upload results to AI Studio, remote evaluation, custom evaluator (#249) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * removed double role assignment Removed double role assignment for ID '8ebe5a00-799e-43f5-93ac-243d3dce84a7' (Search Index Data Contributor). * Fix regex pattern in ghutils.ts * adding ignite lab manual. * Running pre-build pre-build * adding changes to notebooks. * running pre-build. * updating notebooks for workshop. * Update workshop-4-ci-cd.ipynb Fixed typo in workshop notebook 4. * remote evals and local evals working * undo bicep changes * updates --------- Co-authored-by: Carsten Lemm Co-authored-by: Sebastian Förg <72460768+sebafo@users.noreply.github.com> Co-authored-by: Marlene <57748216+marlenezw@users.noreply.github.com> Co-authored-by: Josh Oh --- src/api/evaluate/evaluate.py | 170 ++++++++++++++++++++------------- src/api/evaluate/evaluators.py | 133 ++++++++++++++++++++------ src/api/requirements.txt | 1 + 3 files changed, 208 insertions(+), 96 deletions(-) diff --git a/src/api/evaluate/evaluate.py b/src/api/evaluate/evaluate.py index a18a9954..6385914c 100644 --- a/src/api/evaluate/evaluate.py +++ b/src/api/evaluate/evaluate.py @@ -2,15 +2,13 @@ import os import sys import json -import concurrent.futures from pathlib import Path -from datetime import datetime -from promptflow.core import AzureOpenAIModelConfiguration -from azure.ai.evaluation import evaluate -from evaluate.evaluators import ArticleEvaluator +from .evaluators import ArticleEvaluator from orchestrator import create from prompty.tracer import trace -from tracing import init_tracing +from azure.identity import DefaultAzureCredential +from azure.ai.project import AIProjectClient +from azure.ai.project.models import Evaluation, Dataset, EvaluatorConfiguration, ConnectionType from dotenv import load_dotenv @@ -20,43 +18,96 @@ # # Add the api directory to the sys.path # sys.path.append(os.path.abspath('../src/api')) -def evaluate_aistudio(model_config, project_scope, data_path): - # create unique id for each run with date and time - run_prefix = datetime.now().strftime("%Y%m%d%H%M%S") - run_id = f"{run_prefix}_chat_evaluation_sdk" - print(run_id) - result = evaluate( - evaluation_name=run_id, - data=data_path, +def evaluate_remote(data_path): + # Create an Azure AI Client from a connection string, copied from your AI Studio project. + # At the moment, it should be in the format ";;;" + # Customer needs to login to Azure subscription via Azure CLI and set the environment variables + + project_client = AIProjectClient.from_connection_string( + credential=DefaultAzureCredential(), + conn_str=os.getenv("PROJECT_CONNECTION_STRING"), + ) + + data_id = project_client.upload_file(data_path) + + default_connection = project_client.connections.get_default(connection_type=ConnectionType.AZURE_OPEN_AI) + + deployment_name = os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"] + api_version = os.environ["AZURE_OPENAI_API_VERSION"] + + model_config = default_connection.to_evaluator_model_config(deployment_name=deployment_name, api_version=api_version) + # Create an evaluation + evaluation = Evaluation( + display_name="Remote Evaluation", + description="Evaluation of dataset", + data=Dataset(id=data_id), evaluators={ - "article": ArticleEvaluator(model_config, project_scope), - }, - evaluator_config={ - "defaults": { - "query": "${data.query}", - "response": "${data.response}", - "context": "${data.context}", - }, + "relevance": EvaluatorConfiguration( + id="azureml://registries/azureml/models/Relevance-Evaluator/versions/4", + init_params={ + "model_config": model_config + }, + ), + "fluency": EvaluatorConfiguration( + id="azureml://registries/azureml/models/Fluency-Evaluator/versions/4", + init_params={ + "model_config": model_config + }, + ), + "coherence": EvaluatorConfiguration( + id="azureml://registries/azureml/models/Coherence-Evaluator/versions/4", + init_params={ + "model_config": model_config + }, + ), + "groundedness": EvaluatorConfiguration( + id="azureml://registries/azureml/models/Groundedness-Evaluator/versions/4", + init_params={ + "model_config": model_config + }, + ), + "violence": EvaluatorConfiguration( + id="azureml://registries/azureml/models/Violent-Content-Evaluator/versions/3", + init_params={ + "azure_ai_project": project_client.scope + }, + ), + "hateunfairness": EvaluatorConfiguration( + id="azureml://registries/azureml/models/Hate-and-Unfairness-Evaluator/versions/4", + init_params={ + "azure_ai_project": project_client.scope + }, + ), + "selfharm": EvaluatorConfiguration( + id="azureml://registries/azureml/models/Self-Harm-Related-Content-Evaluator/versions/3", + init_params={ + "azure_ai_project": project_client.scope + }, + ), + "sexual": EvaluatorConfiguration( + id="azureml://registries/azureml/models/Sexual-Content-Evaluator/versions/3", + init_params={ + "azure_ai_project": project_client.scope + }, + ), }, ) - return result -def evaluate_data(model_config, project_scope, data_path): - writer_evaluator = ArticleEvaluator(model_config,project_scope) + # Create evaluation + evaluation_response = project_client.evaluations.create( + evaluation=evaluation, + ) + # Get evaluation + get_evaluation_response = project_client.evaluations.get(evaluation_response.id) - data = [] - with open(data_path) as f: - for line in f: - data.append(json.loads(line)) + print("----------------------------------------------------------------") + print("Created remote evaluation, evaluation ID: ", get_evaluation_response.id) + print("Evaluation status: ", get_evaluation_response.status) + print("AI Studio URI: ", get_evaluation_response.properties["AiStudioEvaluationUri"]) + print("----------------------------------------------------------------") - results = [] - for row in data: - result = writer_evaluator(query=row["query"], context=row["context"], response=row["response"]) - print("Evaluation results: ", result) - results.append(result) - return results def run_orchestrator(research_context, product_context, assignment_context): query = {"research_context": research_context, "product_context": product_context, "assignment_context": assignment_context} @@ -84,54 +135,37 @@ def run_orchestrator(research_context, product_context, assignment_context): def evaluate_orchestrator(model_config, project_scope, data_path): writer_evaluator = ArticleEvaluator(model_config, project_scope) - data = [] + data = [] + eval_data = [] with open(data_path) as f: for line in f: - data.append(json.loads(line)) - - eval_data = [] - eval_results = [] - - results = [] - # futures = [] - def evaluate_row(research_context, product_context, assignment_context): - result = { "research_context": research_context } - print("Running orchestrator...") - eval_data = run_orchestrator(research_context, product_context, assignment_context) - print('') - print("Evaluating results...") - eval_result = writer_evaluator(query=eval_data["query"], context=eval_data["context"], response=eval_data["response"]) - result.update(eval_result) - print("Evaluation results: ", eval_result) - eval_results.append(result) - - #can not execute concurrently with streamed data because of rate errors - # with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: - for row in data: - results.append(evaluate_row(row["research_context"], row["product_context"], row["assignment_context"])) - # futures.append(executor.submit(evaluate_row, row["research_context"], row["product_context"], row["assignment_context"])) - # for future in futures: - # results.append(future.result()) + row = json.loads(line) + data.append(row) + eval_data.append(run_orchestrator(row["research_context"], row["product_context"], row["assignment_context"])) # write out eval data to a file so we can re-run evaluation on it with jsonlines.open(folder + '/eval_data.jsonl', 'w') as writer: for row in eval_data: writer.write(row) + eval_data_path = folder + '/eval_data.jsonl' + + eval_results = writer_evaluator(data_path=eval_data_path) import pandas as pd print("Evaluation summary:\n") - results_df = pd.DataFrame.from_dict(eval_results) - results_df_gpt_evals = results_df[['gpt_relevance', 'gpt_fluency', 'gpt_coherence','gpt_groundedness']] - results_df_content_safety = results_df[['violence_score', 'self_harm_score', 'hate_unfairness_score','sexual_score']] + print("View in Azure AI Studio at: " + str(eval_results['studio_url'])) + metrics = {key: [value] for key, value in eval_results['metrics'].items()} + results_df = pd.DataFrame.from_dict(metrics) + results_df_gpt_evals = results_df[['relevance.gpt_relevance', 'fluency.gpt_fluency', 'coherence.gpt_coherence','groundedness.gpt_groundedness']] + results_df_content_safety = results_df[['violence.violence_defect_rate', 'self-harm.self_harm_defect_rate', 'hate-unfairness.hate_unfairness_defect_rate','sexual.sexual_defect_rate']] - # mean_df = results_df.drop("research_context", axis=1).mean() mean_df = results_df_gpt_evals.mean() print("\nAverage scores:") print(mean_df) content_safety_mean_df = results_df_content_safety.mean() - print("\nContent safety average scores:") + print("\nContent safety average defect rate:") print(content_safety_mean_df) results_df.to_markdown(folder + '/eval_results.md') @@ -165,10 +199,8 @@ def evaluate_row(research_context, product_context, assignment_context): # print(os.environ["BING_SEARCH_ENDPOINT"]) # print("value: ", os.environ["BING_SEARCH_KEY"], len(os.environ["BING_SEARCH_KEY"])) - - tracer = init_tracing(local_tracing=True) - eval_result = evaluate_orchestrator(model_config, project_scope, data_path=folder +"/eval_inputs.jsonl") + evaluate_remote(data_path=folder +"/eval_data.jsonl") end=time.time() print(f"Finished evaluate in {end - start}s") \ No newline at end of file diff --git a/src/api/evaluate/evaluators.py b/src/api/evaluate/evaluators.py index 74ba3688..d37a7f07 100644 --- a/src/api/evaluate/evaluators.py +++ b/src/api/evaluate/evaluators.py @@ -1,47 +1,126 @@ import os import json import logging - +import prompty from opentelemetry import trace from opentelemetry.trace import set_span_in_context from azure.ai.evaluation import RelevanceEvaluator, GroundednessEvaluator, FluencyEvaluator, CoherenceEvaluator, ContentSafetyEvaluator from azure.ai.evaluation import ViolenceEvaluator, HateUnfairnessEvaluator, SelfHarmEvaluator, SexualEvaluator +from azure.ai.evaluation import evaluate from azure.identity import DefaultAzureCredential logging.getLogger('promptflow.core._prompty_utils').setLevel(logging.CRITICAL) +class FriendlinessEvaluator: + def __init__(self) -> None: + pass + + def __call__(self, response): + model_config = { + "azure_deployment": os.environ.get("AZURE_OPENAI_4_EVAL_DEPLOYMENT_NAME"), + "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT") + } + result = prompty.execute( + "friendliness.prompty", + configuration=model_config, + inputs={ + "response": response, + } + ) + return {"score": result} class ArticleEvaluator: def __init__(self, model_config, project_scope): - self.evaluators = [ - RelevanceEvaluator(model_config), - FluencyEvaluator(model_config), - CoherenceEvaluator(model_config), - GroundednessEvaluator(model_config), - ViolenceEvaluator(azure_ai_project=project_scope, credential=DefaultAzureCredential()), - HateUnfairnessEvaluator(azure_ai_project=project_scope, credential=DefaultAzureCredential()), - SelfHarmEvaluator(azure_ai_project=project_scope, credential=DefaultAzureCredential()), - SexualEvaluator(azure_ai_project=project_scope, credential=DefaultAzureCredential()) - ] + self.evaluators = { + "relevance": RelevanceEvaluator(model_config), + "fluency": FluencyEvaluator(model_config), + "coherence": CoherenceEvaluator(model_config), + "groundedness": GroundednessEvaluator(model_config), + "violence": ViolenceEvaluator(azure_ai_project=project_scope, credential=DefaultAzureCredential()), + "hate-unfairness": HateUnfairnessEvaluator(azure_ai_project=project_scope, credential=DefaultAzureCredential()), + "self-harm": SelfHarmEvaluator(azure_ai_project=project_scope, credential=DefaultAzureCredential()), + "sexual": SexualEvaluator(azure_ai_project=project_scope, credential=DefaultAzureCredential()), + "friendliness": FriendlinessEvaluator(), + } + self.project_scope = project_scope - def __call__(self, *, query: str, context: str, response: str, **kwargs): + def __call__(self, *, data_path, **kwargs): output = {} - for evaluator in self.evaluators: - result = evaluator( - query=query, - context=context, - response=response, - ) - output.update(result) - - if not isinstance(evaluator, ContentSafetyEvaluator): - print(f"{evaluator} evaluation done!") - else: - print(f"Content saftey evaluation in done!") - + ## NOTE: - The following code expects that the user has Storage Blob Data Contributor permissions in order for the results to upload to the Azure AI Studio. + result = evaluate( + data=data_path, + evaluators=self.evaluators, + ## NOTE: If you do not have Storage Blob Data Contributor permissions, please comment out the below line of code. + azure_ai_project=self.project_scope, + evaluator_config={ + "relevance": { + "column_mapping": { + "response": "${data.response}", + "context": "${data.context}", + "query": "${data.query}", + }, + }, + "fluency": { + "column_mapping": { + "response": "${data.response}", + "context": "${data.context}", + "query": "${data.query}", + }, + }, + "coherence": { + "column_mapping": { + "response": "${data.response}", + "query": "${data.query}", + }, + }, + "groundedness": { + "column_mapping": { + "response": "${data.response}", + "context": "${data.context}", + "query": "${data.query}", + }, + }, + "violence": { + "column_mapping": { + "response": "${data.response}", + "context": "${data.context}", + "query": "${data.query}", + }, + }, + "self-harm": { + "column_mapping": { + "response": "${data.response}", + "context": "${data.context}", + "query": "${data.query}", + }, + }, + "hate-unfairness": { + "column_mapping": { + "response": "${data.response}", + "context": "${data.context}", + "query": "${data.query}", + }, + }, + "sexual": { + "column_mapping": { + "response": "${data.response}", + "context": "${data.context}", + "query": "${data.query}", + }, + }, + "friendliness": { + "column_mapping": { + "response": "${data.response}", + "context": "${data.context}", + "query": "${data.query}", + }, + }, + }, + ) + output.update(result) return output - + def evaluate_article(data, trace_context): tracer = trace.get_tracer(__name__) with tracer.start_as_current_span("run_evaluators", context=trace_context) as span: @@ -57,7 +136,7 @@ def evaluate_article(data, trace_context): "project_name": os.environ["AZURE_AI_PROJECT_NAME"], } evaluator = ArticleEvaluator(configuration, project_scope) - results = evaluator(query=data['query'], context=data['context'], response=data['response']) + results = evaluator(data) resultsJson = json.dumps(results) span.set_attribute("output", resultsJson) diff --git a/src/api/requirements.txt b/src/api/requirements.txt index fc877142..dd99dfda 100644 --- a/src/api/requirements.txt +++ b/src/api/requirements.txt @@ -15,6 +15,7 @@ gunicorn==21.2.0 jupyter opentelemetry-instrumentation azure-identity==1.17.1 +https://remoteevalbugbash.blob.core.windows.net/remoteevalbugbash/azure_ai_project-1.0.0b1-py3-none-any.whl gunicorn==21.2.0 azure-keyvault-secrets aiohttp==3.9.5