From f658fa79289052bcda8eb4b868376394b7b64528 Mon Sep 17 00:00:00 2001 From: Cassie Breviu <46505951+cassiebreviu@users.noreply.github.com> Date: Fri, 24 May 2024 10:23:03 -0700 Subject: [PATCH] Add gpt4 for evaluation in infra, fix article params and prompty, add evaluation logic (#80) * fix logging when running local * Evaluate uses gpt4 * add gpt4 eval model to terraform * check for related searches * Changes from CI/CD branch (#78) * troubleshoot bing endpoint/key * fix typo * fix typo * run evaluate on push * add some print debugging * more logging * increase debug logging level * use gpt-4 for evals * update max tokens * put max tokens back to 512 * increase token limit * updates * improve article length and token size * add request to eval result * revert token change * undo eval change * update max tokens * Improve evaluate print out * update evaluate * update evaluate * update evaluate * decrease article length, increase tokens * revert token changes * increase writer token limit, decrease word count --------- Co-authored-by: Dan Taylor <qubitron@users.noreply.github.com> --- .github/workflows/evaluate.yml | 13 +- azure.yaml | 1 + data/create-azure-search.py | 209 ++++++++++++++++++++ infra/manifests/api/config.tmpl.yaml | 1 + infra/manifests/api/deployment.tmpl.yaml | 2 + infra/openai.tf | 16 ++ infra/outputs.tf | 9 + infra/variables.tf | 26 +++ src/api/api/agents/orchestrator.py | 13 +- src/api/api/agents/researcher/researcher.py | 11 +- src/api/api/agents/writer/writer.prompty | 7 +- src/api/api/agents/writer/writer.py | 2 +- src/api/api/evaluate/evaluate.py | 25 +-- src/api/api/evaluate/evaluators.py | 6 +- src/api/api/logging.py | 4 +- 15 files changed, 307 insertions(+), 38 deletions(-) create mode 100644 data/create-azure-search.py diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index af71516f..fa96a600 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -2,11 +2,11 @@ name: Evaluate on: workflow_dispatch: - workflow_run: - workflows: ["azd provision and deploy"] - branches: [main] - types: - - completed + push: + # Run when commits are pushed to mainline branch (main or master) + # Set this to the mainline branch you are using + branches: + - main # Set up permissions for deploying with secretless Azure federated credentials # https://learn.microsoft.com/en-us/azure/developer/github/connect-from-azure?tabs=azure-portal%2Clinux#set-up-azure-login-with-openid-connect-authentication @@ -24,6 +24,7 @@ jobs: AZURE_CREDENTIALS: ${{ secrets.AZURE_CREDENTIALS }} AZURE_OPENAI_ENDPOINT: ${{ vars.AZURE_OPENAI_ENDPOINT }} AZURE_OPENAI_API_VERSION: ${{ vars.AZURE_OPENAI_API_VERSION }} + AZURE_OPENAI_GPT4_EVAL_DEPLOYMENT: ${{ vars.AZURE_OPENAI_GPT4_EVAL_DEPLOYMENT }} AZURE_OPENAI_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_DEPLOYMENT_NAME }} AZURE_OPENAI_35_TURBO_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_35_TURBO_DEPLOYMENT_NAME }} AZURE_SEARCH_ENDPOINT: ${{ vars.AZURE_SEARCH_ENDPOINT }} @@ -61,7 +62,7 @@ jobs: inlineScript: | az account set --subscription ${{env.AZURE_SUBSCRIPTION_ID}} - - name: evaluate chat data + - name: evaluate orchestrator working-directory: ./src/api run: | python -m api.evaluate.evaluate diff --git a/azure.yaml b/azure.yaml index 889243c1..d35eb8e3 100644 --- a/azure.yaml +++ b/azure.yaml @@ -60,6 +60,7 @@ infra: provider: terraform pipeline: variables: + - AZURE_OPENAI_GPT4_EVAL_DEPLOYMENT - AZURE_OPENAI_35_TURBO_DEPLOYMENT_NAME - AZURE_OPENAI_35_TURBO_MODEL_NAME - AZURE_OPENAI_API_VERSION diff --git a/data/create-azure-search.py b/data/create-azure-search.py new file mode 100644 index 00000000..520dbd7e --- /dev/null +++ b/data/create-azure-search.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python +# coding: utf-8 + +# # Generating your product search index +# Thereis notebook is designed to automatically create the product search index for you. It uses the [product catalog](products.csv) file to create the index. In order to do so it needs names ane keys for the following services: +# +# - Azure Search Service +# - Azure OpenAI Service +# +# You can find the names and keys in the Azure Portal. These need to be entered in a `.env` file in the root of this repository. The `.env` file is not checked in to source control. You can use the [`.env.sample`](../../.env.sample) file as a template. + +# In[1]: + + +import os +import pandas as pd +from azure.identity import DefaultAzureCredential, get_bearer_token_provider +from azure.search.documents import SearchClient +from azure.search.documents.indexes import SearchIndexClient +from azure.search.documents.indexes.models import ( + HnswParameters, + HnswAlgorithmConfiguration, + SemanticPrioritizedFields, + SearchableField, + SearchField, + SearchFieldDataType, + SearchIndex, + SemanticSearch, + SemanticConfiguration, + SemanticField, + SimpleField, + VectorSearch, + VectorSearchAlgorithmKind, + VectorSearchAlgorithmMetric, + ExhaustiveKnnAlgorithmConfiguration, + ExhaustiveKnnParameters, + VectorSearchProfile, +) +from typing import List, Dict +from openai import AzureOpenAI +from dotenv import load_dotenv + +from pathlib import Path + +load_dotenv() + + +# In[2]: + + +def delete_index(search_index_client: SearchIndexClient, search_index: str): + print(f"deleting index {search_index}") + search_index_client.delete_index(search_index) + + +# In[3]: + + +def create_index_definition(name: str) -> SearchIndex: + """ + Returns an Azure AI Search index with the given name. + """ + # The fields we want to index. The "embedding" field is a vector field that will + # be used for vector search. + fields = [ + SimpleField(name="id", type=SearchFieldDataType.String, key=True), + SearchableField(name="content", type=SearchFieldDataType.String), + SimpleField(name="filepath", type=SearchFieldDataType.String), + SearchableField(name="title", type=SearchFieldDataType.String), + SimpleField(name="url", type=SearchFieldDataType.String), + SearchField( + name="contentVector", + type=SearchFieldDataType.Collection(SearchFieldDataType.Single), + searchable=True, + # Size of the vector created by the text-embedding-ada-002 model. + vector_search_dimensions=1536, + vector_search_profile_name="myHnswProfile", + ), + ] + + # The "content" field should be prioritized for semantic ranking. + semantic_config = SemanticConfiguration( + name="default", + prioritized_fields=SemanticPrioritizedFields( + title_field=SemanticField(field_name="title"), + keywords_fields=[], + content_fields=[SemanticField(field_name="content")], + ), + ) + + # For vector search, we want to use the HNSW (Hierarchical Navigable Small World) + # algorithm (a type of approximate nearest neighbor search algorithm) with cosine + # distance. + vector_search = VectorSearch( + algorithms=[ + HnswAlgorithmConfiguration( + name="myHnsw", + kind=VectorSearchAlgorithmKind.HNSW, + parameters=HnswParameters( + m=4, + ef_construction=400, + ef_search=500, + metric=VectorSearchAlgorithmMetric.COSINE, + ), + ), + ExhaustiveKnnAlgorithmConfiguration( + name="myExhaustiveKnn", + kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN, + parameters=ExhaustiveKnnParameters( + metric=VectorSearchAlgorithmMetric.COSINE + ), + ), + ], + profiles=[ + VectorSearchProfile( + name="myHnswProfile", + algorithm_configuration_name="myHnsw", + ), + VectorSearchProfile( + name="myExhaustiveKnnProfile", + algorithm_configuration_name="myExhaustiveKnn", + ), + ], + ) + + # Create the semantic settings with the configuration + semantic_search = SemanticSearch(configurations=[semantic_config]) + + # Create the search index. + index = SearchIndex( + name=name, + fields=fields, + semantic_search=semantic_search, + vector_search=vector_search, + ) + + return index + + +# In[4]: + + +def gen_products( + path: str, +) -> List[Dict[str, any]]: + openai_service_endoint = os.environ["AZURE_OPENAI_ENDPOINT"] + openai_deployment = "text-embedding-ada-002" + # openai.Embedding.create() -> client.embeddings.create() + azure_credential = DefaultAzureCredential() + token_provider = get_bearer_token_provider(azure_credential,"https://cognitiveservices.azure.com/.default") + client = AzureOpenAI( + api_version="2023-07-01-preview", + azure_endpoint=openai_service_endoint, + azure_deployment=openai_deployment, + azure_ad_token_provider=token_provider + ) + + products = pd.read_csv(path) + items = [] + for product in products.to_dict("records"): + content = product["description"] + id = str(product["id"]) + title = product["name"] + url = f"/products/{title.lower().replace(' ', '-')}" + emb = client.embeddings.create(input=content, model=openai_deployment) + rec = { + "id": id, + "content": content, + "filepath": f"{title.lower().replace(' ', '-')}", + "title": title, + "url": url, + "contentVector": emb.data[0].embedding, + } + items.append(rec) + + return items + + +# In[5]: + + +aisearch_endpoint = os.environ["AZURE_SEARCH_ENDPOINT"] +index_name = "contoso-products" + +search_index_client = SearchIndexClient( + aisearch_endpoint, DefaultAzureCredential() +) + +delete_index(search_index_client, index_name) +index = create_index_definition(index_name) +print(f"creating index {index_name}") +search_index_client.create_or_update_index(index) +print(f"index {index_name} created") + + +# In[6]: + + +print(f"indexing documents") +docs = gen_products("products.csv") +# Upload our data to the index. +search_client = SearchClient( + endpoint=aisearch_endpoint, + index_name=index_name, + credential=DefaultAzureCredential(), +) +print(f"uploading {len(docs)} documents to index {index_name}") +ds = search_client.upload_documents(docs) + diff --git a/infra/manifests/api/config.tmpl.yaml b/infra/manifests/api/config.tmpl.yaml index 4e704a29..a92c13b1 100644 --- a/infra/manifests/api/config.tmpl.yaml +++ b/infra/manifests/api/config.tmpl.yaml @@ -4,6 +4,7 @@ metadata: name: api-config data: AZURE_OPENAI_ENDPOINT: {{.Env.AZURE_OPENAI_ENDPOINT}} + AZURE_OPENAI_GPT4_EVAL_DEPLOYMENT: {{.Env.AZURE_OPENAI_GPT4_EVAL_DEPLOYMENT}} AZURE_OPENAI_35_TURBO_DEPLOYMENT_NAME: {{.Env.AZURE_OPENAI_35_TURBO_DEPLOYMENT_NAME}} AZURE_OPENAI_35_TURBO_MODEL_NAME: {{.Env.AZURE_OPENAI_35_TURBO_MODEL_NAME}} AZURE_OPENAI_API_VERSION: {{.Env.AZURE_OPENAI_API_VERSION}} diff --git a/infra/manifests/api/deployment.tmpl.yaml b/infra/manifests/api/deployment.tmpl.yaml index 1d1d1542..2de11761 100644 --- a/infra/manifests/api/deployment.tmpl.yaml +++ b/infra/manifests/api/deployment.tmpl.yaml @@ -34,6 +34,8 @@ spec: value: {{.Env.AZURE_OPENAI_DEPLOYMENT_NAME}} - name: AZURE_OPENAI_35_TURBO_DEPLOYMENT_NAME value: {{.Env.AZURE_OPENAI_35_TURBO_DEPLOYMENT_NAME}} + - name: AZURE_OPENAI_GPT4_EVAL_DEPLOYMENT + value: {{.Env.AZURE_OPENAI_GPT4_EVAL_DEPLOYMENT}} - name: AZURE_OPENAI_35_TURBO_DEPLOYMENT_NAME value: {{.Env.AZURE_OPENAI_35_TURBO_DEPLOYMENT_NAME}} - name: AZURE_OPENAI_ENDPOINT diff --git a/infra/openai.tf b/infra/openai.tf index f6603f96..3e20618d 100644 --- a/infra/openai.tf +++ b/infra/openai.tf @@ -47,6 +47,22 @@ resource "azurerm_cognitive_deployment" "gpt35_deployment" { } } +resource "azurerm_cognitive_deployment" "gpt4_deployment" { + name = var.openai_4_eval_deployment_name + cognitive_account_id = azurerm_cognitive_account.cog.id + + model { + format = "OpenAI" + name = var.openai_4_eval_model_name + version = var.openai_4_eval_model_version + } + + scale { + type = "Standard" + capacity = var.openai_4_eval_model_capacity + } +} + resource "azurerm_cognitive_deployment" "embedding_deployment" { name = var.openai_embedding_model_name cognitive_account_id = azurerm_cognitive_account.cog.id diff --git a/infra/outputs.tf b/infra/outputs.tf index dec3db9a..74e29fed 100644 --- a/infra/outputs.tf +++ b/infra/outputs.tf @@ -22,6 +22,15 @@ output "AZURE_OPENAI_35_TURBO_DEPLOYMENT_NAME" { value = var.openai_35_turbo_model_name } +output "AZURE_OPENAI_GPT4_EVAL_DEPLOYMENT" { + value = var.openai_4_eval_deployment_name +} + +output "AZURE_OPENAI_4_EVAL_MODEL_VERSION" { + value = var.openai_4_eval_model_version +} + + output "AZURE_OPENAI_35_TURBO_MODEL_NAME" { value = var.openai_35_turbo_model_name } diff --git a/infra/variables.tf b/infra/variables.tf index da350a13..3f9e04df 100644 --- a/infra/variables.tf +++ b/infra/variables.tf @@ -43,6 +43,32 @@ variable "openai_35_turbo_model_name" { default = "gpt-35-turbo" } +variable "openai_4_eval_deployment_name" { + description = "value of azure openai model name" + type = string + default = "gpt-4-eval" +} + +variable "openai_4_eval_model_name" { + description = "value of azure openai model name" + type = string + default = "gpt-4" +} + + +variable "openai_4_eval_model_version" { + description = "value of azure openai model name" + type = string + default = "0613" +} + +variable "openai_4_eval_model_capacity" { + description = "value of azure openai model capacity" + type = number + default = 20 +} + + variable "openai_35_turbo_model_version" { description = "value of azure openai model version" type = string diff --git a/src/api/api/agents/orchestrator.py b/src/api/api/agents/orchestrator.py index 25814190..825cd5de 100644 --- a/src/api/api/agents/orchestrator.py +++ b/src/api/api/agents/orchestrator.py @@ -142,11 +142,16 @@ def write_article(request, instructions, evaluate=False): # Log final editor response log_output("Final editor response: %s", json.dumps(editor_response, indent=2)) -if __name__ == "__main__": - from api.logging import init_logging - - init_logging() +@trace +def test_write_article(): context = "Can you find the latest camping trends and what folks are doing in the winter?" instructions = "Can you find the relevant information needed and good places to visit" for result in write_article(context, instructions, evaluate=True): print(*result) + +if __name__ == "__main__": + from api.logging import init_logging + + init_logging() + test_write_article() + diff --git a/src/api/api/agents/researcher/researcher.py b/src/api/api/agents/researcher/researcher.py index 24d72613..8b75fae9 100644 --- a/src/api/api/agents/researcher/researcher.py +++ b/src/api/api/agents/researcher/researcher.py @@ -7,17 +7,17 @@ from promptflow.tracing import trace from promptflow.core import Prompty, AzureOpenAIModelConfiguration -from api.logging import log_output - from dotenv import load_dotenv from pathlib import Path +import base64 + folder = Path(__file__).parent.absolute().as_posix() load_dotenv() #bing does not currently support managed identity -BING_SEARCH_ENDPOINT = os.getenv("BING_SEARCH_ENDPOINT") -BING_SEARCH_KEY = os.getenv("BING_SEARCH_KEY") +BING_SEARCH_ENDPOINT = os.environ["BING_SEARCH_ENDPOINT"] +BING_SEARCH_KEY = os.environ["BING_SEARCH_KEY"] BING_HEADERS = {"Ocp-Apim-Subscription-Key": BING_SEARCH_KEY} @@ -42,11 +42,10 @@ def find_information(query, market="en-US"): {"url": a["url"], "name": a["name"], "description": a["snippet"]} for a in items["webPages"]["value"] ] - # check if relatedsearches exists if "relatedSearches" not in items: return {"pages": pages, "related": []} - + # else add related searching related = [a["text"] for a in items["relatedSearches"]["value"]] return {"pages": pages, "related": related} diff --git a/src/api/api/agents/writer/writer.prompty b/src/api/api/agents/writer/writer.prompty index 7babca14..85202e64 100644 --- a/src/api/api/agents/writer/writer.prompty +++ b/src/api/api/agents/writer/writer.prompty @@ -38,7 +38,7 @@ system: You are an expert copywriter who can take research from a web researcher as well as some product information from marketing to produce a fun and engaging article that can be used as a magazine article or a blog post. The goal is to engage the reader and provide them with a fun and informative -article. The article should be between 800 and 1000 words. Use the following questions as the basis +article. The article should be between 300 and 500 words. Use the following questions as the basis of your article: # Research @@ -121,14 +121,13 @@ content: {% endfor %} # Article -Write a fun and engaging article that includes the research and product information. The article should -be between 800 and 1000 words. The goal is to engage the reader and provide them with a fun and informative +Write a fun and engaging article that includes the research and product information. The goal is to engage the reader and provide them with a fun and informative article. # Final Instructions Try to keep your writing short and to the point. The goal is to engage the reader and provide them with -a fun and informative article. The article should be between 800 and 1200 words. +a fun and informative article. user: Can you write a fun and engaging article that resolves the following: diff --git a/src/api/api/agents/writer/writer.py b/src/api/api/agents/writer/writer.py index 15d6a864..9c33ff09 100644 --- a/src/api/api/agents/writer/writer.py +++ b/src/api/api/agents/writer/writer.py @@ -19,7 +19,7 @@ def execute(request, feedback, instructions, research, products): override_model = { "configuration": configuration, - "parameters": {"max_tokens": 512} + "parameters": {"max_tokens": 1200} } # create path to prompty file prompty_file = folder + "/writer.prompty" diff --git a/src/api/api/evaluate/evaluate.py b/src/api/api/evaluate/evaluate.py index 2ea2183f..a88d8ef7 100644 --- a/src/api/api/evaluate/evaluate.py +++ b/src/api/api/evaluate/evaluate.py @@ -85,12 +85,14 @@ def evaluate_orchestrator(model_config, data_path): results = [] futures = [] def evaluate_row(request, instructions): + result = { "request": request } print("Running orchestrator...") - eval_data = run_orchestrator(row['request'], row['instructions']) + eval_data = run_orchestrator(request, instructions) print("Evaluating results...") eval_result = writer_evaluator(query=eval_data["query"], context=eval_data["context"], response=eval_data["response"]) + result.update(eval_result) print("Evaluation results: ", eval_result) - eval_results.append(eval_result) + eval_results.append(result) with concurrent.futures.ThreadPoolExecutor() as executor: for row in data: @@ -106,16 +108,17 @@ def evaluate_row(request, instructions): import pandas as pd print("Evaluation summary:\n") - df = pd.DataFrame.from_dict(eval_results) - print(df) + results_df = pd.DataFrame.from_dict(eval_results) + print(results_df) + mean_df = results_df.drop("request", axis=1).mean() print("\nAverage scores:") - print(df.mean()) + print(mean_df) - df.to_markdown(folder + '/eval_results.md') + results_df.to_markdown(folder + '/eval_results.md') with open(folder + '/eval_results.md', 'a') as file: file.write("\n\nAverages scores:\n\n") - df.mean().to_markdown(folder + '/eval_results.md', 'a') + mean_df.to_markdown(folder + '/eval_results.md', 'a') with jsonlines.open(folder + '/eval_results.jsonl', 'w') as writer: writer.write(eval_results) @@ -125,20 +128,18 @@ def evaluate_row(request, instructions): if __name__ == "__main__": import time import jsonlines - from api.logging import init_logging - - init_logging() # Initialize Azure OpenAI Connection model_config = AzureOpenAIModelConfiguration( - azure_deployment=os.environ["AZURE_OPENAI_35_TURBO_DEPLOYMENT_NAME"], + azure_deployment=os.environ["AZURE_OPENAI_GPT4_EVAL_DEPLOYMENT"], api_version=os.environ["AZURE_OPENAI_API_VERSION"], azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"] ) start=time.time() print(f"Starting evaluate...") - + print(os.environ["BING_SEARCH_ENDPOINT"]) + print("value: ", os.environ["BING_SEARCH_KEY"], len(os.environ["BING_SEARCH_KEY"])) eval_result = evaluate_orchestrator(model_config, data_path=folder +"/eval_inputs.jsonl") end=time.time() diff --git a/src/api/api/evaluate/evaluators.py b/src/api/api/evaluate/evaluators.py index feba039a..bb68be08 100644 --- a/src/api/api/evaluate/evaluators.py +++ b/src/api/api/evaluate/evaluators.py @@ -34,9 +34,9 @@ def evaluate_article(data, trace_context): with tracer.start_as_current_span("run_evaluators", context=trace_context) as span: span.set_attribute("inputs", json.dumps(data)) configuration = AzureOpenAIModelConfiguration( - azure_deployment=os.getenv("AZURE_OPENAI_35_TURBO_DEPLOYMENT_NAME"), - api_version=os.getenv("AZURE_OPENAI_API_VERSION"), - azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT") + azure_deployment=os.environ["AZURE_OPENAI_GPT4_EVAL_DEPLOYMENT"], + api_version=os.environ["AZURE_OPENAI_API_VERSION"], + azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"] ) evaluator = ArticleEvaluator(configuration) results = evaluator(query=data['query'], context=data['context'], response=data['response']) diff --git a/src/api/api/logging.py b/src/api/api/logging.py index 15c7004b..11234e24 100644 --- a/src/api/api/logging.py +++ b/src/api/api/logging.py @@ -15,7 +15,7 @@ def log_output(*args): logging.log(DEFAULT_LOG_LEVEL, *args) -def init_logging(sampling_rate=1.0): +def init_logging(sampling_rate=1.0, log_level=DEFAULT_LOG_LEVEL): # Enable logging to app insights if a connection string is provided if 'APPLICATIONINSIGHTS_CONNECTION_STRING' in os.environ: connection_string=os.environ['APPLICATIONINSIGHTS_CONNECTION_STRING'] @@ -29,6 +29,6 @@ def init_logging(sampling_rate=1.0): start_pf_tracing() logging.basicConfig( - level=DEFAULT_LOG_LEVEL, format="%(asctime)s - %(levelname)s - %(message)s" + level=log_level, format="%(asctime)s - %(levelname)s - %(message)s" ) log_output("Logging initialized.") \ No newline at end of file