Skip to content

Commit

Permalink
Merge pull request #1 from vivek-athina/dev
Browse files Browse the repository at this point in the history
Adding a simple eval and validation script along with github action script
  • Loading branch information
vivek-athina authored Feb 21, 2024
2 parents 98de19e + 2a03c27 commit 824a3a1
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 0 deletions.
32 changes: 32 additions & 0 deletions .github/workflows/athina_ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: CI with Athina Evals

on:
push:
branches:
- main # Trigger CI on pushes to main branch only

jobs:
evaluate:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.9'

- name: Install Dependencies
run: |
python -m pip install --upgrade pip
pip install athina # Install Athina and other project dependencies as needed
- name: Prepare Dataset
run: |
# If your dataset is not static, prepare it here
# For example, download the dataset or build it from available data
echo "Prepare your dataset here if necessary"
- name: Run Athina Evaluation and Validation Script
run: python run_athina_evals.py
54 changes: 54 additions & 0 deletions run_athina_evals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os
import pandas as pd
from athina.evals import (
DoesResponseAnswerQuery
)
from athina.loaders import RagLoader
from athina.keys import AthinaApiKey, OpenAiApiKey
OpenAiApiKey.set_key(os.getenv('OPENAI_API_KEY'))
AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))

dataset = None

def load_data():
# Create batch dataset from list of dict objects
raw_data = [
{
"query": "What is the capital of Greece?",
"context": "Greece is often called the cradle of Western civilization.",
"response": "Athens",
},
{
"query": "What is the price of a Tesla Model 3?",
"context": "Tesla Model 3 is a fully electric car.",
"response": "I cannot answer this question as prices vary from country to country.",
},
{
"query": "What is a shooting star?",
"context": "Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.",
"response": "A shooting star is a meteor that burns up in the atmosphere.",
}
]
global dataset
dataset = RagLoader().load_dict(raw_data)
pd.DataFrame(dataset)

def evaluate_and_validate():
if dataset is None:
raise ValueError("No dataset loaded.")
eval_model = "gpt-3.5-turbo"
df = DoesResponseAnswerQuery(model=eval_model).run_batch(data=dataset).to_df()

# Validation: Check if all rows in the dataframe passed the evaluation
all_passed = df['passed'].all()
if not all_passed:
failed_responses = df[~df['passed']]
print("Failed Responses:")
print(failed_responses)
raise ValueError("Not all responses passed the evaluation.")
else:
print("All responses passed the evaluation.")

if __name__ == "__main__":
load_data()
evaluate_and_validate()

0 comments on commit 824a3a1

Please sign in to comment.