Merge pull request #1 from vivek-athina/dev

Adding a simple eval and validation script along with github action script
athina-ai · Feb 21, 2024 · 824a3a1 · 824a3a1
2 parents 98de19e + 2a03c27
commit 824a3a1
Show file tree

Hide file tree

Showing 2 changed files with 86 additions and 0 deletions.
diff --git a/.github/workflows/athina_ci.yml b/.github/workflows/athina_ci.yml
@@ -0,0 +1,32 @@
+name: CI with Athina Evals
+
+on:
+  push:
+    branches:
+      - main  # Trigger CI on pushes to main branch only
+
+jobs:
+  evaluate:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.9'
+
+      - name: Install Dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install athina  # Install Athina and other project dependencies as needed
+
+      - name: Prepare Dataset
+        run: |
+          # If your dataset is not static, prepare it here
+          # For example, download the dataset or build it from available data
+          echo "Prepare your dataset here if necessary"
+
+      - name: Run Athina Evaluation and Validation Script
+        run: python run_athina_evals.py
diff --git a/run_athina_evals.py b/run_athina_evals.py
@@ -0,0 +1,54 @@
+import os
+import pandas as pd
+from athina.evals import (
+    DoesResponseAnswerQuery
+)
+from athina.loaders import RagLoader
+from athina.keys import AthinaApiKey, OpenAiApiKey
+OpenAiApiKey.set_key(os.getenv('OPENAI_API_KEY'))
+AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))
+
+dataset = None
+
+def load_data():
+    # Create batch dataset from list of dict objects
+    raw_data = [
+        {
+            "query": "What is the capital of Greece?",
+            "context": "Greece is often called the cradle of Western civilization.",
+            "response": "Athens",
+        },
+        {
+            "query": "What is the price of a Tesla Model 3?",
+            "context": "Tesla Model 3 is a fully electric car.",
+            "response": "I cannot answer this question as prices vary from country to country.",
+        },
+        {
+            "query": "What is a shooting star?",
+            "context": "Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.",
+            "response": "A shooting star is a meteor that burns up in the atmosphere.",
+        }
+    ]
+    global dataset
+    dataset = RagLoader().load_dict(raw_data)
+    pd.DataFrame(dataset)
+
+def evaluate_and_validate():
+    if dataset is None:
+        raise ValueError("No dataset loaded.")
+    eval_model = "gpt-3.5-turbo"
+    df = DoesResponseAnswerQuery(model=eval_model).run_batch(data=dataset).to_df()
+
+    # Validation: Check if all rows in the dataframe passed the evaluation
+    all_passed = df['passed'].all()
+    if not all_passed:
+        failed_responses = df[~df['passed']]
+        print("Failed Responses:")
+        print(failed_responses)
+        raise ValueError("Not all responses passed the evaluation.")
+    else:
+        print("All responses passed the evaluation.")
+
+if __name__ == "__main__":
+    load_data()
+    evaluate_and_validate()