Merge pull request #4 from athina-ai/dev

Improvements
athina-ai · Feb 23, 2024 · 8046777 · 8046777
2 parents 59a3083 + 4bbb697
commit 8046777
Show file tree

Hide file tree

Showing 8 changed files with 170 additions and 57 deletions.
diff --git a/.github/scripts/run_athina_evals.py b/.github/scripts/run_athina_evals.py
diff --git a/.github/workflows/athina_ci.yml b/.github/workflows/athina_ci.yml
@@ -20,7 +20,8 @@ jobs:
       - name: Install Dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install athina  # Install Athina and other project dependencies as needed
+          pip install -r requirements.txt  # Install project dependencies
+          pip install athina  # Install Athina Evals
 
       - name: Prepare Dataset
         run: |
@@ -29,6 +30,6 @@ jobs:
           echo "Prepare your dataset here if necessary"
         
       - name: Run Athina Evaluation and Validation Script
-        run: python .github/scripts/run_athina_evals.py
+        run: python -m evaluations.run_athina_evals
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/README.md b/README.md
@@ -1,3 +1,29 @@
 # CI using Athina Evals
 
-This repository demonstrates on how you can do a CI configuration for your RAG application using Athina Evals.
+This repository demonstrates on how you can do a CI configuration for your RAG application using Athina Evals.
+
+## Rag Application
+
+A sample Rag application is provided in the `src` directory. This is a simple rag application built using llama_index. We are going add evaluations for this application using Athina Evals.
+
+## Setting up the dependencies
+
+Install the necessary dependencies by running the following command:
+
+```bash
+pip install -r requirements.txt
+```
+
+additionally you need to install the `athina-evals` package by running the following command:
+
+```bash
+pip install athina-evals
+```
+
+## Running the evaluation script
+
+Then you can run the evaluation script by running the following command:
+
+```bash
+python -m evaluations/run_athina_evals.py
+```
diff --git a/evaluations/golden_dataset.jsonl b/evaluations/golden_dataset.jsonl
@@ -0,0 +1,12 @@
+{"query": "When was Y Combinator founded?", "expected_response": "Y Combinator was founded in March 2005."}
+{"query": "What is Y Combinator's primary mission?", "expected_response": "Y Combinator's primary mission is to support early-stage startups through funding, mentorship, and networking opportunities, helping them to launch successful businesses."}
+{"query": "Can you name some successful companies that have graduated from Y Combinator?", "expected_response": "Dropbox, Airbnb, Stripe, Reddit, and Instacart are among the successful companies that have graduated from Y Combinator."}
+{"query": "What industry sectors do Y Combinator's most successful alumni companies belong to?", "expected_response": "Y Combinator's most successful alumni companies belong to a variety of sectors, including technology, hospitality, finance, social media, and e-commerce."}
+{"query": "How much funding does Y Combinator typically provide to its startups?", "expected_response": "Y Combinator typically provides around $125,000 in funding to its startups."}
+{"query": "What equity stake does Y Combinator usually take in return for its investment?", "expected_response": "Y Combinator usually takes a 7% equity stake in return for its investment in startups."}
+{"query": "What is the startup accelerator model introduced by Y Combinator?", "expected_response": "The startup accelerator model introduced by Y Combinator involves providing funding, mentorship, and networking opportunities to early-stage startups over a fixed-term period, culminating in a demo day where startups pitch to investors."}
+{"query": "What are SAFE agreements, and how did Y Combinator standardize them?", "expected_response": "SAFE agreements (Simple Agreement for Future Equity) are legal documents created by Y Combinator to simplify early-stage investment deals, allowing startups to receive funding from investors in exchange for future equity without determining an immediate valuation."}
+{"query": "How has Y Combinator's alumni network impacted new entrepreneurs?", "expected_response": "Y Combinator's vast network of alumni and mentors provides ongoing support, advice, and networking opportunities to new entrepreneurs, significantly impacting their success and growth."}
+{"query": "What qualities does Y Combinator look for in the founding team of a startup?", "expected_response": "Y Combinator looks for qualities such as strength, cohesiveness, and the ability to execute rapidly in the founding team of a startup."}
+{"query": "What factors about a startup's product or service are important to Y Combinator?", "expected_response": "The novelty, market potential, and the problem it solves are important factors about a startup's product or service to Y Combinator."}
+{"query": "What does Y Combinator consider when evaluating the scalability of a business model?", "expected_response": "When evaluating scalability, Y Combinator considers whether the business model allows for rapid growth and expansion in the market."}
diff --git a/evaluations/run_athina_evals.py b/evaluations/run_athina_evals.py
@@ -0,0 +1,76 @@
+import os
+import pandas as pd
+from athina.evals import (
+    DoesResponseAnswerQuery,
+    Faithfulness
+)
+from athina.loaders import RagLoader
+from athina.keys import AthinaApiKey, OpenAiApiKey
+from src.rag_application import RagApplication
+
+OpenAiApiKey.set_key(os.getenv('OPENAI_API_KEY'))
+AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))
+
+dataset = None
+
+def load_data():
+    app = RagApplication(openai_api_key=os.getenv('OPENAI_API_KEY'))
+    # Create batch dataset from list of dict objects
+    raw_data = [
+        {
+            "query": "How much equity does YC take?",
+            "expected_response": "Y Combinator takes a 7% equity stake in companies in return for $125,000 on a post-money SAFE, and a 1.5% equity stake in companies participating in the YC Fellowship Program in exchange for a $20,000 investment.",
+        }
+    ]
+
+    # # or read from file
+    # with open('evaluations/golden_dataset.jsonl', 'r') as file:
+    #     raw_data = file.read()
+    #     raw_data = raw_data.split('\n')
+    #     data = []
+    #     for item in raw_data:
+    #         item = json.loads(item)
+    #         item['context'], item['response'] = app.generate_response(item['query'])
+    #         data.append(item)
+    #     global dataset
+    #     dataset = RagLoader().load_dict(data)
+    #     pd.DataFrame(dataset)
+    # for item in raw_data:
+    #     item['context'], item['response'] = app.generate_response(item['query'])
+
+    global dataset
+    dataset = RagLoader().load_dict(raw_data)
+    pd.DataFrame(dataset)
+
+def evaluate_and_validate():
+    if dataset is None:
+        raise ValueError("No dataset loaded.")
+
+    # Validate whether the response answers the query
+    eval_model = "gpt-3.5-turbo"
+    df = DoesResponseAnswerQuery(model=eval_model).run_batch(data=dataset).to_df()
+    # Validation: Check if all rows in the dataframe passed the evaluation
+    all_passed = df['passed'].all()
+    if not all_passed:
+        failed_responses = df[~df['passed']]
+        print("Failed Responses:")
+        print(failed_responses)
+        raise ValueError("Not all responses passed the evaluation.")
+    else:
+        print("All responses passed the evaluation.")
+
+    # Validate whether the response is faithful to the context
+    df = Faithfulness(model=eval_model).run_batch(data=dataset).to_df()
+    # Validation: Check if all rows in the dataframe passed the evaluation
+    all_passed = df['passed'].all()
+    if not all_passed:
+        failed_responses = df[~df['passed']]
+        print("Failed Responses:")
+        print(failed_responses)
+        raise ValueError("Not all responses passed the evaluation.")
+    else:
+        print("All responses passed the evaluation.")
+
+if __name__ == "__main__":
+    load_data()
+    evaluate_and_validate()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,4 @@
+llama_index==0.9.40
+langchain-openai==0.0.3
+openai==1.12.0
+python-dotenv==1.0.1
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/rag_application.py b/src/rag_application.py
@@ -0,0 +1,48 @@
+"""
+This is a sample llm application using RAG. 
+We will use llama_index to generate a response for a given query. 
+"""
+
+import os
+import openai
+from typing import List
+from dotenv import load_dotenv
+from llama_index import download_loader
+from llama_index import VectorStoreIndex, ServiceContext
+
+load_dotenv()
+OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+
+class RagApplication(): 
+
+    def __init__(self, openai_api_key):
+        openai.api_key = OPENAI_API_KEY
+        self.query_engine = self.create_query_engine()
+
+    def create_query_engine(self):
+        # create a llamaindex query engine
+        WikipediaReader = download_loader("WikipediaReader")
+        loader = WikipediaReader()
+        documents = loader.load_data(pages=['Y Combinator'])
+        vector_index = VectorStoreIndex.from_documents(
+            documents, service_context=ServiceContext.from_defaults(chunk_size=512)
+        )
+        return vector_index.as_query_engine()
+
+    def generate_response(self, query: str, top_k: int = 1) -> List[str]:
+        """
+        Generates a response for the given query.
+        """
+        contexts = []
+        query_engine_response = self.query_engine.query(query)
+        response = query_engine_response.response
+        for c in query_engine_response.source_nodes:
+            text = c.node.get_text()
+            contexts.append(text)
+
+        return "".join(contexts), response
+
+if __name__ == "__main__":
+    # Initialize the application
+    app = RagApplication(openai_api_key=OPENAI_API_KEY)
+    print(app.generate_response("How much equity does YC take?"))