-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
227 lines (199 loc) · 8.56 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
"""
Teaching Assistant Chatbot
This module provides a user interface for a teaching assistant chatbot using Streamlit.
The chatbot is designed to assist students with answering questions and providing explanations.
The chatbot uses the OpenAI GPT-3 model to generate responses based on the input questions and the
context of the course documents. The course documents are indexed using the LLAMA Indexing Library
and stored in a vector database for efficient retrieval.
Author: Ali Shiraee
Last Modified: November 7th, 2024
"""
import json
import os
from pathlib import Path
import streamlit as st
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Settings
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.chroma import ChromaVectorStore
from data_handler import fetch_users
import openai
import chromadb
import streamlit_authenticator as stauth
import yaml
from yaml.loader import SafeLoader
import dotenv
def is_unchanged(docs_path, vectordb_path):
"""
Check if the documents in the docs_path have changed since the last indexing.
Args:
docs_path (str): The path to the directory containing the documents.
vectordb_path (str): The path to the directory containing the vector database.
"""
directory = Path(docs_path)
all_files = list(directory.rglob("*.*"))
current_files = [file.name for file in all_files]
db = chromadb.PersistentClient(path=vectordb_path)
chroma_collection = db.get_or_create_collection("documents")
all_saved_docs = chroma_collection.get()
previous_files = {x["file_name"] for x in all_saved_docs["metadatas"]}
return previous_files == set(current_files)
def clear_chat_history():
"""
Clear the chat history by resetting the messages in the session state.
"""
st.session_state.messages = [
{
"role": "assistant",
"content": (
"I can answer your questions about the course "
"material, exams, outlines, and other course details. Ask me anything!"
),
}
]
with open("assistant_config.json", encoding="utf-8") as f:
json_config = json.load(f)
config = fetch_users()
Settings.llm = OpenAI(model=json_config["model"])
Settings.temperature = json_config["temperature"]
Settings.system_prompt = json_config["system_prompt"]
Settings.embed_model = OpenAIEmbedding(model=json_config["embed_model"])
Settings.node_parser = SentenceSplitter(
chunk_size=json_config["chunk_size"], chunk_overlap=json_config["chunk_overlap"]
)
st.set_page_config(
page_title=json_config["name"],
page_icon="🤖",
layout="centered",
initial_sidebar_state="auto",
menu_items=None,
)
authenticator = stauth.Authenticate(
config["credentials"],
config["cookie"]["name"],
config["cookie"]["key"],
config["cookie"]["expiry_days"],
)
authenticator.login(location="sidebar")
if st.session_state["authentication_status"] is False:
st.error("Username/password is incorrect")
elif st.session_state["authentication_status"] is None:
st.title(json_config["name"])
st.warning("Please enter your username and password to proceed.")
else:
openai.api_key = dotenv.get_key(".env", "OPENAI_API_KEY")
st.title(json_config["name"])
st.info(
"AI can make mistakes, make sure to double-check important information.",
icon="⚠️",
)
if "messages" not in st.session_state.keys():
st.session_state.messages = [
{
"role": "assistant",
"content": (
"I can answer your questions about the course "
"material, exams, outlines, and other course details. Ask me anything!"
),
}
]
with st.sidebar:
st.image("./assets/m24-rev_png.png")
st.write(f'Welcome *{st.session_state["name"]}*')
st.markdown(
(
f"This is your personal teaching assistant for the **{json_config['course_name']}**"
" course. You can access the course content "
"[here](https://avenue.cllmcmaster.ca/d2l/le/content/596841/Home)"
)
)
authenticator.logout("Logout", "main")
st.sidebar.button("Clear Chat History", on_click=clear_chat_history)
@st.cache_resource(show_spinner=False)
def create_index():
"""
Load and index documents for the first time and save them into the vector store.
Returns:
VectorStoreIndex: The index created from the documents in the docs_path.
"""
with st.spinner(
text="Loading and indexing documents for the first time"
"– hang tight! This might take 1-2 minutes."
):
reader = SimpleDirectoryReader(
input_dir=json_config["docs_path"], recursive=True
)
docs = reader.load_data()
db = chromadb.PersistentClient(path=json_config["vectordb_path"])
chroma_collection = db.get_or_create_collection("documents")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
return VectorStoreIndex.from_documents(
docs, storage_context=storage_context
)
@st.cache_resource(show_spinner=False)
def load_index():
"""
Load the index from the existing vector store.
Returns:
VectorStoreIndex: The index loaded from the existing vector store.
"""
db = chromadb.PersistentClient(path=json_config["vectordb_path"])
chroma_collection = db.get_or_create_collection("documents")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
return VectorStoreIndex.from_vector_store(vector_store)
if (
os.path.exists(json_config["vectordb_path"])
and os.listdir(json_config["vectordb_path"])
and is_unchanged(json_config["docs_path"], json_config["vectordb_path"])
):
index = load_index()
else:
index = create_index()
if "chat_engine" not in st.session_state.keys():
memory = ChatMemoryBuffer.from_defaults(token_limit=3900)
st.session_state.chat_engine = index.as_chat_engine(
chat_mode="condense_plus_context",
memory=memory,
context_prompt=(
"You are an AI teaching assistant, able to answer questions about"
" course material, exams, outlines, and other course details."
" Here are the relevant documents for the context:\n"
"{context_str}"
"\nInstruction: Based on the above documents, provide a detailed answer for the"
" student question below. If there is no relevant information in the documents,"
" inform the student that no related information was found in the course database"
" and refuse to answer the question."
),
verbose=True,
)
if prompt := st.chat_input("Type your question..."):
st.session_state.messages.append({"role": "user", "content": prompt})
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.write(message["content"])
if st.session_state.messages[-1]["role"] != "assistant":
with st.chat_message("assistant"):
with st.spinner("Thinking..."):
response = st.session_state.chat_engine.chat(prompt)
if response.source_nodes and response.source_nodes[0].score > 0.3:
relevant_node = response.source_nodes[0]
# page {relevant_node.metadata["page_label"]}'
SOURCE_STRING = (
f'<b>Source:</b> file "{relevant_node.metadata["file_name"]}"'
)
SOURCE_HTML = f'<br><br><span style="font-size:0.75rem;">{SOURCE_STRING}</span>'
else:
print(f"success, nodes: {response.source_nodes}")
SOURCE_HTML = ""
HTML_STRING = f"""
<div>
{response.response}{SOURCE_HTML}
</div>
"""
st.write(HTML_STRING, unsafe_allow_html=True)
message = {"role": "assistant", "content": response.response}
st.session_state.messages.append(message)