ONE-F-M · pjamsheer · Dec 26, 2024 · Dec 25, 2024
diff --git a/one_fm/wiki_chat_bot/main.py b/one_fm/wiki_chat_bot/main.py
@@ -2,16 +2,15 @@
 import json
 
 import frappe
-from llama_index.core import SimpleDirectoryReader,VectorStoreIndex,PromptTemplate,GPTListIndex,StorageContext, load_index_from_storage
+from llama_index.core import SimpleDirectoryReader,Document,VectorStoreIndex,PromptTemplate,GPTListIndex,StorageContext, load_index_from_storage
 from llama_index.llms.openai import OpenAI
 from llama_index.embeddings.openai import OpenAIEmbedding
 from llama_index.core.schema import TextNode
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 
-
 from one_fm.api.v1.utils import response
 
-def split_text_into_chunks(text, chunk_size=4096):
+def split_text_into_chunks(text, chunk_size=2096):
     #This method was creeated to mitigate maxtoken errors
     splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=200)
     return splitter.split_text(text)
@@ -20,29 +19,42 @@ def create_vector_index():
     try:
         os.environ["OPENAI_API_KEY"] = frappe.local.conf.CHATGPT_APIKEY
         existing_text_nodes,new_text_nodes = [],[]
-        embedding_model = OpenAIEmbedding(model_name="gpt-4o-mini")
         # Load existing data and append to nodelist
-        existing_docs = SimpleDirectoryReader("vector_index").load_data()
-
-        for doc in existing_docs:
-            chunks = split_text_into_chunks(doc.text)
-            existing_text_nodes.extend([TextNode(text=chunk) for chunk in chunks])
+        directory_path = "vector_index"
+        if os.path.exists(directory_path):
+            if os.listdir(directory_path):
+                existing_docs = SimpleDirectoryReader("vector_index").load_data()
+                storage_context = StorageContext.from_defaults(persist_dir="vector_index")
+                vector_index_ = load_index_from_storage(storage_context)
+                for doc in existing_docs:
+                    chunks = split_text_into_chunks(doc.text)
+                    existing_text_nodes.extend([TextNode(text=chunk) for chunk in chunks])
+        else:
+            os.mkdir(directory_path)
 
         # Load new data
         new_docs = SimpleDirectoryReader(get_folder_path()).load_data()
-        # new_text_nodes = [TextNode(text=doc.text) for doc in new_docs]
+
         for doc in new_docs:
             #Split the texts so we don't go over the max token value of the model
             chunks = split_text_into_chunks(doc.text)
             new_text_nodes.extend([TextNode(text=chunk) for chunk in chunks])
         # Merge existing and new vector indexes
-        merged_nodes = existing_text_nodes+new_text_nodes
-        merged_vector_index = VectorStoreIndex(nodes= merged_nodes,embedding =embedding_model)
+        combined_text = "\n".join([node.text for node in new_text_nodes])
+        # Create a Document object
+        new_document = Document(text=combined_text)
+        if not os.listdir(directory_path):
+            vector_index_ = VectorStoreIndex.from_documents(new_docs)
+        else:
+            vector_index_.insert(new_document)
+        # merged_nodes = existing_text_nodes+new_text_nodes
+        # merged_vector_index = VectorStoreIndex(nodes= merged_nodes,embedding =embedding_model)
 
         # Persist the merged vector index
-        merged_vector_index.storage_context.persist(persist_dir="vector_index")
+        # merged_vector_index.storage_context.persist(persist_dir="vector_index")
+        vector_index_.storage_context.persist(persist_dir=directory_path)
 
-        return merged_vector_index
+        return vector_index_
     except:
         frappe.log_error(frappe.get_traceback(), "Error while adding to bot memory(Chat-BOT)")
 
@@ -53,38 +65,36 @@ def create_vector_index():
 def ask_question(question: str = None):
     try:
         os.environ["OPENAI_API_KEY"] = frappe.local.conf.CHATGPT_APIKEY
-        if not question:
+        if not question.strip():
             return response("Bad Request !", 400, error="Question can not be empty")
         storage_context = StorageContext.from_defaults(persist_dir="vector_index")
         index = load_index_from_storage(storage_context)
+
         prompt_template_str = (
-           "Context information is below.\n"
+            "You are Lumina, an AI assistant working for One Facilities Management, a company headquartered in Kuwait. "
+            "You always respond to your name when addressed and provide assistance accordingly. "
+            "Context information is below.\n"
             "---------------------\n"
             "{context_str}\n"
             "---------------------\n"
             "Given the context information and not prior knowledge, "
-            "You are an AI assistant called Lumina.\n"
-            "You do not need to introduce yourself or say who you are when you are not asked directly\n"
-            "You Work for One Faciities Management, A company with it's Headquarters in Kuwait\n"
-            "Whenever Lumina does not find the required data,ask the user to upload the most updated data to enable you answer the question appropriately\n"
             "Query: {query_str}\n"
             "Answer: "
         )
 
         refine_prompt_str = (
-            "We have the opportunity to refine the original answer "
-            "(only if needed) with some more context below.\n"
-            "------------\n"
-            "You should always respond in the same language as the query string even if the context is a different language \n"
-
-            "Given the new context, refine the original answer to better "
-            "answer the question: {query_str}. "
-
-            "Original Answer: {existing_answer}"
-        )
+                "As Lumina, the AI assistant for One Facilities Management, refine the original answer with the additional context below, "
+                "ensuring you respond to your name when addressed and maintain consistency in your responses.\n"
+                "------------\n"
+                "{context_str}\n"
+                "------------\n"
+                "Original Answer: {existing_answer}\n"
+                "Refined Answer: "
+            )
+
         text_qa_template = PromptTemplate(prompt_template_str)
         refined_text_qa_template = PromptTemplate(refine_prompt_str)
-        llm = OpenAI(model="gpt-4o-mini")
+        llm = OpenAI(model="gpt-4o-mini-2024-07-18")
         query_engine = index.as_query_engine(llm=llm,text_qa_template=text_qa_template,refine_template=refined_text_qa_template)
         answer = query_engine.query(question)
         return response(message="Success", status_code=200, data={"question": question, "answer": answer.response})