diff --git a/app/chat/chat.py b/app/chat/chat.py index 9a4ed15..63cc92e 100644 --- a/app/chat/chat.py +++ b/app/chat/chat.py @@ -1,16 +1,16 @@ +from langchain.chains import ConversationalRetrievalChain from app.chat.models import ChatArgs - +from app.chat.vector_stores.pinecone import build_retriever +from app.chat.llms.chatopenai import build_llm +from app.chat.memories.sql_memory import build_memory def build_chat(chat_args: ChatArgs): - """ - :param chat_args: ChatArgs object containing - conversation_id, pdf_id, metadata, and streaming flag. - - :return: A chain - - Example Usage: - - chain = build_chat(chat_args) - """ + retriever = build_retriever(chat_args) + llm = build_llm(chat_args) + memory = build_memory(chat_args) - pass + return ConversationalRetrievalChain.from_llm( + llm=llm, + memory=memory, + retriever=retriever + ) \ No newline at end of file diff --git a/app/chat/create_embeddings.py b/app/chat/create_embeddings.py index f10fda9..b73bd8c 100644 --- a/app/chat/create_embeddings.py +++ b/app/chat/create_embeddings.py @@ -1,18 +1,27 @@ -def create_embeddings_for_pdf(pdf_id: str, pdf_path: str): - """ - Generate and store embeddings for the given pdf +from langchain.document_loaders import PyPDFLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from app.chat.vector_stores.pinecone import vector_store - 1. Extract text from the specified PDF. - 2. Divide the extracted text into manageable chunks. - 3. Generate an embedding for each chunk. - 4. Persist the generated embeddings. +def create_embeddings_for_pdf(pdf_id: str, pdf_path: str): + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=500, + chunk_overlap=100 + ) - :param pdf_id: The unique identifier for the PDF. - :param pdf_path: The file path to the PDF. + loader = PyPDFLoader(file_path=pdf_path) + documents = loader.load_and_split(text_splitter) - Example Usage: + # for doc in documents: + # try: + # print(doc) + # except UnicodeEncodeError: + # print("UnicodeEncodeError") - create_embeddings_for_pdf('123456', '/path/to/pdf') - """ + for doc in documents: + doc.metadata = { + "page": doc.metadata["page"], + "text": doc.page_content, + "pdf_id": pdf_id + } - pass + vector_store.add_documents(documents) diff --git a/app/chat/embeddings/__init__.py b/app/chat/embeddings/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/chat/embeddings/openai.py b/app/chat/embeddings/openai.py new file mode 100644 index 0000000..a87306f --- /dev/null +++ b/app/chat/embeddings/openai.py @@ -0,0 +1,3 @@ +from langchain.embeddings import OpenAIEmbeddings + +embeddings = OpenAIEmbeddings() \ No newline at end of file diff --git a/app/chat/llms/chatopenai.py b/app/chat/llms/chatopenai.py new file mode 100644 index 0000000..be85b34 --- /dev/null +++ b/app/chat/llms/chatopenai.py @@ -0,0 +1,20 @@ +# from langchain.chains import ConversationalRetrievalChain +# from langchain_openai import ChatOpenAI +# from app.chat.vector_stores.pinecone import build_retriever +# from app.chat.llms.chatopenai import build_llm +# from app.chat.memories.sql_memory import build_memory + +# def build_llm(chat_args): +# retriever = build_retriever(chat_args) +# llm = build_llm(chat_args) +# memory = build_memory(chat_args) +# return ConversationalRetrievalChain.from_llm( +# llm=llm, +# retriever=retriever, +# memory=memory +# ) + +from langchain.chat_models import ChatOpenAI + +def build_llm(chat_args): + return ChatOpenAI() \ No newline at end of file diff --git a/app/chat/memories/__init__.py b/app/chat/memories/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/chat/memories/sql_memory.py b/app/chat/memories/sql_memory.py new file mode 100644 index 0000000..1a45832 --- /dev/null +++ b/app/chat/memories/sql_memory.py @@ -0,0 +1,34 @@ +from pydantic import BaseModel +from langchain.memory import ConversationBufferMemory +from langchain.schema import BaseChatMessageHistory + +from app.web.api import( + get_messages_by_conversation_id, + add_message_to_conversation +) + +class SqlMessageHistory(BaseChatMessageHistory, BaseModel): + conversation_id: str + @property + def messages(self): + return get_messages_by_conversation_id(self.conversation_id) + + def add_message(self, message): + return add_message_to_conversation( + conversation_id = self.conversation_id, + role = message.type, + content = message.content + ) + + def clear(self): + pass + +def build_memory(chat_args): + return ConversationBufferMemory( + chat_memory=SqlMessageHistory( + conversation_id=chat_args.conversation_id + ), + return_messages=True, + memory_key="chat_history", + output_key="answer" + ) \ No newline at end of file diff --git a/app/chat/vector_stores/__init__.py b/app/chat/vector_stores/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/chat/vector_stores/pinecone.py b/app/chat/vector_stores/pinecone.py new file mode 100644 index 0000000..d9f0c94 --- /dev/null +++ b/app/chat/vector_stores/pinecone.py @@ -0,0 +1,19 @@ +import os +import pinecone +from langchain.vectorstores.pinecone import Pinecone +from app.chat.embeddings.openai import embeddings + +pinecone.Pinecone( + api_key=os.getenv("PINECONE_API_KEY"), + environment=os.getenv("PINECONE_ENV_NAME") +) + +vector_store = Pinecone.from_existing_index( + os.getenv("PINECONE_INDEX_NAME"), embeddings +) + +def build_retriever(chat_args): + search_kwargs = {"filter": { "pdf_id": chat_args.pdf_id }} + return vector_store.as_retriever( + search_kwargs=search_kwargs + ) \ No newline at end of file diff --git a/app/web/views/pdf_views.py b/app/web/views/pdf_views.py index 0a3a37b..06c4761 100644 --- a/app/web/views/pdf_views.py +++ b/app/web/views/pdf_views.py @@ -26,14 +26,13 @@ def upload_file(file_id, file_path, file_name): pdf = Pdf.create(id=file_id, name=file_name, user_id=g.user.id) - # TODO: Defer this to be processed by the worker - process_document(pdf.id) + process_document.delay(pdf.id) return pdf.as_dict() @bp.route("/", methods=["GET"]) -@login_required +@login_required @load_model(Pdf) def show(pdf): return jsonify( diff --git a/client/src/components/auth/AuthLinks.svelte b/client/src/components/auth/AuthLinks.svelte index 84c03ba..9f924f5 100644 --- a/client/src/components/auth/AuthLinks.svelte +++ b/client/src/components/auth/AuthLinks.svelte @@ -5,15 +5,6 @@ {#if $auth.user} Sign Out + href="/auth/signout">Sign Out {:else} - Login - Sign Up -{/if} +{/if} \ No newline at end of file diff --git a/client/src/routes/+layout.svelte b/client/src/routes/+layout.svelte index f9b5964..e18bc56 100644 --- a/client/src/routes/+layout.svelte +++ b/client/src/routes/+layout.svelte @@ -22,3 +22,14 @@ + + \ No newline at end of file diff --git a/client/src/routes/auth/signin/+page.svelte b/client/src/routes/auth/signin/+page.svelte deleted file mode 100644 index 0091019..0000000 --- a/client/src/routes/auth/signin/+page.svelte +++ /dev/null @@ -1,59 +0,0 @@ - - -
-
-
-
-

Sign In

-

- Don't have have an account? - - Sign Up Here - -

-
- -
-
-
- - - - - - - - - {#if $auth.error} - Error: {$auth.error} - {/if} - - -
-
-
-
-
-
diff --git a/client/src/routes/auth/signout/+page.svelte b/client/src/routes/auth/signout/+page.svelte deleted file mode 100644 index 0c616c7..0000000 --- a/client/src/routes/auth/signout/+page.svelte +++ /dev/null @@ -1,35 +0,0 @@ - - -
-
-
-
- Sad to see you go! -

Redirecting...

-
-
-
-
diff --git a/client/src/routes/auth/signup/+page.svelte b/client/src/routes/auth/signup/+page.svelte deleted file mode 100644 index a42d039..0000000 --- a/client/src/routes/auth/signup/+page.svelte +++ /dev/null @@ -1,67 +0,0 @@ - - -
-
-
-
-

Sign Up

-

- Already have an account? - - Sign In Here - -

-
- -
-
-
- - - - - - - - - - - - - {#if $auth.error} - Error: {$auth.error} - {/if} - - -
-
-
-
-
-
diff --git a/spice.pdf b/spice.pdf new file mode 100644 index 0000000..e19cb77 Binary files /dev/null and b/spice.pdf differ