diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..3991070 --- /dev/null +++ b/.env.example @@ -0,0 +1 @@ +GOOGLE_API_KEY="" \ No newline at end of file diff --git a/.gitignore b/.gitignore index 68bc17f..5084291 100644 --- a/.gitignore +++ b/.gitignore @@ -158,3 +158,6 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + + +per_dir/ \ No newline at end of file diff --git a/README.md b/README.md index a7856ac..5b9c7b4 100644 --- a/README.md +++ b/README.md @@ -1 +1,99 @@ -# pebblo-hackathons \ No newline at end of file +## Overview + +This repository contains a Streamlit application that integrates multiple components from the LangChain library and Google Generative AI to create an interactive, AI-powered chatbot. The chatbot can process, retrieve, and respond to user queries based on a pre-loaded PDF document. The system employs a vector database for efficient information retrieval and ensures responses are safe by leveraging Google's safety settings. + +## Features + +- **Google Generative AI Integration**: Utilizes the `ChatGoogleGenerativeAI` model for generating human-like responses. +- **PDF Document Processing**: Loads and processes PDF documents into chunks for efficient retrieval. +- **Vector Database**: Stores processed text chunks in a vector database using `Chroma`, allowing for fast and accurate query retrieval. +- **Streamlit Interface**: Provides a simple and interactive user interface for chat-based interaction. +- **Safety Settings**: Configured to block harmful content using Google's safety settings. + +## Installation + +### Prerequisites + +Ensure you have the following installed: + +- Python 3.8+ +- [Streamlit](https://streamlit.io/) +- [Google Generative AI](https://developers.generativeai.google) +- [LangChain](https://langchain.readthedocs.io/) +- [dotenv](https://pypi.org/project/python-dotenv/) + +### Setup + +1. Clone the repository: + ```bash + git clone https://github.com/yourusername/your-repo-name.git + cd your-repo-name + ``` + +2. Create a virtual environment and activate it: + ```bash + python -m venv venv + source venv/bin/activate # On Windows use `venv\Scripts\activate` + ``` + +3. Install the required dependencies: + ```bash + pip install -r requirements.txt + ``` + +4. Set up your environment variables by creating a `.env` file in the root directory: + ``` + GOOGLE_API_KEY=your_google_api_key + ``` + +5. Place the PDF document in the `docs` directory: + ``` + docs/Merged_Ayurbeat_Everyday_Ayurveda.pdf + ``` + +## Usage + +1. Run the Streamlit application: + ```bash + streamlit run app.py + ``` + +2. The application will start by checking if a vector database already exists. If not, it will process the provided PDF document, split it into chunks, and create a vector database. + +3. Once the setup is complete, the application will load the vector database and present a chat interface. + +4. Enter your query in the chat input, and the chatbot will respond based on the content of the processed PDF. + +## File Structure + +- **app.py**: Main script containing the logic for the Streamlit application. +- **docs/**: Directory containing the PDF document to be processed. +- **per_dir/**: Directory where the vector database will be persisted. +- **.env**: Environment variables, including the Google API key. + +## Configuration + +- **Google API Key**: Required for using Google Generative AI. Store it in the `.env` file. +- **PDF Loader**: Currently set to load a specific PDF document. Modify the path in the script to use a different document. +- **Safety Settings**: Configurable via the `HarmCategory` and `HarmBlockThreshold` parameters in the `ChatGoogleGenerativeAI` initialization. + +## Dependencies + +The project relies on the following Python packages: + +- `langchain_chroma` +- `langchain_google_genai` +- `google-generativeai` +- `langchain_community` +- `dotenv` +- `streamlit` + +Ensure all dependencies are installed using the provided `requirements.txt`. + +## License + +This project is licensed under the MIT License. See the `LICENSE` file for more details. + +--- + +By following the instructions in this README, you can set up and run the AI-powered chatbot that processes and responds to user queries based on the content of a PDF document. The chatbot is designed to provide safe, contextually relevant information while ensuring user interactions are positive and constructive. diff --git a/app.py b/app.py new file mode 100644 index 0000000..a528a32 --- /dev/null +++ b/app.py @@ -0,0 +1,101 @@ +from langchain_chroma import Chroma +from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI, HarmBlockThreshold, HarmCategory +import google.generativeai as genai +from langchain_community.document_loaders import PyPDFLoader, PebbloSafeLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler +from langchain.callbacks.manager import CallbackManager +from langchain.chains import RetrievalQA +from dotenv import load_dotenv +import streamlit as st +import os + + +# Set the PWD environment variable to the current working directory +os.environ['PWD'] = os.getcwd() + +# Load environment variables from .env file +load_dotenv() + +# Define the persist directory for the vector database +persist_directory = os.path.join(os.environ['PWD'], 'per_dir') +embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") + +# Initialize session state for chat history +if "history" not in st.session_state: + st.session_state.history = [] + +# Initialize Google Generative AI +genai.configure(api_key=os.environ["GOOGLE_API_KEY"]) + +# Initialize callback manager for streaming output +callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) + +# Initialize the ChatGoogleGenerativeAI model with streaming output and safety settings +model = ChatGoogleGenerativeAI( + model="gemini-1.5-pro", + safety_settings={ + HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.HARM_BLOCK_THRESHOLD_UNSPECIFIED, + }, + callback_manager=callback_manager +) + +# Check if the persist directory exists +if not os.path.exists(persist_directory): + with st.spinner('🚀 Starting your bot. This might take a while'): + # Data Pre-processing: Load and process PDF documents + pdf_loader = PebbloSafeLoader(PyPDFLoader(os.path.join(os.environ['PWD'], 'docs', 'Merged_Ayurbeat_Everyday_Ayurveda.pdf')), name="Final") + pdf_documents = pdf_loader.load() + + # Split the loaded text into chunks + splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=50) + pdf_context = "\n\n".join(str(p.page_content) for p in pdf_documents) + pdfs = splitter.split_text(pdf_context) + + print("Data Processing Complete") + + # Create and persist the vector database from the processed text chunks + vectordb = Chroma.from_texts(pdfs, embeddings, persist_directory=persist_directory) + vectordb.persist() + + print("Vector DB Creating Complete\n") +else: + # Load the existing vector database + vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings) + print("Vector DB Loaded\n") + +# Initialize the query chain for the model +query_chain = RetrievalQA.from_chain_type( + llm=model, + retriever=vectordb.as_retriever() +) + +# Display the chat history +for msg in st.session_state.history: + with st.chat_message(msg['role']): + st.markdown(msg['content']) + +# Handle user input +prompt = st.chat_input("Say something") +if prompt: + # Append user message to session state history + st.session_state.history.append({ + 'role': 'user', + 'content': prompt + ". don't give harmful advice or anything that can hurt someone's feelings or emotions or anything outside of the context of this chat." + }) + + # Display user message in chat + with st.chat_message("user"): + st.markdown(prompt) + + # Generate and display assistant's response + with st.spinner('💡Thinking'): + response = query_chain.invoke({"query": prompt}) + + st.session_state.history.append({ + 'role': 'Assistant', + 'content': response['result'] + }) + + with st.chat_message("Assistant"): + st.markdown(response['result']) diff --git a/docs/Merged_Ayurbeat_Everyday_Ayurveda.pdf b/docs/Merged_Ayurbeat_Everyday_Ayurveda.pdf new file mode 100644 index 0000000..a972fa3 Binary files /dev/null and b/docs/Merged_Ayurbeat_Everyday_Ayurveda.pdf differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7fca70a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +langchain +chromadb +langchain-chroma +streamlit +google-generativeai +langchain-google-genai +python-dotenv +pypdf \ No newline at end of file