daxa-ai · Manasi-Malge · Jun 26, 2024 · Jun 26, 2024 · Jun 26, 2024 · Jun 26, 2024
diff --git a/NewsTracker-Daxa/NewsTraker_ Project Report.pdf b/NewsTracker-Daxa/NewsTraker_ Project Report.pdf
diff --git a/NewsTracker-Daxa/app.py b/NewsTracker-Daxa/app.py
@@ -0,0 +1,161 @@
+# Import necessary libraries and modules
+from flask import Flask, request, render_template, jsonify                      # Flask web framework components
+import requests                                                                 # For making HTTP requests
+from bs4 import BeautifulSoup                                                   # For web scraping
+from transformers import BertTokenizer, BertForQuestionAnswering, pipeline      # For BERT model and pipeline
+from langchain_community.vectorstores import Chroma                             # For vector database
+from langchain_community.embeddings import OllamaEmbeddings                     # For embeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter             # For text splitting
+from langchain.prompts import ChatPromptTemplate, PromptTemplate                # For creating prompts
+from langchain_core.output_parsers import StrOutputParser                       # For output parsing
+from langchain_community.chat_models import ChatOllama                          # For chat models
+from langchain_core.runnables import RunnablePassthrough                        # For running tasks
+from langchain.retrievers.multi_query import MultiQueryRetriever                # For multi-query retrieval
+import nltk                                                                     # Natural Language Toolkit
+from nltk.corpus import stopwords                                               # For stopwords
+from nltk.tokenize import word_tokenize                                         # For tokenizing words
+
+#################################################################################################################################
+
+# Download stopwords and punkt tokenizer models
+nltk.download('stopwords')
+nltk.download('punkt')
+
+#################################################################################################################################
+
+# Initialize the Flask application
+app = Flask(__name__)
+
+#################################################################################################################################
+
+# Function to remove stopwords from text
+def remove_stopwords(text):
+    stop_words = set(stopwords.words('english'))
+    word_tokens = word_tokenize(text)
+    filtered_text = ' '.join([word for word in word_tokens if word.lower() not in stop_words])
+    return filtered_text
+
+# Function to scrape and clean text from a webpage
+def scrape_search_results(url):
+    response = requests.get(url)                                              # Fetch the webpage content
+    soup = BeautifulSoup(response.text, 'html.parser')                        # Parse the HTML content
+    text = ' '.join([p.get_text() for p in soup.find_all('p')])               # Extract text from paragraph tags
+    text = remove_stopwords(text)                                             # Remove stopwords from the text
+    return text
+
+# Define a class to handle documents
+class Document:
+    def __init__(self, page_content, metadata=None):
+        self.page_content = page_content
+        self.metadata = metadata or {}
+
+#################################################################################################################################
+
+# Load BERT QA model and tokenizer
+tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
+model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
+
+# Initialize the QA pipeline
+qa_pipeline = pipeline('question-answering', model=model, tokenizer=tokenizer)
+
+# Initialize the text splitter for document chunking
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
+vector_db = None                                                            # Placeholder for the vector database
+
+# Initialize the local LLM (Large Language Model) from Ollama
+local_model = "llama3"
+llm = ChatOllama(model=local_model)
+
+#################################################################################################################################
+
+# Define a prompt template for generating alternative questions
+QUERY_PROMPT = PromptTemplate(
+    input_variables=["question"],
+    template="""You are an AI language model assistant. Your task is to generate five
+    different versions of the given user question to retrieve relevant documents from
+    a vector database. By generating multiple perspectives on the user question, your
+    goal is to help the user overcome some of the limitations of the distance-based
+    similarity search. Provide these alternative questions separated by newlines.
+    Original question: {question}""",
+)
+
+retriever = None                                                        # Placeholder for the retriever
+chain = None                                                            # Placeholder for the QA chain
+
+#################################################################################################################################
+
+@app.route('/')
+def index():
+    # Render the index.html template
+    return render_template('index.html')
+
+@app.route('/summarize', methods=['POST'])
+def summarize():
+    url = request.form['url']                                           # Get the URL from the form
+    scraped_text = scrape_search_results(url)                           # Scrape and clean the article text
+    document = Document(scraped_text)                                   # Create a Document object
+
+    global vector_db
+    chunks = text_splitter.split_documents([document])                  # Split the document into chunks
+
+    # Create a vector database from the document chunks
+    vector_db = Chroma.from_documents(
+        documents=chunks, 
+        embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=True),
+        collection_name="local-rag"
+    )
+
+    global retriever, chain
+    retriever = MultiQueryRetriever.from_llm(
+        vector_db.as_retriever(), 
+        llm,
+        prompt=QUERY_PROMPT
+    )
+
+    # Define a prompt template for the QA system
+    template = """Answer the question based ONLY on the following context:
+    {context}
+    Question: {question}
+    """
+
+    prompt = ChatPromptTemplate.from_template(template)
+
+    # Define the QA chain
+    chain = (
+        {"context": retriever, "question": RunnablePassthrough()}
+        | prompt
+        | llm
+        | StrOutputParser()
+    )
+
+    # Define a prompt template for summarizing the text
+    summary_prompt = PromptTemplate(
+        input_variables=["text"],
+        template="""Summarize the following text in a concise manner:\n{text}"""
+    )
+
+    # Define the summary chain
+    summary_chain = (
+        {"text": RunnablePassthrough()}
+        | summary_prompt
+        | llm
+        | StrOutputParser()
+    )
+
+    # Generate the summary
+    summary = summary_chain.invoke(input=document.page_content)
+    return jsonify(summary=summary)
+
+@app.route('/ask', methods=['POST'])
+def ask():
+    question = request.form['question']                                 # Get the question from the form
+    if question.lower() in ["exit", "quit"]:
+        return jsonify(answer="Chatbot session ended.")
+    answer = chain.invoke(question)                                     # Get the answer using the QA chain
+    return jsonify(answer=answer)
+
+#################################################################################################################################
+
+# Run the Flask application
+if __name__ == '__main__':
+    app.run(debug=True)
diff --git a/NewsTracker-Daxa/templates/index.html b/NewsTracker-Daxa/templates/index.html
@@ -0,0 +1,84 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <!-- Ensure the webpage is properly rendered on different devices -->
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>NewsTraker</title>
+    <!-- Include Bootstrap CSS for styling -->
+    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css">
+</head>
+<body>
+    <div class="container mt-5">
+        <h1 class="text-center"> NewsTraker: News Article Summarizer with Q&A Bot</h1>
+        <div class="mt-4">
+            <!-- Form to input URL for web scraping -->
+            <form id="scrape-form">
+                <div class="form-group">
+                    <label for="url">Enter URL:</label>
+                    <input type="text" class="form-control" id="url" required>
+                </div>
+                <button type="submit" class="btn btn-primary">Get Summary</button>
+            </form>
+        </div>
+        <div class="mt-4" id="summary-section" style="display:none;">
+            <h3>Summary</h3>
+            <p id="summary-text"></p>
+            <hr>
+            <h3>Ask a Question</h3>
+            <!-- Form to input question for the chatbot -->
+            <form id="chat-form">
+                <div class="form-group">
+                    <label for="question">Enter your question:</label>
+                    <input type="text" class="form-control" id="question" required>
+                </div>
+                <button type="submit" class="btn btn-primary">Ask</button>
+            </form>
+            <div class="mt-4" id="chat-response" style="display:none;">
+                <h5>Chatbot Answer</h5>
+                <p id="response-text"></p>
+            </div>
+        </div>
+    </div>
+    <!-- Include jQuery for handling form submissions and AJAX requests -->
+    <script src="https://code.jquery.com/jquery-3.5.1.min.js"></script>
+    <script>
+        $(document).ready(function() {
+            // Handle form submission for scraping the article
+            $('#scrape-form').on('submit', function(e) {
+                e.preventDefault();
+                var url = $('#url').val();
+                // Send a POST request to the server to get the summary
+                $.post('/summarize', {url: url}, function(data) {
+                    $('#summary-text').text(data.summary);
+                    $('#summary-section').show();
+                });
+            });
+
+            // Handle form submission for asking a question
+            $('#chat-form').on('submit', function(e) {
+                e.preventDefault();
+                askQuestion();
+            });
+
+            // Function to send a question to the server and get the answer
+            function askQuestion() {
+                var question = $('#question').val();
+                // Send a POST request to the server to get the answer
+                $.post('/ask', {question: question}, function(data) {
+                    if (data.answer.toLowerCase() === 'chatbot session ended.') {
+                        $('#response-text').text(data.answer);
+                        $('#chat-response').show();
+                        $('#question').attr('disabled', true);
+                        $('#chat-form button').attr('disabled', true);
+                    } else {
+                        $('#response-text').append('<p>You: ' + question + '</p><p>Chatbot: ' + data.answer + '</p><hr>');
+                        $('#chat-response').show();
+                        $('#question').val('').focus();
+                    }
+                });
+            }
+        });
+    </script>
+</body>
+</html>