Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NewsTraker: News Article Summarizer with Q&A Bot #1

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added NewsTracker-Daxa/NewsTraker_ Project Report.pdf
Binary file not shown.
161 changes: 161 additions & 0 deletions NewsTracker-Daxa/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
# Import necessary libraries and modules
from flask import Flask, request, render_template, jsonify # Flask web framework components
import requests # For making HTTP requests
from bs4 import BeautifulSoup # For web scraping
from transformers import BertTokenizer, BertForQuestionAnswering, pipeline # For BERT model and pipeline
from langchain_community.vectorstores import Chroma # For vector database
from langchain_community.embeddings import OllamaEmbeddings # For embeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter # For text splitting
from langchain.prompts import ChatPromptTemplate, PromptTemplate # For creating prompts
from langchain_core.output_parsers import StrOutputParser # For output parsing
from langchain_community.chat_models import ChatOllama # For chat models
from langchain_core.runnables import RunnablePassthrough # For running tasks
from langchain.retrievers.multi_query import MultiQueryRetriever # For multi-query retrieval
import nltk # Natural Language Toolkit
from nltk.corpus import stopwords # For stopwords
from nltk.tokenize import word_tokenize # For tokenizing words

#################################################################################################################################

# Download stopwords and punkt tokenizer models
nltk.download('stopwords')
nltk.download('punkt')

#################################################################################################################################

# Initialize the Flask application
app = Flask(__name__)

#################################################################################################################################

# Function to remove stopwords from text
def remove_stopwords(text):
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(text)
filtered_text = ' '.join([word for word in word_tokens if word.lower() not in stop_words])
return filtered_text

# Function to scrape and clean text from a webpage
def scrape_search_results(url):
response = requests.get(url) # Fetch the webpage content
soup = BeautifulSoup(response.text, 'html.parser') # Parse the HTML content
text = ' '.join([p.get_text() for p in soup.find_all('p')]) # Extract text from paragraph tags
text = remove_stopwords(text) # Remove stopwords from the text
return text

# Define a class to handle documents
class Document:
def __init__(self, page_content, metadata=None):
self.page_content = page_content
self.metadata = metadata or {}

#################################################################################################################################

# Load BERT QA model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# Initialize the QA pipeline
qa_pipeline = pipeline('question-answering', model=model, tokenizer=tokenizer)

# Initialize the text splitter for document chunking
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
vector_db = None # Placeholder for the vector database

# Initialize the local LLM (Large Language Model) from Ollama
local_model = "llama3"
llm = ChatOllama(model=local_model)

#################################################################################################################################

# Define a prompt template for generating alternative questions
QUERY_PROMPT = PromptTemplate(
input_variables=["question"],
template="""You are an AI language model assistant. Your task is to generate five
different versions of the given user question to retrieve relevant documents from
a vector database. By generating multiple perspectives on the user question, your
goal is to help the user overcome some of the limitations of the distance-based
similarity search. Provide these alternative questions separated by newlines.
Original question: {question}""",
)

retriever = None # Placeholder for the retriever
chain = None # Placeholder for the QA chain

#################################################################################################################################

@app.route('/')
def index():
# Render the index.html template
return render_template('index.html')

@app.route('/summarize', methods=['POST'])
def summarize():
url = request.form['url'] # Get the URL from the form
scraped_text = scrape_search_results(url) # Scrape and clean the article text
document = Document(scraped_text) # Create a Document object

global vector_db
chunks = text_splitter.split_documents([document]) # Split the document into chunks

# Create a vector database from the document chunks
vector_db = Chroma.from_documents(
documents=chunks,
embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=True),
collection_name="local-rag"
)

global retriever, chain
retriever = MultiQueryRetriever.from_llm(
vector_db.as_retriever(),
llm,
prompt=QUERY_PROMPT
)

# Define a prompt template for the QA system
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

# Define the QA chain
chain = (
{"context": retriever, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)

# Define a prompt template for summarizing the text
summary_prompt = PromptTemplate(
input_variables=["text"],
template="""Summarize the following text in a concise manner:\n{text}"""
)

# Define the summary chain
summary_chain = (
{"text": RunnablePassthrough()}
| summary_prompt
| llm
| StrOutputParser()
)

# Generate the summary
summary = summary_chain.invoke(input=document.page_content)
return jsonify(summary=summary)

@app.route('/ask', methods=['POST'])
def ask():
question = request.form['question'] # Get the question from the form
if question.lower() in ["exit", "quit"]:
return jsonify(answer="Chatbot session ended.")
answer = chain.invoke(question) # Get the answer using the QA chain
return jsonify(answer=answer)

#################################################################################################################################

# Run the Flask application
if __name__ == '__main__':
app.run(debug=True)
84 changes: 84 additions & 0 deletions NewsTracker-Daxa/templates/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<!-- Ensure the webpage is properly rendered on different devices -->
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>NewsTraker</title>
<!-- Include Bootstrap CSS for styling -->
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css">
</head>
<body>
<div class="container mt-5">
<h1 class="text-center"> NewsTraker: News Article Summarizer with Q&A Bot</h1>
<div class="mt-4">
<!-- Form to input URL for web scraping -->
<form id="scrape-form">
<div class="form-group">
<label for="url">Enter URL:</label>
<input type="text" class="form-control" id="url" required>
</div>
<button type="submit" class="btn btn-primary">Get Summary</button>
</form>
</div>
<div class="mt-4" id="summary-section" style="display:none;">
<h3>Summary</h3>
<p id="summary-text"></p>
<hr>
<h3>Ask a Question</h3>
<!-- Form to input question for the chatbot -->
<form id="chat-form">
<div class="form-group">
<label for="question">Enter your question:</label>
<input type="text" class="form-control" id="question" required>
</div>
<button type="submit" class="btn btn-primary">Ask</button>
</form>
<div class="mt-4" id="chat-response" style="display:none;">
<h5>Chatbot Answer</h5>
<p id="response-text"></p>
</div>
</div>
</div>
<!-- Include jQuery for handling form submissions and AJAX requests -->
<script src="https://code.jquery.com/jquery-3.5.1.min.js"></script>
<script>
$(document).ready(function() {
// Handle form submission for scraping the article
$('#scrape-form').on('submit', function(e) {
e.preventDefault();
var url = $('#url').val();
// Send a POST request to the server to get the summary
$.post('/summarize', {url: url}, function(data) {
$('#summary-text').text(data.summary);
$('#summary-section').show();
});
});

// Handle form submission for asking a question
$('#chat-form').on('submit', function(e) {
e.preventDefault();
askQuestion();
});

// Function to send a question to the server and get the answer
function askQuestion() {
var question = $('#question').val();
// Send a POST request to the server to get the answer
$.post('/ask', {question: question}, function(data) {
if (data.answer.toLowerCase() === 'chatbot session ended.') {
$('#response-text').text(data.answer);
$('#chat-response').show();
$('#question').attr('disabled', true);
$('#chat-form button').attr('disabled', true);
} else {
$('#response-text').append('<p>You: ' + question + '</p><p>Chatbot: ' + data.answer + '</p><hr>');
$('#chat-response').show();
$('#question').val('').focus();
}
});
}
});
</script>
</body>
</html>