diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..bcdf207 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.8-slim + +WORKDIR /app + +COPY . /app + +RUN pip install --no-cache-dir -r requirements.txt + +EXPOSE 8501 + +CMD ["streamlit", "run", "app.py"] diff --git a/README.md b/README.md index 427d088..d4febb8 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,64 @@ -# IntellectSummarizer -By leveraging the latest LLMs from platforms like Hugging Face, IntellectSummarizer doesn't just shorten texts; it comprehends and condenses the content, maintaining the essence and key points of the original documents. +# PDF Text Summarizer + +## Overview + +PDF Text Summarizer is a Streamlit-based application that allows users to extract and summarize text from PDF documents or input text directly. It's designed to simplify the process of understanding large documents by providing concise summaries. + +## Features + +- **PDF Text Extraction**: Upload PDF documents to extract text. +- **Text Summarization**: Summarize extracted or input text for quick comprehension. +- **User-Friendly Interface**: Easy-to-use sidebar for method selection and interactive elements for a better user experience. + +## Project Architecture/Workflow + +### Components + +1. **Streamlit Application (`app.py`)**: The frontend interface where users interact with the application. +2. **Text Extraction Module (`src/pytesseract_ocr.py`)**: Extracts text from uploaded PDF files. +3. **Text Summarization Module (`src/summarizer.py`)**: Summarizes the extracted or input text. + +### Workflow + +1. **Start**: User chooses to upload a PDF or input text directly. +2. **Processing**: + - If a PDF is uploaded, the `PDFToTextConverter` extracts text from the PDF. + - If text is input directly, it is taken as is for summarization. +3. **Summarization**: The `TextSummarizer` generates a concise summary of the provided text. +4. **Display**: The original text (if extracted) and the summarized text are displayed to the user. + +## How to Use + +1. **Start the Application**: Run `streamlit run app.py` in the terminal. +2. **Choose Input Method**: Use the sidebar to select between uploading a PDF or entering text. +3. **Upload or Enter Text**: Either upload a PDF file or type text into the provided text area. +4. **Summarize**: Click the 'Summarize' button to process and view the summary. + +## Screenshots/Clippings + +Here are some screenshots showing different stages of the PDF Text Summarizer application: + +### Start Screen +![Start Screen](URL_TO_YOUR_START_SCREEN_IMAGE) + + +## Installation and Setup + +Follow these steps to get the application up and running: + +1. **Clone the repository**: + ``` + git clone https://github.com/olawale0254/IntellectSummarizer.git + +2. **Navigate to the project directory**: + ``` + cd IntellectSummarizer + +3. **Install dependencies**: + + ``` + pip install -r requirements.txt + +4. **Run the application**: + ``` + streamlit run app.py \ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 0000000..4d35ef9 --- /dev/null +++ b/app.py @@ -0,0 +1,100 @@ +import streamlit as st +import os +import tempfile +from src.pytesseract_ocr import PDFToTextConverter +from src.summarizer import TextSummarizer + +# Initialize the summarizer +# summarizer = TextSummarizer() + +# Custom CSS for styling +st.markdown(""" + + """, unsafe_allow_html=True) + +# Title and introduction +st.title("PDF Text Summarizer") + +# Sidebar for selection +option = st.sidebar.radio( + "Choose the input method:", + ("Upload PDF", "Enter Text") +) + +# Process PDF +def process_pdf(file_path): + converter = PDFToTextConverter(file_path) + return converter.convert() + +# Main container +with st.container(): + st.write("## Summary Output") + st.markdown(""" +This tool allows you to extract and summarize text from PDF documents or directly from your input. +Choose your preferred method and get concise summaries quickly and efficiently. +""") + output_container = st.empty() + + # Handle PDF upload + if option == "Upload PDF": + st.markdown("### Upload your PDF") + uploaded_file = st.file_uploader("Upload a PDF", type="pdf") + + if st.button('Summarize PDF'): + if uploaded_file is not None: + with st.spinner('Extracting and summarizing...'): + with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: + tmp.write(uploaded_file.getvalue()) + extracted_text = process_pdf(tmp.name) + + os.remove(tmp.name) + + if extracted_text: + # summarized_text = summarizer.summarize(extracted_text) + output_container.markdown("### Extracted Text:") + output_container.write(extracted_text) + output_container.markdown("### Summarized Text:") + # output_container.write(summarized_text) + else: + st.error("No text extracted from PDF.") + else: + st.error("Please upload a PDF file.") + + # Handle text input + elif option == "Enter Text": + st.markdown("### Enter your text") + user_input_text = st.text_area("Input your text here:") + + if st.button('Summarize Text'): + if user_input_text: + # summarized_text = summarizer.summarize(user_input_text) + output_container.markdown("### Original Text:") + output_container.write(user_input_text) + output_container.markdown("### Summarized Text:") + # output_container.write(summarized_text) + else: + st.error("Please enter some text to process.") diff --git a/requirements.txt b/requirements.txt index eed5b3f..cf96145 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,3 @@ -PyPDF2==1.26.0 +streamlit +transformers +pypdf \ No newline at end of file diff --git a/src/llm/summarizer.py b/src/llm/summarizer.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/ocr/test.pdf b/src/ocr/test.pdf deleted file mode 100644 index e155b3d..0000000 Binary files a/src/ocr/test.pdf and /dev/null differ diff --git a/src/ocr/test.py b/src/ocr/test.py deleted file mode 100644 index f1bcda1..0000000 --- a/src/ocr/test.py +++ /dev/null @@ -1,6 +0,0 @@ -from pytesseract_ocr import PDFToTextConverter - -pdf_path = "/Users/olawaleabimbola/Documents/IntellectSummarizer/src/ocr/test.pdf" -converter = PDFToTextConverter(pdf_path) -text = converter.convert() -print(text) diff --git a/src/ocr/pytesseract_ocr.py b/src/pytesseract_ocr.py similarity index 100% rename from src/ocr/pytesseract_ocr.py rename to src/pytesseract_ocr.py diff --git a/src/summarizer.py b/src/summarizer.py new file mode 100644 index 0000000..545ac96 --- /dev/null +++ b/src/summarizer.py @@ -0,0 +1,30 @@ +from transformers import pipeline + +class TextSummarizer: + def __init__(self, model_name="facebook/bart-large-cnn"): + """ + Initialize the TextSummarizer with a specific model. + + Args: + model_name (str): The name of the model to be used for summarization. + """ + self.model_name = model_name + self.summarizer = pipeline("summarization", model=self.model_name) + + def summarize(self, text, max_length=50, min_length=5): + """ + Summarize the given text using the loaded model. + + Args: + text (str): The text to be summarized. + max_length (int): The maximum length of the summarized text. + min_length (int): The minimum length of the summarized text. + + Returns: + str: The summarized text, or an error message if an exception occurs. + """ + try: + summary = self.summarizer(text, max_length=max_length, min_length=min_length, do_sample=False) + return summary[0]['summary_text'] + except Exception as e: + return f"An error occurred: {e}" \ No newline at end of file