ready

olawale0254 · Jan 15, 2024 · 5a209de · 5a209de
1 parent 94b6c91
commit 5a209de
Show file tree

Hide file tree

Showing 9 changed files with 208 additions and 9 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,11 @@
+FROM python:3.8-slim
+
+WORKDIR /app
+
+COPY . /app
+
+RUN pip install --no-cache-dir -r requirements.txt
+
+EXPOSE 8501
+
+CMD ["streamlit", "run", "app.py"]
diff --git a/README.md b/README.md
@@ -1,2 +1,64 @@
-# IntellectSummarizer
-By leveraging the latest LLMs from platforms like Hugging Face, IntellectSummarizer doesn't just shorten texts; it comprehends and condenses the content, maintaining the essence and key points of the original documents.
+# PDF Text Summarizer
+
+## Overview
+
+PDF Text Summarizer is a Streamlit-based application that allows users to extract and summarize text from PDF documents or input text directly. It's designed to simplify the process of understanding large documents by providing concise summaries.
+
+## Features
+
+- **PDF Text Extraction**: Upload PDF documents to extract text.
+- **Text Summarization**: Summarize extracted or input text for quick comprehension.
+- **User-Friendly Interface**: Easy-to-use sidebar for method selection and interactive elements for a better user experience.
+
+## Project Architecture/Workflow
+
+### Components
+
+1. **Streamlit Application (`app.py`)**: The frontend interface where users interact with the application.
+2. **Text Extraction Module (`src/pytesseract_ocr.py`)**: Extracts text from uploaded PDF files.
+3. **Text Summarization Module (`src/summarizer.py`)**: Summarizes the extracted or input text.
+
+### Workflow
+
+1. **Start**: User chooses to upload a PDF or input text directly.
+2. **Processing**:
+    - If a PDF is uploaded, the `PDFToTextConverter` extracts text from the PDF.
+    - If text is input directly, it is taken as is for summarization.
+3. **Summarization**: The `TextSummarizer` generates a concise summary of the provided text.
+4. **Display**: The original text (if extracted) and the summarized text are displayed to the user.
+
+## How to Use
+
+1. **Start the Application**: Run `streamlit run app.py` in the terminal.
+2. **Choose Input Method**: Use the sidebar to select between uploading a PDF or entering text.
+3. **Upload or Enter Text**: Either upload a PDF file or type text into the provided text area.
+4. **Summarize**: Click the 'Summarize' button to process and view the summary.
+
+## Screenshots/Clippings
+
+Here are some screenshots showing different stages of the PDF Text Summarizer application:
+
+### Start Screen
+![Start Screen](URL_TO_YOUR_START_SCREEN_IMAGE)
+
+
+## Installation and Setup
+
+Follow these steps to get the application up and running:
+
+1. **Clone the repository**:
+    ```
+    git clone https://github.com/olawale0254/IntellectSummarizer.git
+
+2. **Navigate to the project directory**:
+    ```
+    cd IntellectSummarizer
+
+3. **Install dependencies**:
+
+    ```
+    pip install -r requirements.txt
+
+4. **Run the application**:
+    ```
+    streamlit run app.py
diff --git a/app.py b/app.py
@@ -0,0 +1,100 @@
+import streamlit as st
+import os
+import tempfile
+from src.pytesseract_ocr import PDFToTextConverter 
+from src.summarizer import TextSummarizer
+
+# Initialize the summarizer
+# summarizer = TextSummarizer()
+
+# Custom CSS for styling
+st.markdown("""
+    <style>
+    .reportview-container {
+        background: url("https://source.unsplash.com/weekly?water");
+        background-size: cover;
+    }
+    .sidebar .sidebar-content {
+        background: rgba(255, 255, 255, 0.8);
+    }
+    h1 {
+        color: #0E1117;
+    }
+    .stButton>button {
+        color: #ffffff;
+        background-color: #0E1117;
+        border-radius: 20px;
+        border: 1px solid #9c27b0;
+        padding: 10px 24px;
+        font-size: 16px;
+        font-weight: bold;
+    }
+    .stTextArea>textarea {
+        border-radius: 10px;
+        border: 2px solid #9c27b0;
+    }
+    </style>
+    """, unsafe_allow_html=True)
+
+# Title and introduction
+st.title("PDF Text Summarizer")
+
+# Sidebar for selection
+option = st.sidebar.radio(
+    "Choose the input method:",
+    ("Upload PDF", "Enter Text")
+)
+
+# Process PDF
+def process_pdf(file_path):
+    converter = PDFToTextConverter(file_path)
+    return converter.convert()
+
+# Main container
+with st.container():
+    st.write("## Summary Output")
+    st.markdown("""
+This tool allows you to extract and summarize text from PDF documents or directly from your input. 
+Choose your preferred method and get concise summaries quickly and efficiently.
+""")
+    output_container = st.empty()
+
+    # Handle PDF upload
+    if option == "Upload PDF":
+        st.markdown("### Upload your PDF")
+        uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
+
+        if st.button('Summarize PDF'):
+            if uploaded_file is not None:
+                with st.spinner('Extracting and summarizing...'):
+                    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+                        tmp.write(uploaded_file.getvalue())
+                        extracted_text = process_pdf(tmp.name)
+
+                    os.remove(tmp.name)
+
+                    if extracted_text:
+                        # summarized_text = summarizer.summarize(extracted_text)
+                        output_container.markdown("### Extracted Text:")
+                        output_container.write(extracted_text)
+                        output_container.markdown("### Summarized Text:")
+                        # output_container.write(summarized_text)
+                    else:
+                        st.error("No text extracted from PDF.")
+            else:
+                st.error("Please upload a PDF file.")
+
+    # Handle text input
+    elif option == "Enter Text":
+        st.markdown("### Enter your text")
+        user_input_text = st.text_area("Input your text here:")
+
+        if st.button('Summarize Text'):
+            if user_input_text:
+                # summarized_text = summarizer.summarize(user_input_text)
+                output_container.markdown("### Original Text:")
+                output_container.write(user_input_text)
+                output_container.markdown("### Summarized Text:")
+                # output_container.write(summarized_text)
+            else:
+                st.error("Please enter some text to process.")
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,3 @@
-PyPDF2==1.26.0
+streamlit
+transformers
+pypdf
diff --git a/src/llm/summarizer.py b/src/llm/summarizer.py
diff --git a/src/ocr/test.pdf b/src/ocr/test.pdf
diff --git a/src/ocr/test.py b/src/ocr/test.py
diff --git a/src/ocr/pytesseract_ocr.py → src/pytesseract_ocr.py b/src/ocr/pytesseract_ocr.py → src/pytesseract_ocr.py
diff --git a/src/summarizer.py b/src/summarizer.py
@@ -0,0 +1,30 @@
+from transformers import pipeline
+
+class TextSummarizer:
+    def __init__(self, model_name="facebook/bart-large-cnn"):
+        """
+        Initialize the TextSummarizer with a specific model.
+
+        Args:
+            model_name (str): The name of the model to be used for summarization.
+        """
+        self.model_name = model_name
+        self.summarizer = pipeline("summarization", model=self.model_name)
+
+    def summarize(self, text, max_length=50, min_length=5):
+        """
+        Summarize the given text using the loaded model.
+
+        Args:
+            text (str): The text to be summarized.
+            max_length (int): The maximum length of the summarized text.
+            min_length (int): The minimum length of the summarized text.
+
+        Returns:
+            str: The summarized text, or an error message if an exception occurs.
+        """
+        try:
+            summary = self.summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
+            return summary[0]['summary_text']
+        except Exception as e:
+            return f"An error occurred: {e}"