-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
274 lines (239 loc) · 10 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
import os
import io
import base64
import requests
import traceback
from typing import Literal
import yaml
import streamlit as st
from bs4 import BeautifulSoup
from google.genai import types
import streamlit_authenticator as stauth
import prompts
from utils import summarize_common
# ==============================================================
# 1. SET PAGE CONFIG BEFORE ANYTHING ELSE!
# (We use wide layout + custom page title/icon)
# ==============================================================
st.set_page_config(
page_title="SummarizeIQ",
page_icon="📄",
layout="centered"
)
# ==============================================================
# 2. APPLY CUSTOM CSS (optional)
# You can add or modify any custom styling here
# ==============================================================
def apply_custom_css():
st.markdown(
"""
<style>
/* Center the logo container */
.logo-container {
display: flex;
justify-content: center;
margin: 1rem 0 2rem 0;
}
.logo-container img {
width: 80px; /* control the size of the logo here */
height: auto;
}
.p {
font-color: Green;
}
</style>
""",
unsafe_allow_html=True
)
apply_custom_css()
with open("assets/logo.png", "rb") as f:
logo_data = f.read()
logo_base64 = base64.b64encode(logo_data).decode("utf-8")
st.markdown(
f"""
<div class="logo-container">
<img src="data:image/png;base64,{logo_base64}" alt="Logo">
</div>
""",
unsafe_allow_html=True
)
# -----------------------------
# Now your Summarizer interface
# -----------------------------
# We create a simple literal type for doc_type (optional)
DocumentType = Literal["Research paper", "Literature Review", "Report"]
# # Add a responsive logo at the top (already in custom CSS)
# with open("assets/logo.png", "rb") as f:
# logo_data = f.read()
# logo_base64 = base64.b64encode(logo_data).decode("utf-8")
# st.markdown(
# f"""
# <div class="logo-container">
# <img src="data:image/png;base64,{logo_base64}" alt="Logo">
# </div>
# """,
# unsafe_allow_html=True
# )
st.title("SummarizeIQ")
st.write(
"""
This application summarizes PDF content or URLs (webpages, articles, etc.).
Please select one of the tabs below to use the corresponding feature.
"""
)
# -------------------------------------------------------------------------
# Helper function to extract text and images from a URL (Gemini image desc)
# -------------------------------------------------------------------------
def extract_text_and_images_from_url(url: str) -> list[types.Part]:
"""
Extract text and images from a web article URL.
Uses Google Gemini to describe each image found (if any).
Returns:
text_content (str): extracted textual content (article text).
image_descriptions (List[str]): descriptions of each image.
"""
parts = []
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
" AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/94.0.4606.81 Safari/537.36"
)
}
response = requests.get(url, headers=headers)
if response.status_code != 200:
st.error(f"Failed to fetch the page. Status code: {response.status_code}")
return []
html = response.content
soup = BeautifulSoup(html, 'html.parser')
# Try to find the main <article> tag; fallback to entire page
article = soup.find('article')
if article:
text_content = article.get_text()
else:
text_content = soup.get_text()
text_part = types.Part.from_text(text=text_content)
parts.append(text_part)
# Find all <img> tags
images = soup.find_all('img')
for image in images:
img_url = image.get('src')
if img_url and not img_url.startswith('data:'):
img_url = requests.compat.urljoin(url, img_url) # handle relative URLs
img_response = requests.get(img_url, headers=headers)
if img_response.status_code == 200:
image_bytes = io.BytesIO(img_response.content)
# Convert to base64 so we can pass it as a Part to Gemini
image_base64 = base64.b64encode(image_bytes.getvalue()).decode("utf-8")
# Summarize the image using the same "summarize_common" method
image_part = types.Part.from_bytes(
data=base64.b64decode(image_base64),
mime_type="image/png" # or "image/jpeg", depending on actual img
)
parts.append(image_part)
# Fallback if no images or no text
if not parts:
st.error("No valid data found.")
return parts
# Create tabs in the Streamlit UI
tab1, tab2 = st.tabs(["Summarize PDF", "Summarize URL"])
# -------------------------------------
# 1. Summarize PDF
# -------------------------------------
with tab1:
st.subheader("Upload and Summarize a PDF")
st.success("""
Recommended:
1. Please upload a PDF file (maximum size: 15 MB).
2. For optimal results, upload a PDF with 30 pages or fewer.
""" )
# Select the document type
doc_type: DocumentType = st.selectbox(
"Select Document Type",
("Research paper", "Literature Review", "Report"),
index=0
)
# File uploader for PDF
uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"])
# Button to trigger summarization
if st.button("Summarize PDF"):
if uploaded_file is None:
st.error("Please upload a PDF file first.")
else:
# Check if the uploaded file is a PDF
if not uploaded_file.name.lower().endswith(".pdf"):
st.error("Only PDF files are allowed.")
else:
# Check if the file size exceeds 15 MB
uploaded_file.seek(0, 2) # Move cursor to the end of the file
file_size = uploaded_file.tell()
uploaded_file.seek(0) # Reset cursor to the start
if file_size > 15 * 1024 * 1024:
st.error("File size too big. Maximum allowed size is 15 MB.")
else:
try:
# Read the content of the file
content = uploaded_file.read()
# Encode to base64
base64_pdf = base64.b64encode(content)
# Convert base64 back to bytes
document_part = types.Part.from_bytes(
data=base64.b64decode(base64_pdf),
mime_type="application/pdf"
)
# Use the appropriate prompts
if doc_type == "Research paper":
assitant_prompt_part = types.Part.from_text(text=prompts.research_paper_prompt)
completion_string = summarize_common(
system_prompt=prompts.system_prompt.format(document_name=doc_type),
parts=[assitant_prompt_part, document_part]
)
elif doc_type == "Literature Review":
assitant_prompt_part = types.Part.from_text(text=prompts.literature_review_prompt)
completion_string = summarize_common(
system_prompt=prompts.system_prompt.format(document_name=doc_type),
parts=[assitant_prompt_part, document_part]
)
else: # "Report"
assitant_prompt_part = types.Part.from_text(text=prompts.report_prompt)
completion_string = summarize_common(
system_prompt=prompts.system_prompt.format(document_name=doc_type),
parts=[assitant_prompt_part, document_part]
)
# Display the summarized content
st.success("Summary generated successfully!")
st.write(completion_string)
st.download_button(f"Download {doc_type} Summary", completion_string, file_name=f"{doc_type}.md")
except Exception as e:
st.error(f"An error occurred")
traceback.print_exc()
# -------------------------------------
# 2. Summarize URL (Extract Text & Images)
# -------------------------------------
with tab2:
st.subheader("Summarize a Webpage by URL")
st.warning("This summarizer is under beta testing.The summarization may not be accurate.")
url = st.text_input("Enter a URL", placeholder="https://example.com/article")
if st.button("Summarize URL"):
if not url:
st.error("Please enter a valid URL.")
else:
try:
assitant_prompt_part = types.Part.from_text(text=prompts.article_prompt)
# Extract text & images directly
parts = extract_text_and_images_from_url(url)
if parts:
total_parts = [assitant_prompt_part]+parts
# Summarize using the 'article_prompt'
completion_string = summarize_common(
system_prompt=prompts.system_prompt.format(document_name="article"),
parts=total_parts
)
st.success("Summary generated successfully!")
st.write(completion_string)
st.download_button("Download Article Summary", completion_string, file_name="Article summary.md")
else:
st.error("No valid data found.")
except Exception as e:
st.error(f"An error occurred. Try a different URL.")
traceback.print_exc()