-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstreamlit_app.py
202 lines (178 loc) · 7.84 KB
/
streamlit_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import streamlit as st
import pandas as pd
import plotly.express as px
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# Function to load data
def load_data(uploaded_file):
if isinstance(uploaded_file, str):
data = pd.read_csv(uploaded_file)
else:
# Assuming uploaded_file is a file-like object
data = pd.read_csv(uploaded_file)
data['Date Added'] = pd.to_datetime(data['Date Added'], errors='coerce')
data['Date Read'] = pd.to_datetime(data['Date Read'], errors='coerce')
data['Year Published'] = pd.to_numeric(data['Year Published'], errors='coerce')
return data
# Function for distribution of book ratings
def plot_book_ratings(data, year=None):
if year:
data = data[data['Date Added'].dt.year == year]
fig = px.histogram(data, x='My Rating', title=f'Distribution of Book Ratings {"in " + str(year) if year else ""}',
color_discrete_sequence=["lightblue"])
return fig
# Function for most common authors
def plot_common_authors(data, year=None):
if year:
data = data[data['Date Added'].dt.year == year]
author_counts = Counter(data['Author'])
authors_df = pd.DataFrame(author_counts.most_common(10), columns=['Author', 'Count'])
fig = px.bar(authors_df, x='Count', y='Author', title=f'Most Common Authors {"in " + str(year) if year else ""}',
orientation='h', color_discrete_sequence=["lightpink"])
return fig
# Function for cumulative books plot
def plot_cumulative_books(data, year=None):
if year:
data = data[data['Date Added'].dt.year == year]
data_sorted = data.sort_values('Date Added')
data_sorted['Cumulative Books'] = range(1, len(data_sorted) + 1)
fig = px.line(data_sorted, x='Date Added', y='Cumulative Books',
title=f'Cumulative Number of Books Added {"in " + str(year) if year else ""}',
color_discrete_sequence=['lightblue'])
return fig
# Function for the distribution of book lengths
def plot_book_lengths(data, year=None):
if year:
data = data[data['Date Added'].dt.year == year]
valid_page_data = data[data['Number of Pages'].notna() & (data['Number of Pages'] > 0)]
fig = px.histogram(valid_page_data, x='Number of Pages',
title=f'Distribution of Book Lengths {"in " + str(year) if year else ""}',
color_discrete_sequence=["lightpink"])
return fig
# Function for read vs. unread books
def plot_read_unread_books(data, year=None):
if year:
data = data[data['Date Added'].dt.year == year]
read_books_count = (data['Read Count'] > 0).sum()
unread_books_count = (data['Read Count'] == 0).sum()
counts = [read_books_count, unread_books_count]
labels = ['Read Books', 'Unread Books']
fig = px.pie(names=labels, values=counts, title=f'Read vs. Unread Books {"in " + str(year) if year else ""}',
color_discrete_sequence=["lightgreen", "lightblue"])
return fig
# Function to generate a word cloud
def generate_wordcloud(data, title):
text = ' '.join(data['Title'].dropna().tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(title, fontsize=18)
return plt
# Customizing Streamlit's theme
st.set_page_config(page_title="Goodreads Wrapped", layout="wide")
# Main Streamlit app
st.title("Goodreads Wrapped")
# Instructions
st.markdown("""
**Instructions:**
1. Export your Goodreads library to a CSV file.
- Go to 'My Books' on Goodreads.
- Under 'Tools' on the left, click on 'Import and Export'.
- Click 'Export Library' to generate your CSV file.
- Download the CSV file once it's ready.
2. Upload your CSV file here to see the visualizations.
3. Explore various insights about your reading habits!
""")
# Sidebar for file upload and year selection
with st.sidebar:
# Initialize uploaded_file to None
uploaded_file = None
# Option for users to select the dataset source
dataset_source = st.radio(
"Choose your dataset source",
('Upload my dataset', 'Use example dataset')
)
if dataset_source == 'Upload my dataset':
uploaded_file = st.file_uploader("Upload your Goodreads CSV", type="csv")
if uploaded_file is not None:
data = load_data(uploaded_file)
else:
# Load the example dataset from GitHub
example_data_url = "https://raw.githubusercontent.com/gigikenneth/goodreads/main/goodreads_library_export.csv"
try:
data = load_data(example_data_url)
except Exception as e:
st.error(f"Failed to load the example dataset. Error: {e}")
# Check if data is loaded for year selection
if 'data' in locals():
min_year = int(data['Date Added'].dt.year.min())
max_year = int(data['Date Added'].dt.year.max())
year = st.select_slider("Select Year", options=range(min_year, max_year + 1), value=max_year)
st.markdown('<a href="https://github.com/gigikenneth/goodreads" target="_blank"><img src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/github/github-original.svg" width="30" height="30" alt="GitHub"></a>', unsafe_allow_html=True)
st.sidebar.markdown('Made chaotically at 3am🌪️ by [Gigi](https://github.com/gigikenneth)')
# Visualization code
if uploaded_file is not None:
# Book Ratings Plot - Side by side comparison
col1, col2 = st.columns(2)
with col1:
st.subheader("Distribution of Book Ratings")
fig1 = plot_book_ratings(data)
st.plotly_chart(fig1)
with col2:
st.subheader(f"Distribution of Book Ratings in {year}")
fig2 = plot_book_ratings(data, year=year)
st.plotly_chart(fig2)
# Cumulative Books Plot - Side by side comparison
col1, col2 = st.columns(2)
with col1:
st.subheader("Cumulative Number of Books Added Over Time")
fig3 = plot_cumulative_books(data)
st.plotly_chart(fig3)
with col2:
st.subheader(f"Cumulative Number of Books Added in {year}")
fig4 = plot_cumulative_books(data, year=year)
st.plotly_chart(fig4)
# Most Common Authors - Side by side comparison
col1, col2 = st.columns(2)
with col1:
st.subheader("Most Common Authors")
fig5 = plot_common_authors(data)
st.plotly_chart(fig5)
with col2:
st.subheader(f"Most Common Authors in {year}")
fig6 = plot_common_authors(data, year=year)
st.plotly_chart(fig6)
# Book Lengths Plot - Side by side comparison
col1, col2 = st.columns(2)
with col1:
st.subheader("Distribution of Book Lengths")
fig7 = plot_book_lengths(data)
st.plotly_chart(fig7)
with col2:
st.subheader(f"Distribution of Book Lengths in {year}")
fig8 = plot_book_lengths(data, year=year)
st.plotly_chart(fig8)
# Read vs. Unread Books Plot - Side by side comparison
col1, col2 = st.columns(2)
with col1:
st.subheader("Read vs. Unread Books")
fig9 = plot_read_unread_books(data)
st.plotly_chart(fig9)
with col2:
st.subheader(f"Read vs. Unread Books in {year}")
fig10 = plot_read_unread_books(data, year=year)
st.plotly_chart(fig10)
# Word Clouds - Side by side comparison
col1, col2 = st.columns(2)
with col1:
st.subheader("Word Cloud of Book Titles Read")
read_books = data[data['Read Count'] > 0]
fig_wc_all = generate_wordcloud(read_books, "All Time")
st.pyplot(fig_wc_all)
with col2:
st.subheader(f"Word Cloud of Book Titles Read in {year}")
read_books_year = read_books[read_books['Date Read'].dt.year == year]
fig_wc_year = generate_wordcloud(read_books_year, f"In {year}")
st.pyplot(fig_wc_year)