-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstream_app.py
207 lines (154 loc) · 6.34 KB
/
stream_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
from sklearn import cluster
import streamlit as st
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
from wordcloud import WordCloud
import bz2
plt.style.use("ggplot")
plt.rcParams['figure.figsize'] = [10, 4]
showPyplotGlobalUse = False
# App layout
st.set_page_config(layout="centered", initial_sidebar_state="collapsed")
siteHeader = st.container()
Clust = st.container()
WordCl = st.container()
dataExploration = st.container()
FindWords = st.container()
WordCount = st.container()
Simil = st.container()
@st.cache(allow_output_mutation=True)
def read_data():
with bz2.BZ2File('data.pbz2', 'rb') as f:
df = pickle.load(f)
df["cluster"] = df["cluster"].astype("str")
return df
def find(df, word, times=100):
ind = []
counts = []
for i, row in df.iterrows():
c = row["text"].count(word)
if c >= times:
ind.append(i)
counts.append(c)
to_ret = df.iloc[ind].copy()
to_ret["counts"] = counts
return to_ret
def and_top_words(df, n=20):
flg = True
for _, row in df.iterrows():
tops = pd.DataFrame(pd.Series(row["text"]).value_counts(normalize=True))
tops = tops.rename(columns={0: row["title"]})
if flg:
all_tops = tops.copy()
flg = False
else:
all_tops = all_tops.merge(tops,left_index=True, right_index=True)
return all_tops[:n]
def plot_top_words(df, selected, num=20):
rows = df[df["title"].isin(selected)]
return and_top_words(rows, num).plot.bar().get_figure()
def get_closest(title, n):
inp = np.array(df[df["title"] == title][["tsne1", "tsne2"]])
dists = []
for _,row in df.iterrows():
dists.append(np.linalg.norm(np.array(row[["tsne1", "tsne2"]]) - inp))
dd = df.loc[:,["title"]]
dd["dist"] = dists
dd.index = dd["title"]
dd = dd.sort_values(by="dist")
dd = dd.drop(title)
#return dd.head(n).plot.barh().get_figure()
fig = px.bar(dd.head(n), x="title", y="dist", text="title")
fig.update_layout(xaxis_visible=False, showlegend=True)
return fig
def plot_cloud(df, selected):
txt = " ".join(df[df["title"]==selected]["text"].item())
wc = WordCloud(width = 800, height = 800).generate(txt)
fig = plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wc)
plt.axis("off")
plt.tight_layout(pad = 0)
return fig
### APP CODE ###
df = read_data() # load data
# Podcast analytics page
def data_analysis():
# Title
with siteHeader:
st.title('Lex Fridman Podcast NLP')
st.markdown('''An app to explore the [Lex Fridman Podcast](https://lexfridman.com/podcast/).''')
st.markdown('''Some time ago, I was wondering if I could choose an episode based on its
similarity with others that I had already listened to. So I decided to
create a clustering of the episodes based on the text transcripts.''')
st.markdown('''After downloading the data, I started doing some analysis and realised
that there were a few interesting statistics about the episodes.
Thus, I decided to create a set of widgets and put them in this app.''')
st.markdown('''Note: The list will not be updated to the latest episode since
the encoding phase isn't automated yet. Also some episodes are
misssing since there was no available transcription.''')
# Cluster
with Clust:
st.header("Clustering of the episodes")
st.markdown(
'''The clustering was created by encoding the text from the transcriptions with the
[Universal Sentence Encoder](https://tfhub.dev/google/universal-sentence-encoder-large/5) model by Google and by applying the [KMeans](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html) algorithm
to the results.
Note: the transcription were parsed, tokenized and the most stop words removed to
improve performance.'''
)
fig = px.scatter(df, x="tsne1", y="tsne2", color="cluster", hover_name="title")
fig.update_layout(xaxis_visible=False, yaxis_visible=False, showlegend=False)
st.plotly_chart(fig)
# Word search & count
with FindWords:
st.header("Word Search")
st.markdown("Here we can search for a term and the number of occurences.")
w = st.text_input('Which word would you like to search for?', 'ai')
rep = st.slider('Number of times that the word has been said?', min_value=1, max_value=1000, value=20, step=10)
st.write(find(df, w, rep)[["title", "person", "counts"]])
# Most common words
with WordCount:
st.header("Common words")
st.markdown("Find most common words for each podcast.")
sel_pod = st.multiselect("Choose one or more podcasts", df["title"].tolist())
n_words = st.slider("Number of words to plot", min_value=5, max_value=100, step=1, value=20)
but_count = st.checkbox("Show")
if but_count and len(sel_pod) > 0:
st.pyplot(plot_top_words(df, sel_pod, n_words))
# word cloud
with WordCl:
st.header("Word Cloud")
st.markdown('''Create [word cloud](https://pypi.org/project/wordcloud/) by choosing the podcast title and click on "show image."''')
selection_col, display_col = st.columns(2)
sel_pod_cloud = selection_col.selectbox("Choose the podcast...", df["title"].tolist())
but_cloud = selection_col.checkbox("Show Image")
if but_cloud:
display_col.write(plot_cloud(df, sel_pod_cloud))
# Similarity
with Simil:
st.header("Find similar podcasts")
st.markdown("In this section, it is possible to find the most similar podcasts to the one in input. The metric used for similarity is the cosine distance between the text embeddings.")
sel_pod2 = st.selectbox("Choose an episode", df["title"].tolist())
n_words2 = st.slider("Number of similar episodes to plot", min_value=2, max_value=20, step=1, value=5)
but_count2 = st.checkbox("Show", key="similcheck")
if but_count2 and len(sel_pod2) > 0:
#st.pyplot(get_closest(sel_pod2, n_words2))
st.plotly_chart(get_closest(sel_pod2, n_words2))
# About page
#def about():
# st.title("The project")
# Other info
#def future():
# st.title("Next steps...")
# pages = {
# "Podcast data analysis": data_analysis,
# #"About the project": about,
# #"Next steps": future
# }
# st.sidebar.title("Menu")
# page = st.sidebar.radio("Select your page", tuple(pages.keys()))
# pages[page]()
data_analysis()