-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsentiment.py
357 lines (273 loc) · 12.8 KB
/
sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
'''
Main file that will perform sentiment analysis using a python sentiment analysis library
called Vader (https://github.com/cjhutto/vaderSentiment).
Three main tasks:
(1) Is the trending topic overall positive, negative or neutral?
(2) What is the distribution of the sentiment of individual tweets?
(3) Can we locate particularly 'outlier' tweets using sentiment?
@author Theodora Bendlin
'''
import csv
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.cluster import DBSCAN, KMeans
from os.path import isfile
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from mpl_toolkits.mplot3d import Axes3D
from extern import *
"""
Main method that will find the main (6) cluster representations for
a target file:
- The most positive, negative and neutral clusters
- The largest clusters (exclusive from previous group)
@parameter target (str) target data file
@parameter mock (boolean) true if random tweets should be chosen, false otherwise
@parameter cluster_method (str) which type of clustering to use, kmeans or dbscan
@parameter debug (boolean) if debug mode for kmeans clustering mode should be enabled
@parameter plot_clusters (boolean) if cluster plot should show up
@return (object) reps representation for representative tweets
"""
def find_sentiment_cluster_reps(target, mock, cluster_method='kmeans', debug=False, plot_clusters=False):
log(f'Performing sentiment analysis on {target}...')
# Uses a subsample of the data as in cluster.py
log('\tReading in data...')
t_sample = sample(target)
# If mocking the data, use a random sample
if mock:
subsamp = t_sample[np.random.choice(t_sample.shape[0], 6)]
reps = [[0, np.random.uniform(0.5), item] for item in subsamp]
# Using the actual sentiment analyzer so its not as obvious that the
# tweet reps are randomly generated
analyzer = SentimentIntensityAnalyzer()
idx = 0
for row in subsamp:
sentiment = analyzer.polarity_scores(row['text'])
reps[idx][1] = sentiment['compound']
idx += 1
# Mock cluster sizes so they appear reasonable.
reps[0][0] = np.random.randint(SAMPLE_SIZE / 5, SAMPLE_SIZE / 3)
reps[1][0] = np.random.randint(SAMPLE_SIZE / 10, SAMPLE_SIZE / 3)
reps[2][0] = np.random.randint(SAMPLE_SIZE / 8, SAMPLE_SIZE / 3)
reps[3][0] = np.random.randint(SAMPLE_SIZE / 5, SAMPLE_SIZE / 1.5)
reps[4][0] = np.random.randint(SAMPLE_SIZE / 5, SAMPLE_SIZE / 2)
reps[5][0] = np.random.randint(SAMPLE_SIZE / 2, SAMPLE_SIZE / 1.5)
# Sort the first three into pos, neg, neutral
reps[:3] = sorted(reps[:3], key=lambda x: x[1], reverse=True)
temp = reps[2]
reps[2] = reps[1]
reps[1] = temp
# Sort the last (3) by size
reps[3:] = sorted(reps[3:], key=lambda x: x[0], reverse=True)
return reps
# Converts tweets sample to dataframe of tweets with sentiment values
sentiment_df = get_sentiment_data_frame(t_sample)
# Just do one type of clustering, passed in as optional command line arg
clustering = None
if cluster_method == 'kmeans':
clustering = run_k_means(sentiment_df[['pos', 'neg', 'neu']].values, debug)
else:
clustering = run_dbscan(sentiment_df[['pos', 'neg', 'neu']].values)
sentiment_df['cluster_label'] = clustering.labels_
if plot_clusters:
plot_clustering_results(sentiment_df, clustering.labels_.max() + 1, plot_title="Sentiment Clustering Using " + cluster_method)
# Collect cluster and centermoid point data into one dataframe
cluster_df = get_cluster_centers_info(sentiment_df, list(range(0, clustering.labels_.max() + 1)))
# Gets the top 3 extreme clusters (most pos, neg, neu) and the largest 3
# clusters that not any of the most extreme
extreme_clusters = get_most_extreme_clusters(cluster_df)
largest_clusters = get_k_largest_clusters(cluster_df, excluded_clusters=extreme_clusters)
# Returning reps representation from cluster.py
return convert_to_reps(t_sample, [extreme_clusters, largest_clusters])
'''
Calculates the sentiment values using the vader library for
each tweet and stores it in a pandas dataframe for analysis
and manipulation. The scores from vader are as follows:
compound - the overall sentiment score of the tweet [-1, 1]
pos - the amount of the tweet that has positive sentiment [0, 1]
neg - the amount of the tweet that has negative sentiment [0, 1]
neu - the amount of the tweet that has neutral sentiment [0, 1]
@param t_sample (list) subsample of tweets
@returns (Dataframe) pandas dataframe of tweets and sentiment values
'''
def get_sentiment_data_frame(t_sample):
tweets = [row['text'] for row in t_sample]
tweets_map = {
'tweet_idx': [],
'compound': [],
'pos': [],
'neg': [],
'neu': []
}
# Sentiment analysis object from Vader
analyzer = SentimentIntensityAnalyzer()
count = 0
for tweet in tweets:
tweet_sentiment = analyzer.polarity_scores(tweet)
tweets_map['compound'].append(tweet_sentiment['compound'])
tweets_map['pos'].append(tweet_sentiment['pos'])
tweets_map['neg'].append(tweet_sentiment['neg'])
tweets_map['neu'].append(tweet_sentiment['neu'])
tweets_map['tweet_idx'].append(count)
count += 1
return pd.DataFrame(data=tweets_map)
'''
Helper method that will run KMeans on the twitter dataframe.
To choose best number of clusters, this method should be run with
debug set to true so that the SSE plot is shown. The BEST_K_IDX
global parameter can be adjusted to show the clusters for a given
'k', where
idx 0 1 2 3 ...
k 3 4 5 6 ...
@param tweets_df (Dataframe) dataframe of tweets and sentiment scores
@param debug (boolean) flag for printing out kmeans information
'''
def run_k_means(tweets_df, debug):
sse = []
clusters = []
if debug:
for k in range(K_START, K_END):
kmeans_result = KMeans(n_clusters=k, random_state=1).fit(tweets_df)
clusters.append(kmeans_result)
sse.append(kmeans_result.inertia_)
log(f'k = {k}, sse = {kmeans_result.inertia_}')
plt.figure()
plt.plot(np.arange(K_START, K_END), sse)
plt.xlabel('Number of clusters')
plt.ylabel('Sum of Squared Errors')
plt.show()
# TODO: Which cluster is actually the best?
clusters[BEST_K_IDX]
else:
return KMeans(n_clusters=DEFAULT_NUM_CLUSTERS, random_state=1).fit(tweets_df)
'''
Runs the DBSCAN algorithm. Separate function for consistency
and to support additional operations later on.
@param tweets_df (Dataframe) dataframe of tweets and sentiment scores
'''
def run_dbscan(tweets_df):
return DBSCAN(eps=0.015, min_samples=10).fit(tweets_df)
'''
Helper function that will plot the clustering results
using the matplotlib library.
@param tweets_df (Dataframe) dataframe of tweets and sentiment scores
@param num_clusters (int) the number of clusters for the label
'''
def plot_clustering_results(tweets_df, num_clusters, plot_title="Clustering of Sentiment Distribution"):
figure = plt.figure()
axes = figure.add_subplot(111, projection='3d')
colors = iter(cm.rainbow(np.linspace(0, 1, num_clusters)))
for n in range(num_clusters):
cluster_df = tweets_df[tweets_df['cluster_label'] == n]
label = 'Cluster {}'.format(n)
x_vals = cluster_df[['pos']].values
y_vals = cluster_df[['neg']].values
z_vals = cluster_df[['neu']].values
axes.scatter(x_vals, y_vals, z_vals, color=next(colors), label=label)
plt.title(plot_title)
axes.set_xlabel('Positive')
axes.set_ylabel('Negative')
axes.set_zlabel('Neutral')
plt.show()
"""
Finds the tweet closest to the cluster centers and returns a dataframe
with cluster info as well as some useful debugging information.
@parameter tweets_df (Pandas dataframe) dataframe with cluster assignments
and sentiment values
@parameter clusters (array) int array 0 - n, the number of clusters
@returns (Pandas dataframe) cluster info dataframe
"""
def get_cluster_centers_info(tweets_df, clusters):
# Determines what the cluster centers should be
# Defined as the averages for pos, neg, and neu for all the points in the cluster
cluster_centers = []
for cluster in range(len(clusters)):
cluster_df = tweets_df[tweets_df['cluster_label'] == cluster]
cluster_size = len(cluster_df)
compound = cluster_df[['compound']].values
pos = cluster_df[['pos']].values
neg = cluster_df[['neg']].values
neu = cluster_df[['neu']].values
cluster_centers.append((compound.sum() / cluster_size, pos.sum() / cluster_size, neg.sum() / cluster_size, neu.sum() / cluster_size))
# Creating a separate dataframe to keep track of cluster stats
cluster_info_df = {
'overall_compound': [],
'overall_pos': [],
'overall_neg': [],
'overall_neu': [],
'center_tweet_id': [],
'center_compound': [],
'center_pos': [],
'center_neg': [],
'center_neu': [],
'cluster_size': [],
'cluster_label': []
}
for cluster in clusters:
centroid = cluster_centers[cluster]
cluster_points = tweets_df[tweets_df['cluster_label'] == cluster]
min_distance = float('inf')
center_point = None
for _, row in cluster_points.iterrows():
point = row[['pos', 'neg', 'neu']].values
point_dist = np.linalg.norm((point-centroid[1:]))
if point_dist < min_distance:
min_distance = point_dist
center_point = row
cluster_info_df['overall_compound'].append(centroid[0])
cluster_info_df['overall_pos'].append(centroid[1])
cluster_info_df['overall_neg'].append(centroid[2])
cluster_info_df['overall_neu'].append(centroid[3])
cluster_info_df['center_tweet_id'].append(center_point['tweet_idx'])
cluster_info_df['center_compound'].append(center_point['compound'])
cluster_info_df['center_pos'].append(center_point['pos'])
cluster_info_df['center_neg'].append(center_point['neg'])
cluster_info_df['center_neu'].append(center_point['neu'])
cluster_info_df['cluster_label'].append(cluster)
cluster_info_df['cluster_size'].append(len(cluster_points))
return pd.DataFrame(data=cluster_info_df)
"""
Gets the "most extreme" clusters using the cluster
info dataframe.
@parameter cluster_info_df (pandas dataframe) cluster info
@returns (pandas dataframe)
"""
def get_most_extreme_clusters(cluster_info_df):
pos_sort = cluster_info_df.ix[(cluster_info_df['overall_compound'] - 1).abs().argsort()[:1]]
neg_sort = cluster_info_df.ix[(cluster_info_df['overall_compound'] + 1).abs().argsort()[:1]]
neu_sort = cluster_info_df.ix[(cluster_info_df['overall_compound'] - 0).abs().argsort()[:1]]
return pd.concat([pos_sort, neg_sort, neu_sort])
"""
Gets the largest clusters, exclusive from the list passed in using
the cluster info dataframe.
@parameter cluster_info_df (pandas dataframe) cluster info
@parameter excluded_clusters (array) array of clusters to remove from consideration
@parameter k (int) max number of clusters to return
@returns (pandas dataframe)
"""
def get_k_largest_clusters(cluster_info_df, excluded_clusters, k=MAX_PRINTED_CLUSTERS):
sorted_clusters = cluster_info_df.sort_values('cluster_size', ascending=False)
excluded_cluster_labels = excluded_clusters[['cluster_label']].values.flatten()
return sorted_clusters[~sorted_clusters['cluster_label'].isin(excluded_cluster_labels)].head(k)
"""
Converts the cluster dataframe info view into the representation
expected by the module calling the sentiment functionality.
Expected form:
(1) cluster size,
(2)compound score for the tweet representing the center,
(3) tweet rep, original representation from the sample
@parameter t_sample (array) array of ordered dict objects representing tweets
@parameter cluster_dfs (array) array of cluster dataframes to return
@return reps (array) tuples of tweet info
"""
def convert_to_reps(t_sample, cluster_dfs):
reps = []
for cluster_df in cluster_dfs:
for _, cluster in cluster_df.iterrows():
cluster_rep = []
cluster_rep.append(int(cluster['cluster_size']))
cluster_rep.append(cluster['center_compound'])
cluster_rep.append(t_sample[int(cluster['center_tweet_id'])])
reps.append(cluster_rep)
return reps