-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster.py
130 lines (112 loc) · 4.41 KB
/
cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#
# authors:
# Paul Galatic
#
# description:
# Performs clustering of hashtags.
#
# STD LIB
import os
import csv
import pdb
# EXTERNAL LIB
import autocorrect
import numpy as np
from scipy.spatial import distance
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import CountVectorizer
# PROJECT LIB
from extern import *
def get_stopwords():
if not os.path.exists(CACHE_DIR / 'stopwords.flag'):
import nltk
nltk.download('stopwords')
open(CACHE_DIR / 'stopwords.flag', 'w').close()
from nltk.corpus import stopwords
return set(stopwords.words('english'))
def filter(text, speller):
if FILTER_STOPWORDS:
stops = get_stopwords()
text = ' '.join([word for word in text.split() if word not in stops])
if CORRECT_SPELLING:
text = speller(text)
return text
def find_argcenter(cluster):
'''Returns the index of the point closest to the mean of the cluster.'''
center = np.mean(cluster, axis=0)
closest = None
least_dist = 1
for idx in range(len(cluster)):
dist = distance.cosine(cluster[idx], center)
if dist < least_dist:
least_dist = dist
closest = idx
return idx, 1 - least_dist
def agglomerate(samp, corpus):
if not os.path.isdir(CACHE_DIR):
os.mkdir(CACHE_DIR)
# Convert tweets into bags-of-words vectors.
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(corpus).todense()
# Cluster the vectors using agglomerative clustering over the cosine
# similarity space.
log(f'\tClustering {SAMPLE_SIZE} tweets...')
clustering = AgglomerativeClustering(
n_clusters=None,
affinity='cosine',
memory=str(CACHE_DIR),
compute_full_tree='auto',
linkage='complete',
distance_threshold=DISTANCE_THRESHOLD)
clustering.fit(vectors)
# Sort the clusters into lists.
log('\tFinding centers...')
cluster_points = [[] for idx in range(len(set(clustering.labels_)))]
cluster_tweets = [[] for idx in range(len(set(clustering.labels_)))]
for idx, (point, label) in enumerate(zip(vectors, clustering.labels_)):
cluster_points[label].append(point)
cluster_tweets[label].append(samp[idx])
# Sort the lists to find the largest clusters.
cluster_points = sorted(cluster_points, key=lambda x: len(x), reverse=True)
cluster_tweets = sorted(cluster_tweets, key=lambda x: len(x), reverse=True)
# Find the representative tweets with the least distance to the center
# of each cluster.
reps = []
for idx in range(NUM_CLUSTERS):
try:
argcenter, confidence = find_argcenter(cluster_points[idx])
reps.append([len(cluster_tweets[idx]), confidence, cluster_tweets[idx][argcenter]])
except IndexError:
log(f'WARNING: There were only {len(set(clustering.labels_))} cluster(s)!')
# Act as if there were more than two clusters by picking the first
# tweet(s). This only happens if there are very few tweets in a sample,
# usually during testing.
reps.append([1, 0, samp[idx]])
return reps
def find_cluster_reps(target, mock):
log(f'Clustering {target}...')
# Checking spelling can help normalize text.
speller = autocorrect.Speller(lang='en')
# Open the source data file and use it as a corpus for clustering. While
# the corpus is filtered for spelling and stopwords, the original tweets
# are presented.
log('\tReading in data...')
samp = sample(target)
corpus = [filter(row['text'], speller) for row in samp]
# If we're mocking the data, it's very easy. Just return random tweets from
# the corpus.
if mock:
subsamp = samp[np.random.choice(samp.shape[0], 3)]
reps = [[0, np.random.normal(0.75, 0.25), item] for item in subsamp]
# Mock cluster sizes so they appear reasonable.
reps[0][0] = np.random.randint(SAMPLE_SIZE / 5, SAMPLE_SIZE / 1.5)
reps[1][0] = np.random.randint(SAMPLE_SIZE / 10, SAMPLE_SIZE / 5)
reps[2][0] = np.random.randint(SAMPLE_SIZE / 10, SAMPLE_SIZE / 5)
reps = sorted(reps, key=lambda x: x[0], reverse=True)
# Otherwise, find reps "the hard way" using clustering.
else:
reps = agglomerate(samp, corpus)
log('...done.')
return reps
if __name__ == '__main__':
cluster_reps = find_cluster_reps('#VeteransDay', False)