-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_analyzer.py
55 lines (54 loc) · 2.11 KB
/
data_analyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans as kmeans
from warnings import catch_warnings,filterwarnings
from re import search
def findstem(arr):
result = []
for x in range(len(arr)):
subparts = arr[x].split(" ")
for i in range(len(subparts)):
for j in range(i+1, len(subparts)+1):
part = " ".join(subparts[i:j])
count = 0
for y in range(len(arr)):
if (part in arr[y]) and (x != y):
count = count+1
result.append((count, part))
maxval = 0
result_str = ""
for count, str_part in result:
if(count > maxval):
maxval = count
result_str = str_part
elif(count == maxval and len(str_part) > len(result_str)):
result_str = str_part
return result_str
def segregate(transaction_details, n):
convergence_warning = None
vectorizer = CountVectorizer()
vector = vectorizer.fit_transform(transaction_details)
word_counts = vector.toarray()
k_means = kmeans(n_clusters=n)
try:
with catch_warnings():
filterwarnings('error')
try:
k_means.fit(word_counts)
except Warning as w:
convergence_warning = str(w).split(".")[0]
cluster_ids = k_means.labels_
unique_clustor_ids = set(cluster_ids)
unique_clusters = {}
for clustor_id in unique_clustor_ids:
common_words = []
for j in range(len(cluster_ids)):
if(clustor_id == cluster_ids[j]):
common_words.append(transaction_details[j])
unique_clusters.update({clustor_id: findstem(list(set(common_words)))})
clusters = []
for clustor_id in cluster_ids:
clusters.append((clustor_id, unique_clusters[clustor_id]))
return (clusters,convergence_warning)
except Exception:
convergence_warning = "Enable to find "+str(n)+" clusters"
return ([],convergence_warning)