-
Notifications
You must be signed in to change notification settings - Fork 190
/
baseline_classifier.py
141 lines (129 loc) · 4.57 KB
/
baseline_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import sys
import re
import classifier_helper, html_helper, pickle
reload(sys)
sys.setdefaultencoding = 'utf-8'
#start class
class BaselineClassifier:
""" Classifier using baseline method """
#variables
#start __init__
def __init__(self, data, keyword, time):
#Instantiate classifier helper
self.helper = classifier_helper.ClassifierHelper('data/feature_list.txt')
#Remove duplicates
self.lenTweets = len(data)
self.origTweets = self.getUniqData(data)
self.tweets = self.getProcessedTweets(self.origTweets)
self.results = {}
self.neut_count = [0] * self.lenTweets
self.pos_count = [0] * self.lenTweets
self.neg_count = [0] * self.lenTweets
self.time = time
self.keyword = keyword
self.html = html_helper.HTMLHelper()
#end
#start getUniqData
def getUniqData(self, data):
uniq_data = {}
for i in data:
d = data[i]
u = []
for element in d:
if element not in u:
u.append(element)
#end inner loop
uniq_data[i] = u
#end outer loop
return uniq_data
#end
#start getProcessedTweets
def getProcessedTweets(self, data):
tweets = {}
for i in data:
d = data[i]
tw = []
for t in d:
tw.append(self.helper.process_tweet(t))
tweets[i] = tw
#end loop
return tweets
#start classify
def classify(self):
#load positive keywords file
inpfile = open("data/pos_mod.txt", "r")
line = inpfile.readline()
positive_words = []
while line:
positive_words.append(line.strip())
line = inpfile.readline()
#load negative keywords file
inpfile = open("data/neg_mod.txt", "r")
line = inpfile.readline()
negative_words = []
while line:
negative_words.append(line.strip())
line = inpfile.readline()
#start processing each tweet
for i in self.tweets:
tw = self.tweets[i]
count = 0
res = {}
for t in tw:
neg_words = [word for word in negative_words if(self.string_found(word, t))]
pos_words = [word for word in positive_words if(self.string_found(word, t))]
if(len(pos_words) > len(neg_words)):
label = 'positive'
self.pos_count[i] += 1
elif(len(pos_words) < len(neg_words)):
label = 'negative'
self.neg_count[i] += 1
else:
if(len(pos_words) > 0 and len(neg_words) > 0):
label = 'positive'
self.pos_count[i] += 1
else:
label = 'neutral'
self.neut_count[i] += 1
result = {'text': t, 'tweet': self.origTweets[i][count], 'label': label}
res[count] = result
count += 1
#end inner loop
self.results[i] = res
#end outer loop
filename = 'data/results_lastweek.pickle'
outfile = open(filename, 'wb')
pickle.dump(self.results, outfile)
outfile.close()
'''
inpfile = open('data/results_lastweek.pickle')
self.results = pickle.load(inpfile)
inpfile.close()
'''
#end
#start substring whole word match
def string_found(self, string1, string2):
if re.search(r"\b" + re.escape(string1) + r"\b", string2):
return True
return False
#end
#start writeOutput
def writeOutput(self, filename, writeOption='w'):
fp = open(filename, writeOption)
for i in self.results:
res = self.results[i]
for j in res:
item = res[j]
text = item['text'].strip()
label = item['label']
writeStr = text+" | "+label+"\n"
fp.write(writeStr)
#end inner loop
#end outer loop
#end writeOutput
#start printStats
def getHTML(self):
return self.html.getResultHTML(self.keyword, self.results, self.time, self.pos_count, \
self.neg_count, self.neut_count, 'baseline')
#end
#end class