-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathNaiveBayes.py
307 lines (273 loc) · 10.1 KB
/
NaiveBayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
import sys
import getopt
import os
import math
import re
class NaiveBayes:
class TrainSplit:
"""Represents a set of training/testing data. self.train is a list of Examples, as is self.test.
"""
def __init__(self):
self.train = []
self.test = []
class Example:
"""Represents a document with a label. klass is 'pos' or 'neg' by convention.
words is a list of strings.
"""
def __init__(self):
self.klass = ''
self.words = []
def __init__(self):
"""NaiveBayes initialization"""
self.FILTER_STOP_WORDS = False
self.stopList = set(self.readFile('../data/english.stop'))
self.numFolds = 10
self.posText = {} # mega text for positve reviews, with frequency
self.negText = {} # mega text for negative reviews, with frequency
self.text = {} # mega text for all reviews
self.numPosWords = 0.0 # total number of words in positve mega text
self.numNegWords = 0.0 # total number of words in negatvie mega text
self.numPosReviews = 0.0 # number of positive reviews
self.numNegReviews = 0.0 # number of negative reviews
self.NEGATION_FEATURES = True
def classify(self, words):
"""
'words' is a list of words to classify. Return 'pos' or 'neg' classification.
"""
numTotalReviews = self.numPosReviews + self.numNegReviews
probPos = math.log(self.numPosReviews / numTotalReviews) # Prior of positive reviews
probNeg = math.log(self.numNegReviews / numTotalReviews) # Prior of negative reviews
for word in words:
probPos += math.log((self.posText.get(word,0) + 1)/(self.numPosWords + len(self.text) + 1)) # add-1 smoothing and add one for unknown words
probNeg += math.log((self.negText.get(word,0) + 1)/(self.numNegWords + len(self.text) + 1))
if probPos > probNeg:
return 'pos'
else:
return 'neg'
def addExample(self, klass, words):
"""
* Train your model on an example document with label klass ('pos' or 'neg') and
* words, a list of strings.
* You should store whatever data structures you use for your classifier
* in the NaiveBayes class.
* Returns nothing
"""
if klass == 'pos':
self.numPosReviews += 1
for word in words: # use this line for regular Naive Bayes method
# for word in list(set(words)) # use this line for Boolean Naive Bayes method
self.posText[word] = self.posText.get(word,0) + 1
self.numPosWords += 1
self.text[word] = self.text.get(word,0) + 1
else:
self.numNegReviews += 1
for word in words: # use this line for regular Naive Bayes method
# for word in list(set(words)) # use this line for Boolean Naive Bayes method
self.negText[word] = self.negText.get(word,0) + 1
self.numNegWords += 1
self.text[word] = self.text.get(word,0) + 1
pass
def filterStopWords(self, words):
"""
* Filters stop words found in self.stopList.
"""
filtered_words =[]
for word in words:
if word not in self.stopList:
filtered_words.append(word)
return filtered_words
def negationFeatures(self, words):
"""
* Detect negation words (not, n't and never) and add NOT_ to each word
until the next puctuation.
* Use regular expressions
"""
neg_feature = re.compile("^not$|never|[a-z]n't$") # regular expression for not, n't and never
negation = False
neg_words = []
for word in words:
if (word not in (',', '.', '?', '!', ';')) & negation:
word = "NOT_" + word
if re.search(neg_feature, word):
negation = True
if word in (',', '.', '?', '!', ';'):
negation = False
neg_words.append(word)
return neg_words
def readFile(self, fileName):
"""
* Code for reading a file. you probably don't want to modify anything here,
* unless you don't like the way we segment files.
"""
contents = []
f = open(fileName)
for line in f:
contents.append(line)
f.close()
result = self.segmentWords('\n'.join(contents))
return result
def segmentWords(self, s):
"""
* Splits lines on whitespace for file reading
"""
return s.split()
def trainSplit(self, trainDir):
"""Takes in a trainDir, returns one TrainSplit with train set."""
split = self.TrainSplit()
posTrainFileNames = os.listdir('%s/pos/' % trainDir)
negTrainFileNames = os.listdir('%s/neg/' % trainDir)
for fileName in posTrainFileNames:
example = self.Example()
example.words = self.readFile('%s/pos/%s' % (trainDir, fileName))
example.klass = 'pos'
split.train.append(example)
for fileName in negTrainFileNames:
example = self.Example()
example.words = self.readFile('%s/neg/%s' % (trainDir, fileName))
example.klass = 'neg'
split.train.append(example)
return split
def train(self, split):
for example in split.train:
words = example.words
if self.FILTER_STOP_WORDS:
words = self.filterStopWords(words)
if self.NEGATION_FEATURES:
words = self.negationFeatures(words)
self.addExample(example.klass, words)
def crossValidationSplits(self, trainDir):
"""Returns a lsit of TrainSplits corresponding to the cross validation splits."""
splits = []
posTrainFileNames = os.listdir('%s/pos/' % trainDir)
negTrainFileNames = os.listdir('%s/neg/' % trainDir)
#for fileName in trainFileNames:
for fold in range(0, self.numFolds):
split = self.TrainSplit()
for fileName in posTrainFileNames:
example = self.Example()
example.words = self.readFile('%s/pos/%s' % (trainDir, fileName))
example.klass = 'pos'
if fileName[2] == str(fold):
split.test.append(example)
else:
split.train.append(example)
for fileName in negTrainFileNames:
example = self.Example()
example.words = self.readFile('%s/neg/%s' % (trainDir, fileName))
example.klass = 'neg'
if fileName[2] == str(fold):
split.test.append(example)
else:
split.train.append(example)
splits.append(split)
return splits
def test(self, split):
"""Returns a list of labels for split.test."""
labels = []
for example in split.test:
words = example.words
if self.FILTER_STOP_WORDS:
words = self.filterStopWords(words)
guess = self.classify(words)
labels.append(guess)
return labels
def buildSplits(self, args):
"""Builds the splits for training/testing"""
trainData = []
testData = []
splits = []
trainDir = args[0]
if len(args) == 1:
print '[INFO]\tPerforming %d-fold cross-validation on data set:\t%s' % (self.numFolds, trainDir)
posTrainFileNames = os.listdir('%s/pos/' % trainDir)
negTrainFileNames = os.listdir('%s/neg/' % trainDir)
for fold in range(0, self.numFolds):
split = self.TrainSplit()
for fileName in posTrainFileNames:
example = self.Example()
example.words = self.readFile('%s/pos/%s' % (trainDir, fileName))
example.klass = 'pos'
if fileName[2] == str(fold):
split.test.append(example)
else:
split.train.append(example)
for fileName in negTrainFileNames:
example = self.Example()
example.words = self.readFile('%s/neg/%s' % (trainDir, fileName))
example.klass = 'neg'
if fileName[2] == str(fold):
split.test.append(example)
else:
split.train.append(example)
splits.append(split)
elif len(args) == 2:
split = self.TrainSplit()
testDir = args[1]
print '[INFO]\tTraining on data set:\t%s testing on data set:\t%s' % (trainDir, testDir)
posTrainFileNames = os.listdir('%s/pos/' % trainDir)
negTrainFileNames = os.listdir('%s/neg/' % trainDir)
for fileName in posTrainFileNames:
example = self.Example()
example.words = self.readFile('%s/pos/%s' % (trainDir, fileName))
example.klass = 'pos'
split.train.append(example)
for fileName in negTrainFileNames:
example = self.Example()
example.words = self.readFile('%s/neg/%s' % (trainDir, fileName))
example.klass = 'neg'
split.train.append(example)
posTestFileNames = os.listdir('%s/pos/' % testDir)
negTestFileNames = os.listdir('%s/neg/' % testDir)
for fileName in posTestFileNames:
example = self.Example()
example.words = self.readFile('%s/pos/%s' % (testDir, fileName))
example.klass = 'pos'
split.test.append(example)
for fileName in negTestFileNames:
example = self.Example()
example.words = self.readFile('%s/neg/%s' % (testDir, fileName))
example.klass = 'neg'
split.test.append(example)
splits.append(split)
return splits
def main():
nb = NaiveBayes()
# default parameters: no stop word filtering, and
# training/testing on ../data/imdb1
if len(sys.argv) < 2:
options = [('','')]
args = ['../data/imdb1/']
else:
(options, args) = getopt.getopt(sys.argv[1:], 'f')
if ('-f','') in options:
nb.FILTER_STOP_WORDS = True
splits = nb.buildSplits(args)
avgAccuracy = 0.0
fold = 0
for split in splits:
classifier = NaiveBayes()
accuracy = 0.0
for example in split.train:
words = example.words
if nb.FILTER_STOP_WORDS:
words = classifier.filterStopWords(words)
if nb.NEGATION_FEATURES:
words = classifier.negationFeatures(words)
classifier.addExample(example.klass, words)
for example in split.test:
words = example.words
if nb.FILTER_STOP_WORDS:
words = classifier.filterStopWords(words)
if nb.NEGATION_FEATURES:
words = classifier.negationFeatures(words)
guess = classifier.classify(words)
if example.klass == guess:
accuracy += 1.0
accuracy = accuracy / len(split.test)
avgAccuracy += accuracy
print '[INFO]\tFold %d Accuracy: %f' % (fold, accuracy)
fold += 1
avgAccuracy = avgAccuracy / fold
print '[INFO]\tAccuracy: %f' % avgAccuracy
if __name__ == "__main__":
main()