-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNB_in_memory.py
96 lines (85 loc) · 3.28 KB
/
NB_in_memory.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import numpy as np
import time
import re
from collections import Counter
train_path='./DBPedia.full/full_train.txt'
test_path='./DBPedia.full/full_test.txt'
devel_path='./DBPedia.full/full_devel.txt'
def _train(file_path):
file = open(file_path,'r')
file = file.readlines()
data = {}
y_data = {}
vocab = []
total_class = 0
for doc in file:
doc = doc.lower().strip().split()
labels = doc[0].split(",")
doc = [re.sub(r'[^\w\s]','',word) for word in doc[3:] if re.sub(r'[^\w\s]','',word).isalpha() and len(word)>3]
doc = Counter(doc)
vocab += list(doc.keys())
for label in labels:
total_class += 1
if label in y_data:
(total_words,y_freq) = y_data[label]
total_words += sum(doc.values())
y_freq += 1
y_data[label] = (total_words,y_freq)
else:
y_data[label] = (sum(doc.values()),1)
for word in doc:
if word + "^" + label in data:
data[word + "^" + label] += doc[word]
else:
data[word + "^" + label] = doc[word]
return data,y_data,set(vocab),total_class
def read_test_data(file_path,vocab):
file = open(file_path,'r')
file = file.readlines()
data = []
labels = []
test_vocab = []
for doc in file:
doc = doc.lower().strip().split()
labels.append(doc[0].split(","))
doc = [re.sub(r'[^\w\s]','',word) for word in doc[3:] if re.sub(r'[^\w\s]','',word) in vocab]
doc = Counter(doc)
test_vocab += list(doc.keys())
data.append(doc)
return data,labels,set(test_vocab)
def get_prob(train_data,y_data,test_vocab,vocab_size,smooth):
for word in test_vocab:
for y in y_data:
if word + "^" + y in train_data:
train_data[word+"^"+y] = np.log((train_data[word+"^"+y] + smooth)/(y_data[y][0] + vocab_size*smooth))
return train_data
def get_test_score(doc,y,train_data,y_data,vocab_size,smooth,total_class):
score = 0
for word in doc:
if word + "^" + y in train_data:
score += doc[word]*train_data[word + "^" + y]
else:
score += doc[word]*(np.log(smooth/(y_data[y][0] + vocab_size*smooth)))
score += np.log(y_data[y][1]/total_class)
return score
def get_accuracy(test_data,test_labels,train_data,y_data,vocab_size,smooth,total_class):
count = 0
y_list = list(y_data.keys())
for i,doc in enumerate(test_data):
score = [get_test_score(doc,y,train_data,y_data,vocab_size,smooth,total_class) for y in y_list]
label = y_list[np.argmax(score)]
if label in test_labels[i]:
count += 1
return count/len(test_data)
### Training Phase
start = time.time()
train_data, y_data, vocab, total_class = _train(train_path)
print("train_time: %s seconds" % (time.time() - start))
### Testing Phase
start = time.time()
test_data, test_labels, test_vocab = read_test_data(test_path,vocab)
smooth = 0.015
train_data_proc = get_prob(train_data,y_data,test_vocab,len(list(vocab)),smooth)
acc = get_accuracy(test_data,test_labels,train_data_proc,y_data,len(list(vocab)),smooth,total_class)
print("test_time: %s seconds" % (time.time() - start))
print("test_accuracy: ",acc)