-
Notifications
You must be signed in to change notification settings - Fork 16
/
visualize.py
125 lines (104 loc) · 3.18 KB
/
visualize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from sklearn.model_selection import KFold
from preprocess import prepare_data
from utils import *
from utils import build_word2vec
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
#####
#####
# Hyperparameters for word2vec
num_features = 300
min_word_count = 600
num_workers = 4
context = 10
downsampling = 1e-3
epochs = 30
#####
####
dataset_path='./data/training_set_rel3.tsv'
# x = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
def plot_qwk_scores_all_sets(sets):
fig = plt.figure()
ax = plt.subplot(111)
x = [1,2,3,4,5]
set1,set2,set3,set4,set5,set6,set7,set8 = sets
ax.plot(x, set1 , label='set1')
ax.plot(x, set2, label='set2')
ax.plot(x, set3, label='set3')
ax.plot(x, set4, label='set4')
ax.plot(x, set5, label='set5')
ax.plot(x, set6, label='set6')
ax.plot(x, set7, label='set7')
ax.plot(x, set8, label='set8')
plt.title('Set wise QWK using BERT for individual sets')
ax.legend()
plt.show()
def tsne_plot(model):
"Creates and TSNE model and plots it"
labels = []
tokens = []
for word in model.wv.vocab:
tokens.append(model[word])
labels.append(word)
tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
new_values = tsne_model.fit_transform(tokens)
x = []
y = []
for value in new_values:
x.append(value[0])
y.append(value[1])
plt.figure(figsize=(16, 16))
for i in range(len(x)):
plt.scatter(x[i], y[i])
plt.annotate(labels[i],
xy=(x[i], y[i]),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
plt.savefig('Graph.png')
plt.show()
def plot_accuracy_curve(history):
plt.plot(history.history['loss'])
plt.plot(history.history['mae'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
def plot_acrchitecture(filename, model):
from keras.utils import plot_model
plot_model(model, to_file=str(filename) + '.png')
def build_visualization():
cv = KFold(n_splits=2, shuffle=True)
X, y = prepare_data(dataset_path=dataset_path)
cv_data = cv.split(X)
results = []
prediction_list = []
fold_count =1
# hyperparameters for word2vec
most_common_words= []
for traincv, testcv in cv_data:
top10 = collections.defaultdict(int)
print("\n--------Fold {}--------\n".format(fold_count))
# get the train and test from the dataset.
X_train, X_test, y_train, y_test = X.iloc[traincv], X.iloc[testcv], y.iloc[traincv], y.iloc[testcv]
train_essays = X_train['essay']
#print("y_train",y_train)
test_essays = X_test['essay']
#y_train = torch.tensor(y_train,dtype=torch.long)
train_sentences = []
for essay in train_essays:
# get all the sentences from the essay
train_sentences += essay_to_sentences(essay, remove_stopwords = True)
# word2vec embedding
print("Converting sentences to word2vec model")
model, sorted_dic = build_word2vec(train_sentences, num_workers, num_features, min_word_count, context,
downsampling)
for k, v in sorted_dic[:10]:
print("----------most_similar_word_for:" + str(k) + "--------------")
print(model.wv.most_similar(k))
top10 = collections.defaultdict(int)
tsne_plot(model)
if __name__ == '__main__':
build_visualization()