-
Notifications
You must be signed in to change notification settings - Fork 0
/
model_analysis.py
143 lines (126 loc) · 5.68 KB
/
model_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from scipy.spatial import distance
import math
COSINE = "cosine"
EUCLIDEAN = "euclidean"
def similarity(v1, v2, metric=COSINE):
if metric == COSINE:
return cosine_distance(v1, v2)
elif metric == EUCLIDEAN:
return -euclidean_distance(v1, v2)
else:
raise Exception("Unsupported metric %r" % metric)
def cosine_distance(v1, v2):
d = distance.cosine(v1, v2)
if math.isnan(d):
return 0
else:
return 1 - d
def euclidean_distance(v1, v2):
return distance.euclidean(v1, v2)
class SpacyModelAnalyzer:
def __init__(self, nlp):
self._nlp = nlp
@property
def nlp(self):
return self._nlp
def find_by_analogy(self, token_string_a, token_string_b, token_string_c, num_results=10):
d_vector = self.word_vector(token_string_b) - self.word_vector(token_string_a) + \
self.word_vector(token_string_c)
c_orth = self.nlp(token_string_c)[0].orth
words_to_consider = [
word for word in self.nlp.vocab
if word.orth != c_orth and word.orth_.islower()]
by_similarity = sorted(
words_to_consider, key=lambda word: similarity(word.vector, d_vector), reverse=True)
return [(word.orth_, similarity(word.vector, d_vector), word.prob)
for word in by_similarity[:num_results]]
def word_vector(self, string):
return self.token(string).vector
def token(self, string):
return self.nlp(string)[0]
def find_by_similarity(self, token_string, metric=COSINE, num_results=10):
vector = self.word_vector(token_string)
words_to_consider = [
word for word in self.nlp.vocab
if word.orth_.islower()]
by_similarity = sorted(
words_to_consider,
key=lambda word: similarity(word.vector, vector, metric),
reverse=True)
return [(word.orth_, similarity(word.vector, vector, metric), word.prob)
for word in by_similarity[:num_results]]
def interpolate_linear(self, token_string_a, token_string_b, num_steps=3):
vector_a = self._word_vector(token_string_a)
vector_b = self._word_vector(token_string_b)
a_orth = self.nlp(token_string_a)[0].orth
b_orth = self.nlp(token_string_b)[0].orth
words_to_consider = [
word for word in self.nlp.vocab
if word.orth_.islower() and word.orth not in [a_orth, b_orth]]
results = []
for n in range(num_steps):
r = float(n+1) / (num_steps+1)
vector = vector_a + r * (vector_b - vector_a)
by_similarity = sorted(
words_to_consider,
key=lambda word: similarity(word.vector, vector), reverse=True)
if len(by_similarity) > 0:
word = by_similarity[0]
results.append((word.orth_, similarity(word.vector, vector), word.prob))
return results
def interpolate_nearest_neighbour(self, token_string_a, token_string_b, num_neighbours=10):
vector_a = self.word_vector(token_string_a)
vector_b = self.word_vector(token_string_b)
a_orth = self.token(token_string_a).orth
b_orth = self.token(token_string_b).orth
results = []
current_orth = a_orth
current_vector = vector_a
previous_orths = set()
while current_orth != b_orth:
previous_orths.add(current_orth)
words_to_consider = [
word for word in self.nlp.vocab
if word.orth not in previous_orths and \
word.orth_.islower()]
by_similarity_to_current = sorted(
words_to_consider,
key=lambda word: similarity(word.vector, current_vector), reverse=True)
nearest_neighbours = by_similarity_to_current[:num_neighbours]
print [w.orth_ for w in nearest_neighbours]
word = max(
nearest_neighbours,
key=lambda word: similarity(word.vector, vector_b))
print word.orth_ # TEMP
print "similarity to b:", similarity(word.vector, vector_b) # TEMP
results.append((word.orth_, word.prob))
current_orth = word.orth
current_vector = word.vector
return results
def compare_distance_metrics(self, w1, rank_threshold=100):
token1 = self.token(w1)
v1 = token1.vector
orth1 = token1.orth
lexemes_to_consider = [
lexeme for lexeme in self.nlp.vocab
if lexeme.has_vector and lexeme.orth != orth1]
by_cosine = sorted(
lexemes_to_consider, key=lambda lexeme: distance.cosine(v1, lexeme.vector))
cosine_ranks = {}
for index, lexeme in enumerate(by_cosine):
cosine_ranks[lexeme.orth] = index
by_euclidean = sorted(
lexemes_to_consider, key=lambda lexeme: distance.euclidean(v1, lexeme.vector))
euclidean_ranks = {}
for index, lexeme in enumerate(by_euclidean):
euclidean_ranks[lexeme.orth] = index
def rank_difference(lexeme):
return abs(cosine_ranks[lexeme.orth] - euclidean_ranks[lexeme.orth])
by_disagreement = sorted(lexemes_to_consider, key=rank_difference, reverse=True)
for lexeme in by_disagreement:
cosine_rank = cosine_ranks[lexeme.orth]
euclidean_rank = euclidean_ranks[lexeme.orth]
if cosine_rank < rank_threshold and euclidean_rank >= rank_threshold or \
euclidean_rank < rank_threshold and cosine_rank >= rank_threshold:
yield lexeme, cosine_rank, distance.cosine(v1, lexeme.vector), \
euclidean_rank, distance.euclidean(v1, lexeme.vector)