-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdoc_vectors.py
133 lines (99 loc) · 3.01 KB
/
doc_vectors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python
# get document vectors & plot
#
# variety of schemes to constuct document vectors: can use counts or tfidf,
# and can use word vectors or one-hot vectors
import vms_tokenize
from collections import Counter, defaultdict, OrderedDict
from math import log, acos
import numpy as np
import numpy.linalg as npla
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import fasttext
import argparse
import sys
from section_labels import *
from lang_labels import *
def term_freq(term, page):
# raw count
# return term_counts[page][term]
# normalize by page length
#return float(term_counts[page][term]) / sum(term_counts[page].values())
# normalize by most common word
return float(term_counts[page][term]) / max(term_counts[page].values())
def inv_doc_freq(term):
return log(float(len(pages)) / len(doc_counts[term]))
def tfidf(term, page):
return term_freq(term, page) * inv_doc_freq(term)
def annotate(image, words, n=float("inf")):
annotation_list = []
for i, (label, x, y) in enumerate(zip(words, image[:, 0], image[:, 1])):
if i == n: break
plt.annotate(
label,
xy=(x, y), # xytext=(-20, 20),
alpha = 0.4,
# textcoords='offset points', ha='right', va='bottom',
# bbox=dict(boxstyle='round,pad=0.5', fc='black', alpha=0.5),
# arrowprops=dict(arrowstyle = '->', connectionstyle='arc3,rad=0')
)
path = "models/voynich.bin"
model = fasttext.load_model(path)
term_counts = defaultdict(Counter)
doc_counts = defaultdict(set)
# really this should be a set, but we want ordering
pages = OrderedDict()
# load words
for line in vms_tokenize.get_words("text16e6.evt", page_numbers=True):
pg = line.pop(0)
pages[pg] = True
term_counts[pg].update(line)
for word in line:
doc_counts[word].add(pg)
embedded = OrderedDict()
# count terms on page
doc_vectors = []
for p in pages:
v = np.zeros(100)
total_tfidf = 0
for w in set(term_counts[p]):
if w in model:
ti = tfidf(w, p)
v = np.add(v, np.multiply(ti, model[w]))
total_tfidf += ti
# normalize
doc_vectors.append(np.divide(v, total_tfidf))
pagelist = list(pages)
vectors = np.array(doc_vectors)
mag = npla.norm(vectors, axis=1)[:,None] * npla.norm(vectors, axis=1)
sims = np.divide(np.dot(vectors, vectors.T), mag)
sims = np.tril(sims, -1) # don't align to self and remove dups
indices = np.flip(np.argsort(sims, axis=1), axis=1)[:,:5]
doc_dist = []
for i, p in enumerate(pagelist):
L = [(p, pagelist[j], acos(sims[i, j])) for j in indices[i, :]]
doc_dist.extend(L)
# print w
# for _, match, dist in L:
# print "\t", match, "\t", dist
doc_dist.sort(key = lambda x: x[2])
# for p in doc_dist:
# print p[0], "\t", p[1], "\t", p[2]
tsne = TSNE(n_components=2, metric="cosine", random_state=2)
image = tsne.fit_transform(doc_vectors)
#annotate(image, pages)
color = {
'astro': 'red',
'herbal': 'green',
'multiherbal': 'lime',
'bath': 'cyan',
'text': 'grey'
}
# color = {
# 'A': 'red',
# 'B': 'blue',
# 'X': 'grey'
# }
plt.scatter(*zip(*image), c = [color[section_labels.get(i, 'X')] for i in pages])
plt.show()