-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadd.go
142 lines (100 loc) · 2.48 KB
/
add.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
package gokapi
import (
"regexp"
"strings"
"sync"
)
type docMeta struct {
tf map[string]map[string]map[string]float32
len float32
}
func stripRegex(in string) string {
re, _ := regexp.Compile(`[^\w]`)
return re.ReplaceAllString(in, " ")
}
func clean(input string) string {
return stripRegex(strings.ToLower(input))
}
func firstN(s string, n int) string {
i := 0
for j := range s {
if i == n {
return s[:j]
}
i++
}
return s
}
func process(id string, document string) docMeta {
document = clean(document)
content := make(map[string]map[string]map[string]float32)
tokens := strings.Split(document, " ")
len := float32(len(tokens))
for _, token := range tokens {
key := firstN(token, 3)
f := float32(strings.Count(document, token))
if _, exist := content[key]; !exist {
content[key] = make(map[string]map[string]float32)
}
content[key][token] = make(map[string]float32)
content[key][token][id] = f
}
return docMeta{content, len}
}
func (retriever Retriever) Add(documents map[string]string) {
var wg sync.WaitGroup
tf := make(map[string]map[string]map[string]float32)
idf := make(map[string]map[string]float32)
queue := make(chan docMeta, len(documents))
for id, document := range documents {
wg.Add(1)
go func(id string, document string) {
defer wg.Done()
queue <- process(id, document)
}(id, document)
}
wg.Wait()
close(queue)
mean := retriever.Mean()
n := retriever.Size()
for sample := range queue {
n++
mean += (sample.len - mean) / n
for key, tokenIDTF := range sample.tf {
if _, ok := tf[key]; !ok {
tf[key] = make(map[string]map[string]float32)
idf[key] = make(map[string]float32)
}
for token, idtf := range tokenIDTF {
// Unknown token
if _, ok := tf[key][token]; !ok {
tf[key][token] = make(map[string]float32)
idf[key][token] = retriever.IDF(token) + 1
} else {
// Known token.
idf[key][token]++
}
for id, value := range idtf {
tf[key][token][id] = value
}
}
}
}
retriever.writeMeta(n, mean)
var wg2 sync.WaitGroup
for key, value := range tf {
wg2.Add(1)
go func(retriever *Retriever, key string, value map[string]map[string]float32) {
defer wg2.Done()
retriever.writeTF(key, value)
}(&retriever, key, value)
}
for key, value := range idf {
wg2.Add(1)
go func(retriever *Retriever, key string, value map[string]float32) {
defer wg2.Done()
retriever.writeIDF(key, value)
}(&retriever, key, value)
}
wg2.Wait()
}