forked from nagadomi/kaggle-lshtc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnearest_centroid_classifier.hpp
184 lines (167 loc) · 4.26 KB
/
nearest_centroid_classifier.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#ifndef NEAREST_CENTROID_CLASSIFIER_HPP
#define NEAREST_CENTROID_CLASSIFIER_HPP
#include "util.hpp"
#include "inverted_index.hpp"
class NearestCentroidClassifier
{
private:
std::vector<fv_t> m_centroids;
std::vector<int> m_centroid_labels;
InvertedIndex m_inverted_index;
static void
vector_sum(fv_t &sum,
const std::vector<int> &indexes,
const std::vector<fv_t> &data)
{
sum.clear();
for (auto i = indexes.begin(); i != indexes.end(); ++i) {
const fv_t &x = data[*i];
for (auto word = x.begin(); word != x.end(); ++word) {
auto s = sum.find(word->first);
if (s != sum.end()) {
s->second += word->second;
} else {
sum.insert(std::make_pair(word->first, word->second));
}
}
}
}
static void
vector_normalize_l2(fv_t &x)
{
double dot = 0.0f;
for (auto i = x.begin(); i != x.end(); ++i) {
dot += i->second * i->second;
}
if (dot > 0.0f) {
double scale = 1.0f / std::sqrt(dot);
for (auto i = x.begin(); i != x.end(); ++i) {
i->second *= scale;
}
}
}
public:
NearestCentroidClassifier(){}
void
train(const category_index_t &category_index,
const std::vector<fv_t> &data)
{
for (auto l = category_index.begin(); l != category_index.end(); ++l) {
fv_t centroid;
vector_sum(centroid, l->second, data);
vector_normalize_l2(centroid);
m_centroids.push_back(centroid);
m_centroid_labels.push_back(l->first);
}
m_inverted_index.build(&m_centroids);
}
inline void
predict(std::vector<int> &results,
size_t k,
const fv_t &query) const
{
InvertedIndex::result_t knn;
m_inverted_index.knn(knn, k, query);
results.clear();
for (auto i = knn.begin(); i != knn.end(); ++i) {
results.push_back(m_centroid_labels[i->id]);
}
}
size_t
size(void) const
{
return m_centroids.size();
}
bool
save(const char *file) const
{
FILE *fp = std::fopen(file, "wb");
if (fp == 0) {
return false;
}
size_t size = m_centroids.size();
std::fwrite(&size, sizeof(size), 1, fp);
for (auto centroid = m_centroids.begin();
centroid != m_centroids.end(); ++centroid)
{
size = centroid->size();
std::fwrite(&size, sizeof(size), 1, fp);
for (auto w = centroid->begin(); w != centroid->end(); ++w) {
std::fwrite(&w->first, sizeof(w->first), 1, fp);
std::fwrite(&w->second, sizeof(w->second), 1, fp);
}
}
size = m_centroid_labels.size();
std::fwrite(&size, sizeof(size), 1, fp);
std::fwrite(m_centroid_labels.data(), sizeof(int), size, fp);
fclose(fp);
return true;
}
bool
load(const char *file)
{
FILE *fp = std::fopen(file, "rb");
if (fp == 0) {
return false;
}
m_centroids.clear();
m_centroid_labels.clear();
m_inverted_index.clear();
size_t centroid_num = 0;
size_t ret = std::fread(¢roid_num, sizeof(centroid_num), 1, fp);
if (ret != 1) {
std::fprintf(stderr, "%s: invalid format 1\n", file);
fclose(fp);
return false;
}
for (size_t i = 0; i < centroid_num; ++i) {
fv_t centroid;
size_t word_num = 0;
ret = fread(&word_num, sizeof(word_num), 1, fp);
if (ret != 1) {
std::fprintf(stderr, "%s: invalid format 2\n", file);
fclose(fp);
return false;
}
for (size_t j = 0; j < word_num; ++j) {
int word_id;
float word_weight;
ret = std::fread(&word_id, sizeof(word_id), 1, fp);
if (ret != 1) {
std::fprintf(stderr, "%s: invalid format 3\n", file);
fclose(fp);
return false;
}
ret = std::fread(&word_weight, sizeof(word_weight), 1, fp);
if (ret != 1) {
std::fprintf(stderr, "%s: invalid format 4\n", file);
fclose(fp);
return false;
}
centroid.insert(std::make_pair(word_id, word_weight));
}
m_centroids.push_back(centroid);
}
ret = std::fread(¢roid_num, sizeof(centroid_num), 1, fp);
if (ret != 1) {
std::fprintf(stderr, "%s: invalid format 5\n", file);
fclose(fp);
return false;
}
int *buffer = new int[centroid_num];
ret = std::fread(buffer, sizeof(int), centroid_num, fp);
if (ret != centroid_num) {
std::fprintf(stderr, "%s: invalid format 6\n", file);
delete buffer;
fclose(fp);
return false;
}
std::copy(buffer, buffer + centroid_num,
std::back_inserter(m_centroid_labels));
delete buffer;
fclose(fp);
m_inverted_index.build(&m_centroids);
return true;
}
};
#endif