-
Notifications
You must be signed in to change notification settings - Fork 0
/
shannon.h
193 lines (171 loc) · 5.78 KB
/
shannon.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#ifndef SHANNON_H
#define SHANNON_H
#define MIN_ENCODED_SIZE (unsigned long)8
#include "trie.h"
#include "util.h"
#include <boost/dynamic_bitset.hpp>
#include <iostream>
#include <unordered_map>
#include <utility>
#include <vector>
typedef std::pair<boost::dynamic_bitset<>, size_t> EncodedCode;
template <typename T> class ShannonCode {
private:
std::vector<T> syms;
std::unordered_map<T, EncodedCode> sym_codes;
TrieNode<T> *decode_trie;
std::vector<std::pair<T, int>> get_frequencies(const std::vector<T> &syms) {
// create freq map
std::unordered_map<T, int> freq;
for (const T &sym : syms)
freq[sym]++;
// map to vector
std::vector<std::pair<T, int>> freq_vec;
std::copy(freq.begin(), freq.end(), std::back_inserter(freq_vec));
// sort vector in decreasing order by freq
std::sort(freq_vec.begin(), freq_vec.end(),
[](auto const &l, auto const &r) { return l.second > r.second; });
return freq_vec;
}
std::unordered_map<T, EncodedCode>
assign_codes(const std::vector<std::pair<T, int>> &freq_vec) const {
// call recursive helper
std::unordered_map<T, EncodedCode> sym_codes;
boost::dynamic_bitset<> code(MIN_ENCODED_SIZE);
return assign_codes(freq_vec, 0, freq_vec.size() - 1, {code, 0}, sym_codes);
}
std::unordered_map<T, EncodedCode>
assign_codes(const std::vector<std::pair<T, int>> &freq_vec, int l, int r,
EncodedCode &&code,
std::unordered_map<T, EncodedCode> &sym_codes) const {
// Found code for given symbol
if (l == r) {
sym_codes[freq_vec[l].first] = code;
return sym_codes;
}
// Find total and midpt sum
int total = 0;
for (int i = l; i <= r; i++) {
total += freq_vec[i].second;
}
// Find index to split on
int acc = 0, i;
for (i = l; acc <= total / 2; acc += freq_vec[i++].second)
;
int accprev = acc - freq_vec[i - 1].second;
if (std::abs(acc - (total - acc)) > std::abs(accprev - (total - accprev))) {
acc = accprev;
i--;
}
// recurse left
// create new dynamic bitset
boost::dynamic_bitset<> left_code = code.first;
left_code.resize(std::max(MIN_ENCODED_SIZE, code.second == code.first.size()
? 2 * code.first.size()
: code.first.size()));
// set new least significant bit to 0
left_code <<= 1;
left_code.reset(0);
assign_codes(freq_vec, l, i - 1, {left_code, code.second + 1}, sym_codes);
// recurse right
// create new dynamic bitset
boost::dynamic_bitset<> right_code = code.first;
right_code.resize(
std::max(MIN_ENCODED_SIZE, code.second == code.first.size()
? 2 * code.first.size()
: code.first.size()));
// set new least significant bit to 0
right_code <<= 1;
right_code.set(0);
assign_codes(freq_vec, i, r, {right_code, code.second + 1}, sym_codes);
return sym_codes;
}
TrieNode<T> *build_decode_trie() const {
TrieNode<T> *root = new TrieNode<T>();
for (auto p : sym_codes) {
TrieNode<T> *curr = root;
EncodedCode &sym_code = p.second;
// for each bit in code, build new trie node
for (int idx = sym_code.second - 1; idx >= 0; idx--) {
int bit = sym_code.first[idx];
if (!curr->next[bit]) {
curr->next[bit] = new TrieNode<T>();
}
curr = curr->next[bit];
}
curr->value = p.first;
}
return root;
}
public:
ShannonCode(std::vector<T> &syms) : syms(syms) {
// Get frequency list of each symbol, sorted in decreasing order
std::vector<std::pair<T, int>> freq_vec = get_frequencies(syms);
// debug
std::cout << "freq_vec" << std::endl;
std::cout << '[';
for (auto &p : freq_vec) {
std::cout << '(' << p.first << ", " << p.second << "), ";
}
std::cout << ']' << std::endl;
// Assign codes to each symbol
sym_codes = assign_codes(freq_vec);
// debug
std::cout << "sym_codes" << std::endl;
std::cout << '{';
for (auto &p : sym_codes) {
std::cout << p.first << ": " << p.second.first << ", ";
}
std::cout << "}" << std::endl;
// Build decode trie
decode_trie = build_decode_trie();
// debug
std::cout << "decode_trie" << std::endl;
std::cout << decode_trie << std::endl;
}
~ShannonCode() {
// free entire decode_trie structure
delete decode_trie;
}
// encodes the symbols specified in the constructor and returns a dynamic
// bitset of exact length of the code
boost::dynamic_bitset<> encode() const {
boost::dynamic_bitset<> msg(MIN_ENCODED_SIZE);
msg.reset();
size_t size = 0;
for (const T &sym : syms) {
// use at instead of [] to preserve const
const EncodedCode &sym_code = sym_codes.at(sym);
// double size
while (size + sym_code.second > msg.size()) {
msg.resize(msg.size() << 1);
}
// add code to end of message
msg <<= sym_code.second;
// need to |= bitsets of equal length
// manually shift each bit to save space
for (int i = 0; i < sym_code.second; msg[i++] = sym_code.first[i])
;
// increment size
size += sym_code.second;
}
// resize msg to exact size
msg.resize(size);
return msg;
};
// decode the given encoded block. Assumes that the code starts from the most
// significant bit.
std::vector<T> decode(boost::dynamic_bitset<> encoded) const {
std::vector<T> decoded;
TrieNode<T> *curr = decode_trie;
for (int i = encoded.size() - 1; i >= 0; i--) {
curr = curr->next[encoded[i]];
if (curr->has_value()) {
decoded.push_back(curr->value);
curr = decode_trie;
}
}
return decoded;
};
};
#endif