-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathsimple8b_rle.h
313 lines (281 loc) · 11.2 KB
/
simple8b_rle.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
/**
* This code is released under the
* Apache License Version 2.0 http://www.apache.org/licenses/.
*
* Implemented by Daniel Lemire, http://lemire.me/en/
* (c) Daniel Lemire, http://lemire.me/en/
*
* Follows Vo Ngoc Anh, Alistair Moffat: Index compression using 64-bit words.
* Softw., Pract. Exper. 40(2): 131-147 (2010)
**/
/*****************************************************
***** Simple8b-like encoder/decoder, RLE variant *****
******************************************************
*
* Instead of using selector values 0 and 1 for encoding 240 and 120 integers,
* a single selector value 15 is used to run-length encode up to 2^28 repeated
* 32-bit integers. The rest of the selector values are shifted down by one to
* their natural position (selector values 1-14 will encode bit lengths 1-60).
*
* This also frees up selector value 0, so 64-bit output value 0 can now be
* used as end-of-stream marker as it cannot be produced by the compressor.
*
* Selector value: 0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 | 15 (RLE)
* Integers coded: 0 | 60 30 20 15 12 10 8 7 6 5 4 3 2 1 | up to 2^28
* Bits/integer: 0 | 1 2 3 4 5 6 7 8 10 12 15 20 30 60 | 32 bits
*
* If larger than 32-bit integers need to be RL-encoded, the 60 bits allocated
* for selector value 15 can be shifted more towards storage and less towards
* quantity, e.g. 2^12 48-bit integers, or the other way, e.g. 2^44 16-bit ints.
*
* Performance:
*
* Decoding is somewhat faster than Simple8b most probably due to less
*branching.
* Encoding is somewhat slower than Simple8b mostly due to no optimization at
*all.
* The extra check for run-length encoding is optional, cheap and not a big
*factor.
* Code is mostly unoptimized to keep it small, portable and facilitate easy
*reuse.
*
* Since the encoding is entirely unoptimized to keep the code simple, there are
* some other bit packing schemes that can improve encoding speed, while costing
* only a minor increase in the packed size (packed bits per integer) metric.
* If your dataset does not have a lot of 1 bit integers, these may work better:
*
* Selector value: 0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 | 15 (RLE)
* Integers coded: 0 | 30 20 15 12 10 9 8 7 6 6 5 4 3 2 | up to 2^28
* Bits/integer: 0 | 2 3 4 5 6 6 7 8 9 10 12 15 20 30 | 32 bits
*
* Selector value: 0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 | 15 (RLE)
* Integers coded: 0 | 20 15 12 11 10 9 8 7 6 6 5 5 4 3 | up to 2^28
* Bits/integer: 0 | 3 4 5 5 6 6 7 8 9 10 11 12 15 20 | 32 bits
*
**/
#ifndef SIMPLE8B_RLE_H_
#define SIMPLE8B_RLE_H_
#include "common.h"
#include "codecs.h"
#define _SIMPLE8B_USE_RLE // optional, comment this line if run-length encoding
// is not needed
namespace FastPForLib {
static const uint32_t Simple8b_Codec_bitLength[] = {1, 1, 2, 3, 4, 5, 6, 7,
8, 10, 12, 15, 20, 30, 60, 32};
// static const uint32_t Simple8b_Codec_bitLength[] = { 1, 2, 3, 4, 5, 6, 6, 7, 8, 9,
// 10, 12, 15, 20, 30, 32 };
// static const uint32_t Simple8b_Codec_bitLength[] = { 1, 3, 4, 5, 5, 6, 6, 7, 8, 9,
// 10, 11, 12, 15, 20, 32 };
// static const uint32_t Simple8b_Codec_bitLength[] = { 1, 4, 4, 5, 5, 6, 6, 7, 8, 9,
// 10, 11, 12, 14, 15, 32 };
/****************************************
********** Simple8b-like codec **********
*****************************************/
class Simple8b_Codec {
static const uint32_t SIMPLE8B_BITSIZE = 60;
static const uint32_t SIMPLE8B_MAXCODE = 15;
static const uint32_t SIMPLE8B_MINCODE = 1;
#if defined(_SIMPLE8B_USE_RLE)
static const uint32_t RLE_MAX_VALUE_BITS = 32;
static const uint32_t RLE_MAX_COUNT_BITS =
SIMPLE8B_BITSIZE - RLE_MAX_VALUE_BITS;
static const uint64_t RLE_MAX_VALUE_MASK = (1ULL << RLE_MAX_VALUE_BITS) - 1;
static const uint64_t RLE_MAX_COUNT_MASK = (1ULL << RLE_MAX_COUNT_BITS) - 1;
// check if next integer repeats, return count if packs better, otherwise 0
static uint32_t tryRunLength(const uint32_t *input, uint32_t pos,
uint32_t count) {
uint32_t startPos = pos;
uint32_t endPos = pos + count;
uint32_t test = input[pos++];
if (test > RLE_MAX_VALUE_MASK)
return 0;
while (pos < endPos && test == input[pos]) {
pos++;
}
uint32_t bitLen = bits(test | 1U);
uint32_t repeatCount = (pos - startPos);
return (bitLen * repeatCount >= SIMPLE8B_BITSIZE) ? repeatCount : 0;
}
#endif
public:
static int32_t Compress(const uint32_t *input, int32_t inOffset,
uint32_t inCount, uint64_t *output,
int32_t outOffset) {
uint32_t inPos = inOffset;
uint32_t inEnd = inOffset + inCount;
uint32_t outPos = outOffset;
while (inPos < inEnd) {
uint32_t remainingCount = inEnd - inPos;
uint64_t outVal = 0;
#if defined(_SIMPLE8B_USE_RLE)
// try if run-length encoding packs better than bit packing
uint64_t repeatCount = tryRunLength(input, inPos, remainingCount);
if (repeatCount > 0) {
repeatCount = (repeatCount < RLE_MAX_COUNT_MASK) ? repeatCount
: RLE_MAX_COUNT_MASK;
outVal = (static_cast<uint64_t>(SIMPLE8B_MAXCODE) << SIMPLE8B_BITSIZE) |
(repeatCount << RLE_MAX_VALUE_BITS) | input[inPos];
inPos += static_cast<uint32_t>(repeatCount);
} else
#endif
{
// otherwise try all the bit packing possibilities
uint32_t code = SIMPLE8B_MINCODE;
for (; code < SIMPLE8B_MAXCODE; code++) {
uint32_t intNum = Simple8b_Codec_bitLength[SIMPLE8B_MAXCODE - code];
uint32_t bitLen = Simple8b_Codec_bitLength[code];
intNum = (intNum < remainingCount) ? intNum : remainingCount;
uint64_t maxVal = (1ULL << bitLen) - 1;
uint64_t val = static_cast<uint64_t>(code) << SIMPLE8B_BITSIZE;
uint32_t j = 0;
for (; j < intNum; j++) {
uint64_t inputVal = static_cast<uint64_t>(input[inPos + j]);
if (inputVal > maxVal) {
break;
}
val |= inputVal << (j * bitLen);
}
if (j == intNum) {
outVal = val;
inPos += intNum;
break;
}
}
// if no bit packing possible, encode just one value
if (code == SIMPLE8B_MAXCODE) {
outVal = (static_cast<uint64_t>(code) << SIMPLE8B_BITSIZE) |
input[inPos++];
}
}
output[outPos++] = outVal;
}
return outPos - outOffset;
}
static uint32_t Decompress(const uint64_t *input, uint32_t,
uint32_t *output, uint32_t outOffset,
uint32_t outCount) {
uint32_t inPos = 0;
uint32_t outPos = outOffset;
uint32_t outEnd = outOffset + outCount;
while (outPos < outEnd) {
uint32_t remainingCount = outEnd - outPos;
uint64_t val = input[inPos++];
uint32_t code = static_cast<uint32_t>(val >> SIMPLE8B_BITSIZE);
// optional check for end-of-stream
if (code == 0) {
break; // end of stream
}
#if defined(_SIMPLE8B_USE_RLE)
else if (code == SIMPLE8B_MAXCODE) {
// decode rle-encoded integers
uint32_t repeatValue = static_cast<uint32_t>(val & RLE_MAX_VALUE_MASK);
uint32_t repeatCount = static_cast<uint32_t>(
(val >> RLE_MAX_VALUE_BITS) & RLE_MAX_COUNT_MASK);
repeatCount =
(repeatCount < remainingCount) ? repeatCount : remainingCount;
for (uint32_t i = 0; i < repeatCount; ++i) {
output[outPos++] = repeatValue;
}
}
#endif
else {
// decode bit-packed integers
uint32_t intNum = Simple8b_Codec_bitLength[SIMPLE8B_MAXCODE - code];
uint32_t bitLen = Simple8b_Codec_bitLength[code];
uint64_t bitMask = (1ULL << bitLen) - 1;
intNum = (intNum < remainingCount)
? intNum
: remainingCount; // optional buffer end check
#if defined(_DONT_UNROLL_LOOP)
for (uint32_t i = 0; i < intNum; ++i) {
output[outPos++] = static_cast<uint32_t>(val & bitMask);
val >>= bitLen;
}
#else
uint32_t i = 0;
for (; i < intNum; i += 8) {
auto out = output + outPos + i;
out[0] = static_cast<uint32_t>(val & bitMask);
val >>= bitLen;
out[1] = static_cast<uint32_t>(val & bitMask);
val >>= bitLen;
out[2] = static_cast<uint32_t>(val & bitMask);
val >>= bitLen;
out[3] = static_cast<uint32_t>(val & bitMask);
val >>= bitLen;
out[4] = static_cast<uint32_t>(val & bitMask);
val >>= bitLen;
out[5] = static_cast<uint32_t>(val & bitMask);
val >>= bitLen;
out[6] = static_cast<uint32_t>(val & bitMask);
val >>= bitLen;
out[7] = static_cast<uint32_t>(val & bitMask);
val >>= bitLen;
}
for (; i < intNum; ++i) {
auto out = output + outPos + i;
out[0] = static_cast<uint32_t>(val & bitMask);
val >>= bitLen;
}
outPos += intNum;
#endif
}
}
return inPos;
}
};
/**
* If MarkLength is true, than the number of symbols is written
* in the stream. Otherwise you need to specify it using the nvalue
* parameter decodeArray.
*
* Note that when MarkLength is false, some unaligned (64-bit vs. 32-bit)
* access are possible. This may fail on non-x86 platforms.
*/
template <bool MarkLength> class Simple8b_RLE : public IntegerCODEC {
public:
std::string name() const { return "Simple8b_RLE"; }
void encodeArray(const uint32_t *in, const size_t length, uint32_t *out,
size_t &nvalue) {
if (MarkLength) {
*out++ = static_cast<uint32_t>(length);
}
// this may lead to unaligned access. Performance may be affected.
// not much of an effect in practice on recent Intel processors.
uint64_t *out64 = reinterpret_cast<uint64_t *>(out);
auto count = Simple8b_Codec::Compress(in, 0, length, out64, 0);
nvalue = count * 2;
}
const uint32_t *decodeArray(const uint32_t *in, const size_t length,
uint32_t *out, size_t &nvalue) {
(void)length;
uint32_t markednvalue;
if (MarkLength) {
markednvalue = *in++;
if (markednvalue > nvalue)
throw NotEnoughStorage(markednvalue);
}
const size_t actualvalue = MarkLength ? markednvalue : nvalue;
// this may lead to unaligned access. Performance may be affected.
// not much of an effect in practice on recent Intel processors.
const uint64_t *in64 = reinterpret_cast<const uint64_t *>(in);
#ifndef NDEBUG
const uint32_t *const endin(in + length);
const uint64_t *finalin64 = reinterpret_cast<const uint64_t *>(endin);
#endif
if (nvalue < actualvalue) {
fprintf(stderr, "possible overrun\n");
}
nvalue = actualvalue;
uint32_t pos = 0;
pos = Simple8b_Codec::Decompress(in64, 0, out, 0, nvalue);
assert(in64 + pos <= finalin64);
in = reinterpret_cast<const uint32_t *>(in64 + pos);
assert(in <= endin);
nvalue = MarkLength ? actualvalue : nvalue;
return in;
}
};
#undef _SIMPLE8B_USE_RLE
} // namespace FastPForLib
#endif /* SIMPLE8B_RLE_H_ */