-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstring_table.c
361 lines (283 loc) · 11.7 KB
/
string_table.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
#include "string_table.h"
/*
===============================================================================
UTILITY FUNCTIONS FOR STRING TABLE THAT ARE NOT EXPOSED IN THE HEADER FILE.
===============================================================================
*/
/*Uses the Fowler-Noll-Vo algorithm to hash the prefixes and characters*/
u_int64_t __hash_func(int prefix, int character) {
u_int64_t hash = FNV_OFFSET_BASIS_64;
unsigned char *bytes;
// First, we hash the prefix.
bytes = (unsigned char*)&prefix;
for (size_t i = 0; i < sizeof(int); i++) {
hash ^= bytes[i];
hash *= FNV_PRIME_64;
}
// Next, we hash the character.
bytes = (unsigned char*) &character;
for (size_t i = 0; i < sizeof(int); i++) {
hash ^= bytes[i];
hash *= FNV_PRIME_64;
}
return hash;
}
/*Creates a dynamically-allocated deep copy of entry.*/
strtable_entry *__deepcopy_strtable_entry(strtable_entry *entry) {
strtable_entry *copy = malloc(sizeof(strtable_entry));
copy->character = entry->character;
copy->prefix = entry->prefix;
copy->code = entry->code;
return copy;
}
/*Converts a hash-table based string table to an array based string table.
The returned table can therefore be traversed sequentially by code.*/
decompression_strtable *__compression_to_decompression_strtable(compression_strtable *table) {
decompression_strtable *decompress_table = decompression_strtable_new(table->max_size);
for (int i = 0; i < table->num_buckets; i++) {
hashed_strtable_entry *bucket = table->buckets[i];
while (bucket) { // traverse each bucket
decompress_table->arr[bucket->data.code] = bucket->data;
decompress_table->size++;
bucket = bucket->next;
}
}
return decompress_table;
}
/*
===============================================================================
COMPRESSION STRING TABLE
===============================================================================
*/
compression_strtable *compression_strtable_new(size_t max_size) {
compression_strtable *table = malloc(sizeof(compression_strtable));
// We set the hash table size to be 1.33x the max number of entries.
size_t num_buckets = (4*max_size)/3;
// initialize fields
table->num_buckets = num_buckets;
table->size = 0;
table->max_size = max_size;
table->buckets = malloc(num_buckets*sizeof(hashed_strtable_entry*));
for (int i = 0; i < num_buckets; i++) {
table->buckets[i] = NULL; // initialize all buckets with NULL
}
return table;
}
void compression_strtable_insert(compression_strtable *table, int prefix, int character) {
// In the case that the table is full, we can't insert.
if (table->size >= table->max_size) {
return;
}
u_int64_t hash = __hash_func(prefix, character);
size_t index = hash % table->num_buckets;
hashed_strtable_entry *node = malloc(sizeof(hashed_strtable_entry));
node->next = NULL;
node->data = (strtable_entry) {
.character = character,
.prefix = prefix,
.code = table->size++
};
hashed_strtable_entry *bucket = table->buckets[index];
if (!bucket) // if bucket is empty
table->buckets[index] = node;
else { // puts node in the front of the list
node->next = bucket;
table->buckets[index] = node;
}
}
strtable_entry *compression_strtable_get(compression_strtable *table, int prefix, int character) {
// we first get the bucket the code lives in
u_int64_t hash = __hash_func(prefix, character);
hashed_strtable_entry *bucket = table->buckets[hash % table->num_buckets];
// then we sequentially check each node to see if the
// (prefix, character) pairs match to get the right code
while (bucket != NULL) {
if (bucket->data.prefix == prefix && bucket->data.character == character) {
return &(bucket->data);
}
bucket = bucket->next;
}
return NULL; // we return -1 if there is no match
// this causes no concerns with representing the code -1 as
// empty because we never store (the code) -1 in the string table
}
compression_strtable *compression_strtable_prune(compression_strtable *original) {
// we want to be able to get the prefix, character pair
// based on the code, so we create a decode array to help with that
decompression_strtable *decompress_table = __compression_to_decompression_strtable(original);
// we first find the codes that we will keep
// by traversing the entire hash table
int *codes_to_keep = calloc(original->size, sizeof(int));
// although we don't formally need this
// keeping track of the pruned size adds minimal overhead
// and is helpful for debugging
int pruned_size = 0;
for (int i = 0; i < original->num_buckets; i++) {
hashed_strtable_entry *bucket = original->buckets[i];
while (bucket) {
int prefix = bucket->data.prefix;
if (prefix == -1) {
if (codes_to_keep[bucket->data.character] == 0) {
codes_to_keep[bucket->data.character] = 1;
pruned_size++;
}
}
else if (codes_to_keep[prefix] == 0) {
codes_to_keep[prefix] = 1;
pruned_size++;
}
bucket = bucket->next;
}
}
// now we need to reconstruct the hash table
// we maintain a mapping of the old codes to the new codes
// since some codes may change
// the index of the array represents the old code
// and the value of the array element represents the new code
int *old_to_new_codes = calloc(original->size, sizeof(int));
compression_strtable *pruned_table = compression_strtable_new(original->max_size);
// now we populate the pruned hash table
for (int i = 0; i < original->size; i++) {
if (codes_to_keep[i]) { // we should keep this code
strtable_entry data = decompress_table->arr[i]; // we get the prefix, character pair
old_to_new_codes[i] = pruned_table->size;
if (data.prefix == -1) // it is one of the 256 base characters
compression_strtable_insert(pruned_table, data.prefix, data.character);
else // we need to account for the fact that the prefix code may have changed
compression_strtable_insert(pruned_table, old_to_new_codes[data.prefix], data.character);
}
}
// we free the decode arr we made
decompression_strtable_free(decompress_table);
free(old_to_new_codes);
free(codes_to_keep);
return pruned_table;
}
void compression_strtable_dump(compression_strtable *table, char *filename) {
// turns hash_table into a temporary array because codes are sorted
// and it is easier to represent the strings
decompression_strtable *decompress_table = __compression_to_decompression_strtable(table);
decompression_strtable_dump(decompress_table, filename);
decompression_strtable_free(decompress_table);
}
void compression_strtable_free(compression_strtable *table) {
for (int i = 0; i < table->num_buckets; i++) {
hashed_strtable_entry* bucket = table->buckets[i];
while (bucket) {
hashed_strtable_entry *next = bucket->next;
free(bucket);
bucket = next;
}
}
free(table->buckets);
free(table);
}
/*
===============================================================================
DECOMPRESSION STRING TABLE
===============================================================================
*/
decompression_strtable *decompression_strtable_new(size_t max_size) {
decompression_strtable* table = malloc(sizeof(decompression_strtable));
table->size = 0;
table->max_size = max_size;
table->arr = calloc(max_size, sizeof(strtable_entry));
return table;
}
void decompression_strtable_insert(decompression_strtable *table, int prefix, int character) {
// In the case that the table is full, we can't insert.
if (table->size >= table->max_size) {
return;
}
table->arr[table->size] = (strtable_entry) {.prefix = prefix, .character = character, .code = table->size};
table->size++;
}
strtable_entry *decompression_strtable_get(decompression_strtable* table, int code) {
if (code < 0 || code >= table->size)
return NULL;
return &(table->arr[code]);
}
decompression_strtable *decompression_strtable_prune(decompression_strtable *original) {
// we first find the codes that we will keep
// by traversing the entire array
int *codes_to_keep = calloc(original->size, sizeof(int));
// although we don't formally need this
// keeping track of the pruned size adds minimal overhead
// and is helpful for debugging
int pruned_size = 0;
for (int i = 0; i < original->size; i++) {
int prefix = original->arr[i].prefix;
if (prefix == -1 && codes_to_keep[original->arr[i].character] == 0) {
codes_to_keep[original->arr[i].character] = 1;
pruned_size++;
}
else if (codes_to_keep[prefix] == 0) {
codes_to_keep[prefix] = 1;
pruned_size++;
}
}
// we'll now reconstruct the array
decompression_strtable *pruned_table = decompression_strtable_new(original->max_size);
// we maintain a mapping of the old codes to the new codes
// since some codes may change
int *old_to_new_codes = calloc(original->size, sizeof(int));
// now we populate the pruned array
for (int i = 0; i < original->size; i++) {
if (codes_to_keep[i]) { // we should keep this code
strtable_entry data = original->arr[i]; // we get the prefix, character pair
old_to_new_codes[i] = pruned_table->size;
// we may have to adjust the prefix since codes can change
int prefix = data.prefix == -1 ? data.prefix : old_to_new_codes[data.prefix];
decompression_strtable_insert(pruned_table, prefix, data.character);
}
}
free(old_to_new_codes);
free(codes_to_keep);
return pruned_table;
}
void decompression_strtable_free(decompression_strtable* table) {
free(table->arr);
free(table);
}
void decompression_strtable_dump(decompression_strtable* arr, char *filename) {
FILE *file = fopen(filename, "w");
fprintf(file, "String Table Dump\n");
fprintf(file, "%-8s\t%-8s\t%-12s\t%-8s\n", "Code", "Prefix", "Character", "String");
for (int i = 0; i < arr->size; i++) {
// we first convert all the prefixes to their base characters
int prefix = i;
int length = 0;
int buffer[arr->size];
memset(buffer, 0, arr->size);
if (arr->arr[i].prefix == i)
buffer[length++] = arr->arr[i].character;
else {
while (prefix != -1) {
buffer[length++] = arr->arr[prefix].character;
prefix = arr->arr[prefix].prefix;
}
}
// since we appended prefixes, we have to then reverse the array
for (int j = 0; j < length/2; j++) {
int temp = buffer[j];
buffer[j] = buffer[length - 1 - j];
buffer[length - 1 - j] = temp;
}
// now we turn an array of ints into a string
// with their integer representations
char s[12*arr->size];
int s_len = 0;
for (int k = 0; k < length; k++) {
s_len += sprintf(s + s_len, "%d ", buffer[k]);
}
fprintf(
file,
"%-8d\t%-8d\t%-12d\t%-8s\n",
i,
arr->arr[i].prefix,
arr->arr[i].character,
s
);
}
fclose(file);
}