Skip to content

Commit

Permalink
moreso
Browse files Browse the repository at this point in the history
  • Loading branch information
a10y committed Aug 22, 2024
1 parent ced11ef commit 5f05db6
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 24 deletions.
72 changes: 58 additions & 14 deletions src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
use std::cmp::Ordering;
use std::collections::BinaryHeap;

use crate::{Compressor, Symbol, ESCAPE_CODE, MAX_CODE};
use crate::{CodeMeta, Compressor, Symbol, ESCAPE_CODE, MAX_CODE};

/// Bitmap that only works for values up to 512
#[derive(Clone, Copy, Debug, Default)]
Expand Down Expand Up @@ -43,6 +43,18 @@ impl CodesBitmap {
reference: 0,
}
}

/// Clear the bitmap of all entries.
pub(crate) fn clear(&mut self) {
self.codes[0] = 0;
self.codes[1] = 0;
self.codes[2] = 0;
self.codes[3] = 0;
self.codes[4] = 0;
self.codes[5] = 0;
self.codes[6] = 0;
self.codes[7] = 0;
}
}

struct CodesIterator<'a> {
Expand Down Expand Up @@ -179,6 +191,15 @@ impl Counter {
fn second_codes(&self, code1: u16) -> CodesIterator {
self.pair_index[code1 as usize].codes()
}

/// Clear the counters.
/// Note that this just touches the bitmaps and sets them all to invalid.
fn clear(&mut self) {
self.code1_index.clear();
for index in &mut self.pair_index {
index.clear();
}
}
}

/// The number of generations used for training. This is taken from the [FSST paper].
Expand All @@ -190,6 +211,26 @@ const MAX_GENERATIONS: usize = 5;
const MAX_GENERATIONS: usize = 2;

impl Compressor {
/// Clear all set items from the compressor.
///
/// This is considerably faster than building a new Compressor from scratch for each
/// iteration of the `train` loop.
fn clear(&mut self) {
// Eliminate every observed code from the table.
for code in 0..(256 + self.n_symbols as usize) {
let symbol = self.symbols[code];
if symbol.len() <= 2 {
// Clear the codes_twobyte array
self.codes_twobyte[symbol.first_two_bytes() as usize] = CodeMeta::EMPTY;
} else {
// Clear the hashtable
self.lossy_pht.remove(symbol);
}
}

self.n_symbols = 0;
}

/// Build and train a `Compressor` from a sample corpus of text.
///
/// This function implements the generational algorithm described in the [FSST paper] Section
Expand All @@ -200,22 +241,24 @@ impl Compressor {
///
/// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
pub fn train(corpus: impl AsRef<[u8]>) -> Self {
let mut compressor = Self::default();
let mut compressor = Compressor::default();
// TODO(aduffy): handle truncating/sampling if corpus > requires sample size.
let sample = corpus.as_ref();
if sample.is_empty() {
return compressor;
}

let mut counter = Counter::new();
for _generation in 0..(MAX_GENERATIONS - 1) {
let mut counter = Counter::new();
compressor.compress_count(sample, &mut counter);
compressor = compressor.optimize(&counter, true);
compressor.optimize(&counter, true);
counter.clear();
}

let mut counter = Counter::new();
compressor.compress_count(sample, &mut counter);
compressor.optimize(&counter, true)
compressor.optimize(&counter, true);

compressor
}
}

Expand Down Expand Up @@ -258,19 +301,19 @@ impl Compressor {

/// Using a set of counters and the existing set of symbols, build a new
/// set of symbols/codes that optimizes the gain over the distribution in `counter`.
fn optimize(&self, counters: &Counter, include_ascii: bool) -> Self {
let mut res = Compressor::default();
fn optimize(&mut self, counters: &Counter, include_ascii: bool) {
let mut pqueue = BinaryHeap::with_capacity(65_536);

for code1 in counters.first_codes() {
let symbol1 = self.symbols[code1 as usize];
let symbol1_len = symbol1.len();
let count = counters.count1(code1);
// If count is zero, we can skip the whole inner loop.
if count == 0 {
continue;
}

let mut gain = count * symbol1.len();
let mut gain = count * symbol1_len;
// NOTE: use heuristic from C++ implementation to boost the gain of single-byte symbols.
// This helps to reduce exception counts.
if code1 < 256 {
Expand All @@ -287,7 +330,7 @@ impl Compressor {
let symbol2 = &self.symbols[code2 as usize];

// If merging would yield a symbol of length greater than 8, skip.
if symbol1.len() + symbol2.len() > 8 {
if symbol1_len + symbol2.len() > 8 {
continue;
}
let new_symbol = symbol1.concat(symbol2);
Expand All @@ -301,11 +344,14 @@ impl Compressor {
}
}

// clear self in advance of inserting the symbols.
self.clear();

// Pop the 255 best symbols.
let mut n_symbols = 0;
while !pqueue.is_empty() && n_symbols < 255 {
let candidate = pqueue.pop().unwrap();
if res.insert(candidate.symbol) {
if self.insert(candidate.symbol) {
n_symbols += 1;
}
}
Expand All @@ -323,13 +369,11 @@ impl Compressor {
break;
}

if res.insert(Symbol::from_u8(character)) {
if self.insert(Symbol::from_u8(character)) {
n_symbols += 1
}
}
}

res
}
}

Expand Down
5 changes: 1 addition & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -320,9 +320,6 @@ pub struct Compressor {
/// the escape values.
pub(crate) n_symbols: u8,

//
// Index structures used to speedup building the symbol table and compression
//
/// Inverted index mapping 2-byte symbols to codes
codes_twobyte: Vec<CodeMeta>,

Expand Down Expand Up @@ -371,7 +368,7 @@ impl Compressor {
// Insert the 2-byte symbol into the twobyte cache
self.codes_twobyte[symbol.first_two_bytes() as usize] =
CodeMeta::new_symbol(self.n_symbols, symbol);
} else if symbol_len >= 3 {
} else {
// Attempt to insert larger symbols into the 3-byte cache
if !self.lossy_pht.insert(symbol, self.n_symbols) {
return false;
Expand Down
21 changes: 15 additions & 6 deletions src/lossy_pht.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,14 @@ pub(crate) struct LossyPHT {
impl LossyPHT {
/// Construct a new empty lossy perfect hash table
pub(crate) fn new() -> Self {
let slots = [TableEntry {
symbol: Symbol::ZERO,
code: CodeMeta::EMPTY,
ignored_bits: 64,
}]
.repeat(HASH_TABLE_SIZE);
let slots = vec![
TableEntry {
symbol: Symbol::ZERO,
code: CodeMeta::EMPTY,
ignored_bits: 64,
};
HASH_TABLE_SIZE
];

Self { slots }
}
Expand All @@ -92,6 +94,13 @@ impl LossyPHT {
}
}

/// Remove the symbol from the hashtable, if it exists.
pub(crate) fn remove(&mut self, symbol: Symbol) {
let prefix_3bytes = symbol.as_u64() & 0xFF_FF_FF;
let slot = self.hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1);
self.slots[slot].code = CodeMeta::EMPTY;
}

#[inline]
pub(crate) fn lookup(&self, word: u64) -> TableEntry {
let prefix_3bytes = word & 0xFF_FF_FF;
Expand Down
8 changes: 8 additions & 0 deletions tests/correctness.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,14 @@ fn test_zeros() {
assert_eq!(trained.decompressor().decompress(&compressed), &[4, 0]);
}

#[test]
fn test_small() {
let corpus = b"hello world 12345";
let trained = Compressor::train(corpus);
let compressed = trained.compress(corpus);
assert_eq!(trained.decompressor().decompress(&compressed), corpus);
}

#[cfg_attr(miri, ignore)]
#[test]
fn test_large() {
Expand Down

0 comments on commit 5f05db6

Please sign in to comment.