Skip to content

Commit

Permalink
deny(missing_docs), 512 -> 511
Browse files Browse the repository at this point in the history
  • Loading branch information
a10y committed Aug 12, 2024
1 parent 4618d92 commit 1852376
Show file tree
Hide file tree
Showing 6 changed files with 121 additions and 51 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 16 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,8 +1,23 @@
[package]
name = "fsst-rs"
version = "0.1.0"
version = "0.0.1"
edition = "2021"

[lints.rust]
warnings = "deny"
missing_docs = "deny"

[lints.clippy]
all = { level = "deny", priority = -1 }
if_then_some_else_none = { level = "deny" }
mem_forget = { level = "deny" }
or_fun_call = "deny"
panic_in_result_fn = { level = "deny" }
same_name_method = { level = "deny" }
tests_outside_test_module = { level = "deny" }
unwrap_in_result = { level = "deny" }
use_debug = { level = "deny" }

[dev-dependencies]
criterion = "0.5"
lz4 = "1"
Expand Down
10 changes: 8 additions & 2 deletions benches/compress.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
//! Compression benchmark.
//!
//! Contains benchmarks for FSST compression, decompression, and symbol table training.
//!
//! Also contains LZ4 baseline.
#![allow(missing_docs)]
use std::io::{Cursor, Read, Write};

use criterion::{black_box, criterion_group, criterion_main, Criterion};
use lz4::liblz4::BlockChecksum;
use lz4::{BlockSize, ContentChecksum};

use fsst_rs::{train, SymbolTable};
use fsst_rs::{train, Code};

const CORPUS: &str = include_str!("dracula.txt");
const TEST: &str = "I found my smattering of German very useful here";
Expand All @@ -22,7 +28,7 @@ fn bench_fsst(c: &mut Criterion) {
let compressed = table.compress(plaintext);
let escape_count = compressed
.iter()
.filter(|b| **b == SymbolTable::ESCAPE)
.filter(|b| **b == Code::ESCAPE_CODE)
.count();
let ratio = (plaintext.len() as f64) / (compressed.len() as f64);
println!(
Expand Down
39 changes: 27 additions & 12 deletions src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ struct Counter {
impl Counter {
fn new() -> Self {
Self {
counts1: vec![0; 512],
counts2: vec![vec![0; 512]; 512],
counts1: vec![0; 511],
counts2: vec![vec![0; 511]; 511],
}
}

Expand All @@ -47,8 +47,20 @@ impl Counter {
}
}

/// The number of generations used for training. This is taken from the [FSST paper].
///
/// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
pub const MAX_GENERATIONS: usize = 5;

/// Build and train a `SymbolTable` from a sample corpus of text.
///
/// This function implements the generational algorithm described in the [FSST paper] Section
/// 4.3. Starting with an empty symbol table, it iteratively compresses the corpus, then attempts
/// to merge symbols when doing so would yield better compression than leaving them unmerged. The
/// resulting table will have at most 255 symbols (the 256th symbol is reserved for the escape
/// code).
///
/// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
pub fn train(corpus: impl AsRef<[u8]>) -> SymbolTable {
let mut table = SymbolTable::default();
// TODO(aduffy): handle truncating/sampling if corpus > requires sample size.
Expand Down Expand Up @@ -87,7 +99,7 @@ impl SymbolTable {
fn optimize(&self, counters: Counter) -> Self {
let mut res = SymbolTable::default();
let mut pqueue = BinaryHeap::new();
for code1 in 0..512 {
for code1 in 0..511 {
let code1 = Code::from_u16(code1);
let symbol1 = self.symbols[code1.0 as usize];
let gain = counters.count1(code1) * symbol1.len();
Expand All @@ -96,7 +108,7 @@ impl SymbolTable {
gain,
});

for code2 in 0..512 {
for code2 in 0..511 {
let code2 = Code::from_u16(code2);
let symbol2 = &self.symbols[code2.0 as usize];
// If either symbol is zero-length, or if merging would yield a symbol of
Expand Down Expand Up @@ -130,6 +142,9 @@ impl SymbolTable {
}
}

/// A candidate for inclusion in a symbol table.
///
/// This is really only useful for the `optimize` step of training.
struct Candidate {
gain: usize,
symbol: Symbol,
Expand Down Expand Up @@ -166,7 +181,7 @@ impl Ord for Candidate {

#[cfg(test)]
mod test {
use crate::{train, SymbolTable};
use crate::{train, Code};

#[test]
fn test_builder() {
Expand All @@ -178,24 +193,24 @@ mod test {
let compressed = table.compress(text.as_bytes());

// Ensure that the compressed string has no escape bytes
assert!(compressed.iter().all(|b| *b != SymbolTable::ESCAPE));
assert!(compressed.iter().all(|b| *b != Code::ESCAPE_CODE));

// Ensure that we can compress a string with no values seen at training time.
let compressed = table.compress("xyz123".as_bytes());
assert_eq!(
compressed,
vec![
SymbolTable::ESCAPE,
Code::ESCAPE_CODE,
b'x',
SymbolTable::ESCAPE,
Code::ESCAPE_CODE,
b'y',
SymbolTable::ESCAPE,
Code::ESCAPE_CODE,
b'z',
SymbolTable::ESCAPE,
Code::ESCAPE_CODE,
b'1',
SymbolTable::ESCAPE,
Code::ESCAPE_CODE,
b'2',
SymbolTable::ESCAPE,
Code::ESCAPE_CODE,
b'3',
]
)
Expand Down
101 changes: 68 additions & 33 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
//! A pure-Rust, zero-dependency implementation of the [FSST string compression algorithm][whitepaper].
//!
//! FSST is a string compression algorithm meant for use in database systems. It was designed by
//! [Peter Boncz, Thomas Neumann, and Viktor Leis][whitepaper]. It provides 1-3GB/sec compression
//! and decompression of strings at compression rates competitive with or better than LZ4.
//!
//! NOTE: This current implementation is still in-progress, please use at your own risk.
//!
//! [whitepaper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
use std::fmt::{Debug, Formatter};

pub use builder::*;

mod builder;
mod longest;

pub const ESCAPE: u8 = 0xFF;

/// A Symbol wraps a set of values of
#[derive(Copy, Clone)]
pub union Symbol {
Expand All @@ -21,8 +29,10 @@ impl Debug for Symbol {
}

impl Symbol {
/// Zero value for `Symbol`.
pub const ZERO: Self = Self::zero();

/// Constructor for a `Symbol` from an 8-element byte slice.
pub fn from_slice(slice: &[u8; 8]) -> Self {
Self { bytes: *slice }
}
Expand Down Expand Up @@ -53,43 +63,26 @@ impl Symbol {
size_of::<Self>() - null_bytes
}

/// Returns true if the symbol does not encode any bytes.
///
/// Note that this should only be true for the zero code.
pub fn is_empty(&self) -> bool {
self.len() == 0
}

/// Create a ew
pub fn as_slice(&self) -> &[u8] {
let len = self.len();
// Safety: the length from `len()` can never be more than 8.
// SAFETY: constructors will not allow building a struct where len > 8.
unsafe { &self.bytes[0..len] }
}

pub fn append_to(&self, vec: &mut Vec<u8>) {
match self.len() {
0 => self.append_inner::<0>(vec),
1 => self.append_inner::<1>(vec),
2 => self.append_inner::<2>(vec),
3 => self.append_inner::<3>(vec),
4 => self.append_inner::<4>(vec),
5 => self.append_inner::<5>(vec),
6 => self.append_inner::<6>(vec),
7 => self.append_inner::<7>(vec),
8 => self.append_inner::<8>(vec),
_ => unreachable!("Symbol::len() always ≤ 8"),
}
}

fn append_inner<const N: usize>(&self, vec: &mut Vec<u8>) {
for i in 0..N {
let byte: u8 = unsafe { self.num >> i } as u8;
vec.push(byte);
}
}

/// Returns true if the symbol is a prefix of the provided text.
pub fn is_prefix(&self, text: &[u8]) -> bool {
text.starts_with(self.as_slice())
}

/// Return a new `Symbol` by logically concatenating ourselves with another `Symbol`.
pub fn concat(&self, other: &Self) -> Self {
let new_len = self.len() + other.len();
assert!(new_len <= 8, "cannot build symbol with length > 8");
Expand All @@ -102,20 +95,44 @@ impl Symbol {
}
}

/// Codes correspond to bytes.
/// Codes used to map symbols to bytes.
///
/// Logically, codes can range from 0-255 inclusive. Physically, we represent them as a 9-bit
/// value packed into a `u16`.
///
/// Physically in-memory, `Code(0)` through `Code(255)` corresponds to escape sequences of raw bytes
/// 0 through 255. `Code(256)` through `Code(511)` represent the actual codes -255.
#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub struct Code(u16);

impl Code {
/// Maximum code value for the in-memory `Code` representation.
pub const CODE_MAX: u16 = 512;

/// Maximum code value. Code 255 is reserved as the [escape code][`Self::ESCAPE_CODE`].
pub const MAX_CODE: u8 = 254;

/// Code used to indicate bytes that are not in the symbol table.
///
/// When compressing a string that cannot fully be expressed with the symbol table, the compressed
/// output will contain an `ESCAPE` byte followed by a raw byte. At decompression time, the presence
/// of `ESCAPE` indicates that the next byte should be appended directly to the result instead of
/// being looked up in the symbol table.
pub const ESCAPE_CODE: u8 = 255;

/// Create a new code representing an escape byte.
pub fn new_escaped(byte: u8) -> Self {
Self(byte as u16)
}

/// Create a new code representing a symbol.
pub fn new_symbol(code: u8) -> Self {
assert_ne!(
code,
Code::ESCAPE_CODE,
"code {code} cannot be used for symbol, reserved for ESCAPE"
);

Self((code as u16) + 256)
}

Expand All @@ -136,10 +153,27 @@ impl Code {
}
}

/// The static symbol table used for compression and decompression.
///
/// The `SymbolTable` is the central component of FSST. You can create a SymbolTable either by
/// default, or by [training] it on an input corpus of text.
///
/// Example usage:
///
/// ```
/// use fsst_rs::{Symbol, SymbolTable};
/// let mut table = SymbolTable::default();
/// table.insert(Symbol::from_slice(&[b'h', b'e', b'l', b'l', b'o', 0, 0, 0]));
///
/// let compressed = table.compress("hello".as_bytes());
/// assert_eq!(compressed, vec![0u8]);
/// ```
///
/// training: [`train`]
#[derive(Clone, Debug)]
pub struct SymbolTable {
/// Table mapping codes to symbols.
pub(crate) symbols: [Symbol; 512],
pub(crate) symbols: [Symbol; 511],

/// Indicates the number of entries in the symbol table that have been populated.
///
Expand All @@ -151,7 +185,7 @@ pub struct SymbolTable {
impl Default for SymbolTable {
fn default() -> Self {
let mut table = Self {
symbols: [Symbol::ZERO; 512],
symbols: [Symbol::ZERO; 511],
n_symbols: 0,
};

Expand All @@ -170,14 +204,15 @@ impl Default for SymbolTable {
/// The symbol table is trained on a corpus of data in the form of a single byte array, building up
/// a mapping of 1-byte "codes" to sequences of up to `N` plaintext bytse, or "symbols".
impl SymbolTable {
pub const ESCAPE: u8 = 255;

/// Insert a new symbol at the end of the table.
///
/// # Panics
/// Panics if the table is already full.
pub fn insert(&mut self, symbol: Symbol) {
assert!(self.n_symbols < 512, "cannot insert into full symbol table");
assert!(
self.n_symbols < self.symbols.len(),
"cannot insert into full symbol table"
);
self.symbols[self.n_symbols] = symbol;
self.n_symbols += 1;
}
Expand All @@ -193,7 +228,7 @@ impl SymbolTable {
if next_code.is_escape() {
// Case 1 -escape: push an ESCAPE followed by the next byte.
// println!("ESCAPE");
values.push(Self::ESCAPE);
values.push(Code::ESCAPE_CODE);
values.push(next_code.0 as u8);
pos += 1;
} else {
Expand All @@ -218,7 +253,7 @@ impl SymbolTable {

while in_pos < compressed.len() && out_pos < (decoded.capacity() + size_of::<Symbol>()) {
let code = compressed[in_pos];
if code == SymbolTable::ESCAPE {
if code == Code::ESCAPE_CODE {
// Advance by one, do raw write.
in_pos += 1;
// SAFETY: out_pos is always 8 bytes or more from the end of decoded buffer
Expand Down
3 changes: 1 addition & 2 deletions src/longest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,9 @@ impl SymbolTable {
// Find the code that best maps to the provided text table here.
let mut best_code = Code::new_escaped(text[0]);
let mut best_overlap = 1;
for code in 0..512 {
for code in 0..511 {
let symbol = &self.symbols[code as usize];
if symbol.is_prefix(text) && symbol.len() > best_overlap {
// println!("using ideal code: code={code} symbol{:?} len={}", symbol.as_slice(), symbol.len());
best_code = Code::from_u16(code);
best_overlap = symbol.len();
}
Expand Down

0 comments on commit 1852376

Please sign in to comment.