From 18523764245ac4a96aab8a5f8f8fa57d0225e2b1 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 12 Aug 2024 17:51:07 -0400 Subject: [PATCH] deny(missing_docs), 512 -> 511 --- Cargo.lock | 2 +- Cargo.toml | 17 +++++++- benches/compress.rs | 10 ++++- src/builder.rs | 39 +++++++++++------ src/lib.rs | 101 +++++++++++++++++++++++++++++--------------- src/longest.rs | 3 +- 6 files changed, 121 insertions(+), 51 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ef3912f..48d9198 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -180,7 +180,7 @@ checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" [[package]] name = "fsst-rs" -version = "0.1.0" +version = "0.0.1" dependencies = [ "criterion", "lz4", diff --git a/Cargo.toml b/Cargo.toml index 301d560..030301c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,23 @@ [package] name = "fsst-rs" -version = "0.1.0" +version = "0.0.1" edition = "2021" +[lints.rust] +warnings = "deny" +missing_docs = "deny" + +[lints.clippy] +all = { level = "deny", priority = -1 } +if_then_some_else_none = { level = "deny" } +mem_forget = { level = "deny" } +or_fun_call = "deny" +panic_in_result_fn = { level = "deny" } +same_name_method = { level = "deny" } +tests_outside_test_module = { level = "deny" } +unwrap_in_result = { level = "deny" } +use_debug = { level = "deny" } + [dev-dependencies] criterion = "0.5" lz4 = "1" diff --git a/benches/compress.rs b/benches/compress.rs index 10cc7ce..829b7e6 100644 --- a/benches/compress.rs +++ b/benches/compress.rs @@ -1,10 +1,16 @@ +//! Compression benchmark. +//! +//! Contains benchmarks for FSST compression, decompression, and symbol table training. +//! +//! Also contains LZ4 baseline. +#![allow(missing_docs)] use std::io::{Cursor, Read, Write}; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use lz4::liblz4::BlockChecksum; use lz4::{BlockSize, ContentChecksum}; -use fsst_rs::{train, SymbolTable}; +use fsst_rs::{train, Code}; const CORPUS: &str = include_str!("dracula.txt"); const TEST: &str = "I found my smattering of German very useful here"; @@ -22,7 +28,7 @@ fn bench_fsst(c: &mut Criterion) { let compressed = table.compress(plaintext); let escape_count = compressed .iter() - .filter(|b| **b == SymbolTable::ESCAPE) + .filter(|b| **b == Code::ESCAPE_CODE) .count(); let ratio = (plaintext.len() as f64) / (compressed.len() as f64); println!( diff --git a/src/builder.rs b/src/builder.rs index c7ae814..1dca853 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -21,8 +21,8 @@ struct Counter { impl Counter { fn new() -> Self { Self { - counts1: vec![0; 512], - counts2: vec![vec![0; 512]; 512], + counts1: vec![0; 511], + counts2: vec![vec![0; 511]; 511], } } @@ -47,8 +47,20 @@ impl Counter { } } +/// The number of generations used for training. This is taken from the [FSST paper]. +/// +/// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf pub const MAX_GENERATIONS: usize = 5; +/// Build and train a `SymbolTable` from a sample corpus of text. +/// +/// This function implements the generational algorithm described in the [FSST paper] Section +/// 4.3. Starting with an empty symbol table, it iteratively compresses the corpus, then attempts +/// to merge symbols when doing so would yield better compression than leaving them unmerged. The +/// resulting table will have at most 255 symbols (the 256th symbol is reserved for the escape +/// code). +/// +/// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf pub fn train(corpus: impl AsRef<[u8]>) -> SymbolTable { let mut table = SymbolTable::default(); // TODO(aduffy): handle truncating/sampling if corpus > requires sample size. @@ -87,7 +99,7 @@ impl SymbolTable { fn optimize(&self, counters: Counter) -> Self { let mut res = SymbolTable::default(); let mut pqueue = BinaryHeap::new(); - for code1 in 0..512 { + for code1 in 0..511 { let code1 = Code::from_u16(code1); let symbol1 = self.symbols[code1.0 as usize]; let gain = counters.count1(code1) * symbol1.len(); @@ -96,7 +108,7 @@ impl SymbolTable { gain, }); - for code2 in 0..512 { + for code2 in 0..511 { let code2 = Code::from_u16(code2); let symbol2 = &self.symbols[code2.0 as usize]; // If either symbol is zero-length, or if merging would yield a symbol of @@ -130,6 +142,9 @@ impl SymbolTable { } } +/// A candidate for inclusion in a symbol table. +/// +/// This is really only useful for the `optimize` step of training. struct Candidate { gain: usize, symbol: Symbol, @@ -166,7 +181,7 @@ impl Ord for Candidate { #[cfg(test)] mod test { - use crate::{train, SymbolTable}; + use crate::{train, Code}; #[test] fn test_builder() { @@ -178,24 +193,24 @@ mod test { let compressed = table.compress(text.as_bytes()); // Ensure that the compressed string has no escape bytes - assert!(compressed.iter().all(|b| *b != SymbolTable::ESCAPE)); + assert!(compressed.iter().all(|b| *b != Code::ESCAPE_CODE)); // Ensure that we can compress a string with no values seen at training time. let compressed = table.compress("xyz123".as_bytes()); assert_eq!( compressed, vec![ - SymbolTable::ESCAPE, + Code::ESCAPE_CODE, b'x', - SymbolTable::ESCAPE, + Code::ESCAPE_CODE, b'y', - SymbolTable::ESCAPE, + Code::ESCAPE_CODE, b'z', - SymbolTable::ESCAPE, + Code::ESCAPE_CODE, b'1', - SymbolTable::ESCAPE, + Code::ESCAPE_CODE, b'2', - SymbolTable::ESCAPE, + Code::ESCAPE_CODE, b'3', ] ) diff --git a/src/lib.rs b/src/lib.rs index 7fff00d..67f1d3d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,13 @@ +//! A pure-Rust, zero-dependency implementation of the [FSST string compression algorithm][whitepaper]. +//! +//! FSST is a string compression algorithm meant for use in database systems. It was designed by +//! [Peter Boncz, Thomas Neumann, and Viktor Leis][whitepaper]. It provides 1-3GB/sec compression +//! and decompression of strings at compression rates competitive with or better than LZ4. +//! +//! NOTE: This current implementation is still in-progress, please use at your own risk. +//! +//! [whitepaper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf + use std::fmt::{Debug, Formatter}; pub use builder::*; @@ -5,8 +15,6 @@ pub use builder::*; mod builder; mod longest; -pub const ESCAPE: u8 = 0xFF; - /// A Symbol wraps a set of values of #[derive(Copy, Clone)] pub union Symbol { @@ -21,8 +29,10 @@ impl Debug for Symbol { } impl Symbol { + /// Zero value for `Symbol`. pub const ZERO: Self = Self::zero(); + /// Constructor for a `Symbol` from an 8-element byte slice. pub fn from_slice(slice: &[u8; 8]) -> Self { Self { bytes: *slice } } @@ -53,43 +63,26 @@ impl Symbol { size_of::() - null_bytes } + /// Returns true if the symbol does not encode any bytes. + /// + /// Note that this should only be true for the zero code. pub fn is_empty(&self) -> bool { self.len() == 0 } + /// Create a ew pub fn as_slice(&self) -> &[u8] { let len = self.len(); - // Safety: the length from `len()` can never be more than 8. + // SAFETY: constructors will not allow building a struct where len > 8. unsafe { &self.bytes[0..len] } } - pub fn append_to(&self, vec: &mut Vec) { - match self.len() { - 0 => self.append_inner::<0>(vec), - 1 => self.append_inner::<1>(vec), - 2 => self.append_inner::<2>(vec), - 3 => self.append_inner::<3>(vec), - 4 => self.append_inner::<4>(vec), - 5 => self.append_inner::<5>(vec), - 6 => self.append_inner::<6>(vec), - 7 => self.append_inner::<7>(vec), - 8 => self.append_inner::<8>(vec), - _ => unreachable!("Symbol::len() always ≤ 8"), - } - } - - fn append_inner(&self, vec: &mut Vec) { - for i in 0..N { - let byte: u8 = unsafe { self.num >> i } as u8; - vec.push(byte); - } - } - /// Returns true if the symbol is a prefix of the provided text. pub fn is_prefix(&self, text: &[u8]) -> bool { text.starts_with(self.as_slice()) } + /// Return a new `Symbol` by logically concatenating ourselves with another `Symbol`. pub fn concat(&self, other: &Self) -> Self { let new_len = self.len() + other.len(); assert!(new_len <= 8, "cannot build symbol with length > 8"); @@ -102,13 +95,31 @@ impl Symbol { } } -/// Codes correspond to bytes. +/// Codes used to map symbols to bytes. +/// +/// Logically, codes can range from 0-255 inclusive. Physically, we represent them as a 9-bit +/// value packed into a `u16`. +/// +/// Physically in-memory, `Code(0)` through `Code(255)` corresponds to escape sequences of raw bytes +/// 0 through 255. `Code(256)` through `Code(511)` represent the actual codes -255. #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] pub struct Code(u16); impl Code { + /// Maximum code value for the in-memory `Code` representation. pub const CODE_MAX: u16 = 512; + /// Maximum code value. Code 255 is reserved as the [escape code][`Self::ESCAPE_CODE`]. + pub const MAX_CODE: u8 = 254; + + /// Code used to indicate bytes that are not in the symbol table. + /// + /// When compressing a string that cannot fully be expressed with the symbol table, the compressed + /// output will contain an `ESCAPE` byte followed by a raw byte. At decompression time, the presence + /// of `ESCAPE` indicates that the next byte should be appended directly to the result instead of + /// being looked up in the symbol table. + pub const ESCAPE_CODE: u8 = 255; + /// Create a new code representing an escape byte. pub fn new_escaped(byte: u8) -> Self { Self(byte as u16) @@ -116,6 +127,12 @@ impl Code { /// Create a new code representing a symbol. pub fn new_symbol(code: u8) -> Self { + assert_ne!( + code, + Code::ESCAPE_CODE, + "code {code} cannot be used for symbol, reserved for ESCAPE" + ); + Self((code as u16) + 256) } @@ -136,10 +153,27 @@ impl Code { } } +/// The static symbol table used for compression and decompression. +/// +/// The `SymbolTable` is the central component of FSST. You can create a SymbolTable either by +/// default, or by [training] it on an input corpus of text. +/// +/// Example usage: +/// +/// ``` +/// use fsst_rs::{Symbol, SymbolTable}; +/// let mut table = SymbolTable::default(); +/// table.insert(Symbol::from_slice(&[b'h', b'e', b'l', b'l', b'o', 0, 0, 0])); +/// +/// let compressed = table.compress("hello".as_bytes()); +/// assert_eq!(compressed, vec![0u8]); +/// ``` +/// +/// training: [`train`] #[derive(Clone, Debug)] pub struct SymbolTable { /// Table mapping codes to symbols. - pub(crate) symbols: [Symbol; 512], + pub(crate) symbols: [Symbol; 511], /// Indicates the number of entries in the symbol table that have been populated. /// @@ -151,7 +185,7 @@ pub struct SymbolTable { impl Default for SymbolTable { fn default() -> Self { let mut table = Self { - symbols: [Symbol::ZERO; 512], + symbols: [Symbol::ZERO; 511], n_symbols: 0, }; @@ -170,14 +204,15 @@ impl Default for SymbolTable { /// The symbol table is trained on a corpus of data in the form of a single byte array, building up /// a mapping of 1-byte "codes" to sequences of up to `N` plaintext bytse, or "symbols". impl SymbolTable { - pub const ESCAPE: u8 = 255; - /// Insert a new symbol at the end of the table. /// /// # Panics /// Panics if the table is already full. pub fn insert(&mut self, symbol: Symbol) { - assert!(self.n_symbols < 512, "cannot insert into full symbol table"); + assert!( + self.n_symbols < self.symbols.len(), + "cannot insert into full symbol table" + ); self.symbols[self.n_symbols] = symbol; self.n_symbols += 1; } @@ -193,7 +228,7 @@ impl SymbolTable { if next_code.is_escape() { // Case 1 -escape: push an ESCAPE followed by the next byte. // println!("ESCAPE"); - values.push(Self::ESCAPE); + values.push(Code::ESCAPE_CODE); values.push(next_code.0 as u8); pos += 1; } else { @@ -218,7 +253,7 @@ impl SymbolTable { while in_pos < compressed.len() && out_pos < (decoded.capacity() + size_of::()) { let code = compressed[in_pos]; - if code == SymbolTable::ESCAPE { + if code == Code::ESCAPE_CODE { // Advance by one, do raw write. in_pos += 1; // SAFETY: out_pos is always 8 bytes or more from the end of decoded buffer diff --git a/src/longest.rs b/src/longest.rs index 50f0ff7..445a88a 100644 --- a/src/longest.rs +++ b/src/longest.rs @@ -11,10 +11,9 @@ impl SymbolTable { // Find the code that best maps to the provided text table here. let mut best_code = Code::new_escaped(text[0]); let mut best_overlap = 1; - for code in 0..512 { + for code in 0..511 { let symbol = &self.symbols[code as usize]; if symbol.is_prefix(text) && symbol.len() > best_overlap { - // println!("using ideal code: code={code} symbol{:?} len={}", symbol.as_slice(), symbol.len()); best_code = Code::from_u16(code); best_overlap = symbol.len(); }