From 31351ca0da1087f5b96362cdb800facfef319245 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 15 Aug 2024 11:16:40 -0400 Subject: [PATCH] Get compress performance to match paper algorithm 4 (#3) This gets us close to 2-3 cycles per byte or so that they reference in the paper for predicated scalar compression. ![image](https://github.com/user-attachments/assets/5e0c6c24-cb71-435d-ae5c-51f291018f94) ^ the benchmark is compression on string with length 50, so compression is roughly 1-2ns per byte (roughly 3-5 cycles on my M2) --- .gitignore | 7 - Cargo.lock | 13 +- Cargo.toml | 15 +- benches/compress.rs | 36 +-- examples/file_compressor.rs | 70 ++++++ examples/round_trip.rs | 19 ++ rust-toolchain.toml | 3 +- src/builder.rs | 67 +++--- src/find_longest/mod.rs | 5 + src/find_longest/naive.rs | 28 +++ src/lib.rs | 385 ++++++++++++++++++++++++++------- src/longest.rs | 24 -- src/lossy_pht.rs | 118 ++++++++++ tests/correctness.rs | 59 +++++ tests/fixtures/declaration.txt | 63 ++++++ 15 files changed, 735 insertions(+), 177 deletions(-) create mode 100644 examples/file_compressor.rs create mode 100644 examples/round_trip.rs create mode 100644 src/find_longest/mod.rs create mode 100644 src/find_longest/naive.rs delete mode 100644 src/longest.rs create mode 100644 src/lossy_pht.rs create mode 100644 tests/correctness.rs create mode 100644 tests/fixtures/declaration.txt diff --git a/.gitignore b/.gitignore index 8b196e9..c403c34 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,2 @@ /target .idea/ - - -# Added by cargo -# -# already existing elements were commented out - -#/target diff --git a/Cargo.lock b/Cargo.lock index 48d9198..b5e4226 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -43,9 +43,12 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.1.10" +version = "1.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9e8aabfac534be767c909e0690571677d49f41bd8465ae876fe043d52ba5292" +checksum = "5fb8dd288a69fc53a1996d7ecfbf4a20d59065bff137ce7e56bbd620de191189" +dependencies = [ + "shlex", +] [[package]] name = "cfg-if" @@ -438,6 +441,12 @@ dependencies = [ "serde", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "syn" version = "2.0.74" diff --git a/Cargo.toml b/Cargo.toml index 030301c..31f9e7f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,10 @@ [package] name = "fsst-rs" version = "0.0.1" +description = "Pure-Rust implementation of Fast Static Symbol Tables algorithm for string compression" +authors = ["SpiralDB Developers "] +license = "Apache-2.0" +repository = "https://github.com/spiraldb/fsst" edition = "2021" [lints.rust] @@ -22,7 +26,16 @@ use_debug = { level = "deny" } criterion = "0.5" lz4 = "1" +[[example]] +name = "round_trip" +bench = false +test = false + [[bench]] name = "compress" harness = false -bench = true + +[[test]] +name = "correctness" +test = true +bench = false diff --git a/benches/compress.rs b/benches/compress.rs index 829b7e6..603eca1 100644 --- a/benches/compress.rs +++ b/benches/compress.rs @@ -4,13 +4,14 @@ //! //! Also contains LZ4 baseline. #![allow(missing_docs)] +use core::str; use std::io::{Cursor, Read, Write}; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use lz4::liblz4::BlockChecksum; use lz4::{BlockSize, ContentChecksum}; -use fsst_rs::{train, Code}; +use fsst_rs::{train, ESCAPE_CODE}; const CORPUS: &str = include_str!("dracula.txt"); const TEST: &str = "I found my smattering of German very useful here"; @@ -26,17 +27,17 @@ fn bench_fsst(c: &mut Criterion) { let plaintext = TEST.as_bytes(); let compressed = table.compress(plaintext); - let escape_count = compressed - .iter() - .filter(|b| **b == Code::ESCAPE_CODE) - .count(); + let escape_count = compressed.iter().filter(|b| **b == ESCAPE_CODE).count(); let ratio = (plaintext.len() as f64) / (compressed.len() as f64); println!( "Escapes = {escape_count}/{}, compression_ratio = {ratio}", compressed.len() ); - assert_eq!(table.decompress(&compressed), TEST.as_bytes()); + let decompressed = table.decompress(&compressed); + let decompressed = str::from_utf8(&decompressed).unwrap(); + println!("DECODED: {}", decompressed); + assert_eq!(decompressed, TEST); group.bench_function("compress-single", |b| { b.iter(|| black_box(table.compress(black_box(plaintext)))); @@ -50,29 +51,6 @@ fn bench_fsst(c: &mut Criterion) { fn bench_lz4(c: &mut Criterion) { let mut group = c.benchmark_group("lz4"); - // { - // let compressed = Vec::with_capacity(10_000); - // let mut encoder = lz4::EncoderBuilder::new() - // .block_size(BlockSize::Max64KB) - // .build(compressed) - // .unwrap(); - // - // encoder.write_all(TEST.as_bytes()).unwrap(); - // let (compressed, result) = encoder.finish(); - // result.unwrap(); - // - // let ratio = (TEST.as_bytes().len() as f64) / (compressed.len() as f64); - // println!("LZ4 compress_ratio = {ratio}"); - // - // // ensure decodes cleanly - // let cursor = Cursor::new(compressed); - // let mut decoder = lz4::Decoder::new(cursor).unwrap(); - // let mut output = String::new(); - // - // decoder.read_to_string(&mut output).unwrap(); - // assert_eq!(output.as_str(), TEST); - // } - group.bench_function("compress-single", |b| { let mut compressed = Vec::with_capacity(100_000_000); let mut encoder = lz4::EncoderBuilder::new() diff --git a/examples/file_compressor.rs b/examples/file_compressor.rs new file mode 100644 index 0000000..3dab660 --- /dev/null +++ b/examples/file_compressor.rs @@ -0,0 +1,70 @@ +#![allow(missing_docs, clippy::use_debug)] + +//! This is a command line program that expects two input files as arguments. +//! +//! The first is the file to train a symbol table on. +//! +//! The second is the file to compress. The compressor will run and compress +//! in chunks of 16MB, logging the compression ratio for each chunk. +//! +//! Example: +//! +//! ``` +//! cargo run --release --example file_compressor -- file1.csv file2.csv +//! ``` +use std::{ + fs::File, + io::Read, + os::unix::fs::{FileExt, MetadataExt}, + path::Path, +}; + +fn main() { + let args: Vec<_> = std::env::args().skip(1).collect(); + assert!(args.len() >= 2, "args TRAINING and FILE must be provided"); + + let train_path = Path::new(&args[0]); + let input_path = Path::new(&args[1]); + + let mut train_bytes = Vec::new(); + { + let mut f = File::open(train_path).unwrap(); + f.read_to_end(&mut train_bytes).unwrap(); + } + + println!("building the compressor from {train_path:?}..."); + let compressor = fsst_rs::train(&train_bytes); + + println!("compressing blocks of {input_path:?} with compressor..."); + + let f = File::open(input_path).unwrap(); + let size_bytes = f.metadata().unwrap().size() as usize; + + const CHUNK_SIZE: usize = 16 * 1024 * 1024; + + let mut chunk_idx = 1; + let mut pos = 0; + let mut chunk = vec![0u8; CHUNK_SIZE]; + while pos + CHUNK_SIZE < size_bytes { + f.read_exact_at(&mut chunk, pos as u64).unwrap(); + // Compress the chunk, don't write it anywhere. + let compact = compressor.compress(&chunk); + let compression_ratio = (CHUNK_SIZE as f64) / (compact.len() as f64); + println!("compressed chunk {chunk_idx} with ratio {compression_ratio}"); + + pos += CHUNK_SIZE; + chunk_idx += 1; + } + + // Read last chunk with a new custom-sized buffer. + if pos < size_bytes { + let amount = size_bytes - pos; + chunk = vec![0u8; size_bytes - pos]; + f.read_exact_at(&mut chunk, pos as u64).unwrap(); + // Compress the chunk, don't write it anywhere. + let compact = compressor.compress(&chunk[0..amount]); + let compression_ratio = (amount as f64) / (compact.len() as f64); + println!("compressed chunk {chunk_idx} with ratio {compression_ratio}"); + } + println!("done"); +} diff --git a/examples/round_trip.rs b/examples/round_trip.rs new file mode 100644 index 0000000..0f3fab7 --- /dev/null +++ b/examples/round_trip.rs @@ -0,0 +1,19 @@ +//! Simple example where we show round-tripping a string through the static symbol table. + +use core::str; + +fn main() { + // Train on a sample. + let sample = "the quick brown fox jumped over the lazy dog"; + let trained = fsst_rs::train(sample.as_bytes()); + let compressed = trained.compress(sample.as_bytes()); + println!("compressed: {} => {}", sample.len(), compressed.len()); + // decompress now + let decode = trained.decompress(&compressed); + let output = str::from_utf8(&decode).unwrap(); + println!( + "decoded to the original: len={} text='{}'", + decode.len(), + output + ); +} diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 544af13..2296533 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,4 @@ [toolchain] -channel = "nightly-2024-06-19" +channel = "nightly-2024-08-14" components = ["rust-src", "rustfmt", "clippy"] profile = "minimal" - diff --git a/src/builder.rs b/src/builder.rs index 31933db..558a3b4 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -7,7 +7,8 @@ use std::cmp::Ordering; use std::collections::BinaryHeap; -use crate::{Code, Symbol, SymbolTable}; +use crate::find_longest::FindLongestSymbol; +use crate::{Symbol, SymbolTable, MAX_CODE}; #[derive(Debug, Clone)] struct Counter { @@ -21,29 +22,29 @@ struct Counter { impl Counter { fn new() -> Self { Self { - counts1: vec![0; Code::CODE_MAX as usize], - counts2: vec![vec![0; Code::CODE_MAX as usize]; Code::CODE_MAX as usize], + counts1: vec![0; MAX_CODE as usize], + counts2: vec![vec![0; MAX_CODE as usize]; MAX_CODE as usize], } } #[inline] - fn record_count1(&mut self, code1: Code) { - self.counts1[code1.0 as usize] += 1; + fn record_count1(&mut self, code1: u16) { + self.counts1[code1 as usize] += 1; } #[inline] - fn record_count2(&mut self, code1: Code, code2: Code) { - self.counts2[code1.0 as usize][code2.0 as usize] += 1; + fn record_count2(&mut self, code1: u16, code2: u16) { + self.counts2[code1 as usize][code2 as usize] += 1; } #[inline] - fn count1(&self, code: Code) -> usize { - self.counts1[code.0 as usize] + fn count1(&self, code: u16) -> usize { + self.counts1[code as usize] } #[inline] - fn count2(&self, code1: Code, code2: Code) -> usize { - self.counts2[code1.0 as usize][code2.0 as usize] + fn count2(&self, code1: u16, code2: u16) -> usize { + self.counts2[code1 as usize][code2 as usize] } } @@ -65,6 +66,9 @@ pub fn train(corpus: impl AsRef<[u8]>) -> SymbolTable { let mut table = SymbolTable::default(); // TODO(aduffy): handle truncating/sampling if corpus > requires sample size. let sample = corpus.as_ref(); + if sample.is_empty() { + return table; + } for _generation in 0..MAX_GENERATIONS { let counter = table.compress_count(sample); table = table.optimize(counter); @@ -81,13 +85,13 @@ impl SymbolTable { let len = sample.len(); let mut prev_code = self.find_longest_symbol(sample); counter.record_count1(prev_code); - let mut pos = self.symbols[prev_code.0 as usize].len(); + let mut pos = self.symbols[prev_code as usize].len(); while pos < len { let code = self.find_longest_symbol(&sample[pos..len]); counter.record_count1(code); counter.record_count2(prev_code, code); - pos += self.symbols[code.0 as usize].len(); + pos += self.symbols[code as usize].len(); prev_code = code; } @@ -100,8 +104,7 @@ impl SymbolTable { let mut res = SymbolTable::default(); let mut pqueue = BinaryHeap::new(); for code1 in 0..511 { - let code1 = Code::from_u16(code1); - let symbol1 = self.symbols[code1.0 as usize]; + let symbol1 = self.symbols[code1 as usize]; let gain = counters.count1(code1) * symbol1.len(); pqueue.push(Candidate { symbol: symbol1, @@ -109,8 +112,7 @@ impl SymbolTable { }); for code2 in 0..511 { - let code2 = Code::from_u16(code2); - let symbol2 = &self.symbols[code2.0 as usize]; + let symbol2 = &self.symbols[code2 as usize]; // If either symbol is zero-length, or if merging would yield a symbol of // length greater than 8, skip. if symbol1.len() + symbol2.len() >= 8 || symbol1.is_empty() || symbol2.is_empty() { @@ -133,10 +135,13 @@ impl SymbolTable { } // Pop the 255 best symbols. - pqueue - .iter() - .take(255) - .for_each(|candidate| res.insert(candidate.symbol)); + let mut n_symbols = 0; + while !pqueue.is_empty() && n_symbols < 255 { + let candidate = pqueue.pop().unwrap(); + if res.insert(candidate.symbol) { + n_symbols += 1; + } + } res } @@ -181,7 +186,7 @@ impl Ord for Candidate { #[cfg(test)] mod test { - use crate::{train, Code}; + use crate::{train, ESCAPE_CODE}; #[test] fn test_builder() { @@ -193,26 +198,26 @@ mod test { let compressed = table.compress(text.as_bytes()); // Ensure that the compressed string has no escape bytes - assert!(compressed.iter().all(|b| *b != Code::ESCAPE_CODE)); + assert!(compressed.iter().all(|b| *b != ESCAPE_CODE)); - // Ensure that we can compress a string with no values seen at training time. + // Ensure that we can compress a string with no values seen at training time, with escape bytes let compressed = table.compress("xyz123".as_bytes()); assert_eq!( compressed, vec![ - Code::ESCAPE_CODE, + ESCAPE_CODE, b'x', - Code::ESCAPE_CODE, + ESCAPE_CODE, b'y', - Code::ESCAPE_CODE, + ESCAPE_CODE, b'z', - Code::ESCAPE_CODE, + ESCAPE_CODE, b'1', - Code::ESCAPE_CODE, + ESCAPE_CODE, b'2', - Code::ESCAPE_CODE, + ESCAPE_CODE, b'3', ] - ) + ); } } diff --git a/src/find_longest/mod.rs b/src/find_longest/mod.rs new file mode 100644 index 0000000..00eb7b2 --- /dev/null +++ b/src/find_longest/mod.rs @@ -0,0 +1,5 @@ +mod naive; + +pub trait FindLongestSymbol { + fn find_longest_symbol(&self, text: &[u8]) -> u16; +} diff --git a/src/find_longest/naive.rs b/src/find_longest/naive.rs new file mode 100644 index 0000000..c75ecad --- /dev/null +++ b/src/find_longest/naive.rs @@ -0,0 +1,28 @@ +use crate::find_longest::FindLongestSymbol; +use crate::SymbolTable; + +// Find the code that maps to a symbol with longest-match to a piece of text. +// +// This is the naive algorithm that just scans the whole table and is very slow. + +impl FindLongestSymbol for SymbolTable { + // NOTE(aduffy): if you don't disable inlining, this function won't show up in profiles. + #[inline(never)] + fn find_longest_symbol(&self, text: &[u8]) -> u16 { + debug_assert!(!text.is_empty(), "text must not be empty"); + + // Find the code that best maps to the provided text table here. + // Start with the code corresponding to the escape of the first character in the text + let mut best_code = text[0] as u16; + let mut best_overlap = 1; + for code in 256..511 { + let symbol = &self.symbols[code as usize]; + if symbol.is_prefix(text) && symbol.len() > best_overlap { + best_code = code; + best_overlap = symbol.len(); + } + } + + best_code + } +} diff --git a/src/lib.rs b/src/lib.rs index 068f610..7191b00 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,23 +1,30 @@ #![doc = include_str!("../README.md")] + +/// Throw a compiler error if a type isn't guaranteed to have a specific size in bytes. +macro_rules! assert_sizeof { + ($typ:ty => $size_in_bytes:expr) => { + const _: [u8; $size_in_bytes] = [0; std::mem::size_of::<$typ>()]; + }; +} + use std::fmt::{Debug, Formatter}; pub use builder::*; +use lossy_pht::LossyPHT; mod builder; -mod longest; +mod find_longest; +mod lossy_pht; -/// A Symbol wraps a set of values of +/// `Symbol`s are small (up to 8-byte) segments of strings, stored in a [`SymbolTable`] and +/// identified by an 8-bit code. #[derive(Copy, Clone)] pub union Symbol { bytes: [u8; 8], num: u64, } -impl Debug for Symbol { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", unsafe { self.num }) - } -} +assert_sizeof!(Symbol => 8); impl Symbol { /// Zero value for `Symbol`. @@ -51,7 +58,14 @@ impl Symbol { // For little-endian platforms, this counts the number of *trailing* zeros let null_bytes = (numeric.leading_zeros() >> 3) as usize; - size_of::() - null_bytes + // Special case handling of a symbol with all-zeros. This is actually + // a 1-byte symbol containing 0x00. + let len = size_of::() - null_bytes; + if len == 0 { + 1 + } else { + len + } } /// Returns true if the symbol does not encode any bytes. @@ -61,7 +75,31 @@ impl Symbol { self.len() == 0 } - /// Create a ew + #[inline] + fn as_u64(&self) -> u64 { + // SAFETY: the bytes can always be viewed as a u64 + unsafe { self.num } + } + + /// Get the first byte of the symbol as a `u8`. + /// + /// If the symbol is empty, this will return the zero byte. + #[inline] + pub fn first_byte(&self) -> u8 { + // SAFETY: the bytes can always be viewed as a u64 + unsafe { self.num as u8 } + } + + /// Get the first two bytes of the symbol as a `u16`. + /// + /// If the Symbol is one or zero bytes, this will return `0u16`. + #[inline] + pub fn first_two_bytes(&self) -> u16 { + // SAFETY: the bytes can always be viewed as a u64 + unsafe { self.num as u16 } + } + + /// Access the Symbol as a slice. pub fn as_slice(&self) -> &[u8] { let len = self.len(); // SAFETY: constructors will not allow building a struct where len > 8. @@ -80,66 +118,88 @@ impl Symbol { let self_len = self.len(); let mut result = *self; + + // SAFETY: self_len and new_len are checked to be <= 8 unsafe { result.bytes[self_len..new_len].copy_from_slice(other.as_slice()) }; result } } -/// Codes used to map symbols to bytes. +impl Debug for Symbol { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", unsafe { self.bytes }) + } +} + +/// Code and associated metadata fro a symbol. +/// +/// Logically, codes can range from 0-255 inclusive. This type holds both the 8-bit code as well as +/// other metadata bit-packed into a `u16`. /// -/// Logically, codes can range from 0-255 inclusive. Physically, we represent them as a 9-bit -/// value packed into a `u16`. +/// The bottom 8 bits contain EITHER a code for a symbol stored in the table, OR a raw byte. /// -/// Physically in-memory, `Code(0)` through `Code(255)` corresponds to escape sequences of raw bytes -/// 0 through 255. `Code(256)` through `Code(511)` represent the actual codes -255. -#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] -pub struct Code(u16); +/// The interpretation depends on the 9th bit: when toggled off, the value stores a raw byte, and when +/// toggled on, it stores a code. Thus if you examine the bottom 9 bits of the `u16`, you have an extended +/// code range, where the values 0-255 are raw bytes, and the values 256-510 represent codes 0-254. 511 is +/// a placeholder for the invalid code here. +/// +/// Bits 12-15 store the length of the symbol (values ranging from 0-8). +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] +struct CodeMeta(u16); -impl Code { - /// Maximum code value for the in-memory `Code` representation. - /// - /// When truncated to u8 this is code 255, which is equivalent to [`Self::ESCAPE_CODE`]. - pub const CODE_MAX: u16 = 511; +/// Code used to indicate bytes that are not in the symbol table. +/// +/// When compressing a string that cannot fully be expressed with the symbol table, the compressed +/// output will contain an `ESCAPE` byte followed by a raw byte. At decompression time, the presence +/// of `ESCAPE` indicates that the next byte should be appended directly to the result instead of +/// being looked up in the symbol table. +pub const ESCAPE_CODE: u8 = 255; - /// Code used to indicate bytes that are not in the symbol table. - /// - /// When compressing a string that cannot fully be expressed with the symbol table, the compressed - /// output will contain an `ESCAPE` byte followed by a raw byte. At decompression time, the presence - /// of `ESCAPE` indicates that the next byte should be appended directly to the result instead of - /// being looked up in the symbol table. - pub const ESCAPE_CODE: u8 = 255; - - /// Create a new code representing an escape byte. - pub fn new_escaped(byte: u8) -> Self { - Self(byte as u16) +/// Maximum value for the extended code range. +/// +/// When truncated to u8 this is code 255, which is equivalent to [`ESCAPE_CODE`]. +pub const MAX_CODE: u16 = 511; + +#[allow(clippy::len_without_is_empty)] +impl CodeMeta { + const EMPTY: Self = CodeMeta(MAX_CODE); + + fn new(code: u8, escape: bool, len: u16) -> Self { + let value = (len << 12) | ((escape as u16) << 8) | (code as u16); + Self(value) } - /// Create a new code representing a symbol. - pub fn new_symbol(code: u8) -> Self { - assert_ne!( - code, - Code::ESCAPE_CODE, - "code {code} cannot be used for symbol, reserved for ESCAPE" - ); + /// Create a new code from a [`Symbol`]. + fn new_symbol(code: u8, symbol: Symbol) -> Self { + assert_ne!(code, ESCAPE_CODE, "ESCAPE_CODE cannot be used for symbol"); - Self((code as u16) + 256) + Self::new(code, false, symbol.len() as u16) } - /// Create a `Code` directly from a `u16` value. - /// - /// # Panics - /// Panic if the value is ≥ the defined `CODE_MAX`. - pub fn from_u16(code: u16) -> Self { - assert!(code < Self::CODE_MAX, "code value higher than CODE_MAX"); + #[inline] + fn code(&self) -> u8 { + self.0 as u8 + } - Self(code) + #[inline] + fn extended_code(&self) -> u16 { + self.0 & 0b111_111_111 } - /// Returns true if the code is for an escape byte. #[inline] - pub fn is_escape(&self) -> bool { - self.0 <= 255 + fn len(&self) -> u16 { + self.0 >> 12 + } +} + +impl Debug for CodeMeta { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("CodeMeta") + .field("code", &(self.0 as u8)) + .field("is_escape", &(self.0 < 256)) + .field("len", &(self.0 >> 12)) + .finish() } } @@ -153,23 +213,32 @@ impl Code { /// ``` /// use fsst_rs::{Symbol, SymbolTable}; /// let mut table = SymbolTable::default(); -/// table.insert(Symbol::from_slice(&[b'h', b'e', b'l', b'l', b'o', 0, 0, 0])); +/// +/// // Insert a new symbol +/// assert!(table.insert(Symbol::from_slice(&[b'h', b'e', b'l', b'l', b'o', 0, 0, 0]))); /// /// let compressed = table.compress("hello".as_bytes()); /// assert_eq!(compressed, vec![0u8]); /// ``` /// /// [training]: [`train`] -#[derive(Clone, Debug)] +#[derive(Clone)] pub struct SymbolTable { /// Table mapping codes to symbols. pub(crate) symbols: [Symbol; 511], - /// Indicates the number of entries in the symbol table that have been populated. - /// - /// This value is always at least 256, as the first 256 entries in the `table` are the escape - /// bytes. - pub(crate) n_symbols: usize, + /// Indicates the number of entries in the symbol table that have been populated, not counting + /// the escape values. + pub(crate) n_symbols: u8, + + // + // Index structures used to speedup building the symbol table and compression + // + /// Inverted index mapping 2-byte symbols to codes + codes_twobyte: Vec, + + /// Lossy perfect hash table for looking up codes to symbols that are 3 bytes or more + lossy_pht: LossyPHT, } impl Default for SymbolTable { @@ -177,13 +246,14 @@ impl Default for SymbolTable { let mut table = Self { symbols: [Symbol::ZERO; 511], n_symbols: 0, + codes_twobyte: vec![CodeMeta::EMPTY; 65_536], + lossy_pht: LossyPHT::new(), }; // Populate the escape byte entries. for byte in 0..=255 { table.symbols[byte as usize] = Symbol::from_u8(byte); } - table.n_symbols = 256; table } @@ -194,39 +264,158 @@ impl Default for SymbolTable { /// The symbol table is trained on a corpus of data in the form of a single byte array, building up /// a mapping of 1-byte "codes" to sequences of up to `N` plaintext bytse, or "symbols". impl SymbolTable { - /// Insert a new symbol at the end of the table. + /// Attempt to insert a new symbol at the end of the table. /// /// # Panics /// Panics if the table is already full. - pub fn insert(&mut self, symbol: Symbol) { - assert!( - self.n_symbols < self.symbols.len(), - "cannot insert into full symbol table" - ); - self.symbols[self.n_symbols] = symbol; + pub fn insert(&mut self, symbol: Symbol) -> bool { + assert!(self.n_symbols < 255, "cannot insert into full symbol table"); + + let symbol_len = symbol.len(); + if symbol_len <= 2 { + // Insert the 2-byte symbol into the twobyte cache + self.codes_twobyte[symbol.first_two_bytes() as usize] = + CodeMeta::new_symbol(self.n_symbols, symbol); + } else if symbol_len >= 3 { + // Attempt to insert larger symbols into the 3-byte cache + if !self.lossy_pht.insert(symbol, self.n_symbols) { + return false; + } + } + + // Insert at the end of the symbols table. + // Note the rescaling from range [0-254] -> [256, 510]. + self.symbols[256 + (self.n_symbols as usize)] = symbol; self.n_symbols += 1; + true + } + + /// Using the symbol table, runs a single cycle of compression from the front of `in_ptr`, writing + /// the output into `out_ptr`. Attempts to process an entire 64-bit word of prefix from `in_ptr`. + /// + /// # Returns + /// + /// This function returns a tuple of (code, advance_in, advance_out). + /// + /// `code` is the code that was emitted into the output buffer. + /// + /// `advance_in` is the number of bytes to advance `in_ptr` before the next call. + /// + /// `advance_out` is the number of bytes to advance `out_ptr` before the next call. + /// + /// # Safety + /// + /// `out_ptr` must never be NULL or otherwise point to invalid memory. + // NOTE(aduffy): uncomment this line to make the function appear in profiles + #[inline(never)] + pub(crate) unsafe fn compress_word(&self, word: u64, out_ptr: *mut u8) -> (usize, usize) { + // Speculatively write the first byte of `word` at offset 1. This is necessary if it is an escape, and + // if it isn't, it will be overwritten anyway. + // + // SAFETY: caller ensures out_ptr is not null + let first_byte = word as u8; + unsafe { out_ptr.byte_add(1).write_unaligned(first_byte) }; + + // Probe the hash table + let entry = self.lossy_pht.lookup(word); + + // Now, downshift the `word` and the `entry` to see if they align. + let ignored_bits = entry.ignored_bits; + + if !compare_masked(word, entry.symbol.as_u64(), ignored_bits) || entry.is_unused() { + // lookup the appropriate code for the twobyte sequence and write it + // This will hold either 511, OR it will hold the actual code. + let code = self.codes_twobyte[(word as u16) as usize]; + let out = code.code(); + unsafe { + out_ptr.write(out); + } + + // Advance the input by one byte and the output by 1 byte (if real code) or 2 bytes (if escape). + return ( + if out == ESCAPE_CODE { + 1 + } else { + code.len() as usize + }, + if out == ESCAPE_CODE { 2 } else { 1 }, + ); + } + + let code = entry.code; + unsafe { + out_ptr.write_unaligned(code.code()); + } + + (code.len() as usize, 1) } /// Use the symbol table to compress the plaintext into a sequence of codes and escapes. pub fn compress(&self, plaintext: &[u8]) -> Vec { - let mut values = Vec::with_capacity(2 * plaintext.len()); - let len = plaintext.len(); - let mut pos = 0; - while pos < len { - let next_code = self.find_longest_symbol(&plaintext[pos..len]); - if next_code.is_escape() { - // Case 1 -escape: push an ESCAPE followed by the next byte. - values.push(Code::ESCAPE_CODE); - values.push(next_code.0 as u8); - pos += 1; - } else { - // Case 2 - code: push the code, increment position by symbol length - let symbol = self.symbols[next_code.0 as usize]; - values.push(next_code.0 as u8); - pos += symbol.len(); + if plaintext.is_empty() { + return Vec::new(); + } + + let mut values: Vec = Vec::with_capacity(2 * plaintext.len()); + + let mut in_ptr = plaintext.as_ptr(); + let mut out_ptr = values.as_mut_ptr(); + + // SAFETY: `end` will point just after the end of the `plaintext` slice. + let in_end = unsafe { in_ptr.byte_add(plaintext.len()) }; + let in_end_sub8 = unsafe { in_end.byte_sub(8) }; + // SAFETY: `end` will point just after the end of the `values` allocation. + let out_end = unsafe { out_ptr.byte_add(values.capacity()) }; + + while in_ptr < in_end_sub8 && out_ptr < out_end { + // SAFETY: pointer ranges are checked in the loop condition + unsafe { + // Load a full 8-byte word of data from in_ptr. + // SAFETY: caller asserts in_ptr is not null. we may read past end of pointer though. + let word: u64 = (in_ptr as *const u64).read_unaligned(); + let (advance_in, advance_out) = self.compress_word(word, out_ptr); + in_ptr = in_ptr.byte_add(advance_in); + out_ptr = out_ptr.byte_add(advance_out); + }; + } + + let remaining_bytes = unsafe { in_end.byte_offset_from(in_ptr) }; + assert!( + remaining_bytes.is_positive(), + "in_ptr exceeded in_end, should not be possible" + ); + + // Shift off the remaining bytes + let mut last_word = unsafe { (in_ptr as *const u64).read_unaligned() }; + last_word = mask_prefix(last_word, remaining_bytes as usize); + + while in_ptr < in_end && out_ptr < out_end { + unsafe { + // Load a full 8-byte word of data from in_ptr. + // SAFETY: caller asserts in_ptr is not null. we may read past end of pointer though. + let (advance_in, advance_out) = self.compress_word(last_word, out_ptr); + in_ptr = in_ptr.byte_add(advance_in); + out_ptr = out_ptr.byte_add(advance_out); + + last_word = advance_8byte_word(last_word, advance_in); } } + // in_ptr should have exceeded in_end + assert!(in_ptr >= in_end, "exhausted output buffer before exhausting input, there is a bug in SymbolTable::compress()"); + + // Count the number of bytes written + // SAFETY: assertion + unsafe { + let bytes_written = out_ptr.offset_from(values.as_ptr()); + assert!( + bytes_written.is_positive(), + "out_ptr ended before it started, not possible" + ); + + values.set_len(bytes_written as usize); + } + values } @@ -240,7 +429,7 @@ impl SymbolTable { while in_pos < compressed.len() && out_pos < (decoded.capacity() + size_of::()) { let code = compressed[in_pos]; - if code == Code::ESCAPE_CODE { + if code == ESCAPE_CODE { // Advance by one, do raw write. in_pos += 1; // SAFETY: out_pos is always 8 bytes or more from the end of decoded buffer @@ -274,3 +463,37 @@ impl SymbolTable { decoded } } + +/// Mask the word, keeping only the `prefix_bytes` front. +fn mask_prefix(word: u64, prefix_bytes: usize) -> u64 { + let mask = if prefix_bytes == 0 { + 0 + } else { + u64::MAX >> (8 * (8 - prefix_bytes)) + }; + + word & mask +} + +fn advance_8byte_word(word: u64, bytes: usize) -> u64 { + // shift the word off the right-end, because little endian means the first + // char is stored in the LSB. + // + // Note that even though this looks like it branches, Rust compiles this to a + // conditional move instruction. See `` + if bytes == 8 { + 0 + } else { + word >> (8 * bytes) + } +} + +fn compare_masked(left: u64, right: u64, ignored_bits: u16) -> bool { + let mask = if ignored_bits == 64 { + 0 + } else { + u64::MAX >> ignored_bits + }; + + (left & mask) == right +} diff --git a/src/longest.rs b/src/longest.rs deleted file mode 100644 index 445a88a..0000000 --- a/src/longest.rs +++ /dev/null @@ -1,24 +0,0 @@ -use crate::{Code, SymbolTable}; - -/// Find the longest substring. - -impl SymbolTable { - // NOTE(aduffy): if you don't disable inlining, this function won't show up in profiles. - #[inline(never)] - pub(crate) fn find_longest_symbol(&self, text: &[u8]) -> Code { - debug_assert!(!text.is_empty(), "text must not be empty"); - - // Find the code that best maps to the provided text table here. - let mut best_code = Code::new_escaped(text[0]); - let mut best_overlap = 1; - for code in 0..511 { - let symbol = &self.symbols[code as usize]; - if symbol.is_prefix(text) && symbol.len() > best_overlap { - best_code = Code::from_u16(code); - best_overlap = symbol.len(); - } - } - - best_code - } -} diff --git a/src/lossy_pht.rs b/src/lossy_pht.rs new file mode 100644 index 0000000..db4bcf5 --- /dev/null +++ b/src/lossy_pht.rs @@ -0,0 +1,118 @@ +use std::fmt::Debug; + +use crate::CodeMeta; +use crate::Symbol; +use crate::MAX_CODE; + +/// Size of the perfect hash table. +/// +/// NOTE: this differs from the paper, which recommends a 64KB total +/// table size. The paper does not account for the fact that most +/// vendors split the L1 cache into 32KB of instruction and 32KB of data. +pub const HASH_TABLE_SIZE: usize = 1 << 11; + +/// A single entry in the [Lossy Perfect Hash Table][`LossyPHT`]. +/// +/// `TableEntry` is based on the `Symbol` class outlined in Algorithm 4 of the FSST paper. See +/// the module documentation for a link to the paper. +#[derive(Copy, Clone, Debug)] +#[repr(C)] +pub(crate) struct TableEntry { + /// Symbol, piece of a string, 8 bytes or fewer. + pub(crate) symbol: Symbol, + + /// Code and associated metadata for the symbol + pub(crate) code: CodeMeta, + + /// Number of ignored bits in `symbol`. + /// + /// This is equivalent to `64 - 8 * code.len()` but is pre-computed to save a few instructions in + /// the compression loop. + pub(crate) ignored_bits: u16, +} + +assert_sizeof!(TableEntry => 16); + +impl TableEntry { + pub(crate) fn is_unused(&self) -> bool { + // 511 should never come up for real, so use as the sentinel for an unused slot + self.code.extended_code() == MAX_CODE + } +} + +/// Lossy Perfect Hash Table implementation for compression. +/// +/// This implements the "Lossy Perfect Hash Table" described in Section 5 of the paper. +/// +/// It is so-called because the `insert` operation for a symbol may fail, if another symbol is +/// already occupying the slot. +/// +/// If insertions are made from highest-gain to lowest and from longest-symbol to shortest, then +/// we can say that any failed insert is not a big loss, because its slot is being held by a higher-gain +/// symbol. Note that because other code in this crate calls `insert` in the pop-order of a max heap, +/// this holds. +#[derive(Clone, Debug)] +pub(crate) struct LossyPHT { + /// Hash table slots. Used for strings that are 3 bytes or more. + slots: Vec, +} + +impl LossyPHT { + /// Construct a new empty lossy perfect hash table + pub(crate) fn new() -> Self { + let mut slots = Vec::with_capacity(HASH_TABLE_SIZE); + // Initialize all slots to empty entries + for _ in 0..HASH_TABLE_SIZE { + slots.push(TableEntry { + symbol: Symbol::ZERO, + code: CodeMeta::EMPTY, + ignored_bits: 64, + }); + } + + Self { slots } + } + + /// Try and insert the (symbol, code) pair into the table. + /// + /// If there is a collision, we keep the current thing and reject the write. + /// + /// # Returns + /// + /// True if the symbol was inserted into the table, false if it was rejected due to collision. + pub(crate) fn insert(&mut self, symbol: Symbol, code: u8) -> bool { + let prefix_3bytes = symbol.as_u64() & 0xFF_FF_FF; + let slot = self.hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1); + let entry = &mut self.slots[slot]; + + if !entry.is_unused() { + false + } else { + entry.symbol = symbol; + entry.code = CodeMeta::new_symbol(code, symbol); + entry.ignored_bits = (64 - 8 * symbol.len()) as u16; + true + } + } + + pub(crate) fn lookup(&self, word: u64) -> TableEntry { + let prefix_3bytes = word & 0xFF_FF_FF; + let slot = self.hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1); + + self.slots[slot] + } + + /// Hash a value to find the bucket it belongs in. + /// + /// The particular hash function comes from the code listing of Algorithm 4 of the FSST paper. + #[inline] + fn hash(&self, value: u64) -> u64 { + (value * 2971215073) ^ (value >> 15) + } +} + +impl Default for LossyPHT { + fn default() -> Self { + Self::new() + } +} diff --git a/tests/correctness.rs b/tests/correctness.rs new file mode 100644 index 0000000..8773bc7 --- /dev/null +++ b/tests/correctness.rs @@ -0,0 +1,59 @@ +#![cfg(test)] + +static PREAMBLE: &str = r#" +When in the Course of human events, it becomes necessary for one people to dissolve +the political bands which have connected them with another, and to assume among the +powers of the earth, the separate and equal station to which the Laws of Nature and +of Nature's God entitle them, a decent respect to the opinions of mankind requires +that they should declare the causes which impel them to the separation."#; + +static DECLARATION: &str = include_str!("./fixtures/declaration.txt"); + +#[test] +fn test_basic() { + // Roundtrip the declaration + let trained = fsst_rs::train(PREAMBLE); + let compressed = trained.compress(PREAMBLE.as_bytes()); + let decompressed = trained.decompress(&compressed); + assert_eq!(decompressed, PREAMBLE.as_bytes()); +} + +#[test] +fn test_train_on_empty() { + let trained = fsst_rs::train(""); + // We can still compress with it, but the symbols are going to be empty. + let compressed = trained.compress("the quick brown fox jumped over the lazy dog".as_bytes()); + assert_eq!( + trained.decompress(&compressed), + "the quick brown fox jumped over the lazy dog".as_bytes() + ); +} + +#[test] +fn test_zeros() { + println!("training zeros"); + let training_data: Vec = vec![0, 1, 2, 3, 4]; + let trained = fsst_rs::train(&training_data); + println!("compressing with zeros"); + let compressed = trained.compress(&[0, 4]); + println!("decomperssing with zeros"); + assert_eq!(trained.decompress(&compressed), &[0, 4]); + println!("done"); +} + +#[test] +fn test_large() { + let mut corpus = String::new(); + // TODO(aduffy): make this larger once table build performance is better. + while corpus.len() < 10 * 1_024 { + corpus.push_str(DECLARATION); + } + + let trained = fsst_rs::train(&corpus); + let mut massive = String::new(); + while massive.len() < 16 * 1_024 * 1_024 { + massive.push_str(DECLARATION); + } + let compressed = trained.compress(massive.as_bytes()); + assert_eq!(trained.decompress(&compressed), massive.as_bytes()); +} diff --git a/tests/fixtures/declaration.txt b/tests/fixtures/declaration.txt new file mode 100644 index 0000000..30ed22d --- /dev/null +++ b/tests/fixtures/declaration.txt @@ -0,0 +1,63 @@ +The unanimous Declaration of the thirteen united States of America, When in the Course of human events, it becomes necessary for one people to dissolve the political bands which have connected them with another, and to assume among the powers of the earth, the separate and equal station to which the Laws of Nature and of Nature's God entitle them, a decent respect to the opinions of mankind requires that they should declare the causes which impel them to the separation. + +We hold these truths to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness.--That to secure these rights, Governments are instituted among Men, deriving their just powers from the consent of the governed, --That whenever any Form of Government becomes destructive of these ends, it is the Right of the People to alter or to abolish it, and to institute new Government, laying its foundation on such principles and organizing its powers in such form, as to them shall seem most likely to effect their Safety and Happiness. Prudence, indeed, will dictate that Governments long established should not be changed for light and transient causes; and accordingly all experience hath shewn, that mankind are more disposed to suffer, while evils are sufferable, than to right themselves by abolishing the forms to which they are accustomed. But when a long train of abuses and usurpations, pursuing invariably the same Object evinces a design to reduce them under absolute Despotism, it is their right, it is their duty, to throw off such Government, and to provide new Guards for their future security.--Such has been the patient sufferance of these Colonies; and such is now the necessity which constrains them to alter their former Systems of Government. The history of the present King of Great Britain is a history of repeated injuries and usurpations, all having in direct object the establishment of an absolute Tyranny over these States. To prove this, let Facts be submitted to a candid world. + +He has refused his Assent to Laws, the most wholesome and necessary for the public good. + +He has forbidden his Governors to pass Laws of immediate and pressing importance, unless suspended in their operation till his Assent should be obtained; and when so suspended, he has utterly neglected to attend to them. + +He has refused to pass other Laws for the accommodation of large districts of people, unless those people would relinquish the right of Representation in the Legislature, a right inestimable to them and formidable to tyrants only. + +He has called together legislative bodies at places unusual, uncomfortable, and distant from the depository of their public Records, for the sole purpose of fatiguing them into compliance with his measures. + +He has dissolved Representative Houses repeatedly, for opposing with manly firmness his invasions on the rights of the people. + +He has refused for a long time, after such dissolutions, to cause others to be elected; whereby the Legislative powers, incapable of Annihilation, have returned to the People at large for their exercise; the State remaining in the mean time exposed to all the dangers of invasion from without, and convulsions within. + +He has endeavoured to prevent the population of these States; for that purpose obstructing the Laws for Naturalization of Foreigners; refusing to pass others to encourage their migrations hither, and raising the conditions of new Appropriations of Lands. + +He has obstructed the Administration of Justice, by refusing his Assent to Laws for establishing Judiciary powers. + +He has made Judges dependent on his Will alone, for the tenure of their offices, and the amount and payment of their salaries. + +He has erected a multitude of New Offices, and sent hither swarms of Officers to harrass our people, and eat out their substance. + +He has kept among us, in times of peace, Standing Armies without the Consent of our legislatures. + +He has affected to render the Military independent of and superior to the Civil power. + +He has combined with others to subject us to a jurisdiction foreign to our constitution, and unacknowledged by our laws; giving his Assent to their Acts of pretended Legislation: + +For Quartering large bodies of armed troops among us: + +For protecting them, by a mock Trial, from punishment for any Murders which they should commit on the Inhabitants of these States: + +For cutting off our Trade with all parts of the world: + +For imposing Taxes on us without our Consent: + +For depriving us in many cases, of the benefits of Trial by Jury: + +For transporting us beyond Seas to be tried for pretended offences + +For abolishing the free System of English Laws in a neighbouring Province, establishing therein an Arbitrary government, and enlarging its Boundaries so as to render it at once an example and fit instrument for introducing the same absolute rule into these Colonies: + +For taking away our Charters, abolishing our most valuable Laws, and altering fundamentally the Forms of our Governments: + +For suspending our own Legislatures, and declaring themselves invested with power to legislate for us in all cases whatsoever. + +He has abdicated Government here, by declaring us out of his Protection and waging War against us. + +He has plundered our seas, ravaged our Coasts, burnt our towns, and destroyed the lives of our people. + +He is at this time transporting large Armies of foreign Mercenaries to compleat the works of death, desolation and tyranny, already begun with circumstances of Cruelty & perfidy scarcely paralleled in the most barbarous ages, and totally unworthy the Head of a civilized nation. + +He has constrained our fellow Citizens taken Captive on the high Seas to bear Arms against their Country, to become the executioners of their friends and Brethren, or to fall themselves by their Hands. + +He has excited domestic insurrections amongst us, and has endeavoured to bring on the inhabitants of our frontiers, the merciless Indian Savages, whose known rule of warfare, is an undistinguished destruction of all ages, sexes and conditions. + +In every stage of these Oppressions We have Petitioned for Redress in the most humble terms: Our repeated Petitions have been answered only by repeated injury. A Prince whose character is thus marked by every act which may define a Tyrant, is unfit to be the ruler of a free people. + +Nor have We been wanting in attentions to our Brittish brethren. We have warned them from time to time of attempts by their legislature to extend an unwarrantable jurisdiction over us. We have reminded them of the circumstances of our emigration and settlement here. We have appealed to their native justice and magnanimity, and we have conjured them by the ties of our common kindred to disavow these usurpations, which, would inevitably interrupt our connections and correspondence. They too have been deaf to the voice of justice and of consanguinity. We must, therefore, acquiesce in the necessity, which denounces our Separation, and hold them, as we hold the rest of mankind, Enemies in War, in Peace Friends. + +We, therefore, the Representatives of the united States of America, in General Congress, Assembled, appealing to the Supreme Judge of the world for the rectitude of our intentions, do, in the Name, and by Authority of the good People of these Colonies, solemnly publish and declare, That these United Colonies are, and of Right ought to be Free and Independent States; that they are Absolved from all Allegiance to the British Crown, and that all political connection between them and the State of Great Britain, is and ought to be totally dissolved; and that as Free and Independent States, they have full Power to levy War, conclude Peace, contract Alliances, establish Commerce, and to do all other Acts and Things which Independent States may of right do. And for the support of this Declaration, with a firm reliance on the protection of divine Providence, we mutually pledge to each other our Lives, our Fortunes and our sacred Honor.