From 67ff87945861c5224bd145d0e9c6a270c6b85b18 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 8 Aug 2024 12:08:21 -0400 Subject: [PATCH] initial impl --- .gitignore | 8 ++ Cargo.lock | 7 ++ Cargo.toml | 6 + src/fsst.rs | 325 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 12 ++ 5 files changed, 358 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/fsst.rs create mode 100644 src/lib.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a5ff07f --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +/target + + +# Added by cargo +# +# already existing elements were commented out + +#/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..414f6c6 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "fsst-rs" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..c90cd98 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "fsst-rs" +version = "0.1.0" +edition = "2021" + +[dependencies] diff --git a/src/fsst.rs b/src/fsst.rs new file mode 100644 index 0000000..960db9d --- /dev/null +++ b/src/fsst.rs @@ -0,0 +1,325 @@ +use std::cmp::min; + +const FSST_CODE_MAX: u16 = 256; +const FSST_CODE_MASK: u16 = FSST_CODE_MAX - 1; +const FSST_LEN_BITS: u32 = 12; +const FSST_CODE_BITS: u32 = 9; +const FSST_CODE_BASE: u16 = 256; +const FSST_HASH_LOG2SIZE: usize = 10; +const FSST_HASH_PRIME: u64 = 2971215073; +const FSST_SHIFT: u32 = 15; +const FSST_ICL_FREE: u64 = (15 << 28) | ((FSST_CODE_MASK as u64) << 16); +const FSST_MAXHEADER: usize = 8 + 1 + 8 + 2048 + 1; +const FSST_ESC: u8 = 255; + +#[inline(always)] +fn fsst_unaligned_load(v: &[u8]) -> u64 { + let mut ret: u64 = 0; + unsafe { + std::ptr::copy_nonoverlapping(v.as_ptr(), &mut ret as *mut u64 as *mut u8, 8); + } + ret +} + +#[inline(always)] +fn fsst_hash(w: u64) -> u64 { + ((w * FSST_HASH_PRIME) ^ ((w * FSST_HASH_PRIME) >> FSST_SHIFT)) +} + +#[derive(Clone, Copy)] +struct Symbol { + val: [u8; 8], + icl: u64, +} + +impl Symbol { + const MAX_LENGTH: usize = 8; + + fn new() -> Self { + Symbol { val: [0; 8], icl: 0 } + } + + fn from_byte(c: u8, code: u16) -> Self { + let mut s = Symbol::new(); + s.val[0] = c; + s.set_code_len(code, 1); + s + } + + fn from_slice(input: &[u8]) -> Self { + let mut s = Symbol::new(); + let len = min(input.len(), Self::MAX_LENGTH); + s.val[..len].copy_from_slice(&input[..len]); + s.set_code_len(FSST_CODE_MASK, len as u32); + s + } + + fn set_code_len(&mut self, code: u16, len: u32) { + self.icl = (len << 28) as u64 | (code as u64) << 16 | ((8 - len) * 8) as u64; + } + + fn length(&self) -> u32 { + (self.icl >> 28) as u32 + } + + fn code(&self) -> u16 { + ((self.icl >> 16) & FSST_CODE_MASK as u64) as u16 + } + + fn ignored_bits(&self) -> u32 { + self.icl as u32 + } + + fn first(&self) -> u8 { + self.val[0] + } + + fn first2(&self) -> u16 { + u16::from_le_bytes([self.val[0], self.val[1]]) + } + + fn hash(&self) -> usize { + let v = u32::from_le_bytes([self.val[0], self.val[1], self.val[2], self.val[3]]); + fsst_hash(v as u64) as usize + } +} + +struct SymbolTable { + short_codes: [u16; 65536], + byte_codes: [u16; 256], + symbols: Vec, + hash_tab: Vec, + n_symbols: u16, + suffix_lim: u16, + terminator: u16, + zero_terminated: bool, + len_histo: [u16; FSST_CODE_BITS as usize], +} + +impl SymbolTable { + fn new() -> Self { + let mut st = SymbolTable { + short_codes: [0; 65536], + byte_codes: [0; 256], + symbols: vec![Symbol::new(); FSST_CODE_MAX as usize], + hash_tab: vec![Symbol::new(); 1 << FSST_HASH_LOG2SIZE], + n_symbols: 0, + suffix_lim: FSST_CODE_MAX, + terminator: 0, + zero_terminated: false, + len_histo: [0; FSST_CODE_BITS as usize], + }; + + for i in 0..256 { + st.symbols[i] = Symbol::from_byte(i as u8, i as u16 | (1 << FSST_LEN_BITS)); + } + + for i in 256..FSST_CODE_MAX as usize { + st.symbols[i] = Symbol::from_byte(0, FSST_CODE_MASK); + } + + for i in 0..256 { + st.byte_codes[i] = (1 << FSST_LEN_BITS) | i as u16; + } + + for i in 0..65536 { + st.short_codes[i] = (1 << FSST_LEN_BITS) | (i & 255) as u16; + } + + st + } + + fn clear(&mut self) { + self.len_histo = [0; FSST_CODE_BITS as usize]; + for i in FSST_CODE_BASE as usize..FSST_CODE_BASE as usize + self.n_symbols as usize { + let symbol = &self.symbols[i]; + if symbol.length() == 1 { + let val = symbol.first(); + self.byte_codes[val as usize] = (1 << FSST_LEN_BITS) | val as u16; + } else if symbol.length() == 2 { + let val = symbol.first2(); + self.short_codes[val as usize] = (1 << FSST_LEN_BITS) | (val & 255); + } else { + let idx = symbol.hash() & ((1 << FSST_HASH_LOG2SIZE) - 1); + self.hash_tab[idx] = Symbol::new(); + self.hash_tab[idx].icl = FSST_ICL_FREE; + } + } + self.n_symbols = 0; + } + + fn hash_insert(&mut self, s: Symbol) -> bool { + let idx = s.hash() & ((1 << FSST_HASH_LOG2SIZE) - 1); + let taken = self.hash_tab[idx].icl < FSST_ICL_FREE; + if taken { + return false; + } + self.hash_tab[idx] = s; + true + } + + fn add(&mut self, mut s: Symbol) -> bool { + assert!(FSST_CODE_BASE + self.n_symbols < FSST_CODE_MAX); + let len = s.length(); + s.set_code_len(FSST_CODE_BASE + self.n_symbols, len); + if len == 1 { + self.byte_codes[s.first() as usize] = FSST_CODE_BASE + self.n_symbols + (1 << FSST_LEN_BITS); + } else if len == 2 { + self.short_codes[s.first2() as usize] = FSST_CODE_BASE + self.n_symbols + (2 << FSST_LEN_BITS); + } else if !self.hash_insert(s) { + return false; + } + self.symbols[FSST_CODE_BASE as usize + self.n_symbols as usize] = s; + self.len_histo[len as usize - 1] += 1; + self.n_symbols += 1; + true + } + + fn find_longest_symbol(&self, s: Symbol) -> u16 { + let idx = s.hash() & ((1 << FSST_HASH_LOG2SIZE) - 1); + if self.hash_tab[idx].icl <= s.icl && self.hash_tab[idx].val == s.val { + return (self.hash_tab[idx].icl >> 16) & FSST_CODE_MASK as u64; + } + if s.length() >= 2 { + let code = self.short_codes[s.first2() as usize] & FSST_CODE_MASK; + if code >= FSST_CODE_BASE { + return code; + } + } + self.byte_codes[s.first() as usize] & FSST_CODE_MASK + } + + fn find_longest_symbol_slice(&self, cur: &[u8], end: &[u8]) -> u16 { + self.find_longest_symbol(Symbol::from_slice(&cur[..min(cur.len(), end.len())])) + } +} + +struct Counters { + count1: Vec, + count2: Vec>, +} + +impl Counters { + fn new() -> Self { + Counters { + count1: vec![0; FSST_CODE_MAX as usize], + count2: vec![vec![0; FSST_CODE_MAX as usize]; FSST_CODE_MAX as usize], + } + } + + fn count1_set(&mut self, pos1: usize, val: u16) { + self.count1[pos1] = val; + } + + fn count1_inc(&mut self, pos1: usize) { + self.count1[pos1] += 1; + } + + fn count2_inc(&mut self, pos1: usize, pos2: usize) { + self.count2[pos1][pos2] += 1; + } + + fn count1_get_next(&self, pos1: &mut usize) -> u32 { + self.count1[*pos1] as u32 + } + + fn count2_get_next(&self, pos1: usize, pos2: &mut usize) -> u32 { + self.count2[pos1][*pos2] as u32 + } + + fn backup1(&self, buf: &mut [u8]) { + unsafe { + std::ptr::copy_nonoverlapping( + self.count1.as_ptr() as *const u8, + buf.as_mut_ptr(), + FSST_CODE_MAX as usize * std::mem::size_of::(), + ); + } + } + + fn restore1(&mut self, buf: &[u8]) { + unsafe { + std::ptr::copy_nonoverlapping( + buf.as_ptr(), + self.count1.as_mut_ptr() as *mut u8, + FSST_CODE_MAX as usize * std::mem::size_of::(), + ); + } + } +} + +struct Encoder { + symbol_table: SymbolTable, + counters: Counters, +} + +impl Encoder { + fn new() -> Self { + Encoder { + symbol_table: SymbolTable::new(), + counters: Counters::new(), + } + } + + pub fn compress(&self, input: &[u8], output: &mut [u8]) -> (usize, usize) { + let mut in_pos = 0; + let mut out_pos = 0; + + while in_pos < input.len() && out_pos < output.len() { + let symbol = self.symbol_table.find_longest_symbol_slice(&input[in_pos..], &input[input.len()..]); + let code = symbol & FSST_CODE_MASK; + let len = (symbol >> FSST_LEN_BITS) as usize; + + if code < FSST_CODE_BASE { + // Escape byte + if out_pos + 2 > output.len() { + break; + } + output[out_pos] = FSST_ESC; + output[out_pos + 1] = input[in_pos]; + out_pos += 2; + in_pos += 1; + } else { + if out_pos + 1 > output.len() { + break; + } + output[out_pos] = code as u8; + out_pos += 1; + in_pos += len; + } + } + + (in_pos, out_pos) + } +} + +impl SymbolTable { + pub fn decompress(&self, input: &[u8], output: &mut [u8]) -> usize { + let mut in_pos = 0; + let mut out_pos = 0; + + while in_pos < input.len() && out_pos < output.len() { + let code = input[in_pos] as u16; + in_pos += 1; + + if code == FSST_ESC as u16 { + if in_pos >= input.len() { + break; + } + output[out_pos] = input[in_pos]; + in_pos += 1; + out_pos += 1; + } else { + let symbol = &self.symbols[code as usize]; + let len = symbol.length() as usize; + if out_pos + len > output.len() { + break; + } + output[out_pos..out_pos + len].copy_from_slice(&symbol.val[..len]); + out_pos += len; + } + } + + out_pos + } +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..44a4684 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,12 @@ +mod fsst; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = add(2, 2); + assert_eq!(result, 4); + } +}