Skip to content

Commit

Permalink
bugfix, comment fix, force compile fails for big-endian (#5)
Browse files Browse the repository at this point in the history
Also add tests with chinese characters
  • Loading branch information
a10y authored Aug 15, 2024
1 parent 56e0ace commit f2279a0
Show file tree
Hide file tree
Showing 9 changed files with 90 additions and 92 deletions.
36 changes: 0 additions & 36 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ use_debug = { level = "deny" }

[dev-dependencies]
criterion = "0.5"
lz4 = "1"

[[example]]
name = "round_trip"
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ but it is mostly written from a careful reading of the paper.

**NOTE: This current implementation is still in-progress and is not production ready, please use at your own risk.**

**NOTE: This crate only works on little-endian architectures currently. There are no current plans to support big-endian targets.**

[whitepaper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
[MIT-licensed implementation]: https://github.com/cwida/fsst
40 changes: 1 addition & 39 deletions benches/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,8 @@
//! Also contains LZ4 baseline.
#![allow(missing_docs)]
use core::str;
use std::io::{Cursor, Read, Write};

use criterion::{black_box, criterion_group, criterion_main, Criterion};
use lz4::liblz4::BlockChecksum;
use lz4::{BlockSize, ContentChecksum};

use fsst_rs::{train, ESCAPE_CODE};

Expand Down Expand Up @@ -48,40 +45,5 @@ fn bench_fsst(c: &mut Criterion) {
});
}

fn bench_lz4(c: &mut Criterion) {
let mut group = c.benchmark_group("lz4");

group.bench_function("compress-single", |b| {
let mut compressed = Vec::with_capacity(100_000_000);
let mut encoder = lz4::EncoderBuilder::new()
.block_size(BlockSize::Max64KB)
.checksum(ContentChecksum::NoChecksum)
.block_checksum(BlockChecksum::NoBlockChecksum)
.build(&mut compressed)
.unwrap();

b.iter(|| encoder.write_all(TEST.as_bytes()).unwrap());
});

group.bench_function("decompress-single", |b| {
let compressed = Vec::new();
let mut encoder = lz4::EncoderBuilder::new()
.block_size(BlockSize::Max64KB)
.checksum(ContentChecksum::NoChecksum)
.block_checksum(BlockChecksum::NoBlockChecksum)
.build(compressed)
.unwrap();
encoder.write_all(TEST.as_bytes()).unwrap();
let (compressed, result) = encoder.finish();
result.unwrap();

let cursor = Cursor::new(compressed);
let mut decoder = lz4::Decoder::new(cursor).unwrap();
let mut output = Vec::new();

b.iter(|| decoder.read_to_end(&mut output).unwrap());
});
}

criterion_group!(compress_bench, bench_fsst, bench_lz4);
criterion_group!(compress_bench, bench_fsst);
criterion_main!(compress_bench);
6 changes: 3 additions & 3 deletions src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,19 +103,19 @@ impl SymbolTable {
fn optimize(&self, counters: Counter) -> Self {
let mut res = SymbolTable::default();
let mut pqueue = BinaryHeap::new();
for code1 in 0..511 {
for code1 in 0u16..(256u16 + self.n_symbols as u16) {
let symbol1 = self.symbols[code1 as usize];
let gain = counters.count1(code1) * symbol1.len();
pqueue.push(Candidate {
symbol: symbol1,
gain,
});

for code2 in 0..511 {
for code2 in 0u16..(256u16 + self.n_symbols as u16) {
let symbol2 = &self.symbols[code2 as usize];
// If either symbol is zero-length, or if merging would yield a symbol of
// length greater than 8, skip.
if symbol1.len() + symbol2.len() >= 8 || symbol1.is_empty() || symbol2.is_empty() {
if symbol1.len() + symbol2.len() >= 8 {
continue;
}
let new_symbol = symbol1.concat(symbol2);
Expand Down
2 changes: 1 addition & 1 deletion src/find_longest/naive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ impl FindLongestSymbol for SymbolTable {
// Start with the code corresponding to the escape of the first character in the text
let mut best_code = text[0] as u16;
let mut best_overlap = 1;
for code in 256..511 {
for code in 256..(256 + self.n_symbols as u16) {
let symbol = &self.symbols[code as usize];
if symbol.is_prefix(text) && symbol.len() > best_overlap {
best_code = code;
Expand Down
26 changes: 14 additions & 12 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#![doc = include_str!("../README.md")]
#![cfg(target_endian = "little")]

/// Throw a compiler error if a type isn't guaranteed to have a specific size in bytes.
macro_rules! assert_sizeof {
Expand Down Expand Up @@ -49,10 +50,12 @@ impl Symbol {
}

impl Symbol {
/// Calculate the length of the symbol in bytes.
/// Calculate the length of the symbol in bytes. Always a value between 1 and 8.
///
/// Each symbol has the capacity to hold up to 8 bytes of data, but the symbols
/// can contain fewer bytes, padded with 0x00.
/// can contain fewer bytes, padded with 0x00. There is a special case of a symbol
/// that holds the byte 0x00. In that case, the symbol contains `0x0000000000000000`
/// but we want to interpret that as a one-byte symbol containing `0x00`.
pub fn len(&self) -> usize {
let numeric = unsafe { self.num };
// For little-endian platforms, this counts the number of *trailing* zeros
Expand Down Expand Up @@ -113,10 +116,10 @@ impl Symbol {

/// Return a new `Symbol` by logically concatenating ourselves with another `Symbol`.
pub fn concat(&self, other: &Self) -> Self {
let new_len = self.len() + other.len();
let self_len = self.len();
let new_len = self_len + other.len();
assert!(new_len <= 8, "cannot build symbol with length > 8");

let self_len = self.len();
let mut result = *self;

// SAFETY: self_len and new_len are checked to be <= 8
Expand Down Expand Up @@ -290,16 +293,15 @@ impl SymbolTable {
true
}

/// Using the symbol table, runs a single cycle of compression from the front of `in_ptr`, writing
/// the output into `out_ptr`. Attempts to process an entire 64-bit word of prefix from `in_ptr`.
/// Using the symbol table, runs a single cycle of compression on an input word, writing
/// the output into `out_ptr`.
///
/// # Returns
///
/// This function returns a tuple of (code, advance_in, advance_out).
///
/// `code` is the code that was emitted into the output buffer.
/// This function returns a tuple of (advance_in, advance_out) with the number of bytes
/// for the caller to advance the input and output pointers.
///
/// `advance_in` is the number of bytes to advance `in_ptr` before the next call.
/// `advance_in` is the number of bytes to advance the input pointer before the next call.
///
/// `advance_out` is the number of bytes to advance `out_ptr` before the next call.
///
Expand Down Expand Up @@ -421,13 +423,13 @@ impl SymbolTable {

/// Decompress a byte slice that was previously returned by [compression][Self::compress].
pub fn decompress(&self, compressed: &[u8]) -> Vec<u8> {
let mut decoded: Vec<u8> = Vec::with_capacity(size_of::<Symbol>() * compressed.len());
let mut decoded: Vec<u8> = Vec::with_capacity(size_of::<Symbol>() * (compressed.len() + 1));
let ptr = decoded.as_mut_ptr();

let mut in_pos = 0;
let mut out_pos = 0;

while in_pos < compressed.len() && out_pos < (decoded.capacity() + size_of::<Symbol>()) {
while in_pos < compressed.len() && out_pos < (decoded.capacity() - size_of::<Symbol>()) {
let code = compressed[in_pos];
if code == ESCAPE_CODE {
// Advance by one, do raw write.
Expand Down
25 changes: 25 additions & 0 deletions tests/correctness.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#![cfg(test)]

use fsst_rs::Symbol;

static PREAMBLE: &str = r#"
When in the Course of human events, it becomes necessary for one people to dissolve
the political bands which have connected them with another, and to assume among the
Expand All @@ -9,6 +11,8 @@ that they should declare the causes which impel them to the separation."#;

static DECLARATION: &str = include_str!("./fixtures/declaration.txt");

static ART_OF_WAR: &str = include_str!("./fixtures/art_of_war.txt");

#[test]
fn test_basic() {
// Roundtrip the declaration
Expand All @@ -29,6 +33,18 @@ fn test_train_on_empty() {
);
}

#[test]
fn test_one_byte() {
let mut empty = fsst_rs::SymbolTable::default();
// Assign code 0 to map to the symbol containing byte 0x01
empty.insert(Symbol::from_u8(0x01));

let compressed = empty.compress(&[0x01]);
assert_eq!(compressed, vec![0u8]);

assert_eq!(empty.decompress(&compressed), vec![0x01]);
}

#[test]
fn test_zeros() {
println!("training zeros");
Expand Down Expand Up @@ -57,3 +73,12 @@ fn test_large() {
let compressed = trained.compress(massive.as_bytes());
assert_eq!(trained.decompress(&compressed), massive.as_bytes());
}

#[test]
fn test_chinese() {
let trained = fsst_rs::train(ART_OF_WAR.as_bytes());
assert_eq!(
ART_OF_WAR.as_bytes(),
trained.decompress(&trained.compress(ART_OF_WAR.as_bytes()))
);
}
Loading

0 comments on commit f2279a0

Please sign in to comment.