From 90794e5f2991e0dd0d3cab8a9d56d7c8b580c1dd Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Sat, 10 Aug 2024 15:52:54 -0400 Subject: [PATCH 1/7] implementation v0 --- .gitignore | 1 + Cargo.lock | 624 ++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 9 +- benches/compress.rs | 103 ++++++++ benches/dracula.txt | 1 + src/builder.rs | 203 ++++++++++++++ src/fsst.rs | 325 ----------------------- src/lib.rs | 258 +++++++++++++++++- src/longest.rs | 25 ++ 9 files changed, 1215 insertions(+), 334 deletions(-) create mode 100644 benches/compress.rs create mode 100644 benches/dracula.txt create mode 100644 src/builder.rs delete mode 100644 src/fsst.rs create mode 100644 src/longest.rs diff --git a/.gitignore b/.gitignore index a5ff07f..8b196e9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ /target +.idea/ # Added by cargo diff --git a/Cargo.lock b/Cargo.lock index 414f6c6..ef3912f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,630 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anstyle" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1" + +[[package]] +name = "autocfg" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" + +[[package]] +name = "bumpalo" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cc" +version = "1.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9e8aabfac534be767c909e0690571677d49f41bd8465ae876fe043d52ba5292" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clap" +version = "4.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11d8838454fda655dafd3accb2b6e2bea645b9e4078abe84a22ceb947235c5cc" +dependencies = [ + "clap_builder", +] + +[[package]] +name = "clap_builder" +version = "4.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "216aec2b177652e3846684cbfe25c9964d18ec45234f0f5da5157b207ed1aab6" +dependencies = [ + "anstyle", + "clap_lex", +] + +[[package]] +name = "clap_lex" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" + +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" + +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "either" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" + [[package]] name = "fsst-rs" version = "0.1.0" +dependencies = [ + "criterion", + "lz4", +] + +[[package]] +name = "half" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +dependencies = [ + "cfg-if", + "crunchy", +] + +[[package]] +name = "hermit-abi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" + +[[package]] +name = "is-terminal" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" + +[[package]] +name = "js-sys" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.155" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" + +[[package]] +name = "log" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + +[[package]] +name = "lz4" +version = "1.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "958b4caa893816eea05507c20cfe47574a43d9a697138a7872990bba8a0ece68" +dependencies = [ + "libc", + "lz4-sys", +] + +[[package]] +name = "lz4-sys" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "109de74d5d2353660401699a4174a4ff23fcc649caf553df71933c7fb45ad868" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "oorandom" +version = "11.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9" + +[[package]] +name = "plotters" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a15b6eccb8484002195a3e44fe65a4ce8e93a625797a063735536fd59cb01cf3" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "414cec62c6634ae900ea1c56128dfe87cf63e7caece0852ec76aba307cebadb7" + +[[package]] +name = "plotters-svg" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81b30686a7d9c3e010b84284bdd26a29f2138574f52f5eb6f794fc0ad924e705" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "proc-macro2" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" + +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "serde" +version = "1.0.206" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b3e4cd94123dd520a128bcd11e34d9e9e423e7e3e50425cb1b4b1e3549d0284" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.206" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabfb6138d2383ea8208cf98ccf69cdfb1aff4088460681d84189aa259762f97" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.124" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66ad62847a56b3dba58cc891acd13884b9c61138d330c0d7b6181713d4fce38d" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "syn" +version = "2.0.74" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fceb41e3d546d0bd83421d3409b1460cc7444cd389341a4c880fe7a042cb3d7" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" + +[[package]] +name = "web-sys" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/Cargo.toml b/Cargo.toml index c90cd98..301d560 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,4 +3,11 @@ name = "fsst-rs" version = "0.1.0" edition = "2021" -[dependencies] +[dev-dependencies] +criterion = "0.5" +lz4 = "1" + +[[bench]] +name = "compress" +harness = false +bench = true diff --git a/benches/compress.rs b/benches/compress.rs new file mode 100644 index 0000000..10cc7ce --- /dev/null +++ b/benches/compress.rs @@ -0,0 +1,103 @@ +use std::io::{Cursor, Read, Write}; + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use lz4::liblz4::BlockChecksum; +use lz4::{BlockSize, ContentChecksum}; + +use fsst_rs::{train, SymbolTable}; + +const CORPUS: &str = include_str!("dracula.txt"); +const TEST: &str = "I found my smattering of German very useful here"; + +fn bench_fsst(c: &mut Criterion) { + let mut group = c.benchmark_group("fsst"); + group.bench_function("train", |b| { + let corpus = CORPUS.as_bytes(); + b.iter(|| black_box(train(black_box(corpus)))); + }); + + let table = train(CORPUS); + let plaintext = TEST.as_bytes(); + + let compressed = table.compress(plaintext); + let escape_count = compressed + .iter() + .filter(|b| **b == SymbolTable::ESCAPE) + .count(); + let ratio = (plaintext.len() as f64) / (compressed.len() as f64); + println!( + "Escapes = {escape_count}/{}, compression_ratio = {ratio}", + compressed.len() + ); + + assert_eq!(table.decompress(&compressed), TEST.as_bytes()); + + group.bench_function("compress-single", |b| { + b.iter(|| black_box(table.compress(black_box(plaintext)))); + }); + + group.bench_function("decompress-single", |b| { + b.iter(|| black_box(table.decompress(black_box(&compressed)))); + }); +} + +fn bench_lz4(c: &mut Criterion) { + let mut group = c.benchmark_group("lz4"); + + // { + // let compressed = Vec::with_capacity(10_000); + // let mut encoder = lz4::EncoderBuilder::new() + // .block_size(BlockSize::Max64KB) + // .build(compressed) + // .unwrap(); + // + // encoder.write_all(TEST.as_bytes()).unwrap(); + // let (compressed, result) = encoder.finish(); + // result.unwrap(); + // + // let ratio = (TEST.as_bytes().len() as f64) / (compressed.len() as f64); + // println!("LZ4 compress_ratio = {ratio}"); + // + // // ensure decodes cleanly + // let cursor = Cursor::new(compressed); + // let mut decoder = lz4::Decoder::new(cursor).unwrap(); + // let mut output = String::new(); + // + // decoder.read_to_string(&mut output).unwrap(); + // assert_eq!(output.as_str(), TEST); + // } + + group.bench_function("compress-single", |b| { + let mut compressed = Vec::with_capacity(100_000_000); + let mut encoder = lz4::EncoderBuilder::new() + .block_size(BlockSize::Max64KB) + .checksum(ContentChecksum::NoChecksum) + .block_checksum(BlockChecksum::NoBlockChecksum) + .build(&mut compressed) + .unwrap(); + + b.iter(|| encoder.write_all(TEST.as_bytes()).unwrap()); + }); + + group.bench_function("decompress-single", |b| { + let compressed = Vec::new(); + let mut encoder = lz4::EncoderBuilder::new() + .block_size(BlockSize::Max64KB) + .checksum(ContentChecksum::NoChecksum) + .block_checksum(BlockChecksum::NoBlockChecksum) + .build(compressed) + .unwrap(); + encoder.write_all(TEST.as_bytes()).unwrap(); + let (compressed, result) = encoder.finish(); + result.unwrap(); + + let cursor = Cursor::new(compressed); + let mut decoder = lz4::Decoder::new(cursor).unwrap(); + let mut output = Vec::new(); + + b.iter(|| decoder.read_to_end(&mut output).unwrap()); + }); +} + +criterion_group!(compress_bench, bench_fsst, bench_lz4); +criterion_main!(compress_bench); diff --git a/benches/dracula.txt b/benches/dracula.txt new file mode 100644 index 0000000..88adb22 --- /dev/null +++ b/benches/dracula.txt @@ -0,0 +1 @@ +How these papers have been placed in sequence will be made manifest in the reading of them. All needless matters have been eliminated, so that a history almost at variance with the possibilities of later-day belief may stand forth as simple fact. There is throughout no statement of past things wherein memory may err, for all the records chosen are exactly contemporary, given from the standpoints and within the range of knowledge of those who made them. We left in pretty good time, and came after nightfall to Klausenburgh. Here I stopped for the night at the Hotel Royale. I had for dinner, or rather supper, a chicken done up some way with red pepper, which was very good but thirsty. (Mem., get recipe for Mina.) I asked the waiter, and he said it was called “paprika hendl,” and that, as it was a national dish, I should be able to get it anywhere along the Carpathians. I found my smattering of German very useful here; indeed, I don’t know how I should be able to get on without it. diff --git a/src/builder.rs b/src/builder.rs new file mode 100644 index 0000000..c7ae814 --- /dev/null +++ b/src/builder.rs @@ -0,0 +1,203 @@ +//! Functions and types used for building a [`SymbolTable`] from a corpus of text. +//! +//! This module implements the logic from Algorithm 3 of the [FSST Paper]. +//! +//! [FSST Paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf + +use std::cmp::Ordering; +use std::collections::BinaryHeap; + +use crate::{Code, Symbol, SymbolTable}; + +#[derive(Debug, Clone)] +struct Counter { + /// Frequency count for each code. + counts1: Vec, + + /// Frequency count for each code-pair. + counts2: Vec>, +} + +impl Counter { + fn new() -> Self { + Self { + counts1: vec![0; 512], + counts2: vec![vec![0; 512]; 512], + } + } + + #[inline] + fn record_count1(&mut self, code1: Code) { + self.counts1[code1.0 as usize] += 1; + } + + #[inline] + fn record_count2(&mut self, code1: Code, code2: Code) { + self.counts2[code1.0 as usize][code2.0 as usize] += 1; + } + + #[inline] + fn count1(&self, code: Code) -> usize { + self.counts1[code.0 as usize] + } + + #[inline] + fn count2(&self, code1: Code, code2: Code) -> usize { + self.counts2[code1.0 as usize][code2.0 as usize] + } +} + +pub const MAX_GENERATIONS: usize = 5; + +pub fn train(corpus: impl AsRef<[u8]>) -> SymbolTable { + let mut table = SymbolTable::default(); + // TODO(aduffy): handle truncating/sampling if corpus > requires sample size. + let sample = corpus.as_ref(); + for _generation in 0..MAX_GENERATIONS { + let counter = table.compress_count(sample); + table = table.optimize(counter); + } + + table +} + +impl SymbolTable { + /// Compress the text using the current symbol table. Count the code occurrences + /// and code-pair occurrences to allow us to calculate apparent gain. + fn compress_count(&self, sample: &[u8]) -> Counter { + let mut counter = Counter::new(); + let len = sample.len(); + let mut prev_code = self.find_longest_symbol(sample); + counter.record_count1(prev_code); + let mut pos = self.symbols[prev_code.0 as usize].len(); + + while pos < len { + let code = self.find_longest_symbol(&sample[pos..len]); + counter.record_count1(code); + counter.record_count2(prev_code, code); + pos += self.symbols[code.0 as usize].len(); + prev_code = code; + } + + counter + } + + /// Using a set of counters and the existing set of symbols, build a new + /// set of symbols/codes that optimizes the gain over the distribution in `counter`. + fn optimize(&self, counters: Counter) -> Self { + let mut res = SymbolTable::default(); + let mut pqueue = BinaryHeap::new(); + for code1 in 0..512 { + let code1 = Code::from_u16(code1); + let symbol1 = self.symbols[code1.0 as usize]; + let gain = counters.count1(code1) * symbol1.len(); + pqueue.push(Candidate { + symbol: symbol1, + gain, + }); + + for code2 in 0..512 { + let code2 = Code::from_u16(code2); + let symbol2 = &self.symbols[code2.0 as usize]; + // If either symbol is zero-length, or if merging would yield a symbol of + // length greater than 8, skip. + if symbol1.len() + symbol2.len() >= 8 || symbol1.is_empty() || symbol2.is_empty() { + continue; + } + let new_symbol = symbol1.concat(symbol2); + // as`sert the symbol is not empty + assert!( + !new_symbol.is_empty(), + "symbol made by merging {:?} and {:?} is empty", + symbol1, + symbol2, + ); + let gain = counters.count2(code1, code2); + pqueue.push(Candidate { + symbol: new_symbol, + gain, + }) + } + } + + // Pop the 255 best symbols. + pqueue + .iter() + .take(255) + .for_each(|candidate| res.insert(candidate.symbol)); + + res + } +} + +struct Candidate { + gain: usize, + symbol: Symbol, +} + +impl Candidate { + fn comparable_form(&self) -> (usize, usize) { + (self.gain, self.symbol.len()) + } +} + +impl Eq for Candidate {} + +impl PartialEq for Candidate { + fn eq(&self, other: &Self) -> bool { + self.comparable_form().eq(&other.comparable_form()) + } +} + +impl PartialOrd for Candidate { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Candidate { + fn cmp(&self, other: &Self) -> Ordering { + let self_ord = (self.gain, self.symbol.len()); + let other_ord = (other.gain, other.symbol.len()); + + self_ord.cmp(&other_ord) + } +} + +#[cfg(test)] +mod test { + use crate::{train, SymbolTable}; + + #[test] + fn test_builder() { + // Train a SymbolTable on the toy string + let text = "hello world"; + let table = train(text.as_bytes()); + + // Use the table to compress a string, see the values + let compressed = table.compress(text.as_bytes()); + + // Ensure that the compressed string has no escape bytes + assert!(compressed.iter().all(|b| *b != SymbolTable::ESCAPE)); + + // Ensure that we can compress a string with no values seen at training time. + let compressed = table.compress("xyz123".as_bytes()); + assert_eq!( + compressed, + vec![ + SymbolTable::ESCAPE, + b'x', + SymbolTable::ESCAPE, + b'y', + SymbolTable::ESCAPE, + b'z', + SymbolTable::ESCAPE, + b'1', + SymbolTable::ESCAPE, + b'2', + SymbolTable::ESCAPE, + b'3', + ] + ) + } +} diff --git a/src/fsst.rs b/src/fsst.rs deleted file mode 100644 index 960db9d..0000000 --- a/src/fsst.rs +++ /dev/null @@ -1,325 +0,0 @@ -use std::cmp::min; - -const FSST_CODE_MAX: u16 = 256; -const FSST_CODE_MASK: u16 = FSST_CODE_MAX - 1; -const FSST_LEN_BITS: u32 = 12; -const FSST_CODE_BITS: u32 = 9; -const FSST_CODE_BASE: u16 = 256; -const FSST_HASH_LOG2SIZE: usize = 10; -const FSST_HASH_PRIME: u64 = 2971215073; -const FSST_SHIFT: u32 = 15; -const FSST_ICL_FREE: u64 = (15 << 28) | ((FSST_CODE_MASK as u64) << 16); -const FSST_MAXHEADER: usize = 8 + 1 + 8 + 2048 + 1; -const FSST_ESC: u8 = 255; - -#[inline(always)] -fn fsst_unaligned_load(v: &[u8]) -> u64 { - let mut ret: u64 = 0; - unsafe { - std::ptr::copy_nonoverlapping(v.as_ptr(), &mut ret as *mut u64 as *mut u8, 8); - } - ret -} - -#[inline(always)] -fn fsst_hash(w: u64) -> u64 { - ((w * FSST_HASH_PRIME) ^ ((w * FSST_HASH_PRIME) >> FSST_SHIFT)) -} - -#[derive(Clone, Copy)] -struct Symbol { - val: [u8; 8], - icl: u64, -} - -impl Symbol { - const MAX_LENGTH: usize = 8; - - fn new() -> Self { - Symbol { val: [0; 8], icl: 0 } - } - - fn from_byte(c: u8, code: u16) -> Self { - let mut s = Symbol::new(); - s.val[0] = c; - s.set_code_len(code, 1); - s - } - - fn from_slice(input: &[u8]) -> Self { - let mut s = Symbol::new(); - let len = min(input.len(), Self::MAX_LENGTH); - s.val[..len].copy_from_slice(&input[..len]); - s.set_code_len(FSST_CODE_MASK, len as u32); - s - } - - fn set_code_len(&mut self, code: u16, len: u32) { - self.icl = (len << 28) as u64 | (code as u64) << 16 | ((8 - len) * 8) as u64; - } - - fn length(&self) -> u32 { - (self.icl >> 28) as u32 - } - - fn code(&self) -> u16 { - ((self.icl >> 16) & FSST_CODE_MASK as u64) as u16 - } - - fn ignored_bits(&self) -> u32 { - self.icl as u32 - } - - fn first(&self) -> u8 { - self.val[0] - } - - fn first2(&self) -> u16 { - u16::from_le_bytes([self.val[0], self.val[1]]) - } - - fn hash(&self) -> usize { - let v = u32::from_le_bytes([self.val[0], self.val[1], self.val[2], self.val[3]]); - fsst_hash(v as u64) as usize - } -} - -struct SymbolTable { - short_codes: [u16; 65536], - byte_codes: [u16; 256], - symbols: Vec, - hash_tab: Vec, - n_symbols: u16, - suffix_lim: u16, - terminator: u16, - zero_terminated: bool, - len_histo: [u16; FSST_CODE_BITS as usize], -} - -impl SymbolTable { - fn new() -> Self { - let mut st = SymbolTable { - short_codes: [0; 65536], - byte_codes: [0; 256], - symbols: vec![Symbol::new(); FSST_CODE_MAX as usize], - hash_tab: vec![Symbol::new(); 1 << FSST_HASH_LOG2SIZE], - n_symbols: 0, - suffix_lim: FSST_CODE_MAX, - terminator: 0, - zero_terminated: false, - len_histo: [0; FSST_CODE_BITS as usize], - }; - - for i in 0..256 { - st.symbols[i] = Symbol::from_byte(i as u8, i as u16 | (1 << FSST_LEN_BITS)); - } - - for i in 256..FSST_CODE_MAX as usize { - st.symbols[i] = Symbol::from_byte(0, FSST_CODE_MASK); - } - - for i in 0..256 { - st.byte_codes[i] = (1 << FSST_LEN_BITS) | i as u16; - } - - for i in 0..65536 { - st.short_codes[i] = (1 << FSST_LEN_BITS) | (i & 255) as u16; - } - - st - } - - fn clear(&mut self) { - self.len_histo = [0; FSST_CODE_BITS as usize]; - for i in FSST_CODE_BASE as usize..FSST_CODE_BASE as usize + self.n_symbols as usize { - let symbol = &self.symbols[i]; - if symbol.length() == 1 { - let val = symbol.first(); - self.byte_codes[val as usize] = (1 << FSST_LEN_BITS) | val as u16; - } else if symbol.length() == 2 { - let val = symbol.first2(); - self.short_codes[val as usize] = (1 << FSST_LEN_BITS) | (val & 255); - } else { - let idx = symbol.hash() & ((1 << FSST_HASH_LOG2SIZE) - 1); - self.hash_tab[idx] = Symbol::new(); - self.hash_tab[idx].icl = FSST_ICL_FREE; - } - } - self.n_symbols = 0; - } - - fn hash_insert(&mut self, s: Symbol) -> bool { - let idx = s.hash() & ((1 << FSST_HASH_LOG2SIZE) - 1); - let taken = self.hash_tab[idx].icl < FSST_ICL_FREE; - if taken { - return false; - } - self.hash_tab[idx] = s; - true - } - - fn add(&mut self, mut s: Symbol) -> bool { - assert!(FSST_CODE_BASE + self.n_symbols < FSST_CODE_MAX); - let len = s.length(); - s.set_code_len(FSST_CODE_BASE + self.n_symbols, len); - if len == 1 { - self.byte_codes[s.first() as usize] = FSST_CODE_BASE + self.n_symbols + (1 << FSST_LEN_BITS); - } else if len == 2 { - self.short_codes[s.first2() as usize] = FSST_CODE_BASE + self.n_symbols + (2 << FSST_LEN_BITS); - } else if !self.hash_insert(s) { - return false; - } - self.symbols[FSST_CODE_BASE as usize + self.n_symbols as usize] = s; - self.len_histo[len as usize - 1] += 1; - self.n_symbols += 1; - true - } - - fn find_longest_symbol(&self, s: Symbol) -> u16 { - let idx = s.hash() & ((1 << FSST_HASH_LOG2SIZE) - 1); - if self.hash_tab[idx].icl <= s.icl && self.hash_tab[idx].val == s.val { - return (self.hash_tab[idx].icl >> 16) & FSST_CODE_MASK as u64; - } - if s.length() >= 2 { - let code = self.short_codes[s.first2() as usize] & FSST_CODE_MASK; - if code >= FSST_CODE_BASE { - return code; - } - } - self.byte_codes[s.first() as usize] & FSST_CODE_MASK - } - - fn find_longest_symbol_slice(&self, cur: &[u8], end: &[u8]) -> u16 { - self.find_longest_symbol(Symbol::from_slice(&cur[..min(cur.len(), end.len())])) - } -} - -struct Counters { - count1: Vec, - count2: Vec>, -} - -impl Counters { - fn new() -> Self { - Counters { - count1: vec![0; FSST_CODE_MAX as usize], - count2: vec![vec![0; FSST_CODE_MAX as usize]; FSST_CODE_MAX as usize], - } - } - - fn count1_set(&mut self, pos1: usize, val: u16) { - self.count1[pos1] = val; - } - - fn count1_inc(&mut self, pos1: usize) { - self.count1[pos1] += 1; - } - - fn count2_inc(&mut self, pos1: usize, pos2: usize) { - self.count2[pos1][pos2] += 1; - } - - fn count1_get_next(&self, pos1: &mut usize) -> u32 { - self.count1[*pos1] as u32 - } - - fn count2_get_next(&self, pos1: usize, pos2: &mut usize) -> u32 { - self.count2[pos1][*pos2] as u32 - } - - fn backup1(&self, buf: &mut [u8]) { - unsafe { - std::ptr::copy_nonoverlapping( - self.count1.as_ptr() as *const u8, - buf.as_mut_ptr(), - FSST_CODE_MAX as usize * std::mem::size_of::(), - ); - } - } - - fn restore1(&mut self, buf: &[u8]) { - unsafe { - std::ptr::copy_nonoverlapping( - buf.as_ptr(), - self.count1.as_mut_ptr() as *mut u8, - FSST_CODE_MAX as usize * std::mem::size_of::(), - ); - } - } -} - -struct Encoder { - symbol_table: SymbolTable, - counters: Counters, -} - -impl Encoder { - fn new() -> Self { - Encoder { - symbol_table: SymbolTable::new(), - counters: Counters::new(), - } - } - - pub fn compress(&self, input: &[u8], output: &mut [u8]) -> (usize, usize) { - let mut in_pos = 0; - let mut out_pos = 0; - - while in_pos < input.len() && out_pos < output.len() { - let symbol = self.symbol_table.find_longest_symbol_slice(&input[in_pos..], &input[input.len()..]); - let code = symbol & FSST_CODE_MASK; - let len = (symbol >> FSST_LEN_BITS) as usize; - - if code < FSST_CODE_BASE { - // Escape byte - if out_pos + 2 > output.len() { - break; - } - output[out_pos] = FSST_ESC; - output[out_pos + 1] = input[in_pos]; - out_pos += 2; - in_pos += 1; - } else { - if out_pos + 1 > output.len() { - break; - } - output[out_pos] = code as u8; - out_pos += 1; - in_pos += len; - } - } - - (in_pos, out_pos) - } -} - -impl SymbolTable { - pub fn decompress(&self, input: &[u8], output: &mut [u8]) -> usize { - let mut in_pos = 0; - let mut out_pos = 0; - - while in_pos < input.len() && out_pos < output.len() { - let code = input[in_pos] as u16; - in_pos += 1; - - if code == FSST_ESC as u16 { - if in_pos >= input.len() { - break; - } - output[out_pos] = input[in_pos]; - in_pos += 1; - out_pos += 1; - } else { - let symbol = &self.symbols[code as usize]; - let len = symbol.length() as usize; - if out_pos + len > output.len() { - break; - } - output[out_pos..out_pos + len].copy_from_slice(&symbol.val[..len]); - out_pos += len; - } - } - - out_pos - } -} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 44a4684..7fff00d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,12 +1,254 @@ -mod fsst; +use std::fmt::{Debug, Formatter}; -#[cfg(test)] -mod tests { - use super::*; +pub use builder::*; - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); +mod builder; +mod longest; + +pub const ESCAPE: u8 = 0xFF; + +/// A Symbol wraps a set of values of +#[derive(Copy, Clone)] +pub union Symbol { + bytes: [u8; 8], + num: u64, +} + +impl Debug for Symbol { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", unsafe { self.num }) + } +} + +impl Symbol { + pub const ZERO: Self = Self::zero(); + + pub fn from_slice(slice: &[u8; 8]) -> Self { + Self { bytes: *slice } + } + + /// Return a zero symbol + const fn zero() -> Self { + Self { num: 0 } + } + + /// Create a new single-byte symbol + pub fn from_u8(value: u8) -> Self { + Self { + bytes: [value, 0, 0, 0, 0, 0, 0, 0], + } + } +} + +impl Symbol { + /// Calculate the length of the symbol in bytes. + /// + /// Each symbol has the capacity to hold up to 8 bytes of data, but the symbols + /// can contain fewer bytes, padded with 0x00. + pub fn len(&self) -> usize { + let numeric = unsafe { self.num }; + // For little-endian platforms, this counts the number of *trailing* zeros + let null_bytes = (numeric.leading_zeros() >> 3) as usize; + + size_of::() - null_bytes + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + pub fn as_slice(&self) -> &[u8] { + let len = self.len(); + // Safety: the length from `len()` can never be more than 8. + unsafe { &self.bytes[0..len] } + } + + pub fn append_to(&self, vec: &mut Vec) { + match self.len() { + 0 => self.append_inner::<0>(vec), + 1 => self.append_inner::<1>(vec), + 2 => self.append_inner::<2>(vec), + 3 => self.append_inner::<3>(vec), + 4 => self.append_inner::<4>(vec), + 5 => self.append_inner::<5>(vec), + 6 => self.append_inner::<6>(vec), + 7 => self.append_inner::<7>(vec), + 8 => self.append_inner::<8>(vec), + _ => unreachable!("Symbol::len() always ≤ 8"), + } + } + + fn append_inner(&self, vec: &mut Vec) { + for i in 0..N { + let byte: u8 = unsafe { self.num >> i } as u8; + vec.push(byte); + } + } + + /// Returns true if the symbol is a prefix of the provided text. + pub fn is_prefix(&self, text: &[u8]) -> bool { + text.starts_with(self.as_slice()) + } + + pub fn concat(&self, other: &Self) -> Self { + let new_len = self.len() + other.len(); + assert!(new_len <= 8, "cannot build symbol with length > 8"); + + let self_len = self.len(); + let mut result = *self; + unsafe { result.bytes[self_len..new_len].copy_from_slice(other.as_slice()) }; + + result + } +} + +/// Codes correspond to bytes. +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub struct Code(u16); + +impl Code { + pub const CODE_MAX: u16 = 512; + + /// Create a new code representing an escape byte. + pub fn new_escaped(byte: u8) -> Self { + Self(byte as u16) + } + + /// Create a new code representing a symbol. + pub fn new_symbol(code: u8) -> Self { + Self((code as u16) + 256) + } + + /// Create a `Code` directly from a `u16` value. + /// + /// # Panics + /// Panic if the value is ≥ the defined `CODE_MAX`. + pub fn from_u16(code: u16) -> Self { + assert!(code < Self::CODE_MAX, "code value higher than CODE_MAX"); + + Self(code) + } + + /// Returns true if the code is for an escape byte. + #[inline] + pub fn is_escape(&self) -> bool { + self.0 <= 255 + } +} + +#[derive(Clone, Debug)] +pub struct SymbolTable { + /// Table mapping codes to symbols. + pub(crate) symbols: [Symbol; 512], + + /// Indicates the number of entries in the symbol table that have been populated. + /// + /// This value is always at least 256, as the first 256 entries in the `table` are the escape + /// bytes. + pub(crate) n_symbols: usize, +} + +impl Default for SymbolTable { + fn default() -> Self { + let mut table = Self { + symbols: [Symbol::ZERO; 512], + n_symbols: 0, + }; + + // Populate the escape byte entries. + for byte in 0..=255 { + table.symbols[byte as usize] = Symbol::from_u8(byte); + } + table.n_symbols = 256; + + table + } +} + +/// The core structure of the FSST codec, holding a mapping between `Symbol`s and `Code`s. +/// +/// The symbol table is trained on a corpus of data in the form of a single byte array, building up +/// a mapping of 1-byte "codes" to sequences of up to `N` plaintext bytse, or "symbols". +impl SymbolTable { + pub const ESCAPE: u8 = 255; + + /// Insert a new symbol at the end of the table. + /// + /// # Panics + /// Panics if the table is already full. + pub fn insert(&mut self, symbol: Symbol) { + assert!(self.n_symbols < 512, "cannot insert into full symbol table"); + self.symbols[self.n_symbols] = symbol; + self.n_symbols += 1; + } + + /// Return a new encoded sequence of data bytes instead. + pub fn compress(&self, plaintext: &[u8]) -> Vec { + let mut values = Vec::with_capacity(2 * plaintext.len()); + let len = plaintext.len(); + let mut pos = 0; + while pos < len { + // println!("COMPRESS pos={pos} len={len} in_progress_size={}", values.len()); + let next_code = self.find_longest_symbol(&plaintext[pos..len]); + if next_code.is_escape() { + // Case 1 -escape: push an ESCAPE followed by the next byte. + // println!("ESCAPE"); + values.push(Self::ESCAPE); + values.push(next_code.0 as u8); + pos += 1; + } else { + // Case 2 - code: push the code, increment position by symbol length + let symbol = self.symbols[next_code.0 as usize]; + // println!("APPEND symbol={:?} len={}", symbol.as_slice(), symbol.len()); + values.push(next_code.0 as u8); + pos += symbol.len(); + } + } + + values + } + + /// Decompress the provided byte slice into a [`String`] using the symbol table. + pub fn decompress(&self, compressed: &[u8]) -> Vec { + let mut decoded: Vec = Vec::with_capacity(size_of::() * compressed.len()); + let ptr = decoded.as_mut_ptr(); + + let mut in_pos = 0; + let mut out_pos = 0; + + while in_pos < compressed.len() && out_pos < (decoded.capacity() + size_of::()) { + let code = compressed[in_pos]; + if code == SymbolTable::ESCAPE { + // Advance by one, do raw write. + in_pos += 1; + // SAFETY: out_pos is always 8 bytes or more from the end of decoded buffer + unsafe { + let write_addr = ptr.byte_offset(out_pos as isize); + write_addr.write(compressed[in_pos]); + } + out_pos += 1; + in_pos += 1; + } else { + let symbol = self.symbols[256 + code as usize]; + // SAFETY: out_pos is always 8 bytes or more from the end of decoded buffer + unsafe { + let write_addr = ptr.byte_offset(out_pos as isize) as *mut u64; + // Perform 8 byte unaligned write. + write_addr.write_unaligned(symbol.num); + } + in_pos += 1; + out_pos += symbol.len(); + } + } + + assert!( + in_pos >= compressed.len(), + "decompression should exhaust input before output" + ); + + // SAFETY: we enforce in the loop condition that out_pos <= decoded.capacity() + unsafe { decoded.set_len(out_pos) }; + + decoded } } diff --git a/src/longest.rs b/src/longest.rs new file mode 100644 index 0000000..50f0ff7 --- /dev/null +++ b/src/longest.rs @@ -0,0 +1,25 @@ +use crate::{Code, SymbolTable}; + +/// Find the longest substring. + +impl SymbolTable { + // NOTE(aduffy): if you don't disable inlining, this function won't show up in profiles. + #[inline(never)] + pub(crate) fn find_longest_symbol(&self, text: &[u8]) -> Code { + debug_assert!(!text.is_empty(), "text must not be empty"); + + // Find the code that best maps to the provided text table here. + let mut best_code = Code::new_escaped(text[0]); + let mut best_overlap = 1; + for code in 0..512 { + let symbol = &self.symbols[code as usize]; + if symbol.is_prefix(text) && symbol.len() > best_overlap { + // println!("using ideal code: code={code} symbol{:?} len={}", symbol.as_slice(), symbol.len()); + best_code = Code::from_u16(code); + best_overlap = symbol.len(); + } + } + + best_code + } +} From 092d4e99444edfdb94c466c683eb456c593decb7 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 12 Aug 2024 16:51:17 -0400 Subject: [PATCH 2/7] add actions files --- .github/workflows/ci.yml | 56 +++++++++++++++++++++++++++++++ .github/workflows/release-plz.yml | 27 +++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/release-plz.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..7d5a729 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,56 @@ +name: CI + +on: + push: + branches: [ "develop" ] + pull_request: { } + workflow_dispatch: { } + +permissions: + actions: read + contents: read + +jobs: + build: + name: 'build' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Rust Version + id: rust-version + shell: bash + run: echo "version=$(cat rust-toolchain.toml | grep channel | awk -F'\"' '{print $2}')" >> $GITHUB_OUTPUT + + - name: Rust Toolchain + id: rust-toolchain + uses: dtolnay/rust-toolchain@master + if: steps.rustup-cache.outputs.cache-hit != 'true' + with: + toolchain: "${{ steps.rust-version.outputs.version }}" + components: clippy, rustfmt + + - name: Rust Dependency Cache + uses: Swatinem/rust-cache@v2 + with: + save-if: ${{ github.ref == 'refs/heads/develop' }} + shared-key: "shared" # To allow reuse across jobs + + - name: Rust Compile Cache + uses: mozilla-actions/sccache-action@v0.0.5 + - name: Rust Compile Cache Config + shell: bash + # echo "CARGO_LOG=cargo::core::compiler::fingerprint=info" >> $GITHUB_ENV + run: | + echo "SCCACHE_GHA_ENABLED=true" >> $GITHUB_ENV + echo "RUSTC_WRAPPER=sccache" >> $GITHUB_ENV + echo "CARGO_INCREMENTAL=0" >> $GITHUB_ENV + + - name: Rust Build + run: cargo build --all-features --all-targets + - name: Rust Lint - Format + run: cargo fmt --all --check + - name: Rust Lint - Clippy + run: cargo clippy --all-features --all-targets + - name: Rust Test + run: cargo test --workspace --all-features \ No newline at end of file diff --git a/.github/workflows/release-plz.yml b/.github/workflows/release-plz.yml new file mode 100644 index 0000000..5d30bcf --- /dev/null +++ b/.github/workflows/release-plz.yml @@ -0,0 +1,27 @@ +name: Release-plz + +permissions: + pull-requests: write + contents: write + +on: + push: + branches: + - develop + +jobs: + release-plz: + name: Release-plz + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + - name: Run release-plz + uses: MarcoIeni/release-plz-action@v0.5 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} From 4618d92425265636843fe53ab205ea3017561455 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 12 Aug 2024 16:54:16 -0400 Subject: [PATCH 3/7] add toolchain --- rust-toolchain.toml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 rust-toolchain.toml diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000..544af13 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,5 @@ +[toolchain] +channel = "nightly-2024-06-19" +components = ["rust-src", "rustfmt", "clippy"] +profile = "minimal" + From 18523764245ac4a96aab8a5f8f8fa57d0225e2b1 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 12 Aug 2024 17:51:07 -0400 Subject: [PATCH 4/7] deny(missing_docs), 512 -> 511 --- Cargo.lock | 2 +- Cargo.toml | 17 +++++++- benches/compress.rs | 10 ++++- src/builder.rs | 39 +++++++++++------ src/lib.rs | 101 +++++++++++++++++++++++++++++--------------- src/longest.rs | 3 +- 6 files changed, 121 insertions(+), 51 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ef3912f..48d9198 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -180,7 +180,7 @@ checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" [[package]] name = "fsst-rs" -version = "0.1.0" +version = "0.0.1" dependencies = [ "criterion", "lz4", diff --git a/Cargo.toml b/Cargo.toml index 301d560..030301c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,23 @@ [package] name = "fsst-rs" -version = "0.1.0" +version = "0.0.1" edition = "2021" +[lints.rust] +warnings = "deny" +missing_docs = "deny" + +[lints.clippy] +all = { level = "deny", priority = -1 } +if_then_some_else_none = { level = "deny" } +mem_forget = { level = "deny" } +or_fun_call = "deny" +panic_in_result_fn = { level = "deny" } +same_name_method = { level = "deny" } +tests_outside_test_module = { level = "deny" } +unwrap_in_result = { level = "deny" } +use_debug = { level = "deny" } + [dev-dependencies] criterion = "0.5" lz4 = "1" diff --git a/benches/compress.rs b/benches/compress.rs index 10cc7ce..829b7e6 100644 --- a/benches/compress.rs +++ b/benches/compress.rs @@ -1,10 +1,16 @@ +//! Compression benchmark. +//! +//! Contains benchmarks for FSST compression, decompression, and symbol table training. +//! +//! Also contains LZ4 baseline. +#![allow(missing_docs)] use std::io::{Cursor, Read, Write}; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use lz4::liblz4::BlockChecksum; use lz4::{BlockSize, ContentChecksum}; -use fsst_rs::{train, SymbolTable}; +use fsst_rs::{train, Code}; const CORPUS: &str = include_str!("dracula.txt"); const TEST: &str = "I found my smattering of German very useful here"; @@ -22,7 +28,7 @@ fn bench_fsst(c: &mut Criterion) { let compressed = table.compress(plaintext); let escape_count = compressed .iter() - .filter(|b| **b == SymbolTable::ESCAPE) + .filter(|b| **b == Code::ESCAPE_CODE) .count(); let ratio = (plaintext.len() as f64) / (compressed.len() as f64); println!( diff --git a/src/builder.rs b/src/builder.rs index c7ae814..1dca853 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -21,8 +21,8 @@ struct Counter { impl Counter { fn new() -> Self { Self { - counts1: vec![0; 512], - counts2: vec![vec![0; 512]; 512], + counts1: vec![0; 511], + counts2: vec![vec![0; 511]; 511], } } @@ -47,8 +47,20 @@ impl Counter { } } +/// The number of generations used for training. This is taken from the [FSST paper]. +/// +/// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf pub const MAX_GENERATIONS: usize = 5; +/// Build and train a `SymbolTable` from a sample corpus of text. +/// +/// This function implements the generational algorithm described in the [FSST paper] Section +/// 4.3. Starting with an empty symbol table, it iteratively compresses the corpus, then attempts +/// to merge symbols when doing so would yield better compression than leaving them unmerged. The +/// resulting table will have at most 255 symbols (the 256th symbol is reserved for the escape +/// code). +/// +/// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf pub fn train(corpus: impl AsRef<[u8]>) -> SymbolTable { let mut table = SymbolTable::default(); // TODO(aduffy): handle truncating/sampling if corpus > requires sample size. @@ -87,7 +99,7 @@ impl SymbolTable { fn optimize(&self, counters: Counter) -> Self { let mut res = SymbolTable::default(); let mut pqueue = BinaryHeap::new(); - for code1 in 0..512 { + for code1 in 0..511 { let code1 = Code::from_u16(code1); let symbol1 = self.symbols[code1.0 as usize]; let gain = counters.count1(code1) * symbol1.len(); @@ -96,7 +108,7 @@ impl SymbolTable { gain, }); - for code2 in 0..512 { + for code2 in 0..511 { let code2 = Code::from_u16(code2); let symbol2 = &self.symbols[code2.0 as usize]; // If either symbol is zero-length, or if merging would yield a symbol of @@ -130,6 +142,9 @@ impl SymbolTable { } } +/// A candidate for inclusion in a symbol table. +/// +/// This is really only useful for the `optimize` step of training. struct Candidate { gain: usize, symbol: Symbol, @@ -166,7 +181,7 @@ impl Ord for Candidate { #[cfg(test)] mod test { - use crate::{train, SymbolTable}; + use crate::{train, Code}; #[test] fn test_builder() { @@ -178,24 +193,24 @@ mod test { let compressed = table.compress(text.as_bytes()); // Ensure that the compressed string has no escape bytes - assert!(compressed.iter().all(|b| *b != SymbolTable::ESCAPE)); + assert!(compressed.iter().all(|b| *b != Code::ESCAPE_CODE)); // Ensure that we can compress a string with no values seen at training time. let compressed = table.compress("xyz123".as_bytes()); assert_eq!( compressed, vec![ - SymbolTable::ESCAPE, + Code::ESCAPE_CODE, b'x', - SymbolTable::ESCAPE, + Code::ESCAPE_CODE, b'y', - SymbolTable::ESCAPE, + Code::ESCAPE_CODE, b'z', - SymbolTable::ESCAPE, + Code::ESCAPE_CODE, b'1', - SymbolTable::ESCAPE, + Code::ESCAPE_CODE, b'2', - SymbolTable::ESCAPE, + Code::ESCAPE_CODE, b'3', ] ) diff --git a/src/lib.rs b/src/lib.rs index 7fff00d..67f1d3d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,13 @@ +//! A pure-Rust, zero-dependency implementation of the [FSST string compression algorithm][whitepaper]. +//! +//! FSST is a string compression algorithm meant for use in database systems. It was designed by +//! [Peter Boncz, Thomas Neumann, and Viktor Leis][whitepaper]. It provides 1-3GB/sec compression +//! and decompression of strings at compression rates competitive with or better than LZ4. +//! +//! NOTE: This current implementation is still in-progress, please use at your own risk. +//! +//! [whitepaper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf + use std::fmt::{Debug, Formatter}; pub use builder::*; @@ -5,8 +15,6 @@ pub use builder::*; mod builder; mod longest; -pub const ESCAPE: u8 = 0xFF; - /// A Symbol wraps a set of values of #[derive(Copy, Clone)] pub union Symbol { @@ -21,8 +29,10 @@ impl Debug for Symbol { } impl Symbol { + /// Zero value for `Symbol`. pub const ZERO: Self = Self::zero(); + /// Constructor for a `Symbol` from an 8-element byte slice. pub fn from_slice(slice: &[u8; 8]) -> Self { Self { bytes: *slice } } @@ -53,43 +63,26 @@ impl Symbol { size_of::() - null_bytes } + /// Returns true if the symbol does not encode any bytes. + /// + /// Note that this should only be true for the zero code. pub fn is_empty(&self) -> bool { self.len() == 0 } + /// Create a ew pub fn as_slice(&self) -> &[u8] { let len = self.len(); - // Safety: the length from `len()` can never be more than 8. + // SAFETY: constructors will not allow building a struct where len > 8. unsafe { &self.bytes[0..len] } } - pub fn append_to(&self, vec: &mut Vec) { - match self.len() { - 0 => self.append_inner::<0>(vec), - 1 => self.append_inner::<1>(vec), - 2 => self.append_inner::<2>(vec), - 3 => self.append_inner::<3>(vec), - 4 => self.append_inner::<4>(vec), - 5 => self.append_inner::<5>(vec), - 6 => self.append_inner::<6>(vec), - 7 => self.append_inner::<7>(vec), - 8 => self.append_inner::<8>(vec), - _ => unreachable!("Symbol::len() always ≤ 8"), - } - } - - fn append_inner(&self, vec: &mut Vec) { - for i in 0..N { - let byte: u8 = unsafe { self.num >> i } as u8; - vec.push(byte); - } - } - /// Returns true if the symbol is a prefix of the provided text. pub fn is_prefix(&self, text: &[u8]) -> bool { text.starts_with(self.as_slice()) } + /// Return a new `Symbol` by logically concatenating ourselves with another `Symbol`. pub fn concat(&self, other: &Self) -> Self { let new_len = self.len() + other.len(); assert!(new_len <= 8, "cannot build symbol with length > 8"); @@ -102,13 +95,31 @@ impl Symbol { } } -/// Codes correspond to bytes. +/// Codes used to map symbols to bytes. +/// +/// Logically, codes can range from 0-255 inclusive. Physically, we represent them as a 9-bit +/// value packed into a `u16`. +/// +/// Physically in-memory, `Code(0)` through `Code(255)` corresponds to escape sequences of raw bytes +/// 0 through 255. `Code(256)` through `Code(511)` represent the actual codes -255. #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] pub struct Code(u16); impl Code { + /// Maximum code value for the in-memory `Code` representation. pub const CODE_MAX: u16 = 512; + /// Maximum code value. Code 255 is reserved as the [escape code][`Self::ESCAPE_CODE`]. + pub const MAX_CODE: u8 = 254; + + /// Code used to indicate bytes that are not in the symbol table. + /// + /// When compressing a string that cannot fully be expressed with the symbol table, the compressed + /// output will contain an `ESCAPE` byte followed by a raw byte. At decompression time, the presence + /// of `ESCAPE` indicates that the next byte should be appended directly to the result instead of + /// being looked up in the symbol table. + pub const ESCAPE_CODE: u8 = 255; + /// Create a new code representing an escape byte. pub fn new_escaped(byte: u8) -> Self { Self(byte as u16) @@ -116,6 +127,12 @@ impl Code { /// Create a new code representing a symbol. pub fn new_symbol(code: u8) -> Self { + assert_ne!( + code, + Code::ESCAPE_CODE, + "code {code} cannot be used for symbol, reserved for ESCAPE" + ); + Self((code as u16) + 256) } @@ -136,10 +153,27 @@ impl Code { } } +/// The static symbol table used for compression and decompression. +/// +/// The `SymbolTable` is the central component of FSST. You can create a SymbolTable either by +/// default, or by [training] it on an input corpus of text. +/// +/// Example usage: +/// +/// ``` +/// use fsst_rs::{Symbol, SymbolTable}; +/// let mut table = SymbolTable::default(); +/// table.insert(Symbol::from_slice(&[b'h', b'e', b'l', b'l', b'o', 0, 0, 0])); +/// +/// let compressed = table.compress("hello".as_bytes()); +/// assert_eq!(compressed, vec![0u8]); +/// ``` +/// +/// training: [`train`] #[derive(Clone, Debug)] pub struct SymbolTable { /// Table mapping codes to symbols. - pub(crate) symbols: [Symbol; 512], + pub(crate) symbols: [Symbol; 511], /// Indicates the number of entries in the symbol table that have been populated. /// @@ -151,7 +185,7 @@ pub struct SymbolTable { impl Default for SymbolTable { fn default() -> Self { let mut table = Self { - symbols: [Symbol::ZERO; 512], + symbols: [Symbol::ZERO; 511], n_symbols: 0, }; @@ -170,14 +204,15 @@ impl Default for SymbolTable { /// The symbol table is trained on a corpus of data in the form of a single byte array, building up /// a mapping of 1-byte "codes" to sequences of up to `N` plaintext bytse, or "symbols". impl SymbolTable { - pub const ESCAPE: u8 = 255; - /// Insert a new symbol at the end of the table. /// /// # Panics /// Panics if the table is already full. pub fn insert(&mut self, symbol: Symbol) { - assert!(self.n_symbols < 512, "cannot insert into full symbol table"); + assert!( + self.n_symbols < self.symbols.len(), + "cannot insert into full symbol table" + ); self.symbols[self.n_symbols] = symbol; self.n_symbols += 1; } @@ -193,7 +228,7 @@ impl SymbolTable { if next_code.is_escape() { // Case 1 -escape: push an ESCAPE followed by the next byte. // println!("ESCAPE"); - values.push(Self::ESCAPE); + values.push(Code::ESCAPE_CODE); values.push(next_code.0 as u8); pos += 1; } else { @@ -218,7 +253,7 @@ impl SymbolTable { while in_pos < compressed.len() && out_pos < (decoded.capacity() + size_of::()) { let code = compressed[in_pos]; - if code == SymbolTable::ESCAPE { + if code == Code::ESCAPE_CODE { // Advance by one, do raw write. in_pos += 1; // SAFETY: out_pos is always 8 bytes or more from the end of decoded buffer diff --git a/src/longest.rs b/src/longest.rs index 50f0ff7..445a88a 100644 --- a/src/longest.rs +++ b/src/longest.rs @@ -11,10 +11,9 @@ impl SymbolTable { // Find the code that best maps to the provided text table here. let mut best_code = Code::new_escaped(text[0]); let mut best_overlap = 1; - for code in 0..512 { + for code in 0..511 { let symbol = &self.symbols[code as usize]; if symbol.is_prefix(text) && symbol.len() > best_overlap { - // println!("using ideal code: code={code} symbol{:?} len={}", symbol.as_slice(), symbol.len()); best_code = Code::from_u16(code); best_overlap = symbol.len(); } From 6acf4aaf3acc90af48829a0d886586017bac74da Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 12 Aug 2024 17:52:23 -0400 Subject: [PATCH 5/7] disable release action for now --- .github/workflows/release-plz.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/release-plz.yml b/.github/workflows/release-plz.yml index 5d30bcf..4e6f481 100644 --- a/.github/workflows/release-plz.yml +++ b/.github/workflows/release-plz.yml @@ -4,10 +4,11 @@ permissions: pull-requests: write contents: write -on: - push: - branches: - - develop +# TODO(aduffy): uncomment when we're ready to publish +on: {} + # push: + # branches: + # - develop jobs: release-plz: From 5ff6cc62a1ccf4a2120558fc5fdd84026e018482 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 12 Aug 2024 17:53:54 -0400 Subject: [PATCH 6/7] README --- README.md | 13 ++++++++++++- src/lib.rs | 11 +---------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index f0911f4..5f16b80 100644 --- a/README.md +++ b/README.md @@ -1 +1,12 @@ -# fsst \ No newline at end of file +# fsst-rs + +A pure-Rust, zero-dependency implementation of the [FSST string compression algorithm][whitepaper]. + +FSST is a string compression algorithm meant for use in database systems. It was designed by +[Peter Boncz, Thomas Neumann, and Viktor Leis][whitepaper]. It provides 1-3GB/sec compression +and decompression of strings at compression rates competitive with or better than LZ4. + +**NOTE**: This current implementation is still in-progress, please use at your own risk. + + +[whitepaper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf diff --git a/src/lib.rs b/src/lib.rs index 67f1d3d..a545ac1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,13 +1,4 @@ -//! A pure-Rust, zero-dependency implementation of the [FSST string compression algorithm][whitepaper]. -//! -//! FSST is a string compression algorithm meant for use in database systems. It was designed by -//! [Peter Boncz, Thomas Neumann, and Viktor Leis][whitepaper]. It provides 1-3GB/sec compression -//! and decompression of strings at compression rates competitive with or better than LZ4. -//! -//! NOTE: This current implementation is still in-progress, please use at your own risk. -//! -//! [whitepaper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf - +#![doc = include_str!("../README.md")] use std::fmt::{Debug, Formatter}; pub use builder::*; From fa2a1b86fadad44bf81931f3bf5b71c4b817d5a2 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 12 Aug 2024 17:57:03 -0400 Subject: [PATCH 7/7] words --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5f16b80..d6957db 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,11 @@ FSST is a string compression algorithm meant for use in database systems. It was [Peter Boncz, Thomas Neumann, and Viktor Leis][whitepaper]. It provides 1-3GB/sec compression and decompression of strings at compression rates competitive with or better than LZ4. -**NOTE**: This current implementation is still in-progress, please use at your own risk. +This implementation is somewhat inspired by the [MIT-licensed implementation] from the paper authors, written in C++, +but it is mostly written from a careful reading of the paper. + +**NOTE: This current implementation is still in-progress and is not production ready, please use at your own risk.** [whitepaper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf +[MIT-licensed implementation]: https://github.com/cwida/fsst