diff --git a/.gitignore b/.gitignore index 4b5d46c..b0e73dd 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ *.orig *.swp tmp/ +lib/ diff --git a/Cargo.lock b/Cargo.lock index f588a86..3127194 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -49,6 +49,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "aligned-vec" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4aa90d7ce82d4be67b64039a3d588d38dbcc6736577de4a847025ce5b0c468d1" + [[package]] name = "allocator-api2" version = "0.2.18" @@ -134,6 +140,23 @@ dependencies = [ "derive_arbitrary", ] +[[package]] +name = "arg_enum_proc_macro" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.72", +] + +[[package]] +name = "arrayvec" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" + [[package]] name = "async-compression" version = "0.4.12" @@ -203,6 +226,29 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" +[[package]] +name = "av1-grain" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6678909d8c5d46a42abcf571271e15fdbc0a225e3646cf23762cd415046c78bf" +dependencies = [ + "anyhow", + "arrayvec", + "log", + "nom", + "num-rational", + "v_frame", +] + +[[package]] +name = "avif-serialize" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "876c75a42f6364451a033496a14c44bffe41f5f4a8236f697391f11024e596d2" +dependencies = [ + "arrayvec", +] + [[package]] name = "aws-config" version = "1.5.4" @@ -696,12 +742,53 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" +[[package]] +name = "bindgen" +version = "0.69.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0" +dependencies = [ + "bitflags 2.6.0", + "cexpr", + "clang-sys", + "itertools 0.12.1", + "lazy_static", + "lazycell", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash 1.1.0", + "shlex", + "syn 2.0.72", + "which", +] + +[[package]] +name = "bit_field" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc827186963e592360843fb5ba4b973e145841266c1357f7180c43526f2e5b61" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" +[[package]] +name = "bitstream-io" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dcde5f311c85b8ca30c2e4198d4326bc342c76541590106f5fa4a50946ea499" + [[package]] name = "block-buffer" version = "0.10.4" @@ -721,18 +808,36 @@ dependencies = [ "serde", ] +[[package]] +name = "built" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "236e6289eda5a812bc6b53c3b024039382a2895fbbeef2d748b2931546d392c4" + [[package]] name = "bumpalo" version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +[[package]] +name = "bytemuck" +version = "1.16.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "102087e286b4677862ea56cf8fc58bb2cdfa8725c40ffb80fe3a008eb7f2fc83" + [[package]] name = "byteorder" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" +[[package]] +name = "byteorder-lite" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495" + [[package]] name = "bytes" version = "1.7.1" @@ -786,6 +891,25 @@ dependencies = [ "libc", ] +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-expr" +version = "0.15.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d067ad48b8650848b989a59a86c6c36a995d02d2bf778d45c3c5d57bc2718f02" +dependencies = [ + "smallvec", + "target-lexicon", +] + [[package]] name = "cfg-if" version = "1.0.0" @@ -817,6 +941,17 @@ dependencies = [ "inout", ] +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "4.5.13" @@ -857,6 +992,12 @@ version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" +[[package]] +name = "color_quant" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" + [[package]] name = "colorchoice" version = "1.0.2" @@ -876,6 +1017,26 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "console_error_panic_hook" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06aeb73f470f66dcdbf7223caeebb85984942f22f1adb2a088cf9668146bbbc" +dependencies = [ + "cfg-if", + "wasm-bindgen", +] + +[[package]] +name = "console_log" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be8aed40e4edbf4d3b4431ab260b63fdc40f5780a4766824329ea0f1eefe3c0f" +dependencies = [ + "log", + "web-sys", +] + [[package]] name = "const-oid" version = "0.9.6" @@ -946,12 +1107,37 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + [[package]] name = "crypto-bigint" version = "0.4.9" @@ -1160,12 +1346,37 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "exr" +version = "1.72.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "887d93f60543e9a9362ef8a21beedd0a833c5d9610e18c67abe15a5963dcb1a4" +dependencies = [ + "bit_field", + "flume", + "half", + "lebe", + "miniz_oxide", + "rayon-core", + "smallvec", + "zune-inflate", +] + [[package]] name = "fastrand" version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" +[[package]] +name = "fdeflate" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f9bfee30e4dedf0ab8b422f03af778d9612b63f502710fc500a334ebe2de645" +dependencies = [ + "simd-adler32", +] + [[package]] name = "ff" version = "0.12.1" @@ -1186,6 +1397,15 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "flume" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55ac459de2512911e4b674ce33cf20befaba382d05b62b008afc1c8b57cbf181" +dependencies = [ + "spin", +] + [[package]] name = "fnv" version = "1.0.7" @@ -1342,12 +1562,28 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "gif" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fb2d69b19215e18bb912fa30f7ce15846e301408695e44e0ef719f1da9e19f2" +dependencies = [ + "color_quant", + "weezl", +] + [[package]] name = "gimli" version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + [[package]] name = "globset" version = "0.4.14" @@ -1410,6 +1646,16 @@ dependencies = [ "tracing", ] +[[package]] +name = "half" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +dependencies = [ + "cfg-if", + "crunchy", +] + [[package]] name = "hashbrown" version = "0.12.3" @@ -1453,6 +1699,15 @@ dependencies = [ "digest", ] +[[package]] +name = "home" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "http" version = "0.2.12" @@ -1673,6 +1928,45 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "image" +version = "0.25.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99314c8a2152b8ddb211f924cdae532d8c5e4c8bb54728e12fff1b0cd5963a10" +dependencies = [ + "bytemuck", + "byteorder-lite", + "color_quant", + "exr", + "gif", + "image-webp", + "num-traits", + "png", + "qoi", + "ravif", + "rayon", + "rgb", + "tiff", + "zune-core", + "zune-jpeg", +] + +[[package]] +name = "image-webp" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f79afb8cbee2ef20f59ccd477a218c12a93943d075b492015ecb1bb81f8ee904" +dependencies = [ + "byteorder-lite", + "quick-error", +] + +[[package]] +name = "imgref" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44feda355f4159a7c757171a77de25daf6411e217b4cabd03bd6650690468126" + [[package]] name = "indexmap" version = "1.9.3" @@ -1726,6 +2020,17 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "interpolate_name" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.72", +] + [[package]] name = "ipnet" version = "2.9.0" @@ -1738,6 +2043,15 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.13.0" @@ -1762,6 +2076,12 @@ dependencies = [ "libc", ] +[[package]] +name = "jpeg-decoder" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5d4a7da358eff58addd2877a45865158f0d78c911d43a5784ceb7bbf52833b0" + [[package]] name = "js-sys" version = "0.3.69" @@ -1792,18 +2112,61 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + +[[package]] +name = "lebe" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03087c2bad5e1034e8cace5926dec053fb3790248370865f5117a7d0213354c8" + [[package]] name = "libc" version = "0.2.155" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" +[[package]] +name = "libfuzzer-sys" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7" +dependencies = [ + "arbitrary", + "cc", + "once_cell", +] + +[[package]] +name = "libloading" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" +dependencies = [ + "cfg-if", + "windows-targets 0.52.6", +] + [[package]] name = "linux-raw-sys" version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] + [[package]] name = "lockfree-object-pool" version = "0.1.6" @@ -1816,6 +2179,15 @@ version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +[[package]] +name = "loop9" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fae87c125b03c1d2c0150c90365d7d6bcc53fb73a9acaef207d2d065860f062" +dependencies = [ + "imgref", +] + [[package]] name = "lru" version = "0.12.4" @@ -1850,6 +2222,21 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" +[[package]] +name = "maybe-owned" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4facc753ae494aeb6e3c22f839b158aebd4f9270f55cd3c79906c45476c47ab4" + +[[package]] +name = "maybe-rayon" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea1f30cedd69f0a2954655f7188c6a834246d2bcf1e315e2ac40c4b24dc9519" +dependencies = [ + "cfg-if", +] + [[package]] name = "md-5" version = "0.10.6" @@ -1882,6 +2269,12 @@ dependencies = [ "unicase", ] +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "miniz_oxide" version = "0.7.4" @@ -1889,6 +2282,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" dependencies = [ "adler", + "simd-adler32", ] [[package]] @@ -1903,6 +2297,28 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "noop_proc_macro" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0676bb32a98c1a483ce53e500a81ad9c3d5b3f7c920c28c24e9cb0980d0b5bc8" + [[package]] name = "nu-ansi-term" version = "0.46.0" @@ -1929,6 +2345,17 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +[[package]] +name = "num-derive" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.72", +] + [[package]] name = "num-integer" version = "0.1.46" @@ -1938,6 +2365,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -1997,6 +2435,12 @@ dependencies = [ "sha2", ] +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + [[package]] name = "pbkdf2" version = "0.12.2" @@ -2007,6 +2451,33 @@ dependencies = [ "hmac", ] +[[package]] +name = "pdfium-render" +version = "0.8.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cf21aa9bd11aa175e8755e0dbc613affe885e149c4b3ee4ac6d2c183260e727" +dependencies = [ + "bindgen", + "bitflags 2.6.0", + "bytemuck", + "bytes", + "chrono", + "console_error_panic_hook", + "console_log", + "image", + "itertools 0.13.0", + "js-sys", + "libloading", + "log", + "maybe-owned", + "once_cell", + "utf16string", + "vecmath", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "pem" version = "3.0.4" @@ -2075,6 +2546,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "piston-float" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad78bf43dcf80e8f950c92b84f938a0fc7590b7f6866fbcbeca781609c115590" + [[package]] name = "pkcs8" version = "0.9.0" @@ -2091,6 +2568,19 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" +[[package]] +name = "png" +version = "0.17.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06e4b0d3d1312775e782c86c91a111aa1f910cbb65e1337f9975b5f9a554b5e1" +dependencies = [ + "bitflags 1.3.2", + "crc32fast", + "fdeflate", + "flate2", + "miniz_oxide", +] + [[package]] name = "portable-atomic" version = "1.7.0" @@ -2112,6 +2602,16 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "prettyplease" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f12335488a2f3b0a83b14edad48dca9879ce89b2edd10e80237e4e852dd645e" +dependencies = [ + "proc-macro2", + "syn 2.0.72", +] + [[package]] name = "proc-macro2" version = "1.0.86" @@ -2121,6 +2621,25 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "profiling" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d84d1d7a6ac92673717f9f6d1518374ef257669c24ebc5ac25d5033828be58" +dependencies = [ + "profiling-procmacros", +] + +[[package]] +name = "profiling-procmacros" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8021cf59c8ec9c432cfc2526ac6b8aa508ecaf29cd415f271b8406c1b851c3fd" +dependencies = [ + "quote", + "syn 2.0.72", +] + [[package]] name = "prost" version = "0.13.1" @@ -2138,7 +2657,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "18bec9b0adc4eba778b33684b7ba3e7137789434769ee3ce3930463ef904cfca" dependencies = [ "anyhow", - "itertools", + "itertools 0.13.0", "proc-macro2", "quote", "syn 2.0.72", @@ -2153,6 +2672,21 @@ dependencies = [ "prost", ] +[[package]] +name = "qoi" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f6d64c71eb498fe9eae14ce4ec935c555749aef511cca85b5568910d6e48001" +dependencies = [ + "bytemuck", +] + +[[package]] +name = "quick-error" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" + [[package]] name = "quinn" version = "0.11.3" @@ -2163,7 +2697,7 @@ dependencies = [ "pin-project-lite", "quinn-proto", "quinn-udp", - "rustc-hash", + "rustc-hash 2.0.0", "rustls 0.23.12", "socket2", "thiserror", @@ -2180,7 +2714,7 @@ dependencies = [ "bytes", "rand", "ring", - "rustc-hash", + "rustc-hash 2.0.0", "rustls 0.23.12", "slab", "thiserror", @@ -2240,15 +2774,85 @@ dependencies = [ "getrandom", ] +[[package]] +name = "rav1e" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd87ce80a7665b1cce111f8a16c1f3929f6547ce91ade6addf4ec86a8dda5ce9" +dependencies = [ + "arbitrary", + "arg_enum_proc_macro", + "arrayvec", + "av1-grain", + "bitstream-io", + "built", + "cfg-if", + "interpolate_name", + "itertools 0.12.1", + "libc", + "libfuzzer-sys", + "log", + "maybe-rayon", + "new_debug_unreachable", + "noop_proc_macro", + "num-derive", + "num-traits", + "once_cell", + "paste", + "profiling", + "rand", + "rand_chacha", + "simd_helpers", + "system-deps", + "thiserror", + "v_frame", + "wasm-bindgen", +] + +[[package]] +name = "ravif" +version = "0.11.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f0bfd976333248de2078d350bfdf182ff96e168a24d23d2436cef320dd4bdd" +dependencies = [ + "avif-serialize", + "imgref", + "loop9", + "quick-error", + "rav1e", + "rgb", +] + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "redacter" -version = "0.6.0" +version = "0.7.0" dependencies = [ "async-recursion", "async-trait", "aws-config", "aws-sdk-comprehend", "aws-sdk-s3", + "bytes", "cargo-husky", "chrono", "clap", @@ -2258,9 +2862,11 @@ dependencies = [ "gcloud-sdk", "globset", "hex", + "image", "indicatif", "mime", "mime_guess", + "pdfium-render", "rand", "reqwest", "rsb_derive", @@ -2386,6 +2992,15 @@ dependencies = [ "zeroize", ] +[[package]] +name = "rgb" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f86ae463694029097b846d8f99fd5536740602ae00022c0c50c5600720b2f71" +dependencies = [ + "bytemuck", +] + [[package]] name = "ring" version = "0.17.8" @@ -2418,6 +3033,12 @@ version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustc-hash" version = "2.0.0" @@ -2439,7 +3060,7 @@ version = "0.38.34" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" dependencies = [ - "bitflags", + "bitflags 2.6.0", "errno", "libc", "linux-raw-sys", @@ -2585,6 +3206,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + [[package]] name = "sct" version = "0.7.1" @@ -2628,7 +3255,7 @@ version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ - "bitflags", + "bitflags 2.6.0", "core-foundation", "core-foundation-sys", "libc", @@ -2683,6 +3310,15 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_spanned" +version = "0.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb5b1b31579f3811bf615c144393417496f152e12ac8b7663bf664f4a815306d" +dependencies = [ + "serde", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -2756,6 +3392,12 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "signal-hook-registry" version = "1.4.2" @@ -2781,6 +3423,15 @@ version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" +[[package]] +name = "simd_helpers" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95890f873bec569a0362c235787f3aca6e1e887302ba4840839bcc6459c42da6" +dependencies = [ + "quote", +] + [[package]] name = "simple_asn1" version = "0.6.2" @@ -2823,6 +3474,9 @@ name = "spin" version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +dependencies = [ + "lock_api", +] [[package]] name = "spki" @@ -2883,6 +3537,25 @@ dependencies = [ "futures-core", ] +[[package]] +name = "system-deps" +version = "6.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3e535eb8dded36d55ec13eddacd30dec501792ff23a0b1682c38601b8cf2349" +dependencies = [ + "cfg-expr", + "heck", + "pkg-config", + "toml", + "version-compare", +] + +[[package]] +name = "target-lexicon" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" + [[package]] name = "tempfile" version = "3.11.0" @@ -2926,6 +3599,17 @@ dependencies = [ "once_cell", ] +[[package]] +name = "tiff" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba1310fcea54c6a9a4fd1aad794ecc02c31682f6bfbecdf460bf19533eed1e3e" +dependencies = [ + "flate2", + "jpeg-decoder", + "weezl", +] + [[package]] name = "time" version = "0.3.36" @@ -3046,6 +3730,40 @@ dependencies = [ "tokio", ] +[[package]] +name = "toml" +version = "0.8.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1ed1f98e3fdc28d6d910e6737ae6ab1a93bf1985935a1193e68f93eeb68d24e" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "583c44c02ad26b0c3f3066fe629275e50627026c51ac2e595cca4c230ce1ce1d" +dependencies = [ + "indexmap 2.3.0", + "serde", + "serde_spanned", + "toml_datetime", + "winnow", +] + [[package]] name = "tonic" version = "0.12.1" @@ -3256,6 +3974,15 @@ version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" +[[package]] +name = "utf16string" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b62a1e85e12d5d712bf47a85f426b73d303e2d00a90de5f3004df3596e9d216" +dependencies = [ + "byteorder", +] + [[package]] name = "utf8parse" version = "0.2.2" @@ -3268,12 +3995,38 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" +[[package]] +name = "v_frame" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6f32aaa24bacd11e488aa9ba66369c7cd514885742c9fe08cfe85884db3e92b" +dependencies = [ + "aligned-vec", + "num-traits", + "wasm-bindgen", +] + [[package]] name = "valuable" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" +[[package]] +name = "vecmath" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "956ae1e0d85bca567dee1dcf87fb1ca2e792792f66f87dced8381f99cd91156a" +dependencies = [ + "piston-float", +] + +[[package]] +name = "version-compare" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "852e951cb7832cb45cb1169900d19760cfa39b82bc0ea9c0e5a14ae88411c98b" + [[package]] name = "version_check" version = "0.9.5" @@ -3399,6 +4152,24 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "weezl" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53a85b86a771b1c87058196170769dd264f66c0782acf1ae6cc51bfd64b39082" + +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix", +] + [[package]] name = "winapi" version = "0.3.9" @@ -3569,6 +4340,15 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "winnow" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68a9bda4691f099d435ad181000724da8e5899daa10713c2d432552b9ccd3a6f" +dependencies = [ + "memchr", +] + [[package]] name = "winreg" version = "0.52.0" @@ -3696,3 +4476,27 @@ dependencies = [ "cc", "pkg-config", ] + +[[package]] +name = "zune-core" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f423a2c17029964870cfaabb1f13dfab7d092a62a29a89264f4d36990ca414a" + +[[package]] +name = "zune-inflate" +version = "0.2.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73ab332fe2f6680068f3582b16a24f90ad7096d5d39b974d1c0aff0125116f02" +dependencies = [ + "simd-adler32", +] + +[[package]] +name = "zune-jpeg" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16099418600b4d8f028622f73ff6e3deaabdff330fb9a2a131dea781ee8b0768" +dependencies = [ + "zune-core", +] diff --git a/Cargo.toml b/Cargo.toml index 006fb01..221d315 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "redacter" -version = "0.6.0" +version = "0.7.0" edition = "2021" authors = ["Abdulla Abdurakhmanov "] license = "Apache-2.0" @@ -56,6 +56,9 @@ reqwest = { version = "0.12", default-features = false, features = ["multipart", tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } rand = "0.8" +pdfium-render = { version = "0.8", features = ["thread_safe", "image"] } +image = "0.25" +bytes = { version = "1" } [dev-dependencies] diff --git a/README.md b/README.md index 8db0e30..d6ad478 100644 --- a/README.md +++ b/README.md @@ -26,12 +26,14 @@ Google Cloud Platform's DLP API. * text, html, json files * structured data table files (csv) * images (jpeg, png, bpm, gif) + * PDF files (rendering as images) * [AWS Comprehend](https://aws.amazon.com/comprehend/) PII redaction: * text, html, csv, json files * [Microsoft Presidio](https://microsoft.github.io/presidio/) for PII redaction (open source project that you can install on-prem). * text, html, csv, json files * images + * PDF files (rendering as images) * [Gemini LLM](https://ai.google.dev/gemini-api/docs) based redaction * text, html, csv, json files * [Open AI LLM](https://openai.com/) based redaction @@ -151,6 +153,25 @@ Optionally, you can provide a model name using `--open-ai-model` option. Default You can specify multiple redacters using `--redact` option multiple times. The tool will apply redaction in the order of the redacters specified. +## PDF redaction + +PDF redaction is supported by rendering PDF files as images and redacting them. +To render and convert PDF files the tool uses external library `Pdfium` (the C++ PDF library used by the Google Chromium +project). +This library needs to be installed separately on your system. + +Installation instructions: + +- Download the latest release from, for example, + here [Pdfium releases](https://github.com/bblanchon/pdfium-binaries/releases) for your system. +- Extract the archive and copy library file `libpdfium.so` to the one of the following directory: + - The path the redacter tool installed (such as `/usr/local/bin`) + - The path that resides with redacter tool `/usr/local/lib/` if you have installed the tool in `/usr/local/bin` + - The path specified in the `LD_LIBRARY_PATH` environment variable + +If library is detected correctly it will be reported in the tool output as. +> PDF to image support: ✓ Yes + ## Examples: ```sh diff --git a/media/redacter-demo.gif b/media/redacter-demo.gif index 48dc627..7a789b2 100644 Binary files a/media/redacter-demo.gif and b/media/redacter-demo.gif differ diff --git a/renovate.json b/renovate.json new file mode 100644 index 0000000..39a2b6e --- /dev/null +++ b/renovate.json @@ -0,0 +1,6 @@ +{ + "$schema": "https://docs.renovatebot.com/renovate-schema.json", + "extends": [ + "config:base" + ] +} diff --git a/src/commands/copy_command.rs b/src/commands/copy_command.rs index 7ffeaea..fd8f8c3 100644 --- a/src/commands/copy_command.rs +++ b/src/commands/copy_command.rs @@ -1,4 +1,5 @@ use crate::errors::AppError; +use crate::file_converters::FileConverters; use crate::filesystems::{ DetectFileSystem, FileMatcher, FileMatcherResult, FileSystemConnection, FileSystemRef, }; @@ -59,16 +60,32 @@ pub async fn command_copy( } else { Style::new().dim().apply_to("-".to_string()) }; + + let mut file_converters = FileConverters::new(); + file_converters.init().await?; + + let converter_style = Style::new(); + let pdf_support_output = if file_converters.pdf_image_converter.is_some() { + converter_style + .clone() + .green() + .apply_to("✓ Yes".to_string()) + } else { + converter_style.clone().dim().apply_to("✗ No".to_string()) + }; + term.write_line( format!( - "Copying from {} to {}.\nRedacting: {}.\nSampling: {}\n", + "Copying from {} to {}.\nRedacting: {}.\nSampling: {}\nPDF to image support: {}\n", bold_style.clone().white().apply_to(source), bold_style.clone().yellow().apply_to(destination), redacted_output, - sampling_output + sampling_output, + pdf_support_output, ) .as_str(), )?; + let bar = ProgressBar::new(1); bar.set_style( ProgressStyle::with_template( @@ -130,6 +147,7 @@ pub async fn command_copy( &mut destination_fs, &options, &maybe_redacters, + &file_converters, ) .await? { @@ -151,6 +169,7 @@ pub async fn command_copy( &mut destination_fs, &options, &maybe_redacters, + &file_converters, ) .await? { @@ -176,6 +195,7 @@ enum TransferFileResult { Skipped, } +#[allow(clippy::too_many_arguments)] async fn transfer_and_redact_file< 'a, SFS: FileSystemConnection<'a>, @@ -188,6 +208,7 @@ async fn transfer_and_redact_file< destination_fs: &mut DFS, options: &CopyCommandOptions, redacter: &Option<(RedacterBaseOptions, Vec)>, + file_converters: &FileConverters, ) -> AppResult { let bold_style = Style::new().bold().white(); let (base_file_ref, source_reader) = source_fs.download(source_file_ref).await?; @@ -255,6 +276,7 @@ async fn transfer_and_redact_file< source_reader, file_ref, redacter_with_options, + file_converters, ) .await? } else { @@ -279,6 +301,7 @@ async fn redact_upload_file< source_reader: S, dest_file_ref: &FileSystemRef, redacter_with_options: &(RedacterBaseOptions, Vec), + file_converters: &FileConverters, ) -> AppResult { let (redacter_base_options, redacters) = redacter_with_options; let mut support_redacters = Vec::new(); @@ -294,6 +317,7 @@ async fn redact_upload_file< redacter_base_options, source_reader, dest_file_ref, + file_converters, bar, ) .await diff --git a/src/errors.rs b/src/errors.rs index 4ca5002..c28fef4 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -34,6 +34,10 @@ pub enum AppError { RedacterConfigError { message: String }, #[error("Template error: {0}")] TemplateError(#[from] TemplateError), + #[error("PDF conversion error: {0}")] + PdfiumError(#[from] pdfium_render::prelude::PdfiumError), + #[error("Image conversion error: {0}")] + ImageError(#[from] image::ImageError), #[error("System error: {message}")] SystemError { message: String }, } diff --git a/src/file_converters/mod.rs b/src/file_converters/mod.rs new file mode 100644 index 0000000..c8f7669 --- /dev/null +++ b/src/file_converters/mod.rs @@ -0,0 +1,26 @@ +use crate::file_converters::pdf::PdfToImage; +use crate::AppResult; + +pub mod pdf; + +pub struct FileConverters { + pub pdf_image_converter: Option>, +} + +impl FileConverters { + pub fn new() -> Self { + Self { + pdf_image_converter: None, + } + } + + pub async fn init(&mut self) -> AppResult<()> { + match pdf::PdfImageConverter::new().ok() { + Some(pdf_image_converter) => { + self.pdf_image_converter = Some(Box::new(pdf_image_converter)); + Ok(()) + } + None => Ok(()), + } + } +} diff --git a/src/file_converters/pdf.rs b/src/file_converters/pdf.rs new file mode 100644 index 0000000..199d2fe --- /dev/null +++ b/src/file_converters/pdf.rs @@ -0,0 +1,104 @@ +use crate::errors::AppError; +use crate::AppResult; +use gcloud_sdk::prost::bytes; +use gcloud_sdk::prost::bytes::Bytes; +use pdfium_render::prelude::*; + +#[derive(Debug, Clone)] +pub struct PdfInfo { + pub pages: Vec, +} + +#[derive(Debug, Clone)] +pub struct PdfPageInfo { + pub height: PdfPoints, + pub width: PdfPoints, + pub page_as_images: image::DynamicImage, +} + +pub trait PdfToImage { + fn convert_to_images(&self, pdf_bytes: bytes::Bytes) -> AppResult; + + fn images_to_pdf(&self, pdf_info: PdfInfo) -> AppResult; +} + +pub struct PdfImageConverter { + pdfium: Pdfium, +} + +impl PdfImageConverter { + pub fn new() -> AppResult { + let executable = std::env::current_exe()?; + let current_dir = executable + .parent() + .ok_or(AppError::SystemError { + message: "No parent directory for executable".to_string(), + })? + .to_path_buf(); + + let bindings = Pdfium::bind_to_library( + // Attempt to bind to a pdfium library in the current working directory... + Pdfium::pdfium_platform_library_name_at_path("./"), + ) + .or_else(|_| Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./lib"))) + .or_else(|_| { + Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path( + ¤t_dir + .parent() + .map(|p| p.join("lib")) + .unwrap_or(current_dir.clone()), + )) + }) + .or_else(|_| { + Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path(¤t_dir)) + }) + .or_else(|_| Pdfium::bind_to_system_library())?; + + let pdfium = Pdfium::new(bindings); + Ok(Self { pdfium }) + } +} + +impl PdfToImage for PdfImageConverter { + fn convert_to_images(&self, pdf_bytes: Bytes) -> AppResult { + let render_config = PdfRenderConfig::new() + .set_target_width(2000) + .set_maximum_height(2000) + .rotate_if_landscape(PdfPageRenderRotation::Degrees90, true); + let document = self + .pdfium + .load_pdf_from_byte_vec(pdf_bytes.to_vec(), None)?; + let mut pdf_info = PdfInfo { pages: Vec::new() }; + for page in document.pages().iter() { + let image = page.render_with_config(&render_config)?.as_image(); + let page_info = PdfPageInfo { + height: page.height(), + width: page.width(), + page_as_images: image, + }; + pdf_info.pages.push(page_info); + } + Ok(pdf_info) + } + + fn images_to_pdf(&self, pdf_info: PdfInfo) -> AppResult { + let mut document = self.pdfium.create_new_pdf()?; + for src_page in pdf_info.pages { + let mut page = + document + .pages_mut() + .create_page_at_start(PdfPagePaperSize::from_points( + src_page.width, + src_page.height, + ))?; + let object = PdfPageImageObject::new_with_size( + &document, + &src_page.page_as_images, + src_page.width, + src_page.height, + )?; + page.objects_mut().add_image_object(object)?; + } + Ok(document.save_to_bytes()?.into()) + } +} diff --git a/src/main.rs b/src/main.rs index 54c8761..10d70ed 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,10 +1,9 @@ use std::error::Error; +use args::*; use clap::Parser; use console::{Style, Term}; -use args::*; - use crate::commands::*; use crate::errors::AppError; @@ -23,6 +22,8 @@ pub type AppResult = Result; mod common_types; +mod file_converters; + pub fn config_env_var(name: &str) -> Result { std::env::var(name).map_err(|e| format!("{}: {}", name, e)) } diff --git a/src/redacters/aws_comprehend.rs b/src/redacters/aws_comprehend.rs index dd5bf73..c4334a7 100644 --- a/src/redacters/aws_comprehend.rs +++ b/src/redacters/aws_comprehend.rs @@ -80,11 +80,11 @@ impl<'a> Redacter for AwsComprehendRedacter<'a> { async fn redact(&self, input: RedacterDataItem) -> AppResult { match &input.content { RedacterDataItemContent::Value(_) => self.redact_text_file(input).await, - RedacterDataItemContent::Table { .. } | RedacterDataItemContent::Image { .. } => { - Err(AppError::SystemError { - message: "Attempt to redact of unsupported type".to_string(), - }) - } + RedacterDataItemContent::Table { .. } + | RedacterDataItemContent::Image { .. } + | RedacterDataItemContent::Pdf { .. } => Err(AppError::SystemError { + message: "Attempt to redact of unsupported type".to_string(), + }), } } diff --git a/src/redacters/gcp_dlp.rs b/src/redacters/gcp_dlp.rs index 775ed86..f6e17f2 100644 --- a/src/redacters/gcp_dlp.rs +++ b/src/redacters/gcp_dlp.rs @@ -12,6 +12,7 @@ use gcloud_sdk::tonic::metadata::MetadataValue; use gcloud_sdk::{tonic, GoogleApi, GoogleAuthMiddleware}; use mime::Mime; use rvstruct::ValueStruct; +use tokio_util::bytes; #[derive(Clone)] pub struct GcpDlpRedacter<'a> { @@ -97,31 +98,38 @@ impl<'a> GcpDlpRedacter<'a> { } } + async fn redact_image_content( + &self, + input_bytes_content: gcloud_sdk::google::privacy::dlp::v2::ByteContentItem, + ) -> AppResult { + let mut request = + tonic::Request::new(gcloud_sdk::google::privacy::dlp::v2::RedactImageRequest { + parent: format!( + "projects/{}/locations/global", + self.gcp_dlp_options.project_id.value() + ), + inspect_config: Some(Self::create_inspect_config()), + byte_item: Some(input_bytes_content), + ..gcloud_sdk::google::privacy::dlp::v2::RedactImageRequest::default() + }); + request.metadata_mut().insert( + "x-goog-user-project", + MetadataValue::::try_from( + self.gcp_dlp_options.project_id.value(), + )?, + ); + let response = self.client.get().redact_image(request).await?; + Ok(response.into_inner().redacted_image.into()) + } + pub async fn redact_image_file(&self, input: RedacterDataItem) -> AppResult { match &input.content { RedacterDataItemContent::Image { mime_type, data: _ } => { let output_mime = mime_type.clone(); - let mut request = - tonic::Request::new(gcloud_sdk::google::privacy::dlp::v2::RedactImageRequest { - parent: format!( - "projects/{}/locations/global", - self.gcp_dlp_options.project_id.value() - ), - inspect_config: Some(Self::create_inspect_config()), - byte_item: Some(input.content.try_into()?), - ..gcloud_sdk::google::privacy::dlp::v2::RedactImageRequest::default() - }); - request.metadata_mut().insert( - "x-goog-user-project", - MetadataValue::::try_from( - self.gcp_dlp_options.project_id.value(), - )?, - ); - let response = self.client.get().redact_image(request).await?; let content = RedacterDataItemContent::Image { mime_type: output_mime, - data: response.into_inner().redacted_image.into(), + data: self.redact_image_content(input.content.try_into()?).await?, }; Ok(RedacterDataItem { file_ref: input.file_ref, @@ -197,9 +205,11 @@ impl<'a> Redacter for GcpDlpRedacter<'a> { { self.redact_image_file(input).await } - RedacterDataItemContent::Image { .. } => Err(AppError::SystemError { - message: "Attempt to redact of unsupported image type".to_string(), - }), + RedacterDataItemContent::Image { .. } | RedacterDataItemContent::Pdf { .. } => { + Err(AppError::SystemError { + message: "Attempt to redact of unsupported type".to_string(), + }) + } } } @@ -207,17 +217,21 @@ impl<'a> Redacter for GcpDlpRedacter<'a> { &self, file_ref: &FileSystemRef, ) -> AppResult { - Ok( - if file_ref.media_type.as_ref().iter().all(|media_type| { - Redacters::is_mime_text(media_type) - || Redacters::is_mime_table(media_type) - || Self::check_supported_image_type(media_type) - }) { + Ok(match file_ref.media_type.as_ref() { + Some(media_type) if Redacters::is_mime_text(media_type) => { RedactSupportedOptions::Supported - } else { - RedactSupportedOptions::Unsupported - }, - ) + } + Some(media_type) if Redacters::is_mime_table(media_type) => { + RedactSupportedOptions::Supported + } + Some(media_type) if Self::check_supported_image_type(media_type) => { + RedactSupportedOptions::Supported + } + Some(media_type) if Redacters::is_mime_pdf(media_type) => { + RedactSupportedOptions::SupportedAsImages + } + _ => RedactSupportedOptions::Unsupported, + }) } fn redacter_type(&self) -> RedacterType { @@ -277,9 +291,11 @@ impl TryInto for RedacterData ), }) } - RedacterDataItemContent::Image { .. } => Err(AppError::SystemError { - message: "Attempt to convert image content to ContentItem".to_string(), - }), + RedacterDataItemContent::Image { .. } | RedacterDataItemContent::Pdf { .. } => { + Err(AppError::SystemError { + message: "Attempt to convert image content to ContentItem".to_string(), + }) + } } } } diff --git a/src/redacters/gemini_llm.rs b/src/redacters/gemini_llm.rs index 42f39ed..6044bfa 100644 --- a/src/redacters/gemini_llm.rs +++ b/src/redacters/gemini_llm.rs @@ -167,11 +167,11 @@ impl<'a> Redacter for GeminiLlmRedacter<'a> { async fn redact(&self, input: RedacterDataItem) -> AppResult { match &input.content { RedacterDataItemContent::Value(_) => self.redact_text_file(input).await, - RedacterDataItemContent::Table { .. } | RedacterDataItemContent::Image { .. } => { - Err(AppError::SystemError { - message: "Attempt to redact of unsupported type".to_string(), - }) - } + RedacterDataItemContent::Table { .. } + | RedacterDataItemContent::Image { .. } + | RedacterDataItemContent::Pdf { .. } => Err(AppError::SystemError { + message: "Attempt to redact of unsupported type".to_string(), + }), } } diff --git a/src/redacters/mod.rs b/src/redacters/mod.rs index 81cd99f..d928d91 100644 --- a/src/redacters/mod.rs +++ b/src/redacters/mod.rs @@ -4,6 +4,7 @@ use crate::reporter::AppReporter; use crate::AppResult; use futures::{Stream, TryStreamExt}; use gcloud_sdk::prost::bytes; +use image::ImageFormat; use indicatif::ProgressBar; use mime::Mime; use std::fmt::Display; @@ -22,6 +23,8 @@ pub use gemini_llm::*; mod open_ai_llm; use crate::args::RedacterType; +use crate::file_converters::pdf::{PdfInfo, PdfPageInfo, PdfToImage}; +use crate::file_converters::FileConverters; pub use open_ai_llm::*; #[derive(Debug, Clone)] @@ -41,6 +44,9 @@ pub enum RedacterDataItemContent { mime_type: Mime, data: bytes::Bytes, }, + Pdf { + data: bytes::Bytes, + }, } #[derive(Clone)] @@ -140,12 +146,17 @@ impl<'a> Redacters<'a> { pub fn is_mime_image(mime: &Mime) -> bool { mime.type_() == mime::IMAGE } + + pub fn is_mime_pdf(mime: &Mime) -> bool { + *mime == mime::APPLICATION_PDF + } } #[derive(Debug, Clone, PartialEq, Eq)] pub enum RedactSupportedOptions { Supported, SupportedAsText, + SupportedAsImages, Unsupported, } @@ -202,15 +213,90 @@ pub async fn redact_stream< redacter_base_options: &RedacterBaseOptions, input: S, file_ref: &FileSystemRef, + file_converters: &FileConverters, bar: &ProgressBar, ) -> AppResult> + Send + Sync + Unpin + 'static>> { let mut redacters_supported_options = Vec::with_capacity(redacters.len()); for redacter in redacters { let supported_options = redacter.redact_supported_options(file_ref).await?; - redacters_supported_options.push((redacter, supported_options)); + redacters_supported_options.push((*redacter, supported_options)); + } + + let mut redacted = stream_to_redact_item( + redacter_base_options, + input, + file_ref, + &redacters_supported_options, + ) + .await?; + + for (index, (redacter, options)) in redacters_supported_options.iter().enumerate() { + let width = " ".repeat(index); + match options { + RedactSupportedOptions::Supported => { + bar.println(format!( + "{width}↳ Redacting using {} redacter", + redacter.redacter_type() + )); + redacted = redacter.redact(redacted).await?; + } + RedactSupportedOptions::SupportedAsImages => { + match file_converters.pdf_image_converter { + Some(ref converter) => { + redacted = redact_pdf_with_images_converter( + file_ref, + bar, + redacted, + *redacter, + &width, + converter.as_ref(), + ) + .await? + } + None => { + bar.println(format!( + "{width}↲ Skipping redaction because PDF to image converter is not available", + )); + } + } + } + RedactSupportedOptions::SupportedAsText | RedactSupportedOptions::Unsupported => {} + } } - let mut item_to_redact = match file_ref.media_type { + match redacted.content { + RedacterDataItemContent::Value(content) => { + let bytes = bytes::Bytes::from(content.into_bytes()); + Ok(Box::new(futures::stream::iter(vec![Ok(bytes)]))) + } + RedacterDataItemContent::Image { data, .. } => { + Ok(Box::new(futures::stream::iter(vec![Ok(data)]))) + } + RedacterDataItemContent::Pdf { data } => { + Ok(Box::new(futures::stream::iter(vec![Ok(data)]))) + } + RedacterDataItemContent::Table { headers, rows } => { + let mut writer = csv_async::AsyncWriter::from_writer(vec![]); + writer.write_record(headers).await?; + for row in rows { + writer.write_record(row).await?; + } + writer.flush().await?; + let bytes = bytes::Bytes::from(writer.into_inner().await?); + Ok(Box::new(futures::stream::iter(vec![Ok(bytes)]))) + } + } +} + +async fn stream_to_redact_item< + S: Stream> + Send + Unpin + Sync + 'static, +>( + redacter_base_options: &RedacterBaseOptions, + input: S, + file_ref: &FileSystemRef, + redacters_supported_options: &[(&impl Redacter, RedactSupportedOptions)], +) -> AppResult { + match file_ref.media_type { Some(ref mime) if Redacters::is_mime_text(mime) || (Redacters::is_mime_table(mime) @@ -221,72 +307,16 @@ pub async fn redact_stream< .iter() .all(|(_, o)| matches!(o, RedactSupportedOptions::Supported))) => { - let all_chunks: Vec = input.try_collect().await?; - let all_bytes = all_chunks.concat(); - let whole_content = - String::from_utf8(all_bytes).map_err(|e| AppError::SystemError { - message: format!("Failed to convert bytes to string: {}", e), - })?; - let content = if let Some(sampling_size) = redacter_base_options.sampling_size { - let sampling_size = std::cmp::min(sampling_size, whole_content.len()); - whole_content - .chars() - .take(sampling_size) - .collect::() - } else { - whole_content - }; - Ok(RedacterDataItem { - content: RedacterDataItemContent::Value(content), - file_ref: file_ref.clone(), - }) + stream_to_text_redact_item(redacter_base_options, input, file_ref).await } Some(ref mime) if Redacters::is_mime_image(mime) => { - let all_chunks: Vec = input.try_collect().await?; - let all_bytes = all_chunks.concat(); - Ok(RedacterDataItem { - content: RedacterDataItemContent::Image { - mime_type: mime.clone(), - data: all_bytes.into(), - }, - file_ref: file_ref.clone(), - }) + stream_to_image_redact_item(input, file_ref, mime.clone()).await } Some(ref mime) if Redacters::is_mime_table(mime) => { - let reader = tokio_util::io::StreamReader::new( - input.map_err(|err| std::io::Error::new(std::io::ErrorKind::Other, err)), - ); - let mut reader = csv_async::AsyncReaderBuilder::default() - .has_headers(!redacter_base_options.csv_headers_disable) - .delimiter( - redacter_base_options - .csv_delimiter - .as_ref() - .cloned() - .unwrap_or(b','), - ) - .create_reader(reader); - let headers = if !redacter_base_options.csv_headers_disable { - reader - .headers() - .await? - .into_iter() - .map(|h| h.to_string()) - .collect() - } else { - vec![] - }; - let records: Vec = reader.records().try_collect().await?; - Ok(RedacterDataItem { - content: RedacterDataItemContent::Table { - headers, - rows: records - .iter() - .map(|r| r.iter().map(|c| c.to_string()).collect()) - .collect(), - }, - file_ref: file_ref.clone(), - }) + stream_to_table_redact_item(redacter_base_options, input, file_ref).await + } + Some(ref mime) if Redacters::is_mime_pdf(mime) => { + stream_to_pdf_redact_item(input, file_ref).await } Some(ref mime) => Err(AppError::SystemError { message: format!("Media type {} is not supported for redaction", mime), @@ -294,36 +324,166 @@ pub async fn redact_stream< None => Err(AppError::SystemError { message: "Media type is not provided to redact".to_string(), }), - }?; + } +} - for (index, (redacter, options)) in redacters_supported_options.iter().enumerate() { - if !matches!(options, RedactSupportedOptions::Unsupported) { - let width = " ".repeat(index); +async fn stream_to_text_redact_item< + S: Stream> + Send + Unpin + Sync + 'static, +>( + redacter_base_options: &RedacterBaseOptions, + input: S, + file_ref: &FileSystemRef, +) -> AppResult { + let all_chunks: Vec = input.try_collect().await?; + let all_bytes = all_chunks.concat(); + let whole_content = String::from_utf8(all_bytes).map_err(|e| AppError::SystemError { + message: format!("Failed to convert bytes to string: {}", e), + })?; + let content = if let Some(sampling_size) = redacter_base_options.sampling_size { + let sampling_size = std::cmp::min(sampling_size, whole_content.len()); + whole_content + .chars() + .take(sampling_size) + .collect::() + } else { + whole_content + }; + Ok(RedacterDataItem { + content: RedacterDataItemContent::Value(content), + file_ref: file_ref.clone(), + }) +} + +async fn stream_to_table_redact_item< + S: Stream> + Send + Unpin + Sync + 'static, +>( + redacter_base_options: &RedacterBaseOptions, + input: S, + file_ref: &FileSystemRef, +) -> AppResult { + let reader = tokio_util::io::StreamReader::new( + input.map_err(|err| std::io::Error::new(std::io::ErrorKind::Other, err)), + ); + let mut reader = csv_async::AsyncReaderBuilder::default() + .has_headers(!redacter_base_options.csv_headers_disable) + .delimiter( + redacter_base_options + .csv_delimiter + .as_ref() + .cloned() + .unwrap_or(b','), + ) + .create_reader(reader); + let headers = if !redacter_base_options.csv_headers_disable { + reader + .headers() + .await? + .into_iter() + .map(|h| h.to_string()) + .collect() + } else { + vec![] + }; + let records: Vec = reader.records().try_collect().await?; + Ok(RedacterDataItem { + content: RedacterDataItemContent::Table { + headers, + rows: records + .iter() + .map(|r| r.iter().map(|c| c.to_string()).collect()) + .collect(), + }, + file_ref: file_ref.clone(), + }) +} + +async fn stream_to_image_redact_item< + S: Stream> + Send + Unpin + Sync + 'static, +>( + input: S, + file_ref: &FileSystemRef, + mime: Mime, +) -> AppResult { + let all_chunks: Vec = input.try_collect().await?; + let all_bytes = all_chunks.concat(); + Ok(RedacterDataItem { + content: RedacterDataItemContent::Image { + mime_type: mime.clone(), + data: all_bytes.into(), + }, + file_ref: file_ref.clone(), + }) +} + +async fn stream_to_pdf_redact_item< + S: Stream> + Send + Unpin + Sync + 'static, +>( + input: S, + file_ref: &FileSystemRef, +) -> AppResult { + let all_chunks: Vec = input.try_collect().await?; + let all_bytes = all_chunks.concat(); + Ok(RedacterDataItem { + content: RedacterDataItemContent::Pdf { + data: all_bytes.into(), + }, + file_ref: file_ref.clone(), + }) +} + +async fn redact_pdf_with_images_converter( + file_ref: &FileSystemRef, + bar: &ProgressBar, + redacted: RedacterDataItem, + redacter: &impl Redacter, + width: &String, + converter: &dyn PdfToImage, +) -> Result { + match redacted.content { + RedacterDataItemContent::Pdf { data } => { bar.println(format!( - "{width}↳ Redacting using {} redacter", + "{width}↳ Redacting using {} redacter and converting the PDF to images", redacter.redacter_type() )); - item_to_redact = redacter.redact(item_to_redact).await?; - } - } - - match item_to_redact.content { - RedacterDataItemContent::Value(content) => { - let bytes = bytes::Bytes::from(content.into_bytes()); - Ok(Box::new(futures::stream::iter(vec![Ok(bytes)]))) - } - RedacterDataItemContent::Image { data, .. } => { - Ok(Box::new(futures::stream::iter(vec![Ok(data)]))) - } - RedacterDataItemContent::Table { headers, rows } => { - let mut writer = csv_async::AsyncWriter::from_writer(vec![]); - writer.write_record(headers).await?; - for row in rows { - writer.write_record(row).await?; + let pdf_info = converter.convert_to_images(data)?; + bar.println(format!( + "{width} ↳ Converting {pdf_info_pages} images", + pdf_info_pages = pdf_info.pages.len() + )); + let mut redacted_pages = Vec::with_capacity(pdf_info.pages.len()); + for page in pdf_info.pages { + let mut png_image_bytes = std::io::Cursor::new(Vec::new()); + page.page_as_images + .write_to(&mut png_image_bytes, ImageFormat::Png)?; + let image_to_redact = RedacterDataItem { + content: RedacterDataItemContent::Image { + mime_type: mime::IMAGE_PNG, + data: png_image_bytes.into_inner().into(), + }, + file_ref: file_ref.clone(), + }; + let redacted_image = redacter.redact(image_to_redact).await?; + if let RedacterDataItemContent::Image { data, .. } = redacted_image.content { + redacted_pages.push(PdfPageInfo { + page_as_images: image::load_from_memory_with_format( + &data, + ImageFormat::Png, + )?, + ..page + }); + } } - writer.flush().await?; - let bytes = bytes::Bytes::from(writer.into_inner().await?); - Ok(Box::new(futures::stream::iter(vec![Ok(bytes)]))) + let redacted_pdf_info = PdfInfo { + pages: redacted_pages, + }; + let redact_pdf_as_images = converter.images_to_pdf(redacted_pdf_info)?; + Ok(RedacterDataItem { + content: RedacterDataItemContent::Pdf { + data: redact_pdf_as_images, + }, + file_ref: file_ref.clone(), + }) } + _ => Ok(redacted), } } diff --git a/src/redacters/ms_presidio.rs b/src/redacters/ms_presidio.rs index 72e69b3..f0f8cf2 100644 --- a/src/redacters/ms_presidio.rs +++ b/src/redacters/ms_presidio.rs @@ -175,9 +175,11 @@ impl<'a> Redacter for MsPresidioRedacter<'a> { match &input.content { RedacterDataItemContent::Value(_) => self.redact_text_file(input).await, RedacterDataItemContent::Image { .. } => self.redact_image_file(input).await, - RedacterDataItemContent::Table { .. } => Err(AppError::SystemError { - message: "Attempt to redact of unsupported table type".to_string(), - }), + RedacterDataItemContent::Table { .. } | RedacterDataItemContent::Pdf { .. } => { + Err(AppError::SystemError { + message: "Attempt to redact of unsupported table type".to_string(), + }) + } } } @@ -204,6 +206,12 @@ impl<'a> Redacter for MsPresidioRedacter<'a> { { RedactSupportedOptions::Supported } + Some(media_type) + if Redacters::is_mime_pdf(media_type) + && self.ms_presidio_options.image_redact_url.is_some() => + { + RedactSupportedOptions::SupportedAsImages + } _ => RedactSupportedOptions::Unsupported, }) } diff --git a/src/redacters/open_ai_llm.rs b/src/redacters/open_ai_llm.rs index 7af4b75..f1159b9 100644 --- a/src/redacters/open_ai_llm.rs +++ b/src/redacters/open_ai_llm.rs @@ -146,11 +146,11 @@ impl<'a> Redacter for OpenAiLlmRedacter<'a> { async fn redact(&self, input: RedacterDataItem) -> AppResult { match &input.content { RedacterDataItemContent::Value(_) => self.redact_text_file(input).await, - RedacterDataItemContent::Image { .. } | RedacterDataItemContent::Table { .. } => { - Err(AppError::SystemError { - message: "Attempt to redact of unsupported table type".to_string(), - }) - } + RedacterDataItemContent::Image { .. } + | RedacterDataItemContent::Table { .. } + | RedacterDataItemContent::Pdf { .. } => Err(AppError::SystemError { + message: "Attempt to redact of unsupported table type".to_string(), + }), } }