diff --git a/Cargo.lock b/Cargo.lock index a17e9105..b48b6d20 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "adblock" -version = "0.8.8" +version = "0.8.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80e6cf097ea9bb36bd04e1af44da71aaa336d2a3fb0515504f37ac297486ff5b" +checksum = "c38c2dd54639e86e42f6401271686385aef8380dba0f13096271955bb3b472c8" dependencies = [ "addr", "base64 0.13.1", @@ -28,9 +28,9 @@ dependencies = [ [[package]] name = "addr" -version = "0.14.0" +version = "0.15.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c54ccac949a2afafdfc889e15c753bbc6ee8783e026bbe3d057b00b13907db70" +checksum = "a93b8a41dbe230ad5087cc721f8d41611de654542180586b315d9f4cf6b72bef" dependencies = [ "psl", "psl-types", @@ -38,18 +38,18 @@ dependencies = [ [[package]] name = "addr2line" -version = "0.21.0" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" +checksum = "f5fb1d8e4442bd405fdfd1dacb42792696b0cf9cb15882e5d097b742a676d375" dependencies = [ "gimli", ] [[package]] -name = "adler" -version = "1.0.2" +name = "adler2" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" [[package]] name = "ahash" @@ -75,9 +75,9 @@ dependencies = [ [[package]] name = "allocator-api2" -version = "0.2.16" +version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" +checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f" [[package]] name = "ansi_term" @@ -90,47 +90,48 @@ dependencies = [ [[package]] name = "anstream" -version = "0.6.13" +version = "0.6.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb" +checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", + "is_terminal_polyfill", "utf8parse", ] [[package]] name = "anstyle" -version = "1.0.6" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" +checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1" [[package]] name = "anstyle-parse" -version = "0.2.3" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c" +checksum = "eb47de1e80c2b463c735db5b217a0ddc39d612e7ac9e2e96a5aed1f57616c1cb" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.0.2" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648" +checksum = "6d36fc52c7f6c869915e99412912f22093507da8d9e942ceaf66fe4b7c14422a" dependencies = [ "windows-sys 0.52.0", ] [[package]] name = "anstyle-wincon" -version = "3.0.2" +version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" +checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8" dependencies = [ "anstyle", "windows-sys 0.52.0", @@ -138,9 +139,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.81" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0952808a6c2afd1aa8947271f3a60f1a6763c7b912d210184c5149b5cf147247" +checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6" [[package]] name = "atomic-traits" @@ -165,15 +166,15 @@ dependencies = [ [[package]] name = "autocfg" -version = "1.2.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80" +checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" [[package]] name = "aws-config" -version = "1.1.10" +version = "1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48730d0b4c3d91c43d0d37168831d9fd0e065ad4a889a2ee9faf8d34c3d2804d" +checksum = "848d7b9b605720989929279fa644ce8f244d0ce3146fcca5b70e4eb7b3c020fc" dependencies = [ "aws-credential-types", "aws-runtime", @@ -191,7 +192,6 @@ dependencies = [ "fastrand", "hex", "http 0.2.12", - "hyper", "ring", "time", "tokio", @@ -202,9 +202,9 @@ dependencies = [ [[package]] name = "aws-credential-types" -version = "1.1.8" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa8587ae17c8e967e4b05a62d495be2fb7701bec52a97f7acfe8a29f938384c8" +checksum = "60e8f6b615cb5fc60a98132268508ad104310f0cfb25a1c22eee76efdf9154da" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -214,15 +214,16 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.1.9" +version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4ee6903f9d0197510eb6b44c4d86b493011d08b4992938f7b9be0333b6685aa" +checksum = "a10d5c055aa540164d9561a0e2e74ad30f0dcf7393c3a92f6733ddf9c5762468" dependencies = [ "aws-credential-types", "aws-sigv4", "aws-smithy-async", "aws-smithy-eventstream", "aws-smithy-http", + "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", @@ -230,6 +231,7 @@ dependencies = [ "fastrand", "http 0.2.12", "http-body 0.4.6", + "once_cell", "percent-encoding", "pin-project-lite", "tracing", @@ -238,9 +240,9 @@ dependencies = [ [[package]] name = "aws-sdk-s3" -version = "1.22.0" +version = "1.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "644c5939c1b78097d37f3341708978d68490070d4b0f8fa91f0878678c06a7ef" +checksum = "c09fd4b5c7ed75f52b913b4f3ff0501dae7f8cb9125f6d45db4553980cbc0528" dependencies = [ "ahash", "aws-credential-types", @@ -273,9 +275,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.19.0" +version = "1.43.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2be5ba83b077b67a6f7a1927eb6b212bf556e33bd74b5eaa5aa6e421910803a" +checksum = "70a9d27ed1c12b1140c47daf1bc541606c43fdafd918c4797d520db0043ceef2" dependencies = [ "aws-credential-types", "aws-runtime", @@ -295,9 +297,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.19.0" +version = "1.44.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "022ca669825f841aef17b12d4354ef2b8651e4664be49f2d9ea13e4062a80c9f" +checksum = "44514a6ca967686cde1e2a1b81df6ef1883d0e3e570da8d8bc5c491dcb6fc29b" dependencies = [ "aws-credential-types", "aws-runtime", @@ -317,9 +319,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.19.0" +version = "1.43.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e4a5f5cb007347c1ab34a6d56456301dfada921fc9e57d687ecb08baddd11ff" +checksum = "cd7a4d279762a35b9df97209f6808b95d4fe78547fe2316b4d200a0283960c5a" dependencies = [ "aws-credential-types", "aws-runtime", @@ -340,9 +342,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.2.0" +version = "1.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11d6f29688a4be9895c0ba8bef861ad0c0dac5c15e9618b9b7a6c233990fc263" +checksum = "cc8db6904450bafe7473c6ca9123f88cc11089e41a025408f992db4e22d3be68" dependencies = [ "aws-credential-types", "aws-smithy-eventstream", @@ -380,9 +382,9 @@ dependencies = [ [[package]] name = "aws-smithy-checksums" -version = "0.60.7" +version = "0.60.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83fa43bc04a6b2441968faeab56e68da3812f978a670a5db32accbdcafddd12f" +checksum = "598b1689d001c4d4dc3cb386adb07d37786783aee3ac4b324bcadac116bf3d23" dependencies = [ "aws-smithy-http", "aws-smithy-types", @@ -401,9 +403,9 @@ dependencies = [ [[package]] name = "aws-smithy-eventstream" -version = "0.60.4" +version = "0.60.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6363078f927f612b970edf9d1903ef5cef9a64d1e8423525ebb1f0a1633c858" +checksum = "cef7d0a272725f87e51ba2bf89f8c21e4df61b9e49ae1ac367a6d69916ef7c90" dependencies = [ "aws-smithy-types", "bytes", @@ -412,9 +414,9 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.60.7" +version = "0.60.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f10fa66956f01540051b0aa7ad54574640f748f9839e843442d99b970d3aff9" +checksum = "5c8bc3e8fdc6b8d07d976e301c02fe553f72a39b7a9fea820e023268467d7ab6" dependencies = [ "aws-smithy-eventstream", "aws-smithy-runtime-api", @@ -452,9 +454,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.2.1" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c53572b4cd934ee5e8461ad53caa36e9d246aaef42166e3ac539e206a925d330" +checksum = "d1ce695746394772e7000b39fe073095db6d45a862d0767dd5ad0ac0d7f8eb87" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -465,7 +467,8 @@ dependencies = [ "h2", "http 0.2.12", "http-body 0.4.6", - "http-body 1.0.0", + "http-body 1.0.1", + "httparse", "hyper", "hyper-rustls", "once_cell", @@ -478,9 +481,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.3.0" +version = "1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccb2b3a7030dc9a3c9a08ce0b25decea5130e9db19619d4dffbbff34f75fe850" +checksum = "e086682a53d3aa241192aa110fa8dfce98f2f5ac2ead0de84d41582c7e8fdb96" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -495,9 +498,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.1.8" +version = "1.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abe14dceea1e70101d38fbf2a99e6a34159477c0fb95e68e05c66bd7ae4c3729" +checksum = "03701449087215b5369c7ea17fef0dd5d24cb93439ec5af0c7615f58c3f22605" dependencies = [ "base64-simd", "bytes", @@ -506,7 +509,7 @@ dependencies = [ "http 0.2.12", "http 1.1.0", "http-body 0.4.6", - "http-body 1.0.0", + "http-body 1.0.1", "http-body-util", "itoa", "num-integer", @@ -521,41 +524,40 @@ dependencies = [ [[package]] name = "aws-smithy-xml" -version = "0.60.7" +version = "0.60.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "872c68cf019c0e4afc5de7753c4f7288ce4b71663212771bf5e4542eb9346ca9" +checksum = "ab0b0166827aa700d3dc519f72f8b3a91c35d0b8d042dc5d643a91e6f80648fc" dependencies = [ "xmlparser", ] [[package]] name = "aws-types" -version = "1.1.9" +version = "1.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afb278e322f16f59630a83b6b2dc992a0b48aa74ed47b4130f193fae0053d713" +checksum = "5221b91b3e441e6675310829fd8984801b772cb1546ef6c0e54dec9f1ac13fef" dependencies = [ "aws-credential-types", "aws-smithy-async", "aws-smithy-runtime-api", "aws-smithy-types", - "http 0.2.12", - "rustc_version 0.4.0", + "rustc_version 0.4.1", "tracing", ] [[package]] name = "backtrace" -version = "0.3.71" +version = "0.3.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26b05800d2e817c8b3b4b54abd461726265fa9789ae34330622f2db9ee696f9d" +checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a" dependencies = [ "addr2line", - "cc", "cfg-if", "libc", "miniz_oxide", "object", "rustc-demangle", + "windows-targets 0.52.6", ] [[package]] @@ -576,6 +578,12 @@ version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + [[package]] name = "base64-simd" version = "0.8.0" @@ -592,15 +600,6 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" -[[package]] -name = "bincode" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" -dependencies = [ - "serde", -] - [[package]] name = "bitflags" version = "1.3.2" @@ -609,9 +608,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" [[package]] name = "block-buffer" @@ -630,9 +629,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.6.0" +version = "1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" +checksum = "428d9aa8fbc0670b7b8d6030a7fadd0f86151cae55e4dbbece15f3780a3dfaf3" [[package]] name = "bytes-utils" @@ -646,12 +645,13 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.92" +version = "1.1.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2678b2e3449475e95b0aa6f9b506a28e61b3dc8996592b983695e8ebb58a8b41" +checksum = "07b1695e2c7e8fc85310cde85aeaab7e3097f593c91d209d3f9df76c928100f0" dependencies = [ "jobserver", "libc", + "shlex", ] [[package]] @@ -660,6 +660,15 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chrono" +version = "0.4.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" +dependencies = [ + "num-traits", +] + [[package]] name = "chumsky" version = "0.9.3" @@ -686,9 +695,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.4" +version = "4.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90bc066a67923782aa8515dbaea16946c5bcc5addbd668bb80af688e53e548a0" +checksum = "b0956a43b323ac1afaffc053ed5c4b7c1f1800bacd1683c353aabbb752515dd3" dependencies = [ "clap_builder", "clap_derive", @@ -696,9 +705,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.2" +version = "4.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4" +checksum = "4d72166dd41634086d5803a47eb71ae740e61d84709c36f3c34110173db3961b" dependencies = [ "anstream", "anstyle", @@ -708,36 +717,36 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.4" +version = "4.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "528131438037fd55894f62d6e9f068b8f45ac57ffa77517819645d10aed04f64" +checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.77", ] [[package]] name = "clap_lex" -version = "0.7.0" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" +checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" [[package]] name = "cmake" -version = "0.1.50" +version = "0.1.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130" +checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a" dependencies = [ "cc", ] [[package]] name = "colorchoice" -version = "1.0.0" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +checksum = "d3fd119d74b830634cea2a0f58bbd0d54540518a14397557951e79340abc28c0" [[package]] name = "colored" @@ -780,33 +789,33 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.6" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" [[package]] name = "cpufeatures" -version = "0.2.12" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" +checksum = "608697df725056feaccfa42cffdaeeec3fccc4ffc38358ecd19b243e716a78e0" dependencies = [ "libc", ] [[package]] name = "crc32c" -version = "0.6.5" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2" +checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47" dependencies = [ - "rustc_version 0.4.0", + "rustc_version 0.4.1", ] [[package]] name = "crc32fast" -version = "1.4.0" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3855a8a784b474f333699ef2bbca9db2c4a1f6d9088a90a2d25b1eb53111eaa" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" dependencies = [ "cfg-if", ] @@ -832,9 +841,9 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.19" +version = "0.8.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" [[package]] name = "crypto-bigint" @@ -996,11 +1005,12 @@ dependencies = [ "aws-config", "aws-sdk-s3", "byteorder", - "clap 4.5.4", + "clap 4.5.18", "console", "env_logger", "flate2", "glob", + "human_bytes", "humantime", "indicatif", "jaq-core", @@ -1021,6 +1031,7 @@ dependencies = [ "serde_json", "simple_logger", "structopt", + "sysinfo", "tempfile", "thousands", "threadpool", @@ -1052,9 +1063,9 @@ dependencies = [ [[package]] name = "either" -version = "1.10.0" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" [[package]] name = "elliptic-curve" @@ -1103,9 +1114,9 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" dependencies = [ "libc", "windows-sys 0.52.0", @@ -1122,9 +1133,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.0.2" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "658bd65b1cf4c852a3cc96f18a8ce7b5640f6b703f905c7d74532294c2a63984" +checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" [[package]] name = "ff" @@ -1138,9 +1149,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.28" +version = "1.0.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" +checksum = "324a1be68054ef05ad64b861cc9eaf1d623d2d8cb25b4bf2cb9cdd902b4bf253" dependencies = [ "crc32fast", "libz-ng-sys", @@ -1228,9 +1239,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94b22e06ecb0110981051723910cbf0b5f5e09a2062dd7663334ee79a9d1286c" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", "libc", @@ -1239,9 +1250,9 @@ dependencies = [ [[package]] name = "gimli" -version = "0.28.1" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" +checksum = "32085ea23f3234fc7846555e85283ba4de91e21016dc0455a16286d87a292d64" [[package]] name = "glob" @@ -1281,9 +1292,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.14.3" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" dependencies = [ "ahash", "allocator-api2", @@ -1319,6 +1330,12 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" +[[package]] +name = "hermit-abi" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc" + [[package]] name = "hex" version = "0.4.3" @@ -1344,9 +1361,9 @@ dependencies = [ [[package]] name = "hifijson" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18ae468bcb4dfecf0e4949ee28abbc99076b6a0077f51ddbc94dbfff8e6a870c" +checksum = "9958ab3ce3170c061a27679916bd9b969eceeb5e8b120438e6751d0987655c42" [[package]] name = "hmac" @@ -1392,9 +1409,9 @@ dependencies = [ [[package]] name = "http-body" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", "http 1.1.0", @@ -1402,22 +1419,22 @@ dependencies = [ [[package]] name = "http-body-util" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0475f8b2ac86659c21b64320d5d653f9efe42acd2a4e560073ec61a155a34f1d" +checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" dependencies = [ "bytes", - "futures-core", + "futures-util", "http 1.1.0", - "http-body 1.0.0", + "http-body 1.0.1", "pin-project-lite", ] [[package]] name = "httparse" -version = "1.8.0" +version = "1.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" +checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9" [[package]] name = "httpdate" @@ -1425,6 +1442,12 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" +[[package]] +name = "human_bytes" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91f255a4535024abf7640cb288260811fc14794f62b063652ed349f9a6c2348e" + [[package]] name = "humantime" version = "2.1.0" @@ -1433,9 +1456,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "0.14.28" +version = "0.14.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf96e135eb83a2a8ddf766e426a841d8ddd7449d5f00d34ea02b41d2f19eef80" +checksum = "a152ddd61dfaec7273fe8419ab357f33aee0d914c5f4efbf0d96fa749eea5ec9" dependencies = [ "bytes", "futures-channel", @@ -1500,9 +1523,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.2.6" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5" dependencies = [ "equivalent", "hashbrown", @@ -1529,24 +1552,30 @@ checksum = "bfa799dd5ed20a7e349f3b4639aa80d74549c81716d9ec4f994c9b5815598306" [[package]] name = "instant" -version = "0.1.12" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" dependencies = [ "cfg-if", ] [[package]] name = "is-terminal" -version = "0.4.12" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" +checksum = "261f68e344040fbd0edea105bef17c66edf46f984ddb1115b775ce31be948f4b" dependencies = [ - "hermit-abi 0.3.9", + "hermit-abi 0.4.0", "libc", "windows-sys 0.52.0", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + [[package]] name = "itertools" version = "0.10.5" @@ -1582,26 +1611,26 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "jaq-core" -version = "1.2.1" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03d6a5713b8f33675abfac79d1db0022a3f28764b2a6b96a185c199ad8dab86d" +checksum = "d6fda09ee08c84c81293fdf811d9ebaa87b327557b5391f290c926d728c2ddd4" dependencies = [ "aho-corasick", - "base64 0.21.7", + "base64 0.22.1", + "chrono", "hifijson", "jaq-interpret", "libm", "log", "regex", - "time", "urlencoding", ] [[package]] name = "jaq-interpret" -version = "1.2.1" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f569e38e5fc677db8dfda89ee0b4c25b3f53e811b16434fd14bdc5b43fc362ac" +checksum = "2fe95ec3c24af3fd9f3dd1091593f5e49b003a66c496a8aa39d764d0a06ae17b" dependencies = [ "ahash", "dyn-clone", @@ -1614,9 +1643,9 @@ dependencies = [ [[package]] name = "jaq-parse" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef6f8beb9f9922546419e774e24199e8a968f54c63a5a2323c8f3ef3321ace14" +checksum = "0346d7d3146cdda8acd929581f3d6626a332356c74d5c95aeaffaac2eb6dee82" dependencies = [ "chumsky", "jaq-syn", @@ -1624,29 +1653,27 @@ dependencies = [ [[package]] name = "jaq-std" -version = "1.2.1" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d7871c59297cbfdd18f6f1bbbafaad24e97fd555ee1e2a1be7a40a5a20f551a" +checksum = "bfbaa55578fd3b70433b594a370741e0c364e4afff92cc0099623fce87311bc1" dependencies = [ - "bincode", - "jaq-parse", "jaq-syn", ] [[package]] name = "jaq-syn" -version = "1.1.0" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4d60101fb791b20c982731d848ed6e7d25363656497647c2093b68bd88398d6" +checksum = "1ba44fe4428c71304604261ecbae047ee9cfb60c4f1a6bd222ebbb31726d3948" dependencies = [ "serde", ] [[package]] name = "jobserver" -version = "0.1.31" +version = "0.1.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" dependencies = [ "libc", ] @@ -1666,15 +1693,15 @@ dependencies = [ [[package]] name = "lazy_static" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "libc" -version = "0.2.153" +version = "0.2.158" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" [[package]] name = "libm" @@ -1688,15 +1715,15 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" dependencies = [ - "bitflags 2.5.0", + "bitflags 2.6.0", "libc", ] [[package]] name = "libz-ng-sys" -version = "1.1.15" +version = "1.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6409efc61b12687963e602df8ecf70e8ddacf95bc6576bcf16e3ac6328083c5" +checksum = "4436751a01da56f1277f323c80d584ffad94a3d14aecd959dd0dff75aa73a438" dependencies = [ "cmake", "libc", @@ -1710,15 +1737,15 @@ checksum = "89be94dbd775db37b46ca4f4bf5cf89adfb13ba197bfbcb69b2122848ee73c26" [[package]] name = "linux-raw-sys" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] name = "lock_api" -version = "0.4.11" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" dependencies = [ "autocfg", "scopeguard", @@ -1726,15 +1753,15 @@ dependencies = [ [[package]] name = "log" -version = "0.4.21" +version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" [[package]] name = "lru" -version = "0.12.3" +version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc" +checksum = "37ee39891760e7d94734f6f63fedc29a2e4a152f836120753a72503f09fcf904" dependencies = [ "hashbrown", ] @@ -1773,9 +1800,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.7.2" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "memoffset" @@ -1794,29 +1821,30 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "miniz_oxide" -version = "0.7.2" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" dependencies = [ - "adler", + "adler2", ] [[package]] name = "mio" -version = "0.8.11" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" +checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" dependencies = [ + "hermit-abi 0.3.9", "libc", "wasi", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] name = "monostate" -version = "0.1.11" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "878c2a1f1c70e5724fa28f101ca787b6a7e8ad5c5e4ae4ca3b0fa4a419fa9075" +checksum = "0d208407d7552cd041d8cdb69a1bc3303e029c598738177a3d87082004dc0e1e" dependencies = [ "monostate-impl", "serde", @@ -1824,22 +1852,21 @@ dependencies = [ [[package]] name = "monostate-impl" -version = "0.1.11" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f686d68a09079e63b1d2c64aa305095887ce50565f00a922ebfaeeee0d9ba6ce" +checksum = "a7ce64b975ed4f123575d11afd9491f2e37bbd5813fbfbc0f09ae1fbddea74e0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.77", ] [[package]] name = "native-tls" -version = "0.2.11" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" +checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466" dependencies = [ - "lazy_static", "libc", "log", "openssl", @@ -1861,6 +1888,15 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "ntapi" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4" +dependencies = [ + "winapi", +] + [[package]] name = "num-conv" version = "0.1.0" @@ -1878,9 +1914,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", ] @@ -1903,9 +1939,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" [[package]] name = "object" -version = "0.32.2" +version = "0.36.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" +checksum = "084f1a5821ac4c651660a94a7153d27ac9d8a53736203f58b31945ded098070a" dependencies = [ "memchr", ] @@ -1944,7 +1980,7 @@ version = "0.10.66" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1" dependencies = [ - "bitflags 2.5.0", + "bitflags 2.6.0", "cfg-if", "foreign-types", "libc", @@ -1961,7 +1997,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.77", ] [[package]] @@ -1972,9 +2008,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-src" -version = "300.2.3+3.2.1" +version = "300.3.2+3.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cff92b6f71555b61bb9315f7c64da3ca43d87531622120fea0195fc761b4843" +checksum = "a211a18d945ef7e648cc6e0058f4c548ee46aab922ea203e0d30e966ea23647b" dependencies = [ "cc", ] @@ -2017,9 +2053,9 @@ dependencies = [ [[package]] name = "parking_lot" -version = "0.12.1" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" dependencies = [ "lock_api", "parking_lot_core", @@ -2027,28 +2063,28 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.9" +version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" dependencies = [ "cfg-if", "libc", "redox_syscall", "smallvec", - "windows-targets 0.48.5", + "windows-targets 0.52.6", ] [[package]] name = "parse-size" -version = "1.0.0" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "944553dd59c802559559161f9816429058b869003836120e262e8caec061b7ae" +checksum = "487f2ccd1e17ce8c1bfab3a65c89525af41cfad4c8659021a1e9a2aacd73b89b" [[package]] name = "paste" -version = "1.0.14" +version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] name = "percent-encoding" @@ -2058,9 +2094,9 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "pest" -version = "2.7.9" +version = "2.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "311fb059dee1a7b802f036316d790138c613a4e8b180c822e3925a662e9f0c95" +checksum = "fdbef9d1d47087a895abd220ed25eb4ad973a5e26f6a4367b038c25e28dfc2d9" dependencies = [ "memchr", "thiserror", @@ -2069,9 +2105,9 @@ dependencies = [ [[package]] name = "pest_derive" -version = "2.7.9" +version = "2.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f73541b156d32197eecda1a4014d7f868fd2bcb3c550d5386087cfba442bf69c" +checksum = "4d3a6e3394ec80feb3b6393c725571754c6188490265c61aaf260810d6b95aa0" dependencies = [ "pest", "pest_generator", @@ -2079,22 +2115,22 @@ dependencies = [ [[package]] name = "pest_generator" -version = "2.7.9" +version = "2.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c35eeed0a3fab112f75165fdc026b3913f4183133f19b49be773ac9ea966e8bd" +checksum = "94429506bde1ca69d1b5601962c73f4172ab4726571a59ea95931218cb0e930e" dependencies = [ "pest", "pest_meta", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.77", ] [[package]] name = "pest_meta" -version = "2.7.9" +version = "2.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2adbf29bb9776f28caece835398781ab24435585fe0d4dc1374a61db5accedca" +checksum = "ac8a071862e93690b6e34e9a5fb8e33ff3734473ac0245b27232222c4906a33f" dependencies = [ "once_cell", "pest", @@ -2131,9 +2167,9 @@ checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" [[package]] name = "portable-atomic" -version = "1.6.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" +checksum = "d30538d42559de6b034bc76fd6dd4c38961b1ee5c6c56e3808c50128fdbc22ce" [[package]] name = "powerfmt" @@ -2143,9 +2179,12 @@ checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" [[package]] name = "ppv-lite86" -version = "0.2.17" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] [[package]] name = "proc-macro-error" @@ -2173,18 +2212,18 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.79" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" dependencies = [ "unicode-ident", ] [[package]] name = "psl" -version = "2.1.30" +version = "2.1.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3bb1091ea5d9ac71671164e4a9a0317144816c6c4f76ebaa0a6f43f32586463" +checksum = "ce9398ad066421139b2e3afe16ea46772ffda30bd9ba57554dc035df5e26edc8" dependencies = [ "psl-types", ] @@ -2257,9 +2296,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.35" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" dependencies = [ "proc-macro2", ] @@ -2327,18 +2366,18 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.4.1" +version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +checksum = "0884ad60e090bf1345b93da0a5de8923c93884cd03f40dfcfddd3b4bee661853" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.6.0", ] [[package]] name = "redox_users" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd283d9651eeda4b2a83a43c1c91b266c40fd76ecd39a50a8c630ae69dc72891" +checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ "getrandom", "libredox", @@ -2347,9 +2386,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.4" +version = "1.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c" +checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" dependencies = [ "aho-corasick", "memchr", @@ -2359,9 +2398,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" dependencies = [ "aho-corasick", "memchr", @@ -2370,15 +2409,15 @@ dependencies = [ [[package]] name = "regex-lite" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30b661b2f27137bdbc16f00eda72866a92bb28af1753ffbd56744fb6e2e9cd8e" +checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" [[package]] name = "regex-syntax" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" +checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" [[package]] name = "rfc6979" @@ -2408,9 +2447,9 @@ dependencies = [ [[package]] name = "rmp" -version = "0.8.12" +version = "0.8.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f9860a6cc38ed1da53456442089b4dfa35e7cedaa326df63017af88385e6b20" +checksum = "228ed7c16fa39782c3b3468e974aec2795e9089153cd08ee2e9aefb3613334c4" dependencies = [ "byteorder", "num-traits", @@ -2430,9 +2469,9 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] name = "rustc_version" @@ -2445,20 +2484,20 @@ dependencies = [ [[package]] name = "rustc_version" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" dependencies = [ - "semver 1.0.22", + "semver 1.0.23", ] [[package]] name = "rustix" -version = "0.38.32" +version = "0.38.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65e04861e65f21776e67888bfbea442b3642beaa0138fdb1dd7a84a52dffdb89" +checksum = "8acb788b847c24f28525660c4d7758620a7210875711f79e7f663cc152726811" dependencies = [ - "bitflags 2.5.0", + "bitflags 2.6.0", "errno", "libc", "linux-raw-sys", @@ -2479,14 +2518,15 @@ dependencies = [ [[package]] name = "rustls" -version = "0.22.3" +version = "0.23.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99008d7ad0bbbea527ec27bddbc0e432c5b87d8175178cee68d2eec9c4a1813c" +checksum = "f2dabaac7466917e566adb06783a81ca48944c6898a1b08b9374106dd671f4c8" dependencies = [ "log", + "once_cell", "ring", "rustls-pki-types", - "rustls-webpki 0.102.2", + "rustls-webpki 0.102.8", "subtle", "zeroize", ] @@ -2514,9 +2554,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.4.1" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecd36cc4259e3e4514335c4a138c6b43171a8d61d8f5c9348f9fc7529416f247" +checksum = "fc0a2ce646f8655401bb81e7927b812614bd5d91dbc968696be50603510fcaf0" [[package]] name = "rustls-webpki" @@ -2530,9 +2570,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.102.2" +version = "0.102.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610" +checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" dependencies = [ "ring", "rustls-pki-types", @@ -2541,17 +2581,17 @@ dependencies = [ [[package]] name = "ryu" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" [[package]] name = "schannel" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" +checksum = "e9aaafd5a2b6e3d657ff009d82fbd630b6bd54dd4eb06f21693925cdf80f9b8b" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -2592,11 +2632,11 @@ dependencies = [ [[package]] name = "security-framework" -version = "2.10.0" +version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "770452e37cad93e0a50d5abc3990d2bc351c36d0328f86cefec2f2fb206eaef6" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.6.0", "core-foundation", "core-foundation-sys", "libc", @@ -2605,9 +2645,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.10.0" +version = "2.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41f3cc463c0ef97e11c3461a9d3787412d30e8e7eb907c79180c4a57bf7c04ef" +checksum = "ea4a292869320c0272d7bc55a5a6aafaff59b4f63404a003887b679a2e05b4b6" dependencies = [ "core-foundation-sys", "libc", @@ -2624,9 +2664,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.22" +version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92d43fe69e652f3df9bdc2b85b2854a0825b86e4fb76bc44d945137d053639ca" +checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" [[package]] name = "semver-parser" @@ -2639,31 +2679,32 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.197" +version = "1.0.210" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2" +checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.197" +version = "1.0.210" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" +checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.77", ] [[package]] name = "serde_json" -version = "1.0.115" +version = "1.0.128" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd" +checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8" dependencies = [ "itoa", + "memchr", "ryu", "serde", ] @@ -2690,11 +2731,17 @@ dependencies = [ "digest", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "signal-hook-registry" -version = "1.4.1" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" +checksum = "a9e9e0b4211b72e7b8b6e85c807d36c212bdb33ea8587f7569562a84df5465b1" dependencies = [ "libc", ] @@ -2738,9 +2785,9 @@ checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" [[package]] name = "socket2" -version = "0.5.6" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871" +checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" dependencies = [ "libc", "windows-sys 0.52.0", @@ -2818,9 +2865,9 @@ dependencies = [ [[package]] name = "subtle" -version = "2.5.0" +version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" @@ -2835,31 +2882,47 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.58" +version = "2.0.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44cfb93f38070beee36b3fef7d4f5a16f27751d94b187b666a5cc5e9b0d30687" +checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] +[[package]] +name = "sysinfo" +version = "0.30.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a5b4ddaee55fb2bea2bf0e5000747e5f5c0de765e5a5ff87f4cd106439f4bb3" +dependencies = [ + "cfg-if", + "core-foundation-sys", + "libc", + "ntapi", + "once_cell", + "rayon", + "windows", +] + [[package]] name = "target-lexicon" -version = "0.12.14" +version = "0.12.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1fc403891a21bcfb7c37834ba66a547a8f402146eba7265b5a6d88059c9ff2f" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.10.1" +version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" +checksum = "04cbcdd0c794ebb0d4cf35e88edd2f7d2c4c3e9a5a6dab322839b321c6a87a64" dependencies = [ "cfg-if", "fastrand", + "once_cell", "rustix", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -2882,22 +2945,22 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.58" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297" +checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.58" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" +checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.77", ] [[package]] @@ -2922,7 +2985,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885" dependencies = [ "deranged", - "itoa", "num-conv", "powerfmt", "serde", @@ -2948,9 +3010,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.6.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" dependencies = [ "tinyvec_macros", ] @@ -2968,7 +3030,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3dd47962b0ba36e7fd33518fbf1754d136fd1474000162bbf2a8b5fcb2d3654d" dependencies = [ "aho-corasick", - "clap 4.5.4", + "clap 4.5.18", "derive_builder", "esaxx-rs", "getrandom", @@ -2997,32 +3059,31 @@ dependencies = [ [[package]] name = "tokio" -version = "1.37.0" +version = "1.40.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787" +checksum = "e2b070231665d27ad9ec9b8df639893f46727666c6767db40317fbe920a5d998" dependencies = [ "backtrace", "bytes", "libc", "mio", - "num_cpus", "parking_lot", "pin-project-lite", "signal-hook-registry", "socket2", "tokio-macros", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] name = "tokio-macros" -version = "2.2.0" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" +checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.77", ] [[package]] @@ -3037,23 +3098,22 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.10" +version = "0.7.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15" +checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a" dependencies = [ "bytes", "futures-core", "futures-sink", "pin-project-lite", "tokio", - "tracing", ] [[package]] name = "tower-service" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" [[package]] name = "tracing" @@ -3074,7 +3134,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.77", ] [[package]] @@ -3112,15 +3172,15 @@ checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" [[package]] name = "unicode-ident" -version = "1.0.12" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" [[package]] name = "unicode-normalization" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" dependencies = [ "tinyvec", ] @@ -3136,15 +3196,15 @@ dependencies = [ [[package]] name = "unicode-segmentation" -version = "1.11.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" [[package]] name = "unicode-width" -version = "0.1.11" +version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" [[package]] name = "unicode_categories" @@ -3166,18 +3226,17 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "ureq" -version = "2.9.6" +version = "2.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11f214ce18d8b2cbe84ed3aa6486ed3f5b285cf8d8fbdbce9f3f767a724adc35" +checksum = "b74fc6b57825be3373f7054754755f03ac3a8f5d70015ccad699ba2029956f4a" dependencies = [ - "base64 0.21.7", + "base64 0.22.1", "flate2", "log", "native-tls", "once_cell", - "rustls 0.22.3", + "rustls 0.23.13", "rustls-pki-types", - "rustls-webpki 0.102.2", "serde", "serde_json", "url", @@ -3186,9 +3245,9 @@ dependencies = [ [[package]] name = "url" -version = "2.5.0" +version = "2.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" +checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c" dependencies = [ "form_urlencoded", "idna 0.5.0", @@ -3203,15 +3262,15 @@ checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" [[package]] name = "utf8parse" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.8.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a183cf7feeba97b4dd1c0d46788634f6221d87fa961b305bed08c851829efcc0" +checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" [[package]] name = "vcpkg" @@ -3227,9 +3286,9 @@ checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" [[package]] name = "version_check" -version = "0.9.4" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "vsimd" @@ -3254,9 +3313,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "webpki-roots" -version = "0.26.1" +version = "0.26.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3de34ae270483955a94f4b21bdaaeb83d508bb84a01435f393818edb0012009" +checksum = "841c67bff177718f1d4dfefde8d8f0e78f9b6589319ba88312f567fc5841a958" dependencies = [ "rustls-pki-types", ] @@ -3279,11 +3338,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" -version = "0.1.6" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "winapi", + "windows-sys 0.59.0", ] [[package]] @@ -3292,6 +3351,25 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" +dependencies = [ + "windows-core", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-sys" version = "0.48.0" @@ -3307,7 +3385,16 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.4", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", ] [[package]] @@ -3327,17 +3414,18 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm 0.52.4", - "windows_aarch64_msvc 0.52.4", - "windows_i686_gnu 0.52.4", - "windows_i686_msvc 0.52.4", - "windows_x86_64_gnu 0.52.4", - "windows_x86_64_gnullvm 0.52.4", - "windows_x86_64_msvc 0.52.4", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] [[package]] @@ -3348,9 +3436,9 @@ checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_gnullvm" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_msvc" @@ -3360,9 +3448,9 @@ checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_aarch64_msvc" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_i686_gnu" @@ -3372,9 +3460,15 @@ checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_gnu" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_msvc" @@ -3384,9 +3478,9 @@ checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_i686_msvc" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_x86_64_gnu" @@ -3396,9 +3490,9 @@ checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnu" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnullvm" @@ -3408,9 +3502,9 @@ checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_gnullvm" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_msvc" @@ -3420,9 +3514,9 @@ checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "windows_x86_64_msvc" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "xmlparser" @@ -3432,53 +3526,54 @@ checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" [[package]] name = "zerocopy" -version = "0.7.32" +version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ + "byteorder", "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.7.32" +version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.77", ] [[package]] name = "zeroize" -version = "1.7.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" [[package]] name = "zstd" -version = "0.13.1" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d789b1514203a1120ad2429eae43a7bd32b90976a7bb8a05f7ec02fa88cc23a" +checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" dependencies = [ "zstd-safe", ] [[package]] name = "zstd-safe" -version = "7.1.0" +version = "7.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd99b45c6bc03a018c8b8a86025678c87e55526064e38f9df301989dce7ec0a" +checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059" dependencies = [ "zstd-sys", ] [[package]] name = "zstd-sys" -version = "2.0.10+zstd.1.5.6" +version = "2.0.13+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c253a4914af5bafc8fa8c86ee400827e83cf6ec01195ec1f1ed8441bf00d65aa" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" dependencies = [ "cc", "pkg-config", diff --git a/Cargo.toml b/Cargo.toml index 44963812..02105be1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ flate2 = { version = "1.0.28", features = [ "zlib-ng", ], default-features = false } glob = "0.3.1" +human_bytes = "0.4.3" humantime = "2.1" indicatif = "0.17" jsonpath-rust = "0.3.0" @@ -42,6 +43,7 @@ simple_logger = { version = "3.0", features = [ "colors", ], default-features = false, optional = true } structopt = { version = "0.3", optional = true } +sysinfo="0.30.7" thousands = "0.2" threadpool = "1.8.1" tokenizers = { version = "0.15.0", features = ["http"] } @@ -56,6 +58,7 @@ jaq-std = "1.2.1" jaq-parse = "1.0.2" jaq-interpret = { version = "1.2.1", features = ["serde_json"] } zstd = "0.13.1" +once_cell = "1.20.2" [dev-dependencies] tempfile = "3.10.1" diff --git a/docs/examples/dedupe-by-url.json b/docs/examples/dedupe-by-url.json index 0664f796..a1edc49e 100644 --- a/docs/examples/dedupe-by-url.json +++ b/docs/examples/dedupe-by-url.json @@ -7,6 +7,7 @@ "output": "tests/work/url/output" }, "dedupe": { + "dedupe_method": "documents", "name": "dedupe_by_url", "documents": { "attribute_name": "bff_duplicate_url", diff --git a/docs/examples/dedupe-paragraphs.json b/docs/examples/dedupe-paragraphs.json index 93a8e257..80e66612 100644 --- a/docs/examples/dedupe-paragraphs.json +++ b/docs/examples/dedupe-paragraphs.json @@ -7,6 +7,7 @@ "output": "tests/work/para/output" }, "dedupe": { + "dedupe_method": "paragraphs", "name": "dedupe_paragraphs", "paragraphs": { "attribute_name": "bff_duplicate_paragraph_spans" diff --git a/docs/getting-started.md b/docs/getting-started.md index 9d90ed9b..0f2fdc21 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -92,6 +92,7 @@ After tagging, we deduplicate the dataset at a paragraph level. ```shell dolma dedupe \ --documents "wikipedia/v0/documents/*" \ + --dedupe.dedupe_method "paragraphs" \ --dedupe.paragraphs.attribute_name 'bff_duplicate_paragraph_spans' \ --dedupe.skip_empty \ --bloom_filter.file /tmp/deduper_bloom_filter.bin \ diff --git a/python/dolma/cli/deduper.py b/python/dolma/cli/deduper.py index c4a60a66..06621a56 100644 --- a/python/dolma/cli/deduper.py +++ b/python/dolma/cli/deduper.py @@ -1,5 +1,6 @@ from contextlib import ExitStack from dataclasses import dataclass +from enum import Enum from pathlib import Path from typing import Any, Dict, List, Optional @@ -46,6 +47,18 @@ class ParagraphDedupeConfig: ) +@dataclass +class DCLMDedupeConfig: + attribute_name: Optional[str] = field(help="Name of the output field in the tagger") + by_ngram: Optional[NgramDedupeConfig] = field( + default=None, help="Configuration for fuzzy dedupe", default_factory=NgramDedupeConfig + ) + paragraph_separator: Optional[str] = field( + default="\n", + help="String to use to separate paragraphs. By default, paragraphs are separated by newlines.", + ) + + @dataclass class DocumentDedupeConfig: attribute_name: Optional[str] = field(help="Name of the output field in the tagger") @@ -77,6 +90,15 @@ class BloomFilterConfig: "estimated_doc_count." ), ) + save_to_disk: bool = field( + default=True, help=("If False, ignore the 'file' field and do NOT save the populated bloom filter to disk") + ) + sysram_limit: float = field( + default=0.9, + help=( + "Maximum fraction of the system RAM we use -- will print out a warning if we really want more than this" + ), + ) @dataclass @@ -88,6 +110,7 @@ class DedupeConfig: paragraphs: Optional[ParagraphDedupeConfig] = field( default=None, help="Configuration for paragraph deduplication" ) + dclm: Optional[DCLMDedupeConfig] = field(default=None, help="Configuration for DCLM deduplication") skip_empty: Optional[bool] = field(default=False, help="If true, empty documents/paragraphs will be skipped") min_length: Optional[int] = field(default=0, help="Minimum length of documents/paragraphs to be deduplicated") min_words: Optional[int] = field( @@ -99,6 +122,10 @@ class DedupeConfig: partition_index: Optional[int] = field( default=0, help="The index of the partition being processed, in the range [0, num_partitions)." ) + dedupe_method: Optional[str] = field( + default=None, + help="Selects which dedupe method to use. Must be either empty or in the set {paragraphs, documents, dclm}", + ) @dataclass @@ -108,7 +135,7 @@ class DeduperConfig: dedupe: DedupeConfig = field(help="Deduplication configuration. Required.") bloom_filter: BloomFilterConfig = field(help="Bloom filter configuration. Required.") processes: int = field( - default=1, help="Number of processes to use for deduplication. If 1, no multiprocessing will be used." + default=0, help="Number of processes to use for deduplication. If 1, no multiprocessing will be used." ) compression: CompressionConfig = field( default=CompressionConfig(), @@ -133,9 +160,8 @@ class DeduperCli(BaseCli): @classmethod def run(cls, parsed_config: DeduperConfig): logger = get_logger("tagger") - + dict_config: Dict[str, Any] = {} - with ExitStack() as stack: work_dirs = stack.enter_context(make_workdirs(parsed_config.work_dir)) @@ -155,22 +181,29 @@ def run(cls, parsed_config: DeduperConfig): if dedupe_dict_config["min_words"] < 0: raise ValueError("min_words must be >= 0") - # add either the document or paragraph dedupe config - if not ( - om.is_missing(parsed_config.dedupe.documents, "attribute_name") - and om.is_missing(parsed_config.dedupe.documents, "key") - ): - cfg = om.to_container(parsed_config.dedupe.documents) - assert isinstance(cfg, dict), "Expected dedupe.documents to be a dict" - dedupe_dict_config["documents"] = cfg - try_name = try_name or cfg["attribute_name"] - elif not om.is_missing(parsed_config.dedupe.paragraphs, "attribute_name"): - cfg = om.to_container(parsed_config.dedupe.paragraphs) - assert isinstance(cfg, dict), "Expected dedupe.paragraphs to be a dict" - dedupe_dict_config["paragraphs"] = cfg - try_name = try_name or cfg["attribute_name"] - else: - raise ValueError("Either dedupe.documents or dedupe.paragraphs must be specified") + # add either the document or paragraph dedupe config and infer the dedup_method + dedupe_method = parsed_config.dedupe.dedupe_method # If is specified + if dedupe_method == None: + # Else infer the dedupe method: + if not ( + om.is_missing(parsed_config.dedupe.documents, "attribute_name") + and om.is_missing(parsed_config.dedupe.documents, "key") + ): + dedupe_method = "documents" + elif not (om.is_missing(parsed_config.dedupe.paragraphs, "attribute_name")): + dedupe_method = "paragraphs" + elif not (om.is_missing(parsed_config.dedupe.dclm, "attribute_name")): + dedupe_method = "dclm" + else: + raise ValueError("Some dedupe method must be specified (either explicitly or implicitly)") + dedupe_dict_config["dedupe_method"] = dedupe_method + dedupe_dict_config[dedupe_method] = om.to_container(parsed_config.dedupe[dedupe_method]) + assert ( + dedupe_dict_config[dedupe_method].get("attribute_name") != None + ), "Need attribute name for deduplication" + cfg = om.to_container(parsed_config.dedupe[dedupe_method]) + assert isinstance(cfg, dict), "Expected dedupe.%s to be a dict" % dedupe_meth + try_name = try_name or cfg["attribute_name"] if try_name is None: raise ValueError("dedupe.name must be specified") @@ -215,6 +248,7 @@ def run(cls, parsed_config: DeduperConfig): "size_in_bytes": int(parsed_config.bloom_filter.size_in_bytes), "estimated_doc_count": int(parsed_config.bloom_filter.estimated_doc_count), "desired_false_positive_rate": float(parsed_config.bloom_filter.desired_false_positive_rate), + "save_to_disk": parsed_config.bloom_filter.save_to_disk, } if dict_config["bloom_filter"]["size_in_bytes"] <= 0 and ( @@ -247,7 +281,11 @@ def run(cls, parsed_config: DeduperConfig): deduper(dict_config) # upload to remote file if necessary - if not parsed_config.bloom_filter.read_only and not path_is_local: + if ( + not parsed_config.bloom_filter.read_only + and not path_is_local + and parsed_config.bloom_filter.save_to_disk + ): print(f"Pushing Bloom filter to {parsed_config.bloom_filter.file}") local = stack.enter_context(smart_open.open(local_bloom_file, "rb")) remote = stack.enter_context(smart_open.open(parsed_config.bloom_filter.file, "wb")) diff --git a/src/bloom_filter.rs b/src/bloom_filter.rs index f38edb6d..3b1277e7 100644 --- a/src/bloom_filter.rs +++ b/src/bloom_filter.rs @@ -1,6 +1,8 @@ use ahash::RandomState; use byteorder::{LittleEndian, NativeEndian, ReadBytesExt, WriteBytesExt}; +use human_bytes::human_bytes; use rand::Rng; +use rayon::prelude::*; use serde::{Deserialize, Serialize}; use std::collections::VecDeque; use std::fs::{create_dir_all, OpenOptions}; @@ -10,6 +12,8 @@ use std::io::{BufReader, BufWriter, Write}; use std::mem::size_of; use std::path::PathBuf; use std::sync::atomic::{AtomicU32, Ordering}; +use sysinfo::System; + mod bloom_test; // A thread-safe bloom filter. pub struct BloomFilter { @@ -59,6 +63,61 @@ impl BloomFilter { size_in_bytes } + pub fn compute_bloom_size_binsearch( + expected_elements: usize, + fp_rate: f64, + sysram_limit: Option, + num_hashers: usize, + ) -> usize { + /* Uses binary search to get a finer-grained bloom filter size. + If limit_to_system: guarantees that no more than 90% of RAM gets allocated + If num_hashers == 0: computes the optimal number of hashers on the fly + */ + + // Get 90% of System RAM and set binsearch bounds + let mut sys = System::new_all(); + sys.refresh_all(); + let sysram_limit: f64 = sysram_limit.unwrap_or(0.0); + let mut lo = 1 as usize; + let mut hi = if sysram_limit > 0.0 { + ((sys.total_memory() as f64) * sysram_limit) as usize + } else { + std::usize::MAX / 8 + }; + + let compute_hashers = num_hashers == 0; + let num_hashers = if num_hashers == 0 { + BloomFilter::optimal_number_of_hashers(hi, expected_elements) + } else { + num_hashers + }; + + if (sysram_limit > 0.0) + && BloomFilter::prob_of_false_positive(hi, expected_elements, num_hashers) > fp_rate + { + log::info!("WARNING: TO achieve desired false-positive rate, you'd need >90% of system RAM. Defaulting to {:?} SysRAM", sysram_limit); + return hi; + } + + // Do BinSearch + while lo < hi - 1 { + let mid = lo + (hi - lo) / 2; + let num_hashers = if compute_hashers { + BloomFilter::optimal_number_of_hashers(mid, expected_elements) + } else { + num_hashers + }; + let computed_fp = + BloomFilter::prob_of_false_positive(mid, expected_elements, num_hashers); + if computed_fp > fp_rate { + lo = mid + 1; + } else { + hi = mid - 1; + } + } + hi + } + #[allow(dead_code)] pub fn my_prob_of_false_positive(&self, expected_elements: usize) -> f64 { Self::prob_of_false_positive( @@ -68,6 +127,19 @@ impl BloomFilter { ) } + pub fn calculate_sparsity(&self) -> f64 { + let set_bits: usize = self + .bits + .par_iter() + .map(|atomic| { + let value = atomic.load(std::sync::atomic::Ordering::Relaxed); + value.count_ones() as usize + }) + .sum(); + let total_bits = self.size_in_bytes() * 8; + (set_bits as f64) / (total_bits as f64) + } + #[allow(dead_code)] pub fn size_in_bytes(&self) -> usize { self.bits.len() * size_of::() @@ -86,8 +158,9 @@ impl BloomFilter { } let number_of_u32 = size_in_bytes / size_of::(); - let bits: Vec = std::iter::repeat_with(|| AtomicU32::new(0)) - .take(number_of_u32) + let bits = (0..number_of_u32) + .into_par_iter() + .map(|_| AtomicU32::default()) .collect(); Self { bits, @@ -243,11 +316,13 @@ impl BloomFilter { log::info!("Creating new bloom filter..."); let mut bloom_filter_size: usize = config.size_in_bytes; if bloom_filter_size == 0 { - bloom_filter_size = BloomFilter::suggest_size_in_bytes( + bloom_filter_size = BloomFilter::compute_bloom_size_binsearch( config.estimated_doc_count, config.desired_false_positive_rate, + config.sysram_limit, + 0, ); - log::info!("Creating bloom filter with size {} bytes to achieve false positive rate {} for {} elements", bloom_filter_size, config.desired_false_positive_rate, config.estimated_doc_count); + log::info!("Creating bloom filter with size {} bytes to achieve false positive rate {} for {} elements", human_bytes(bloom_filter_size as f64), config.desired_false_positive_rate, config.estimated_doc_count); } let num_hashers = BloomFilter::optimal_number_of_hashers( bloom_filter_size, @@ -260,7 +335,7 @@ impl BloomFilter { ); log::info!( "Bloom filter will have size {}, {} hashers, false positive rate {}.", - bloom_filter_size, + human_bytes(bloom_filter_size as f64), num_hashers, p ); @@ -271,11 +346,13 @@ impl BloomFilter { } } -#[derive(Serialize, Deserialize, Clone)] +#[derive(Serialize, Deserialize, Clone, Debug)] pub struct BloomFilterConfig { pub file: String, pub size_in_bytes: usize, pub read_only: bool, pub estimated_doc_count: usize, pub desired_false_positive_rate: f64, + pub save_to_disk: bool, + pub sysram_limit: Option, } diff --git a/src/deduper.rs b/src/deduper.rs index f2ad99b1..715f9566 100644 --- a/src/deduper.rs +++ b/src/deduper.rs @@ -1,81 +1,144 @@ +use human_bytes::human_bytes; use std::collections::VecDeque; use std::io; -use std::io::{BufRead, Write}; +use std::io::{BufRead, Error, ErrorKind, Write}; use std::path::PathBuf; -use std::sync::atomic::{AtomicU32, Ordering}; -use std::sync::Arc; +use std::sync::atomic::{AtomicU32, AtomicUsize, Ordering}; +use std::sync::{Arc, Mutex}; +use std::time::Instant; use serde_json::{json, Value}; -use threadpool::ThreadPool; +use rayon::prelude::*; +use rayon::ThreadPoolBuilder; +use once_cell::sync::OnceCell; use crate::bloom_filter::BloomFilter; use crate::io::MultiStream; +use crate::log_pbar::LogProgressBar; use crate::s3_util; use crate::shard::shard_config::{CompressionConfig, WorkDirConfig}; use crate::shard::{find_objects_matching_patterns, FileCache}; use crate::wimbd::tokens::tokenize; + + use deduper_config::*; +static GLOBAL_POOL: OnceCell<()> = OnceCell::new(); + + +fn build_pbar(num_items: usize) -> LogProgressBar { + let mut pbar = LogProgressBar::new(num_items); + pbar.inc(0); + pbar +} + pub fn run(config: DeduperConfig) -> Result { + // Set global thread count for rayon parallelism + let start_main = Instant::now(); + if config.processes > 0 { + GLOBAL_POOL.get_or_init(|| { + ThreadPoolBuilder::new() + .num_threads(config.processes) + .build_global() + .expect("Failed to build global thread pool") + }); + } + let bloom_filter = BloomFilter::initialize(&config.bloom_filter).unwrap(); let bloom_filter = Arc::new(bloom_filter); - let paths = find_objects_matching_patterns(&config.documents) .unwrap() .clone(); - if !(config.dedupe.paragraphs.is_some() ^ config.dedupe.documents.is_some()) { - log::error!("Must dedupe either paragraphs or documents"); - return Err(paths.len() as u32); - } - - let threadpool = ThreadPool::new(config.processes); + let docs_processed = AtomicUsize::new(0); + let seen_bytes = AtomicUsize::new(0); + let removed_bytes = AtomicUsize::new(0); let failed_shard_count = AtomicU32::new(0); let failed_shard_count_ref = Arc::new(failed_shard_count); - for p in paths { + let pbar = Arc::new(Mutex::new(build_pbar(paths.len()))); + + println!("Starting par iter thing"); + paths.par_iter().for_each(|p| { let path = p.clone(); let work_dirs = config.work_dir.clone(); let dedupe = config.dedupe.clone(); - let bloom_filter = bloom_filter.clone(); - let failed_shard_count_ref = failed_shard_count_ref.clone(); let compression = match config.compression.clone() { Some(c) => c, None => CompressionConfig::infer(), }; - threadpool.execute(move || { - let result = write_attributes( - path, - work_dirs, - dedupe, - compression, - bloom_filter, - !config.is_s3_volume.unwrap_or(false), - ); - if let Err(e) = result { - log::error!("Failed to process {:?}: {}", p, e); - failed_shard_count_ref.fetch_add(1, Ordering::Relaxed); + let result = write_attributes( + path, + work_dirs, + dedupe, + compression, + bloom_filter.clone(), + !config.is_s3_volume.unwrap_or(false), + ); + if let Err(e) = result { + log::error!("Failed to process {:?}: {}", p, e); + failed_shard_count_ref.fetch_add(1, Ordering::Relaxed); + } else { + let (path_docs_processed, path_seen_bytes, path_removed_bytes) = result.unwrap(); + docs_processed.fetch_add(path_docs_processed, Ordering::Relaxed); + seen_bytes.fetch_add(path_seen_bytes, Ordering::Relaxed); + removed_bytes.fetch_add(path_removed_bytes, Ordering::Relaxed); + } + pbar.lock().unwrap().inc(1); + }); + + if config.bloom_filter.save_to_disk { + let bloom_filter_file = PathBuf::from(&config.bloom_filter.file); + log::info!("Writing bloom filter to {:?}...", config.bloom_filter.file); + match bloom_filter.write_to_file(&bloom_filter_file) { + Ok(_) => log::info!("Bloom filter written."), + Err(e) => { + log::error!("Write failed: {}", e); + panic!("Failed to write bloom filter"); } - }); - } - threadpool.join(); - - let bloom_filter_file = PathBuf::from(&config.bloom_filter.file); - log::info!("Writing bloom filter to {:?}...", config.bloom_filter.file); - match bloom_filter.write_to_file(&bloom_filter_file) { - Ok(_) => log::info!("Bloom filter written."), - Err(e) => { - log::error!("Write failed: {}", e); - panic!("Failed to write bloom filter"); } } + // Log outputs let failure_count = failed_shard_count_ref.load(Ordering::Relaxed); + let seen_bytes = seen_bytes.into_inner(); + let removed_bytes = removed_bytes.into_inner(); + log::info!("----------------------------------"); + log::info!( + "Finished processing files in {:?} (s)", + start_main.elapsed().as_secs() + ); + log::info!( + "Was successful on {:?}/{:?} of the paths", + paths.len() - failure_count as usize, + paths.len() + ); + if failure_count > 0 { + log::error!("FAILED ON {:?} PATHS", failure_count); + } + log::info!( + "Bloom filter has sparsity {:?}", + bloom_filter.calculate_sparsity() + ); + log::info!( + "Processed {:?} documents in total", + docs_processed.into_inner() + ); + log::info!( + "Processed {} of data, removed {} of them | Removal rate of {:?}", + human_bytes(seen_bytes as f32), + human_bytes(removed_bytes as f32), + if seen_bytes == 0 { + 0.0 + } else { + removed_bytes as f32 / seen_bytes as f32 + } + ); + + if failure_count == 0 { - log::info!("Done!"); Ok(failure_count) } else { - log::error!("{} shards failed to process.", failure_count); Err(failure_count) } } @@ -106,7 +169,7 @@ fn write_attributes( compression: CompressionConfig, bloom_filter: Arc, label_temp: bool, -) -> Result<(), io::Error> { +) -> Result<(usize, usize, usize), io::Error> { let cache = FileCache { s3_client: Box::new(s3_util::new_client(None)?), work: work_dirs.clone(), @@ -125,18 +188,19 @@ fn write_attributes( let attr_prefix = format!("/attributes/{}/", attr_key); docs_location.replace("/documents/", &attr_prefix) }; + if attrs_location == docs_location { + return Err(Error::new( + ErrorKind::InvalidInput, + "Malformed file location: no /documents/ in file path! Continuing would overwrite data!")); + } let local_output = cache.prepare_output(&attrs_location, label_temp)?; - let mut num_processed = 0; - let mut num_observed = 0; + let mut docs_processed = 0; + let mut seen_bytes = 0; + let mut removed_bytes = 0; if local_output.exists() { log::info!("Skipping {:?} because it already exists", attrs_location); - return Ok(()); + return Ok((docs_processed, seen_bytes, removed_bytes)); } - log::info!( - "Writing attributes for {} to {}", - docs_location, - local_output.display() - ); std::fs::create_dir_all(local_output.parent().unwrap())?; log::info!( @@ -182,9 +246,6 @@ fn write_attributes( ) .writer()?; - let min_content_length = dedupe_config.min_length.unwrap_or(0); - let min_word_count = dedupe_config.min_words.unwrap_or(0); - for (line_number, line) in reader.lines().enumerate() { let line = match line { Ok(line) => line, @@ -198,266 +259,23 @@ fn write_attributes( break; } }; + docs_processed += 1; let data: Value = serde_json::from_str(&line)?; - let mut attributes = json!({}); - - if let Some(ref cfg) = dedupe_config.documents { - let document_key = { - let mut finder = jsonpath_rust::JsonPathFinder::from_str("{}", &cfg.key) - .map_err(|e| io::Error::new(io::ErrorKind::Other, e)) - .unwrap(); - finder.set_json(Box::new(data.clone())); - finder - .find() - .as_array() - .unwrap() - .get(0) - .unwrap() - .as_str() - .unwrap() - .to_string() - }; - - let attr_name_with_index; - let attr_name = if dedupe_config.num_partitions.unwrap_or(1) > 1 { - attr_name_with_index = format!( - "{}_{}", - cfg.attribute_name, - dedupe_config.partition_index.unwrap_or(0) - ); - &attr_name_with_index + let id = data["id"].clone(); + let (attributes, doc_seen_bytes, doc_removed_bytes) = + if dedupe_config.dedupe_method == "documents" { + dedupe_documents(data, dedupe_config.clone(), &bloom_filter) + } else if dedupe_config.dedupe_method == "paragraphs" { + dedupe_paragraphs(data, dedupe_config.clone(), &bloom_filter) + } else if dedupe_config.dedupe_method == "dclm" { + dedupe_dclm(data, dedupe_config.clone(), &bloom_filter) } else { - &cfg.attribute_name + (json!({}), 0, 0) }; - - if min_word_count > 0 { - // Split the text into words and check the number of words. - let words = tokenize(&document_key); - if words.count() < min_word_count { - // skip documents with fewer than min_word_count words - attributes[attr_name] = Value::Array(Vec::new()); - } - } else if document_key.len() < min_content_length { - // skip length 0 documents - attributes[attr_name] = Value::Array(Vec::new()); - } else if dedupe_config.skip_empty.unwrap_or(false) - && document_key.trim().is_empty() - { - // skip empty documents if dedupe_config.skip_empty is true - // and the document key is empty after trimming (i.e., removing whitespace) - attributes[attr_name] = Value::Array(Vec::new()); - } else { - let dedupe_key = VecDeque::from([document_key.as_str()]); - - //Just compute the first hash to see if it matches the partition - num_observed += 1; - let hashes = build_hashes( - &bloom_filter, - &dedupe_key, - dedupe_config.num_partitions.unwrap_or(1), - dedupe_config.partition_index.unwrap_or(0), - ); - - if !hashes.is_empty() { - num_processed += 1; - //Compute the remaining hashes - if bloom_filter.contains(&hashes) { - // attributes[&cfg.attribute_name] = Value::Bool(true); - - let mut duplicate_docs_array = Vec::new(); - let attr = vec![ - Value::from(0), - Value::Number(document_key.len().into()), - Value::from(1), - ]; - duplicate_docs_array.push(Value::Array(attr)); - attributes[attr_name] = Value::Array(duplicate_docs_array); - } else if !bloom_filter.read_only { - bloom_filter.insert(&hashes); - } - } else { - //The dedupe key doesn't belong to this partition - attributes[attr_name] = Value::Array(Vec::new()); - } - } - } - match dedupe_config.paragraphs { - None => {} - Some(ref cfg) => { - // Split the text into paragraphs and check each one. - let text = data["text"].as_str().unwrap(); - let text_length = text.len(); - let mut offset = 0; - - if text_length > 0 { - let paragraphs = - text.split(cfg.paragraph_separator.as_deref().unwrap_or("\n")); - let mut duplicate_paragraph_spans = Vec::new(); - - // skip empty documents if text_length is 0 - for p in paragraphs { - let par_start = offset; - let par_char_length = p.chars().count(); - offset += par_char_length; - if offset < text_length - 1 { - offset += 1; // For the newline - } - let par_end = offset; - - if par_char_length < min_content_length { - // skip length 0 paragraphs - continue; - } - if min_word_count > 0 { - // Split the text into words and check the number of words. - let words = tokenize(&p); - - if words.count() < min_word_count { - // skip documents with fewer than min_words words - continue; - } - } else if dedupe_config.skip_empty.unwrap_or(false) - && p.trim().is_empty() - { - // skip empty paragraphs if dedupe_config.skip_empty is true - // and the paragraph is empty after trimming (i.e., removing whitespace) - continue; - } else { - if cfg.by_ngram.is_none() - || cfg.by_ngram.as_ref().unwrap().ngram_length == 0 - { - let dedupe_key = VecDeque::from([p]); - let hashes = build_hashes( - &bloom_filter, - &dedupe_key, - dedupe_config.num_partitions.unwrap_or(1), - dedupe_config.partition_index.unwrap_or(0), - ); - num_observed += 1; - if !hashes.is_empty() { - num_processed += 1; - // Dedupe the entire paragraph - if bloom_filter.contains(&hashes) { - let span = vec![ - Value::Number(par_start.into()), - Value::Number(par_end.into()), - Value::from(1), - ]; - // add span to duplicate_paragraph_spans - duplicate_paragraph_spans.push(Value::Array(span)); - } else if !bloom_filter.read_only { - bloom_filter.insert(&hashes); - } - } - } else { - // Dedupe by ngram overlap - let by_ngram = cfg.clone().by_ngram.unwrap(); - let ngram_length = by_ngram.ngram_length; - let stride = by_ngram.stride; - let mut ngram: VecDeque<&str> = - VecDeque::with_capacity(ngram_length); - let mut word_index = 0; - let mut last_ngram_start = 0; - let mut ngram_count = 0; - let mut duplicate_ngram_count = 0; - for token in tokenize(p) { - ngram.push_back(token); - if ngram.len() == ngram_length { - let ngram_start = word_index - (ngram_length - 1); - if last_ngram_start == 0 - || ngram_start - last_ngram_start >= stride - { - last_ngram_start = ngram_start; - ngram_count += 1; - let dedupe_key = VecDeque::from(ngram.clone()); - let hashes = build_hashes( - &bloom_filter, - &dedupe_key, - dedupe_config.num_partitions.unwrap_or(1), - dedupe_config.partition_index.unwrap_or(0), - ); - num_observed += 1; - if !hashes.is_empty() { - num_processed += 1; - if bloom_filter.contains(&hashes) { - duplicate_ngram_count += 1; - } else if !bloom_filter.read_only { - bloom_filter.insert(&hashes); - } - } - } - ngram.pop_front(); - } - word_index += 1; - } - if ngram_count < 2 - && !by_ngram.skip_short_paragraphs.unwrap_or(false) - { - // Too few ngrams to dedupe by overlap. Just compare the whole thing - let dedupe_key = VecDeque::from([p]); - let hashes = bloom_filter.hashes(&dedupe_key); - - let span_score = match bloom_filter.contains(&hashes) { - // we found a match! score is 1.0 - true => 1.0, - false => { - // this is a new paragraph, push to bloom filter - if !bloom_filter.read_only { - bloom_filter.insert(&hashes); - } - // score is 0.0 because it's not a duplicate - 0.0 - } - }; - - // we check if the score is above the threshold; note that - // users can set the threshold to 0.0 to always include the span, - // or 1.0 to only include spans that are exact duplicates. - if span_score >= by_ngram.overlap_threshold { - let span = vec![ - Value::Number(par_start.into()), - Value::Number(par_end.into()), - Value::from(span_score), - ]; - // add span to duplicate_paragraph_spans - duplicate_paragraph_spans.push(Value::Array(span)); - } - } else { - let overlap_fraction = - duplicate_ngram_count as f32 / ngram_count as f32; - - if overlap_fraction >= by_ngram.overlap_threshold { - let span = vec![ - Value::Number(par_start.into()), - Value::Number(par_end.into()), - Value::from(overlap_fraction), - ]; - // add span to duplicate_paragraph_spans - duplicate_paragraph_spans.push(Value::Array(span)); - } - } - } - } - } - - let attr_name_with_index; - let attr_name = if dedupe_config.num_partitions.unwrap_or(1) > 1 { - attr_name_with_index = format!( - "{}_{}", - cfg.attribute_name, - dedupe_config.partition_index.unwrap_or(0) - ); - &attr_name_with_index - } else { - &cfg.attribute_name - }; - attributes[attr_name] = Value::Array(duplicate_paragraph_spans); - } - } - } - + seen_bytes += doc_seen_bytes; + removed_bytes += doc_removed_bytes; let mut output_object = json!({}); - output_object["id"] = data["id"].clone(); + output_object["id"] = id; output_object["attributes"] = attributes; serde_json::to_writer(&mut writer_stream, &output_object)?; writer_stream.write_all(b"\n")?; @@ -479,17 +297,390 @@ fn write_attributes( } log::info!( - " Num processed: {} / Job total: {}", - num_processed, - num_observed + "{:?} | Saw {:?} docs and {:?} bytes, removed {:?} of them", + docs_location, + docs_processed, + seen_bytes, + removed_bytes ); if label_temp { //Finalize output performs a rename operation, which isn't implemented in mountpoint-s3 (https://github.com/awslabs/mountpoint-s3/issues/506) cache.finalize_output(&attrs_location)?; } - Ok(()) + Ok((docs_processed, seen_bytes, removed_bytes)) +} + +/*================================================================= += DEDUP SINGLE-DOCUMENT METHODS = +=================================================================*/ +/* Methods to process a single document. Roughly shared signatures here: +Inputs: + data: serde_json::Value + key: str key to dedup on (usually this should be "text") + config: the DedupeConfig struct +Output: + the json with the duplicate spans +*/ + +pub fn dedupe_documents( + data: Value, + dedupe_config: DedupeConfig, + bloom_filter: &Arc, +) -> (Value, usize, usize) { + let mut attributes = json!({}); + let cfg = dedupe_config.documents.unwrap(); + let min_word_count = dedupe_config.min_words.unwrap_or(0); + let min_content_length = dedupe_config.min_length.unwrap_or(0); + // Get the thing we're trying to dedup as 'document_key' + let document_key = { + let mut finder = jsonpath_rust::JsonPathFinder::from_str("{}", &cfg.key) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e)) + .unwrap(); + finder.set_json(Box::new(data.clone())); + finder + .find() + .as_array() + .unwrap() + .get(0) + .unwrap() + .as_str() + .unwrap() + .to_string() + }; + + let seen_bytes = document_key.len(); + let mut removed_bytes = 0; + let attr_name_with_index; + let attr_name = if dedupe_config.num_partitions.unwrap_or(1) > 1 { + attr_name_with_index = format!( + "{}_{}", + cfg.attribute_name, + dedupe_config.partition_index.unwrap_or(0) + ); + &attr_name_with_index + } else { + &cfg.attribute_name + }; + + if min_word_count > 0 { + // Split the text into words and check the number of words. + let words = tokenize(&document_key); + if words.count() < min_word_count { + // skip documents with fewer than min_word_count words + attributes[attr_name] = Value::Array(Vec::new()); + } + } else if document_key.len() < min_content_length { + // skip length 0 documents + attributes[attr_name] = Value::Array(Vec::new()); + } else if dedupe_config.skip_empty.unwrap_or(false) && document_key.trim().is_empty() { + // skip empty documents if dedupe_config.skip_empty is true + // and the document key is empty after trimming (i.e., removing whitespace) + attributes[attr_name] = Value::Array(Vec::new()); + } else { + let dedupe_key = VecDeque::from([document_key.as_str()]); + + //Just compute the first hash to see if it matches the partition + // num_observed += 1; + let hashes = build_hashes( + &bloom_filter, + &dedupe_key, + dedupe_config.num_partitions.unwrap_or(1), + dedupe_config.partition_index.unwrap_or(0), + ); + + if !hashes.is_empty() { + //num_processed += 1; + //Compute the remaining hashes + if bloom_filter.contains(&hashes) { + // attributes[&cfg.attribute_name] = Value::Bool(true); + + let mut duplicate_docs_array = Vec::new(); + let attr = vec![ + Value::from(0), + Value::Number(document_key.len().into()), + Value::from(1), + ]; + duplicate_docs_array.push(Value::Array(attr)); + removed_bytes += document_key.len(); + attributes[attr_name] = Value::Array(duplicate_docs_array); + } else if !bloom_filter.read_only { + bloom_filter.insert(&hashes); + } + } else { + //The dedupe key doesn't belong to this partition + attributes[attr_name] = Value::Array(Vec::new()); + } + } + (attributes, seen_bytes, removed_bytes) } +pub fn dedupe_paragraphs( + data: Value, + dedupe_config: DedupeConfig, + bloom_filter: &Arc, +) -> (Value, usize, usize) { + let mut attributes = json!({}); + let cfg = dedupe_config.paragraphs.unwrap(); + let min_content_length = dedupe_config.min_length.unwrap_or(0); + let min_word_count = dedupe_config.min_words.unwrap_or(0); + let text = data["text"].as_str().unwrap(); + let text_length = text.len(); + let mut seen_bytes = text_length; + let mut removed_bytes = 0; + let mut offset = 0; + + if text_length == 0 { + return (attributes, seen_bytes, removed_bytes) + } + + + let paragraphs = text.split(cfg.paragraph_separator.as_deref().unwrap_or("\n")); + let mut duplicate_paragraph_spans = Vec::new(); + + for p in paragraphs { + // Get start,end in half-open intervals like [start:end) + let par_start = offset; + let par_char_length = p.chars().count(); + offset += par_char_length; + if offset < text_length - 1 { + offset += 1; + } + let par_end = offset; + + // Skip degenerate cases + if par_char_length < min_content_length { continue; } + if tokenize(&p).count() < min_word_count { continue; } + if dedupe_config.skip_empty.unwrap_or(true) && p.trim().is_empty() { continue; } + + // If not doing ngrams, then the whole paragraph is "one ngram" (simulated) [for code simplicity] + let (ngram_len, stride, threshold, skip_short) = if cfg.by_ngram.is_none() || cfg.by_ngram.as_ref().unwrap().ngram_length == 0 { + (usize::MAX, 1, 1.0, false) + } else { + let by_ngram = cfg.by_ngram.clone().unwrap(); + (by_ngram.ngram_length, by_ngram.stride, by_ngram.overlap_threshold, by_ngram.skip_short_paragraphs.unwrap_or(false)) + }; + + + // And now iterate through the words/tokens + + let mut current_ngram = if ngram_len < usize::MAX { + VecDeque::with_capacity(ngram_len)} + else { + VecDeque::new() + }; + let mut hashes_to_add = Vec::new(); + let mut stride_status = 0; + for token in tokenize(&p) { + // Warmup phase + if hashes_to_add.len() == 0 && current_ngram.len() < ngram_len - 1 { + current_ngram.push_back(token); + continue; + } + + // Once warm, if at stride check, make hash and store it + current_ngram.push_back(token); + if stride_status == 0 { + let hashes = build_hashes( + &bloom_filter, + ¤t_ngram.clone(), + dedupe_config.num_partitions.unwrap_or(1), + dedupe_config.partition_index.unwrap_or(1)); + hashes_to_add.push(hashes); + } + + stride_status = (stride_status + 1) % stride; + current_ngram.pop_front(); + } + + // If paragraph was too short... + if hashes_to_add.len() == 0 && !skip_short{ + let hashes = build_hashes( + &bloom_filter, + ¤t_ngram, + dedupe_config.num_partitions.unwrap_or(1), + dedupe_config.partition_index.unwrap_or(1)); + hashes_to_add.push(hashes); + } + + + // Get containment numbers: + let total_hashes = hashes_to_add.len(); + let mut contain_count = 0; + for hash in &hashes_to_add { + if bloom_filter.contains(&hash) { + contain_count += 1; + } + } + + // If containment matches threshold, set span and add hashes + seen_bytes += par_end - par_start; + if (contain_count as f64 / total_hashes as f64) >= threshold.into() { + let span = vec![ + Value::Number(par_start.into()), + Value::Number(par_end.into()), + Value::from(contain_count as f64 / total_hashes as f64), + ]; + // add span to duplicate_paragraph_spans + removed_bytes += par_end - par_start; + duplicate_paragraph_spans.push(Value::Array(span)); + } else if !bloom_filter.read_only { + for hash in hashes_to_add { + bloom_filter.insert(&hash); + } + } + + } + let attr_name_with_index; + let attr_name = if dedupe_config.num_partitions.unwrap_or(1) > 1 { + attr_name_with_index = format!( + "{}_{}", + cfg.attribute_name, + dedupe_config.partition_index.unwrap_or(0) + ); + &attr_name_with_index + } else { + &cfg.attribute_name + }; + attributes[attr_name] = Value::Array(duplicate_paragraph_spans); + (attributes, seen_bytes, removed_bytes) +} + + + + +pub fn dedupe_dclm( + data: Value, + dedupe_config: DedupeConfig, + bloom_filter: &Arc, +) -> (Value, usize, usize) { + // Setup/init for DCLM-style dedup + // Break into paragraphs and skip the too-short paragraphs + // For each paragraph: if >threshold of the ngrams are seen before, mark this paragraph as duplicate + // For whole document: amongst long paragraphs only, if >threshold of ngrams have been seen before, mark whole document as duplicate + // If whole document is duplicate: add nothing to the bloom filter + // If only some paragraphs are duplicates, add just those hashes to the bloom filter + + + // Set things up: + let mut attributes = json!({}); + let cfg = dedupe_config.dclm.unwrap(); + let ngram_params = cfg.by_ngram; + let min_content_length = dedupe_config.min_length.unwrap_or(0); + let attr_name = if dedupe_config.num_partitions.unwrap_or(1) > 1 { + &format!( + "{}_{}", + cfg.attribute_name, + dedupe_config.partition_index.unwrap_or(0) + ) + } else { + &cfg.attribute_name + }; + let text = data["text"].as_str().unwrap(); + let text_length = text.len(); + let splitter = cfg.paragraph_separator.as_deref().unwrap_or("\n"); + let paragraphs = text.split(splitter); + let mut duplicate_paragraph_spans = Vec::new(); + let mut total_ngrams = 0; + let mut total_contained_ngrams = 0; + let seen_bytes = text.len(); + let mut removed_bytes = 0; + let mut hashes_to_insert: Vec> = Vec::new(); + let mut offset = 0; + let stride = ngram_params.stride; + + if text_length == 0 { return (attributes, seen_bytes, removed_bytes); } // degenerate empty case + + + for p in paragraphs { + // Get par start/end + let par_start = offset; + let par_char_length = p.chars().count(); + offset += par_char_length; + if offset < text_length - 1 { + offset += splitter.len(); + } + let par_end = offset; + + // Skip degenerate cases + if par_char_length < min_content_length { continue; } // Skip paragraph: too short (in chars) + + // Set more things up + let mut hashes: Vec> = Vec::new(); + let mut ngram: VecDeque<&str> = VecDeque::with_capacity(ngram_params.ngram_length); + let mut stride_status = 0; + + // Then loop over tokens/words + + for token in tokenize(p) { + if hashes.len() == 0 && ngram.len() < ngram_params.ngram_length - 1 { + ngram.push_back(token); + continue; + } + ngram.push_back(token); + if stride_status == 0 { + let this_hash = build_hashes( + &bloom_filter, + &ngram.clone(), + dedupe_config.num_partitions.unwrap_or(1), + dedupe_config.partition_index.unwrap_or(1)); + hashes.push(this_hash); + } + stride_status = (stride_status + 1) % stride; + ngram.pop_front(); + + } + if hashes.len() == 0 { + continue; + } // Skip paragraph: too short (in tokens ) + + + // Check containment and keep track of whether we should keep/delete this para + let contained_ngrams = hashes.iter().filter(|h| bloom_filter.contains(h)).count(); + total_ngrams += hashes.len(); + total_contained_ngrams += contained_ngrams; + + let paragraph_duplicate_score = contained_ngrams as f32 / hashes.len() as f32; + let should_remove = paragraph_duplicate_score >= ngram_params.overlap_threshold; + + if should_remove { + duplicate_paragraph_spans.push(Value::Array(vec![ + Value::from(par_start), + Value::from(par_end), + Value::from(paragraph_duplicate_score), + ])); + removed_bytes += par_end - par_start; + } else { + hashes_to_insert.extend(hashes); + } + } + + // If all paragraphs in aggregate are duplicates, then make adjustments + if total_ngrams > 0 { + let document_score = (total_contained_ngrams / total_ngrams) as f32; + if (total_contained_ngrams / total_ngrams) as f32 >= ngram_params.overlap_threshold { + duplicate_paragraph_spans.clear(); + duplicate_paragraph_spans.push(Value::Array(vec![ + Value::from(0), + Value::from(text_length), + Value::from(document_score), + ])); + hashes_to_insert.clear(); + } + + if !bloom_filter.read_only { + for h in hashes_to_insert { + bloom_filter.insert(&h); + } + } + } + attributes[attr_name] = Value::Array(duplicate_paragraph_spans); + (attributes, seen_bytes, removed_bytes) +} + +/*================================================================= += CONFIG DEFINITIONS = +=================================================================*/ + pub mod deduper_config { use serde::{Deserialize, Serialize}; use std::io; @@ -507,13 +698,13 @@ pub mod deduper_config { pub document_key: Option, } - #[derive(Serialize, Deserialize, Clone)] + #[derive(Serialize, Deserialize, Clone, Debug)] pub struct DocumentDedupeConfig { pub attribute_name: String, pub key: String, } - #[derive(Serialize, Deserialize, Clone)] + #[derive(Serialize, Deserialize, Clone, Debug)] pub struct ParagraphDedupeConfig { pub attribute_name: String, // If defined, remove paragraphs based on contained ngrams @@ -524,7 +715,7 @@ pub mod deduper_config { pub paragraph_separator: Option, } - #[derive(Serialize, Deserialize, Clone)] + #[derive(Serialize, Deserialize, Clone, Debug)] pub struct NgramDedupeConfig { // Number of whitespace-delimited tokens per ngram pub ngram_length: usize, @@ -536,11 +727,21 @@ pub mod deduper_config { pub skip_short_paragraphs: Option, } - #[derive(Serialize, Deserialize, Clone)] + #[derive(Serialize, Deserialize, Clone, Debug)] + pub struct DCLMDedupeConfig { + // DCLMDedupeConfig does a hybrid of both fuzzy document and paragraph level deduplication + pub attribute_name: String, + pub by_ngram: NgramDedupeConfig, // NOT OPTIONAL + pub paragraph_separator: Option, + } + + #[derive(Serialize, Deserialize, Clone, Debug)] pub struct DedupeConfig { pub name: String, + pub dedupe_method: String, pub documents: Option, pub paragraphs: Option, + pub dclm: Option, pub min_length: Option, pub min_words: Option, pub skip_empty: Option, @@ -548,7 +749,7 @@ pub mod deduper_config { pub partition_index: Option, } - #[derive(Serialize, Deserialize, Clone)] + #[derive(Serialize, Deserialize, Clone, Debug)] pub struct DeduperConfig { pub documents: Vec, pub work_dir: WorkDirConfig, diff --git a/src/lib.rs b/src/lib.rs index 574af7ef..c6229e54 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,6 +9,7 @@ pub mod bloom_filter; pub mod deduper; pub mod filters; pub mod io; +pub mod log_pbar; pub mod mixer; pub mod s3_util; pub mod shard; diff --git a/src/log_pbar.rs b/src/log_pbar.rs new file mode 100644 index 00000000..b63eefee --- /dev/null +++ b/src/log_pbar.rs @@ -0,0 +1,49 @@ +use indicatif::{ProgressBar, ProgressStyle}; +use log::info; + +pub struct LogProgressBar { + progress_bar: ProgressBar, + last_logged_position: u64, +} + +impl LogProgressBar { + pub fn new(total: usize) -> Self { + let progress_bar = ProgressBar::new(total.try_into().unwrap()); + progress_bar.set_style( + ProgressStyle::default_bar() + .template("{elapsed_precise} [{bar:40.cyan/blue}] {pos}/{len} ({eta}) {msg}") + .unwrap() + .progress_chars("#>-"), + ); + + Self { + progress_bar, + last_logged_position: 0, + } + } + + pub fn inc(&mut self, delta: u64) { + self.progress_bar.inc(delta); + self.log_progress_if_needed(); + } + + fn finish_with_message(&mut self, msg: &str) { + self.progress_bar.finish_with_message(msg.to_string()); + self.log_progress(); + } + + fn log_progress_if_needed(&mut self) { + let current_position = self.progress_bar.position(); + if current_position - self.last_logged_position >= self.progress_bar.length().unwrap() / 10 + { + self.log_progress(); + } + } + + fn log_progress(&mut self) { + let mut progress_message = String::new(); + self.progress_bar.println(&mut progress_message); + info!("{}", progress_message.trim()); + self.last_logged_position = self.progress_bar.position(); + } +} diff --git a/src/shard.rs b/src/shard.rs index d5992caa..f15d87cf 100644 --- a/src/shard.rs +++ b/src/shard.rs @@ -489,8 +489,8 @@ pub mod shard_config { pub compression: Option, } - #[derive(Serialize, Deserialize, Clone)] - pub struct CompressionConfig { + #[derive(Serialize, Deserialize, Clone, Debug)] +pub struct CompressionConfig { pub input: Option, pub output: Option, } @@ -512,7 +512,7 @@ pub mod shard_config { pub min_text_length: Option, } - #[derive(Serialize, Deserialize, Clone)] + #[derive(Serialize, Deserialize, Clone, Debug)] pub struct WorkDirConfig { pub input: String, pub output: String, diff --git a/tests/config/dedupe-by-url.json b/tests/config/dedupe-by-url.json index 1bc82ce8..99ed429c 100644 --- a/tests/config/dedupe-by-url.json +++ b/tests/config/dedupe-by-url.json @@ -18,7 +18,8 @@ "size_in_bytes": 0, "read_only": false, "estimated_doc_count": 1000, - "desired_false_positive_rate": 0.001 + "desired_false_positive_rate": 0.001, + "sysram_limit": 0.9 }, "processes": 1 } diff --git a/tests/config/dedupe-paragraph-dclm.json b/tests/config/dedupe-paragraph-dclm.json new file mode 100644 index 00000000..6117c303 --- /dev/null +++ b/tests/config/dedupe-paragraph-dclm.json @@ -0,0 +1,29 @@ +{ + "documents": [ + "tests/data/provided/deduper/doc*/dclm_bff_checker.jsonl.gz" + ], + "work_dir": { + "input": "tests/work/temp/dedupe-dclm/input", + "output": "tests/work/temp/dedupe-dclm/output" + }, + "dedupe": { + "name": "dedupe_dclm_ngrams", + "dedupe_method": "dclm", + "dclm": { + "attribute_name": "bff_duplicate_dclm_spans", + "by_ngram": { + "ngram_length": 6, + "stride": 1, + "overlap_threshold": 0.8 + } + } + }, + "bloom_filter": { + "file": "tests/work/para_bloom_filter.bin", + "size_in_bytes": 0, + "read_only": false, + "estimated_doc_count": 10000, + "desired_false_positive_rate": 0.00001 + }, + "processes": 1 +} diff --git a/tests/config/dedupe-paragraph-ngrams.json b/tests/config/dedupe-paragraph-ngrams.json index 6f599a85..1446b41b 100644 --- a/tests/config/dedupe-paragraph-ngrams.json +++ b/tests/config/dedupe-paragraph-ngrams.json @@ -1,6 +1,6 @@ { "documents": [ - "tests/data/provided/deduper/doc*/000.json.gz" + "tests/data/provided/deduper/doc*/dclm_bff_checker.jsonl.gz" ], "work_dir": { "input": "tests/work/temp/dedupe-para/input", @@ -12,7 +12,7 @@ "attribute_name": "bff_duplicate_paragraph_spans", "by_ngram": { "ngram_length": 6, - "stride": 3, + "stride": 1, "overlap_threshold": 0.5 } } @@ -22,7 +22,7 @@ "size_in_bytes": 0, "read_only": false, "estimated_doc_count": 1000, - "desired_false_positive_rate": 0.001 + "desired_false_positive_rate": 0.00001 }, "processes": 1 } diff --git a/tests/data/expected/dedupe-paragraph-ngrams.json.gz b/tests/data/expected/dedupe-paragraph-ngrams.json.gz deleted file mode 100644 index c22ba24c..00000000 Binary files a/tests/data/expected/dedupe-paragraph-ngrams.json.gz and /dev/null differ diff --git a/tests/data/expected/dedupe-paragraphs-ngrams.jsonl.gz b/tests/data/expected/dedupe-paragraphs-ngrams.jsonl.gz new file mode 100644 index 00000000..69b91a9f Binary files /dev/null and b/tests/data/expected/dedupe-paragraphs-ngrams.jsonl.gz differ diff --git a/tests/data/provided/deduper/documents/dclm_bff_checker.jsonl.gz b/tests/data/provided/deduper/documents/dclm_bff_checker.jsonl.gz new file mode 100644 index 00000000..1fc1a184 Binary files /dev/null and b/tests/data/provided/deduper/documents/dclm_bff_checker.jsonl.gz differ diff --git a/tests/python/test_deduper.py b/tests/python/test_deduper.py index 05a3081f..b0da532e 100644 --- a/tests/python/test_deduper.py +++ b/tests/python/test_deduper.py @@ -26,6 +26,7 @@ DEDUPE_BY_URL = TEST_DIR / "config/dedupe-by-url.json" DEDUPE_PARAGRAPHS = TEST_DIR / "config/dedupe-paragraphs.json" DEDUPE_PARAGRAPH_NGRAMS = TEST_DIR / "config/dedupe-paragraph-ngrams.json" +DEDUPE_DCLM = TEST_DIR / "config/dedupe-paragraph-dclm.json" D = TypeVar("D", bound="DedupeAttributesDict") @@ -42,7 +43,7 @@ def setUp(self) -> None: self.local_temp_dir = self.stack.enter_context(TemporaryDirectory()).rstrip("/") if skip_aws_tests(): - self.remote_test_prefix = None + self.remote_test_prefix = None else: self.remote_test_prefix = get_test_prefix() @@ -201,28 +202,49 @@ def test_dedupe_paragraphs_stride_math_skip_short(self): for (start_para, end_para), (start_span, end_span, _) in zip(valid_paragraphs, spans): self.assertEqual(doc["text"][start_para:end_para], doc["text"][start_span:end_span]) + def test_dedupe_paragraph_ngrams(self): with open(DEDUPE_PARAGRAPH_NGRAMS, "r") as f: config = json.load(f) config["documents"][0] = f'{self.local_temp_dir}/{config["documents"][0]}' config["bloom_filter"]["file"] = f'{self.local_temp_dir}/{config["bloom_filter"]["file"]}' - with NamedTemporaryFile("w") as f: json.dump(config, f) f.flush() main(argv=["-c", f.name, "dedupe"]) - expected = load_jsonl("tests/data/expected/dedupe-paragraph-ngrams.json.gz") + expected = load_jsonl("tests/data/expected/dedupe-paragraphs-ngrams.jsonl.gz") print( - f"Loading data from {self.local_temp_dir}/tests/data/provided/attributes/dedupe_paragraph_ngrams/000.json.gz" + f"Loading data from {self.local_temp_dir}/tests/data/provided/attributes/dedupe_paragraph_ngrams/dclm_bff_checker.jsonl.gz" ) computed = load_jsonl( - f"{self.local_temp_dir}/tests/data/provided/deduper/attributes/dedupe_paragraph_ngrams/000.json.gz" + f"{self.local_temp_dir}/tests/data/provided/deduper/attributes/dedupe_paragraph_ngrams/dclm_bff_checker.jsonl.gz" ) return self._compare_dedupe_output(expected, computed) # pyright: ignore + + + def test_dedupe_dclm_bff(self): + with open(DEDUPE_DCLM, 'r') as f: + config = json.load(f) + + config["documents"][0] = f'{self.local_temp_dir}/{config["documents"][0]}' + config["bloom_filter"]["file"] = f'{self.local_temp_dir}/{config["bloom_filter"]["file"]}' + with NamedTemporaryFile("w") as f: + json.dump(config, f) + f.flush() + + main(argv=["-c", f.name, "dedupe"]) + computed = load_jsonl( + f"{self.local_temp_dir}/tests/data/provided/deduper/attributes/dedupe_dclm_ngrams/dclm_bff_checker.jsonl.gz" + ) + + expected = load_jsonl('tests/data/expected/dclm_bff_checker_exp.jsonl.gz') + return self._compare_dedupe_output(expected, computed) + + def _compare_dedupe_output(self, expected: List[D], computed: List[D]): self.assertEqual(len(expected), len(computed)) for exp_row, comp_row in zip(expected, computed): @@ -257,6 +279,8 @@ def test_dedupe_by_url_remote_input(self): return self._compare_dedupe_output(expected, computed) # pyright: ignore + + class TestDeduperPipeline(TestCasePipeline): def test_skip_empty(self): duplicate_text = "More text" @@ -364,7 +388,6 @@ def test_unicode(self): main(argv=["-c", f"{d}/config.json", "dedupe"]) attributes = load_jsonl(f"{attributes_dir}/dedupe_paragraphs/test.jsonl.gz") - first_dup, second_dup = attributes[0]["attributes"]["bff_duplicate_paragraph_spans"] self.assertEqual(text[first_dup[0] : first_dup[1]], paragraphs[2] + "\n")