diff --git a/Cargo.lock b/Cargo.lock index dda757b..3d22e63 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "addr2line" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4fa78e18c64fce05e902adecd7a5eed15a5e0a3439f7b0e169f0252214865e3" +dependencies = [ + "gimli", +] + [[package]] name = "adler" version = "1.0.2" @@ -78,7 +87,7 @@ checksum = "ece550dd6710221de9bcdc1697424d8eee4fc4ca7e017479ea9d50c348465e37" dependencies = [ "byteorder", "crc", - "digest", + "digest 0.9.0", "lazy_static", "libflate", "num-bigint", @@ -94,6 +103,21 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "backtrace" +version = "0.3.68" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4319208da049c43661739c5fade2ba182f09d1dc2299b32298d3a31692b17e12" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + [[package]] name = "base64" version = "0.21.2" @@ -106,10 +130,10 @@ version = "0.59.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2bd2a9a458e8f4304c52c43ebb0cfbd520289f8379a52e329a38afda99bf8eb8" dependencies = [ - "bitflags", + "bitflags 1.3.2", "cexpr", "clang-sys", - "clap", + "clap 2.34.0", "env_logger 0.9.3", "lazy_static", "lazycell", @@ -129,6 +153,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42" + [[package]] name = "block-buffer" version = "0.9.0" @@ -138,6 +168,15 @@ dependencies = [ "generic-array", ] +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "build_const" version = "0.2.2" @@ -180,9 +219,13 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.0.79" +version = "1.0.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" +checksum = "305fe645edc1442a0fa8b6726ba61d422798d37a52e12eaecf4b022ebbb88f01" +dependencies = [ + "jobserver", + "libc", +] [[package]] name = "cexpr" @@ -242,13 +285,52 @@ checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" dependencies = [ "ansi_term", "atty", - "bitflags", - "strsim", - "textwrap", + "bitflags 1.3.2", + "strsim 0.8.0", + "textwrap 0.11.0", "unicode-width", "vec_map", ] +[[package]] +name = "clap" +version = "3.2.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123" +dependencies = [ + "atty", + "bitflags 1.3.2", + "clap_derive", + "clap_lex", + "indexmap", + "once_cell", + "strsim 0.10.0", + "termcolor", + "textwrap 0.16.0", +] + +[[package]] +name = "clap_derive" +version = "3.2.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae6371b8bdc8b7d3959e9cf7b22d4435ef3e79e138688421ec654acf8c81b008" +dependencies = [ + "heck 0.4.1", + "proc-macro-error", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "clap_lex" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5" +dependencies = [ + "os_str_bytes", +] + [[package]] name = "cmake" version = "0.1.50" @@ -266,9 +348,9 @@ checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" [[package]] name = "cpufeatures" -version = "0.2.7" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e4c1eaa2012c47becbbad2ab175484c2a84d1185b566fb2cc5b8707343dfe58" +checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1" dependencies = [ "libc", ] @@ -299,10 +381,10 @@ checksum = "b01d6de93b2b6c65e17c634a26653a29d107b3c98c607c765bf38d041531cd8f" dependencies = [ "atty", "cast", - "clap", + "clap 2.34.0", "criterion-plot", "csv", - "itertools", + "itertools 0.10.5", "lazy_static", "num-traits", "oorandom", @@ -324,7 +406,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2673cc8207403546f45f5fd319a974b1e6983ad1a3ee7e6041650013be041876" dependencies = [ "cast", - "itertools", + "itertools 0.10.5", ] [[package]] @@ -350,9 +432,9 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.14" +version = "0.9.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695" +checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" dependencies = [ "autocfg", "cfg-if", @@ -363,13 +445,23 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.15" +version = "0.8.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b" +checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" dependencies = [ "cfg-if", ] +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + [[package]] name = "csv" version = "1.2.2" @@ -424,17 +516,27 @@ dependencies = [ "generic-array", ] +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer 0.10.4", + "crypto-common", +] + [[package]] name = "dyn-clone" -version = "1.0.11" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68b0cf012f1230e43cd00ebb729c6bb58707ecfa8ad08b52ef3a4ccd2697fc30" +checksum = "304e6508efa593091e97a9abbc10f90aa7ca635b6d2784feff3c89d41dd12272" [[package]] name = "either" -version = "1.8.1" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" [[package]] name = "encoding_rs" @@ -473,9 +575,9 @@ dependencies = [ [[package]] name = "errno" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" +checksum = "6b30f669a7961ef1631673d2766cc92f52d64f7ef354d4fe0ddfd30ed52f0f4f" dependencies = [ "errno-dragonfly", "libc", @@ -494,12 +596,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "1.9.0" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" -dependencies = [ - "instant", -] +checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764" [[package]] name = "fasttext" @@ -591,7 +690,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.28", ] [[package]] @@ -656,6 +755,12 @@ dependencies = [ "wasi 0.11.0+wasi-snapshot-preview1", ] +[[package]] +name = "gimli" +version = "0.27.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c80984affa11d98d1b88b66ac8853f143217b399d3c74116778ff8fdb4ed2e" + [[package]] name = "glob" version = "0.3.1" @@ -664,9 +769,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "h2" -version = "0.3.19" +version = "0.3.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d357c7ae988e7d2182f7d7871d0b963962420b0678b0997ce7de72001aeab782" +checksum = "97ec8491ebaf99c8eaa73058b045fe58073cd6be7f596ac993ced0b0a0c01049" dependencies = [ "bytes", "fnv", @@ -703,28 +808,25 @@ dependencies = [ ] [[package]] -name = "hermit-abi" -version = "0.1.19" +name = "heck" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" -dependencies = [ - "libc", -] +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "hermit-abi" -version = "0.2.6" +version = "0.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" dependencies = [ "libc", ] [[package]] name = "hermit-abi" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" +checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" [[package]] name = "http" @@ -768,9 +870,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "0.14.26" +version = "0.14.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab302d72a6f11a3b910431ff93aae7e773078c769f0a3ef15fb9ec692ed147d4" +checksum = "ffb1cfd654a8219eaef89881fdb3bb3b1cdc5fa75ded05d6933b2b382e395468" dependencies = [ "bytes", "futures-channel", @@ -792,10 +894,11 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.24.0" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7" +checksum = "8d78e1e73ec14cf7375674f74d7dde185c8206fd9dea6fb6295e8a98098aaa97" dependencies = [ + "futures-util", "http", "hyper", "rustls", @@ -855,22 +958,11 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "io-lifetimes" -version = "1.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" -dependencies = [ - "hermit-abi 0.3.1", - "libc", - "windows-sys", -] - [[package]] name = "ipnet" -version = "2.7.2" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12b6ee2129af8d4fb011108c73d99a1b83a85977f23b82460c0ae2e25bb4b57f" +checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6" [[package]] name = "itertools" @@ -881,17 +973,35 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + [[package]] name = "itoa" -version = "1.0.6" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" +checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" + +[[package]] +name = "jobserver" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2" +dependencies = [ + "libc", +] [[package]] name = "js-sys" -version = "0.3.63" +version = "0.3.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f37a4a5928311ac501dee68b3c7613a1037d0edb30c8e5427bd832d55d1b790" +checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" dependencies = [ "wasm-bindgen", ] @@ -921,7 +1031,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6607c62aa161d23d17a9072cc5da0be67cdfc89d3afb1e8d9c842bebc2525ffe" dependencies = [ "arrayvec", - "bitflags", + "bitflags 1.3.2", "cfg-if", "ryu", "static_assertions", @@ -929,9 +1039,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.146" +version = "0.2.147" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f92be4933c13fd498862a9e02a3055f8a8d9c039ce33db97306fd5a6caa7f29b" +checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" [[package]] name = "libflate" @@ -971,9 +1081,9 @@ checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4" [[package]] name = "linux-raw-sys" -version = "0.3.8" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" +checksum = "57bcfdad1b858c2db7c38303a6d2ad4dfaf5eb53dfeb0910128b2c26d6158503" [[package]] name = "lock_api" @@ -987,9 +1097,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.18" +version = "0.4.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "518ef76f2f87365916b142844c16d8fefd85039bc5699050210a7778ee1cd1de" +checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" [[package]] name = "matches" @@ -1005,9 +1115,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "memoffset" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1" +checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" dependencies = [ "autocfg", ] @@ -1088,9 +1198,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.15" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" +checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" dependencies = [ "autocfg", "libm", @@ -1098,14 +1208,23 @@ dependencies = [ [[package]] name = "num_cpus" -version = "1.15.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi 0.2.6", + "hermit-abi 0.3.2", "libc", ] +[[package]] +name = "object" +version = "0.31.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bda667d9f2b5051b8833f59f3bf748b28ef54f850f4fcb389a252aa383866d1" +dependencies = [ + "memchr", +] + [[package]] name = "once_cell" version = "1.18.0" @@ -1133,11 +1252,32 @@ dependencies = [ "num-traits", ] +[[package]] +name = "os_str_bytes" +version = "6.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d5d9eb14b174ee9aa2ef96dc2b94637a2d4b6e7cb873c7e171f0c20c6cf3eac" + [[package]] name = "oscar-io" -version = "0.2.4" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df3129df1afe61e0826c2a861fcc59c0772b33f808283d4ad6a652f36f3007a0" +checksum = "08bf1edf98688037a1e1252a642fda3c7dd8b830d460dd1f4ad58efba77d3fc6" +dependencies = [ + "avro-rs", + "flate2", + "log", + "schemars", + "serde", + "serde_json", + "warc", +] + +[[package]] +name = "oscar-io" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8dec9b3817e0004a8e4cfbf6a5c9edcdc1ac2da999b1316b96fa7a08480097b" dependencies = [ "avro-rs", "flate2", @@ -1147,6 +1287,27 @@ dependencies = [ "serde", "serde_json", "warc", + "zstd", +] + +[[package]] +name = "oscar-tools" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a24217c5cd1e8f14679e5999d296074a0f063767783730e8641996ddffaf6139" +dependencies = [ + "clap 3.2.25", + "env_logger 0.9.3", + "flate2", + "itertools 0.11.0", + "log", + "oscar-io 0.1.3", + "rand 0.8.5", + "rayon", + "runiq-lib", + "serde_json", + "sha2 0.10.7", + "walkdir", ] [[package]] @@ -1226,9 +1387,9 @@ checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" [[package]] name = "pin-project-lite" -version = "0.2.9" +version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" +checksum = "2c516611246607d0c04186886dbb3a754368ef82c79e9827a802c6d836dd111c" [[package]] name = "pin-utils" @@ -1236,11 +1397,17 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkg-config" +version = "0.3.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" + [[package]] name = "plotters" -version = "0.3.4" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2538b639e642295546c50fcd545198c9d64ee2a38620a628724a3b266d5fbf97" +checksum = "d2c224ba00d7cadd4d5c660deaf2098e5e80e07846537c51f9cfa4be50c1fd45" dependencies = [ "num-traits", "plotters-backend", @@ -1251,15 +1418,15 @@ dependencies = [ [[package]] name = "plotters-backend" -version = "0.3.4" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "193228616381fecdc1224c62e96946dfbc73ff4384fba576e052ff8c1bea8142" +checksum = "9e76628b4d3a7581389a35d5b6e2139607ad7c75b17aed325f210aa91f4a9609" [[package]] name = "plotters-svg" -version = "0.3.3" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9a81d2759aae1dae668f783c308bc5c8ebd191ff4184aaa1b37f65a6ae5a56f" +checksum = "38f6d39893cca0701371e3c27294f09797214b86f1fb951b89ade8ec04e2abab" dependencies = [ "plotters-backend", ] @@ -1296,18 +1463,18 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.60" +version = "1.0.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dec2b086b7a862cf4de201096214fa870344cf922b2b30c167badb3af3195406" +checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.28" +version = "1.0.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488" +checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965" dependencies = [ "proc-macro2", ] @@ -1421,7 +1588,7 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] @@ -1430,14 +1597,26 @@ version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] name = "regex" -version = "1.8.4" +version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f" +checksum = "81bc1d4caf89fac26a70747fe603c130093b53c773888797a6329091246d651a" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fed1ceff11a1dddaee50c9dc8e4938bd106e9d89ae372f192311e7da498e3b69" dependencies = [ "aho-corasick", "memchr", @@ -1446,9 +1625,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.7.2" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" +checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2" [[package]] name = "reqwest" @@ -1519,12 +1698,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06f1b54cc97c736ebdad530f37f6ae5b574ea61721a3df9c8836ae6648d20a40" dependencies = [ "bytelines", - "clap", + "clap 2.34.0", "fnv", "scalable_bloom_filter", "twox-hash", ] +[[package]] +name = "rustc-demangle" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" + [[package]] name = "rustc-hash" version = "1.1.0" @@ -1533,13 +1718,12 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustix" -version = "0.37.19" +version = "0.38.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d" +checksum = "172891ebdceb05aa0005f533a6cbfca599ddd7d966f6f5d4d9b2e70478e70399" dependencies = [ - "bitflags", + "bitflags 2.3.3", "errno", - "io-lifetimes", "libc", "linux-raw-sys", "windows-sys", @@ -1547,9 +1731,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.21.1" +version = "0.21.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c911ba11bc8433e811ce56fde130ccf32f5127cab0e0194e9c68c5a5b671791e" +checksum = "1d1feddffcfcc0b33f5c6ce9a29e341e4cd59c3f78e7ee45f4a40c038b1d6cbb" dependencies = [ "log", "ring", @@ -1559,18 +1743,18 @@ dependencies = [ [[package]] name = "rustls-pemfile" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b" +checksum = "2d3987094b1d07b653b7dfdc3f70ce9a1da9c51ac18c1b06b662e4f9a0e9f4b2" dependencies = [ "base64", ] [[package]] name = "rustls-webpki" -version = "0.100.1" +version = "0.101.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6207cd5ed3d8dca7816f8f3725513a34609c0c765bf652b8c3cb4cfd87db46b" +checksum = "261e9e0888cba427c3316e6322805653c9425240b6fd96cee7cb671ab70ab8d0" dependencies = [ "ring", "untrusted", @@ -1578,9 +1762,9 @@ dependencies = [ [[package]] name = "ryu" -version = "1.0.13" +version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041" +checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" [[package]] name = "same-file" @@ -1626,9 +1810,9 @@ dependencies = [ [[package]] name = "scopeguard" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "sct" @@ -1642,9 +1826,9 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.164" +version = "1.0.183" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d" +checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c" dependencies = [ "serde_derive", ] @@ -1661,13 +1845,13 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.164" +version = "1.0.183" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9735b638ccc51c28bf6914d90a2e9725b377144fc612c49a611fddd1b631d68" +checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.28", ] [[package]] @@ -1683,9 +1867,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.96" +version = "1.0.104" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1" +checksum = "076066c5f1078eac5b722a31827a8832fe108bed65dfa75e233c89f8206e976c" dependencies = [ "itoa", "ryu", @@ -1732,10 +1916,10 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99cd6713db3cf16b6c84e06321e049a9b9f699826e16096d23bbcc44d15d51a6" dependencies = [ - "block-buffer", + "block-buffer 0.9.0", "cfg-if", "cpufeatures", - "digest", + "digest 0.9.0", "opaque-debug", ] @@ -1745,13 +1929,24 @@ version = "0.9.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4d58a1e1bf39749807d89cf2d98ac2dfa0ff1cb3faa38fbb64dd88ac8013d800" dependencies = [ - "block-buffer", + "block-buffer 0.9.0", "cfg-if", "cpufeatures", - "digest", + "digest 0.9.0", "opaque-debug", ] +[[package]] +name = "sha2" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479fb9d862239e610720565ca91403019f2f00410f1864c5aa7479b950a76ed8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest 0.10.7", +] + [[package]] name = "shlex" version = "1.1.0" @@ -1784,9 +1979,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" +checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" [[package]] name = "snap" @@ -1826,13 +2021,19 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + [[package]] name = "structopt" version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10" dependencies = [ - "clap", + "clap 2.34.0", "lazy_static", "structopt-derive", ] @@ -1843,7 +2044,7 @@ version = "0.4.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0" dependencies = [ - "heck", + "heck 0.3.3", "proc-macro-error", "proc-macro2", "quote", @@ -1862,7 +2063,7 @@ version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "87c85aa3f8ea653bfd3ddf25f7ee357ee4d204731f6aa9ad04002306f6e2774c" dependencies = [ - "heck", + "heck 0.3.3", "proc-macro2", "quote", "syn 1.0.109", @@ -1881,9 +2082,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.18" +version = "2.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e" +checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567" dependencies = [ "proc-macro2", "quote", @@ -1904,11 +2105,10 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.6.0" +version = "3.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31c0432476357e58790aaa47a8efb0c5138f137343f3b5f23bd36a27e3b0a6d6" +checksum = "dc02fddf48964c42031a0b3fe0428320ecf3a73c401040fc0096f97794310651" dependencies = [ - "autocfg", "cfg-if", "fastrand", "redox_syscall 0.3.5", @@ -1927,9 +2127,9 @@ dependencies = [ [[package]] name = "test-log" -version = "0.2.11" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38f0c854faeb68a048f0f2dc410c5ddae3bf83854ef0e4977d58306a5edef50e" +checksum = "d9601d162c1d77e62c1ea0bc8116cd1caf143ce3af947536c3c9052a1677fe0c" dependencies = [ "proc-macro2", "quote", @@ -1945,24 +2145,30 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "textwrap" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" + [[package]] name = "thiserror" -version = "1.0.40" +version = "1.0.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac" +checksum = "611040a08a0439f8248d1990b111c95baa9c704c805fa1f62104b39655fd7f90" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.40" +version = "1.0.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" +checksum = "090198534930841fab3a5d1bb637cde49e339654e606195f8d9c76eeb081dc96" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.28", ] [[package]] @@ -2009,11 +2215,12 @@ checksum = "f762ca8308eda1e38512dc88a99f021e5214699ba133de157f588c8bfd0745c7" [[package]] name = "tokio" -version = "1.28.2" +version = "1.29.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94d7b1cfd2aa4011f2de74c2c4c63665e27a71006b0a192dcd2710272e73dfa2" +checksum = "532826ff75199d5833b9d2c5fe410f29235e25704ee5f0ef599fb51c21f4a4da" dependencies = [ "autocfg", + "backtrace", "bytes", "libc", "mio", @@ -2034,14 +2241,14 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.28", ] [[package]] name = "tokio-rustls" -version = "0.24.0" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5" +checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" dependencies = [ "rustls", "tokio", @@ -2152,11 +2359,12 @@ dependencies = [ "futures-core", "futures-util", "glob", - "itertools", + "itertools 0.10.5", "language-tags", "lazy_static", "log", - "oscar-io", + "oscar-io 0.4.0", + "oscar-tools", "oxilangtag", "rand 0.8.5", "rand_distr", @@ -2168,7 +2376,7 @@ dependencies = [ "serde_json", "serial_test", "sha-1", - "sha2", + "sha2 0.9.9", "structopt", "tempfile", "test-log", @@ -2182,6 +2390,7 @@ dependencies = [ "url", "ut1_blocklist", "warc", + "zstd", ] [[package]] @@ -2375,9 +2584,9 @@ checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" [[package]] name = "unicode-ident" -version = "1.0.9" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" +checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" [[package]] name = "unicode-normalization" @@ -2475,11 +2684,10 @@ dependencies = [ [[package]] name = "want" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ce8a968cb1cd110d136ff8b819a556d6fb6d919363c61534f6860c7eb172ba0" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" dependencies = [ - "log", "try-lock", ] @@ -2517,9 +2725,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.86" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bba0e8cb82ba49ff4e229459ff22a191bbe9a1cb3a341610c9c33efc27ddf73" +checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -2527,24 +2735,24 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.86" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19b04bc93f9d6bdee709f6bd2118f57dd6679cf1176a1af464fca3ab0d66d8fb" +checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.28", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.36" +version = "0.4.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d1985d03709c53167ce907ff394f5316aa22cb4e12761295c5dc57dacb6297e" +checksum = "c02dbc21516f9f1f04f187958890d7e6026df8d16540b7ad9492bc34a67cea03" dependencies = [ "cfg-if", "js-sys", @@ -2554,9 +2762,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.86" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14d6b024f1a526bb0234f52840389927257beb670610081360e5a03c5df9c258" +checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -2564,22 +2772,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.86" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8" +checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.28", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.86" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93" +checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" [[package]] name = "wasm-streams" @@ -2596,9 +2804,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.63" +version = "0.3.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bdd9ef4e984da1187bf8110c5cf5b845fbc87a23602cdf912386a76fcd3a7c2" +checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" dependencies = [ "js-sys", "wasm-bindgen", @@ -2685,9 +2893,9 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.48.0" +version = "0.48.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" +checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", @@ -2769,3 +2977,33 @@ dependencies = [ "syn 1.0.109", "synstructure", ] + +[[package]] +name = "zstd" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "6.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581" +dependencies = [ + "libc", + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.8+zstd.1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c" +dependencies = [ + "cc", + "libc", + "pkg-config", +] diff --git a/Cargo.toml b/Cargo.toml index 74251f4..a80da90 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,8 @@ twox-hash = "1.6" glob = "0.3.0" sha2 = "0.9.5" +oscar-tools = { version = "0.4.0", optional = true } + serde = { version = "1", features = ["derive"] } serde_json = "1" schemars = "0.8.3" @@ -59,6 +61,7 @@ ctclib-pp = { version = "0.2.0", optional = true } [features] kenlm = ["dep:ctclib-pp"] +checksum = ["dep:oscar-tools"] [dev-dependencies] rand_distr = "0.4.2" @@ -97,4 +100,9 @@ ci = ["github"] # The installers to generate for each app installers = [] # Target platforms to build apps for (Rust target-triple syntax) -targets = ["x86_64-unknown-linux-gnu", "x86_64-apple-darwin", "x86_64-pc-windows-msvc", "aarch64-apple-darwin"] +targets = [ + "x86_64-unknown-linux-gnu", + "x86_64-apple-darwin", + "x86_64-pc-windows-msvc", + "aarch64-apple-darwin", +] diff --git a/src/cli.rs b/src/cli.rs index ae5fff9..95b90e4 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -195,4 +195,8 @@ pub struct Pipeline { #[structopt(short = "c", long = "comp", help = "Enables zstd compression")] pub comp: bool, + + #[cfg(feature = "checksum")] + #[structopt(long = "checksum", help = "compute checksums")] + pub checksum: bool, } diff --git a/src/io/langfiles.rs b/src/io/langfiles.rs index 087f76f..46e322e 100644 --- a/src/io/langfiles.rs +++ b/src/io/langfiles.rs @@ -99,7 +99,13 @@ impl LangFilesDoc { } else { None }; - let w = Writer::new(dst, lang, part_size_bytes, comp)?; + + // add lang subfolder + let mut subfolder = dst.to_path_buf(); + subfolder.push(lang.to_string()); + std::fs::create_dir(&subfolder)?; + + let w = Writer::new(&subfolder, lang, part_size_bytes, comp)?; Ok(Arc::new(Mutex::new(w))) } @@ -219,7 +225,7 @@ mod tests { w.flush().unwrap(); } let mut read_path = PathBuf::from(dst.path()); - read_path.push("en.jsonl"); + read_path.push("en/en.jsonl"); let b = File::open(read_path).unwrap(); let doc_from_file: Document = serde_json::from_reader(b).unwrap(); @@ -252,7 +258,7 @@ mod tests { } let mut read_path = PathBuf::from(dst.path()); - read_path.push("en.jsonl.zstd"); + read_path.push("en/en.jsonl.zstd"); let b = File::open(&read_path).unwrap(); let dec = zstd::decode_all(b).unwrap(); diff --git a/src/main.rs b/src/main.rs index 63efb38..b4e6152 100644 --- a/src/main.rs +++ b/src/main.rs @@ -73,6 +73,8 @@ async fn main() -> Result<(), error::Error> { p.kenlms_path, p.split.map(|size_mbytes| size_mbytes * 1_000_000), p.comp, + #[cfg(feature = "checksum")] + p.checksum, ); p.run()?; diff --git a/src/pipelines/oscardoc/pipeline.rs b/src/pipelines/oscardoc/pipeline.rs index 68b05e0..7446639 100644 --- a/src/pipelines/oscardoc/pipeline.rs +++ b/src/pipelines/oscardoc/pipeline.rs @@ -60,8 +60,15 @@ pub struct OscarDoc { kenlms_path: Option, split: Option, // in bytes comp: bool, + #[cfg(feature = "checksum")] + checksum: bool, } +#[cfg(feature = "checksum")] +use oscar_tools::Checksum; +#[cfg(feature = "checksum")] +impl Checksum for OscarDoc {} + impl OscarDoc { pub fn new( src: PathBuf, @@ -71,6 +78,7 @@ impl OscarDoc { kenlms_path: Option, split: Option, comp: bool, + #[cfg(feature = "checksum")] checksum: bool, ) -> Self { if blocklist.is_none() { warn!("No blocklist folder specified! No adult content tagging will be done."); @@ -85,6 +93,8 @@ impl OscarDoc { kenlms_path, split, comp, + #[cfg(feature = "checksum")] + checksum, } } @@ -519,6 +529,16 @@ impl Pipeline<()> for OscarDoc { // flush writers info!("Flushing writers"); langfiles.flush_all()?; + std::mem::drop(langfiles); + //compute checksums + #[cfg(feature = "checksum")] + { + if self.checksum { + info!("Checksumming"); + Self::checksum_folder(&self.dst, 1).unwrap(); + } + } + info!("Done"); Ok(()) } diff --git a/src/pipelines/oscardoc/types/location.rs b/src/pipelines/oscardoc/types/location.rs index d1f7de3..0e7038d 100644 --- a/src/pipelines/oscardoc/types/location.rs +++ b/src/pipelines/oscardoc/types/location.rs @@ -191,8 +191,6 @@ impl Default for Location { #[cfg(test)] mod tests { - - use super::Location; use super::LocationBuilder; diff --git a/src/transformers/mod.rs b/src/transformers/mod.rs index b7779a0..d9acc3b 100644 --- a/src/transformers/mod.rs +++ b/src/transformers/mod.rs @@ -24,13 +24,13 @@ pub use annotate::Annotate; pub use annotate::Annotator; pub use content_detector::ContentDetector; pub use header::Header; -pub use lsh::LSH; #[cfg(feature = "kenlm")] pub use kenlm::AdultDetector; #[cfg(feature = "kenlm")] pub use kenlm::AdultDetectorBuilder; #[cfg(feature = "kenlm")] pub use kenlm::Models; +pub use lsh::LSH; pub use noisy::Noisy; pub use sentence_filter::Conv; pub use sentence_filter::RemoveShortSentences; diff --git a/tests/oscardoc_rebuild.rs b/tests/oscardoc_rebuild.rs index 5c77a64..4a913fe 100644 --- a/tests/oscardoc_rebuild.rs +++ b/tests/oscardoc_rebuild.rs @@ -24,7 +24,12 @@ fn gen_corpus() { let kenlm = Path::new("res/kenlm/").to_path_buf(); //TODO test with custom blocklists + #[cfg(feature = "checksum")] + let pipeline = OscarDoc::new(src, dst, lid, Some(bl), Some(kenlm), None, false, false); + + #[cfg(not(feature = "checksum"))] let pipeline = OscarDoc::new(src, dst, lid, Some(bl), Some(kenlm), None, false); + pipeline.run().expect( "Ensure to have shards in res/shards, lid.176.bin at root and blocklist at res/blocklist", );