From 0303f6b6be46dba02609c8b7e04c8d7438693ace Mon Sep 17 00:00:00 2001 From: "James C. Wise" Date: Thu, 19 Dec 2024 16:22:39 -0500 Subject: [PATCH] Better parallelization and misc. upgrades Signed-off-by: James C. Wise --- Cargo.lock | 199 +++++++++++++++---------------- Cargo.toml | 11 +- README.md | 4 +- benchmarking/benchmark.sh | 38 ++++-- default-config.json | 44 ++++--- src/glue/caching.rs | 4 + src/main.rs | 178 +++++++++++++++------------ src/types/rules.rs | 2 +- src/types/rules/conditions.rs | 2 +- src/types/rules/mappers.rs | 2 +- src/types/string_matcher.rs | 2 +- src/types/string_modification.rs | 2 +- src/types/string_source.rs | 2 +- 13 files changed, 269 insertions(+), 221 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f36eac3..3c62ef8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "addr2line" @@ -122,15 +122,15 @@ checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" [[package]] name = "bytes" -version = "1.8.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" +checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" [[package]] name = "cc" -version = "1.2.1" +version = "1.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd9de9f2205d5ef3fd67e685b0df337994ddd4495e2a28d185500d0e1edfea47" +checksum = "c31a0499c1dc64f458ad13872de75c0eb7e3fdb0e67964610c914b034fc5956e" dependencies = [ "shlex", ] @@ -143,9 +143,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "4.5.21" +version = "4.5.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb3b4b9e5a7c7514dfa52869339ee98b3156b0bfb4e8a77c4ff4babb64b1604f" +checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84" dependencies = [ "clap_builder", "clap_derive", @@ -153,9 +153,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.21" +version = "4.5.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b17a95aa67cc7b5ebd32aa5370189aa0d79069ef1c64ce893bd30fb24bff20ec" +checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838" dependencies = [ "anstream", "anstyle", @@ -177,9 +177,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" [[package]] name = "colorchoice" @@ -284,9 +284,9 @@ dependencies = [ [[package]] name = "diesel" -version = "2.2.5" +version = "2.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbf9649c05e0a9dbd6d0b0b8301db5182b972d0fd02f0a7c6736cf632d7c0fd5" +checksum = "ccf1bedf64cdb9643204a36dd15b19a6ce8e7aa7f7b105868e9f1fad5ffa7d12" dependencies = [ "diesel_derives", "libsqlite3-sys", @@ -372,19 +372,19 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.9" +version = "0.3.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] name = "fastrand" -version = "2.2.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "fnv" @@ -510,9 +510,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.1" +version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" [[package]] name = "heck" @@ -520,26 +520,20 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "hermit-abi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" - [[package]] name = "home" -version = "0.5.9" +version = "0.5.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] name = "http" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" +checksum = "f16ca2af56261c99fba8bac40a10251ce8188205a4c448fbb745a2e4daa76fea" dependencies = [ "bytes", "fnv", @@ -577,9 +571,9 @@ checksum = "7d71d3574edd2771538b901e6549113b4006ece66150fb69c0fb6d9a2adae946" [[package]] name = "hyper" -version = "1.5.1" +version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97818827ef4f364230e16705d4706e2897df2bb60617d6ca15d598025a3c481f" +checksum = "256fb8d4bd6413123cc9d91832d78325c48ff41677595be797d90f42969beae0" dependencies = [ "bytes", "futures-channel", @@ -597,9 +591,9 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.27.3" +version = "0.27.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08afdbb5c31130e3034af566421053ab03787c640246a446327f550d11bcb333" +checksum = "f6884a48c6826ec44f524c7456b163cebe9e55a18d7b5e307cb4f100371cc767" dependencies = [ "futures-util", "http", @@ -794,9 +788,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.6.0" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" +checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" dependencies = [ "equivalent", "hashbrown", @@ -816,24 +810,25 @@ checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" [[package]] name = "itoa" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "540654e97a3f4470a492cd30ff187bc95d89557a903a2bbf112e2fae98104ef2" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" [[package]] name = "js-sys" -version = "0.3.72" +version = "0.3.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9" +checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7" dependencies = [ + "once_cell", "wasm-bindgen", ] [[package]] name = "libc" -version = "0.2.164" +version = "0.2.169" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "433bfe06b8c75da9b2e3fbea6e5329ff87748f0b144ef75306e674c3f6f7c13f" +checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" [[package]] name = "libsqlite3-sys" @@ -883,20 +878,19 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "miniz_oxide" -version = "0.8.0" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +checksum = "4ffbe83022cedc1d264172192511ae958937694cd57ce297164951b8b3568394" dependencies = [ "adler2", ] [[package]] name = "mio" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" +checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" dependencies = [ - "hermit-abi", "libc", "wasi", "windows-sys 0.52.0", @@ -1025,9 +1019,9 @@ dependencies = [ [[package]] name = "psl" -version = "2.1.60" +version = "2.1.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "add4147324c04b450a4eeb919da7e3a86363621b351a39c5a386564720d4e465" +checksum = "699f79b15ea465a0a1ef3c7f0ef2b6b6782a2f720f3587428f0eea37f5c1d3b9" dependencies = [ "psl-types", ] @@ -1156,22 +1150,22 @@ checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] name = "rustix" -version = "0.38.41" +version = "0.38.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6" +checksum = "f93dc38ecbab2eb790ff964bb77fa94faf256fd3e73285fd7ba0903b76bedb85" dependencies = [ "bitflags", "errno", "libc", "linux-raw-sys", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] name = "rustls" -version = "0.23.18" +version = "0.23.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c9cc1d47e243d655ace55ed38201c19ae02c148ae56412ab8750e8f0166ab7f" +checksum = "5065c3f250cbd332cd894be57c40fa52387247659b14a2d6041d121547903b1b" dependencies = [ "once_cell", "rustls-pki-types", @@ -1191,9 +1185,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.10.0" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b" +checksum = "d2bf47e6ff922db3825eb750c4e2ff784c6ff8fb9e13046ef6a1d1c5401b0b37" [[package]] name = "rustls-webpki" @@ -1236,9 +1230,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.12.1" +version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa39c7303dc58b5543c94d22c1766b0d31f2ee58306363ea622b10bbc075eaa2" +checksum = "1863fd3768cd83c56a7f60faa4dc0d403f1b6df0a38c3c25f44b7894e45370d5" dependencies = [ "core-foundation-sys", "libc", @@ -1246,18 +1240,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.215" +version = "1.0.216" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" +checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.215" +version = "1.0.216" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" +checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e" dependencies = [ "proc-macro2", "quote", @@ -1311,9 +1305,9 @@ checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" [[package]] name = "socket2" -version = "0.5.7" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" +checksum = "c970269d99b64e60ec3bd6ad27270092a5394c4e309314b18ae3fe575695fbe8" dependencies = [ "libc", "windows-sys 0.52.0", @@ -1345,9 +1339,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.89" +version = "2.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44d46482f1c1c87acd84dea20c1bf5ebff4c757009ed6bf19cfd36fb10e92c4e" +checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" dependencies = [ "proc-macro2", "quote", @@ -1419,11 +1413,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.3" +version = "2.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c006c85c7651b3cf2ada4584faa36773bd07bac24acfb39f3c431b36d7e667aa" +checksum = "08f5383f3e0071702bf93ab5ee99b52d26936be9dedd9413067cbdcddcb6141a" dependencies = [ - "thiserror-impl 2.0.3", + "thiserror-impl 2.0.8", ] [[package]] @@ -1439,9 +1433,9 @@ dependencies = [ [[package]] name = "thiserror-impl" -version = "2.0.3" +version = "2.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f077553d607adc1caf65430528a576c757a71ed73944b66ebb58ef2bbd243568" +checksum = "f2f357fcec90b3caef6623a099691be676d033b40a058ac95d2a6ade6fa0c943" dependencies = [ "proc-macro2", "quote", @@ -1450,9 +1444,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.36" +version = "0.3.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885" +checksum = "35e7868883861bd0e56d9ac6efcaaca0d6d5d82a2a7ec8209ff492c07cf37b21" dependencies = [ "deranged", "itoa", @@ -1471,9 +1465,9 @@ checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" [[package]] name = "time-macros" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf" +checksum = "2834e6017e3e5e4b9834939793b282bc03b37a3336245fa820e35e233e2a85de" dependencies = [ "num-conv", "time-core", @@ -1491,9 +1485,9 @@ dependencies = [ [[package]] name = "tokio" -version = "1.41.1" +version = "1.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22cfb5bee7a6a52939ca9224d6ac897bb669134078daa8735560897f69de4d33" +checksum = "5cec9b21b0450273377fc97bd4c33a8acffc8c996c987a7c5b319a0083707551" dependencies = [ "backtrace", "bytes", @@ -1516,12 +1510,11 @@ dependencies = [ [[package]] name = "tokio-rustls" -version = "0.26.0" +version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" +checksum = "5f6d0975eaace0cf0fcadee4e4aaa5da15b5c079146f2cffb67c113be122bf37" dependencies = [ "rustls", - "rustls-pki-types", "tokio", ] @@ -1539,9 +1532,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.12" +version = "0.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a" +checksum = "d7fcaa8d55a2bdd6b83ace262b016eca0d79ee02818c5c1bcdf0305114081078" dependencies = [ "bytes", "futures-core", @@ -1558,9 +1551,9 @@ checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" [[package]] name = "tracing" -version = "0.1.40" +version = "0.1.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" dependencies = [ "pin-project-lite", "tracing-core", @@ -1568,9 +1561,9 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.32" +version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" dependencies = [ "once_cell", ] @@ -1622,7 +1615,7 @@ dependencies = [ "reqwest", "serde", "serde_json", - "thiserror 2.0.3", + "thiserror 2.0.8", "url", "which", ] @@ -1674,9 +1667,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e" +checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396" dependencies = [ "cfg-if", "once_cell", @@ -1685,13 +1678,12 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358" +checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79" dependencies = [ "bumpalo", "log", - "once_cell", "proc-macro2", "quote", "syn", @@ -1700,21 +1692,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.45" +version = "0.4.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc7ec4f8827a71586374db3e87abdb5a2bb3a15afed140221307c3ec06b1f63b" +checksum = "38176d9b44ea84e9184eff0bc34cc167ed044f816accfe5922e54d84cf48eca2" dependencies = [ "cfg-if", "js-sys", + "once_cell", "wasm-bindgen", "web-sys", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56" +checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1722,9 +1715,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" +checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2" dependencies = [ "proc-macro2", "quote", @@ -1735,15 +1728,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" +checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6" [[package]] name = "web-sys" -version = "0.3.72" +version = "0.3.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112" +checksum = "04dd7223427d52553d3702c004d3b2fe07c148165faa56313cb00211e31c12bc" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index fb52152..b040192 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,22 +11,22 @@ license = "AGPL-3.0-or-later" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -clap = { version = "4.5.21", features = ["derive", "unstable-v5"] } -serde = { version = "1.0.215", features = ["derive"] } +clap = { version = "4.5.23", features = ["derive", "unstable-v5"] } +serde = { version = "1.0.216", features = ["derive"] } serde_json = "1.0.133" url = { version = "2.5.4", features = ["serde"] } reqwest = { version = "0.12.9", features = ["blocking", "socks"], optional = true } const-str = { version = "0.5.7", optional = true } -thiserror = "2.0.3" +thiserror = "2.0.8" regex = { version = "1.11.1", optional = true } glob = { version = "0.3.1", optional = true } -psl = "2.1.60" +psl = "2.1.72" form_urlencoded = "1.2.1" regex-syntax = { version = "0.8.5", optional = true } percent-encoding = "2.3.1" which = { version = "7.0.0", optional = true } base64 = { version = "0.22.1", optional = true } -diesel = { version = "2.2.5", features = ["sqlite", "returning_clauses_for_sqlite_3_35"], optional = true } +diesel = { version = "2.2.6", features = ["sqlite", "returning_clauses_for_sqlite_3_35"], optional = true } [features] default = [ @@ -87,7 +87,6 @@ debug = [] debug-time = [] experiment-parallel = [] -experiment-parallel-debug = [] # https://stackoverflow.com/a/54842093/10720231 [profile.release] diff --git a/README.md b/README.md index 47be572..0db8a84 100755 --- a/README.md +++ b/README.md @@ -184,7 +184,9 @@ Often the first few cleanings will take a few hundred milliseconds each because Mileage varies wildly but as long as you're not spawning a new instance of URL Cleaner for each URL it should be fast enough. -Please note that URL Cleaner is currently single threaded because I don't know how to do it well. Parallelizing yourself (for example, with [GNU Parallel](https://www.gnu.org/software/parallel/)) may give better results, especially in network-bound tasks. +While URL Cleaner defaults to being single threaded, there is an experimental compilation feature flag to allow parallelization (`experiment-parallel`). For CPU bound jobs, increasing `--thread-queue` can give better times, where for network letency bound jobs increasing `--threads` can give *much* better times. + +Exact behavior and implementation of parallelization is still in flux, and it will likely never be added as a library feature due to some nonsense involving POSIX pthreads. #### Credits diff --git a/benchmarking/benchmark.sh b/benchmarking/benchmark.sh index 7423d7e..50a70a1 100755 --- a/benchmarking/benchmark.sh +++ b/benchmarking/benchmark.sh @@ -20,9 +20,11 @@ massif=1 dhat=1 memcheck=1 an_only_is_set=0 +mode= +just_set_mode=0 urls_are_reset=0 -num_mode=0 nums_are_reset=0 +features= for arg in "$@"; do shift @@ -42,18 +44,40 @@ for arg in "$@"; do --only-massif) if [ $an_only_is_set -eq 0 ]; then an_only_is_set=1; hyperfine=0 ; callgrind=0; cachegrind=0 ; dhat=0; memcheck=0; else echo "Error: Multiple --only- flags were set."; exit 1; fi ;; --only-dhat) if [ $an_only_is_set -eq 0 ]; then an_only_is_set=1; hyperfine=0 ; callgrind=0; cachegrind=0; massif=0 ; memcheck=0; else echo "Error: Multiple --only- flags were set."; exit 1; fi ;; --only-memcheck) if [ $an_only_is_set -eq 0 ]; then an_only_is_set=1; hyperfine=0 ; callgrind=0; cachegrind=0; massif=0; dhat=0 ; else echo "Error: Multiple --only- flags were set."; exit 1; fi ;; - --nums) num_mode=1 ;; - *:*) if [ $urls_are_reset -eq 0 ]; then URLS=( ); urls_are_reset=1; fi; URLS=( ${URLS[@]} "$arg" ) ;; - [0123456789]*) if [ $num_mode -eq 1 ]; then if [ $nums_are_reset -eq 0 ]; then NUMS=( ); nums_are_reset=1; fi; NUMS=(${NUMS[@]} "$arg"); fi ;; - --) break ;; - *) echo Unknown option \"$arg\" && exit 1 ;; + --nums) mode=nums ; just_set_mode=1 ;; + --urls) mode=urls ; just_set_mode=1 ;; + --features) mode=features; just_set_mode=1 ;; + --) break ;; + --*) echo Unknown option \"$arg\"; exit 1 ;; + *) if [ "$mode" == "urls" ]; then + if [ $urls_are_reset -eq 0 ]; then + URLS=( ) + urls_are_reset=1 + fi + URLS=( ${URLS[@]} "$arg" ) + elif [ "$mode" == "nums" ]; then + if [ $nums_are_reset -eq 0 ]; then + NUMS=( ) + nums_are_reset=1 + fi + NUMS=( ${NUMS[@]} "$arg" ) + elif [ "$mode" == "features" ]; then + features_arg=--features + features="$arg" + mode= + else + echo "Modal arguments provided without a mode." + exit 1 + fi ;; esac + if [[ "$arg" =~ ^"--" && $just_set_mode -eq 0 ]]; then mode=; fi + just_set_mode=0 done COMMAND="../target/release/url-cleaner --config ../default-config.json $@" echo "$COMMAND" if [ $compile -eq 1 ]; then - cargo build -r --config profile.release.strip=false --config profile.release.debug=2 + cargo build -r $features_arg $features --config profile.release.strip=false --config profile.release.debug=2 if [ $? -ne 0 ]; then exit 2; fi fi diff --git a/default-config.json b/default-config.json index eef09ad..13c61dc 100755 --- a/default-config.json +++ b/default-config.json @@ -76,20 +76,21 @@ "nerd.whatever.social", "z.opnxng.com" ], "redirect-host-without-www-dot-prefixes": [ - "2kgam.es", "4.nbcla.com", "a.co", "ab.co", "abc7.la", "abc7ne.ws", "adobe.ly", "aje.io", "aje.io", "amzn.asia", "amzn.ew", "amzn.to", - "apple.co", "b23.tv", "bbc.in", "bit.ly", "bitly.com", "bitly.com", "bityl.co", "blizz.ly", "blockclubchi.co", "bloom.bg", "boxd.it", - "buff.ly", "bzfd.it", "cbsn.ws", "cfl.re", "chn.ge", "chng.it", "clckhl.co", "cnb.cx", "cnn.it", "cos.lv", "cutt.ly", "db.tt", "dcdr.me", - "depop.app.link", "dis.gd", "dlvr.it", "econ.st", "etsy.me", "fal.cn", "fanga.me", "fb.me", "fdip.fr", "flip.it", "forms.gle", "g.co", - "glo.bo", "go.bsky.app", "go.forbes.com", "go.microsoft.com", "go.nasa.gov", "gofund.me", "goo.gl", "goo.su", "gum.co", "hmstr.fr", - "hulu.tv", "ift.tt", "intel.ly", "interc.pt", "is.gd", "iwe.one", "j.mp", "jbgm.es", "k00.fr", "katy.to", "kck.st", "kre.pe", - "l.leparisien.fr", "link.animaapp.com", "linkr.it", "lnk.to", "loom.ly", "loom.ly", "lpc.ca", "msft.it", "mzl.la", "n.pr", "nas.cr", - "nbc4i.co", "ninten.do", "ntdo.co.uk", "nvda.ws", "ny.ti", "nyer.cm", "nyp.st", "nyti.ms", "nyto.ms", "on.forbes.com", "on.ft.com", - "on.ft.com", "on.msnbc.com", "on.nyc.gov", "onl.bz", "onl.la", "onl.sc", "operagx.gg", "orlo.uk", "ow.ly", "peoplem.ag", "pin.it", - "pixiv.me", "play.st", "politi.co", "prn.to", "propub.li", "pulse.ly", "py.pl", "qr1.be", "rb.gy", "rb.gy", "rblx.co", "rdbl.co", - "redd.it", "reurl.cc", "reut.rs", "rzr.to", "s.goodsmile.link", "s.team", "s76.co", "shor.tf", "shorturl.at", "spoti.fi", "spr.ly", - "spr.ly", "sqex.to", "t.co", "t.ly", "theatln.tc", "thecut.io", "thr.cm", "thrn.co", "tiny.cc", "tmz.me", "to.pbs.org", "tps.to", - "tr.ee", "trib.al", "u.jd.com", "unes.co", "uni.cf", "visitlink.me", "w.wiki", "wlgrn.com", "wlo.link", "wn.nr", "wwdc.io", "x.gd", - "xbx.ly", "xhslink.com", "yrp.ca", "api.link.agorapulse.com", "perfht.ml", "share.firefox.dev", "unf.pa", "spr.ly", "thef.pub", "cons.lv", "kre.pe", "uniceflink.org" + "2kgam.es", "4.nbcla.com", "a.co", "ab.co", "abc7.la", "abc7ne.ws", "adobe.ly", "aje.io", "aje.io", "amzn.asia", "amzn.ew", + "amzn.to", "api.link.agorapulse.com", "apple.co", "b23.tv", "bbc.in", "bit.ly", "bitly.com", "bitly.com", "bityl.co", "blizz.ly", + "blockclubchi.co", "bloom.bg", "boxd.it", "buff.ly", "bzfd.it", "cbsn.ws", "cfl.re", "chn.ge", "chng.it", "clckhl.co", "cnb.cx", + "cnn.it", "cons.lv", "cos.lv", "cutt.ly", "db.tt", "dcdr.me", "depop.app.link", "dis.gd", "dlvr.it", "econ.st", "etsy.me", "fal.cn", + "fanga.me", "fb.me", "fdip.fr", "flip.it", "forms.gle", "g.co", "glo.bo", "go.bsky.app", "go.forbes.com", "go.microsoft.com", + "go.nasa.gov", "gofund.me", "goo.gl", "goo.su", "gum.co", "hmstr.fr", "hulu.tv", "ift.tt", "intel.ly", "interc.pt", "is.gd", "iwe.one", + "j.mp", "jbgm.es", "k00.fr", "katy.to", "kck.st", "kre.pe", "kre.pe", "l.leparisien.fr", "link.animaapp.com", "linkr.it", "lnk.to", + "loom.ly", "loom.ly", "lpc.ca", "msft.it", "mzl.la", "n.pr", "nas.cr", "nbc4i.co", "ninten.do", "ntdo.co.uk", "nvda.ws", "ny.ti", + "nyer.cm", "nyp.st", "nyti.ms", "nyto.ms", "on.forbes.com", "on.ft.com", "on.ft.com", "on.msnbc.com", "on.nyc.gov", "onl.bz", + "onl.la", "onl.sc", "operagx.gg", "orlo.uk", "ow.ly", "peoplem.ag", "perfht.ml", "pin.it", "pixiv.me", "play.st", "politi.co", + "prn.to", "propub.li", "pulse.ly", "py.pl", "qr1.be", "rb.gy", "rb.gy", "rblx.co", "rdbl.co", "redd.it", "reurl.cc", "reut.rs", + "rzr.to", "s.goodsmile.link", "s.team", "s76.co", "share.firefox.dev", "shor.tf", "shorturl.at", "spoti.fi", "spr.ly", "spr.ly", + "spr.ly", "sqex.to", "t.co", "t.ly", "theatln.tc", "thecut.io", "thef.pub", "thr.cm", "thrn.co", "tiny.cc", "tmz.me", "to.pbs.org", + "tps.to", "tr.ee", "trib.al", "u.jd.com", "unes.co", "unf.pa", "uni.cf", "uniceflink.org", "visitlink.me", "w.wiki", "wlgrn.com", + "wlo.link", "wn.nr", "wwdc.io", "x.gd", "xbx.ly", "xhslink.com", "yrp.ca" ], "redirect-not-subdomains": [ "lnk.to", "visitlink.me", "goo.gl", "o93x.net", "pusle.ly" @@ -296,10 +297,11 @@ "buymeacoff.ee", "buymeacoffee.com", "cach.me", "carleton.ca", "cash.app", "click.notification.elsevier.com", "deviantart.com", "duckduckgo.com", "duckduckgo.com", "duckduckgogg42xjoc72x3sjasowoarfbgcmvfimaftt6twagswzczad.onion", "e.emailalerts.cnn.com", "facebook.com", "gf.me", "gofundme.com", "gofundme.com", "goodreads.com", "google.com", "gprivate.com", "href.li", "instagr.am", - "instagram.com", "l.instagram.com", "l.threads.com", "lmddgtfy.net", "lmgtfy2.com", "lnk.bio", "old.reddit.com", "open.substack.com", - "out.reddit.com", "pawoo.net", "pixiv.net", "preview.tinyurl.com", "proxy.notsobot.com", "rd.goodreads.com", "reddit.com", - "shareasale-analytics.com", "shareasale.com", "sketchfab.com", "sketchfab.com", "skfb.ly", "smarturl.it", "ss3.shipstation.com", - "steamcommunity.com", "subscribestar.adult", "substack.com", "t.umblr.com", "tinyurl.com", "toyhou.se", "youtube.com", "twpf.jp" + "instagram.com", "l.instagram.com", "l.threads.com", "lmddgtfy.net", "lmgtfy2.com", "lnk.bio", "m.vk.com", "old.reddit.com", + "open.substack.com", "out.reddit.com", "pawoo.net", "pixiv.net", "preview.tinyurl.com", "proxy.notsobot.com", "rd.goodreads.com", + "reddit.com", "shareasale-analytics.com", "shareasale.com", "sketchfab.com", "sketchfab.com", "skfb.ly", "smarturl.it", + "ss3.shipstation.com", "steamcommunity.com", "subscribestar.adult", "substack.com", "t.umblr.com", "tinyurl.com", "toyhou.se", + "twpf.jp", "vk.com", "youtube.com" ]}, {"InSet": "lmgtfy-hosts"}, {"LengthIs": 4}, @@ -718,7 +720,11 @@ "pulse.ly": {"All": [ {"SetHost": "api.link.agorapulse.com"}, {"SetPart": {"part": {"BeforePathSegment": 0}, "value": "redirect"}} - ]} + ]}, + "vk.com": {"IfCondition": { + "condition": {"PathIs": "/away.php"}, + "mapper": {"GetUrlFromQueryParam": "to"} + }} } } }, diff --git a/src/glue/caching.rs b/src/glue/caching.rs index eaf6933..872e7f9 100644 --- a/src/glue/caching.rs +++ b/src/glue/caching.rs @@ -233,6 +233,8 @@ impl InnerCache { /// /// The inner [`Option`] is the cache entry. /// # Errors + /// If the call to [`Self::connect`] returns an error, that error is returned. + /// /// If the call to [`RunQueryDsl::get_result`] returns an error, that error is returned. pub fn read(&mut self, category: &str, key: &str) -> Result>, ReadFromCacheError> { debug!(InnerCache::read, self, category, key); @@ -250,6 +252,8 @@ impl InnerCache { /// /// If an entry doesn't exist, it is made. /// # Errors + /// If the call to [`Self::connect`] returns an error, that error is returned. + /// /// If the call to [`RunQueryDsl::get_result`] returns an error, that error is returned. pub fn write(&mut self, category: &str, key: &str, value: Option<&str>) -> Result<(), WriteToCacheError> { debug!(InnerCache::write, self, category, key, value); diff --git a/src/main.rs b/src/main.rs index 77012a0..121af03 100755 --- a/src/main.rs +++ b/src/main.rs @@ -133,12 +133,14 @@ pub struct Args { #[arg( long, verbatim_doc_comment)] pub test_config : bool, /// Amount of threads to process jobs in. + /// + /// Zero gets the current CPU threads. #[cfg(feature = "experiment-parallel")] - #[arg(long, default_value_t = 4)] + #[arg(long, default_value_t = 0)] pub threads: usize, /// Amount of jobs to do in each thread while waiting for other threads to return. #[cfg(feature = "experiment-parallel")] - #[arg(long, default_value_t = 10)] + #[arg(long, default_value_t = 100)] pub thread_queue: usize } @@ -276,7 +278,6 @@ fn main() -> Result { #[cfg(feature = "debug-time")] let x = std::time::Instant::now(); - #[cfg(not(feature = "experiment-parallel"))] let mut jobs = Jobs { #[cfg(feature = "cache")] cache: args.cache_path.as_deref().unwrap_or(&*config.cache_path).into(), @@ -294,6 +295,101 @@ fn main() -> Result { #[cfg(feature = "debug-time")] eprintln!("Make Jobs: {:?}", x.elapsed()); #[cfg(feature = "debug-time")] let x = std::time::Instant::now(); + #[cfg(feature = "experiment-parallel")] + { + let mut threads = args.threads; + if threads == 0 {threads = std::thread::available_parallelism().expect("To be able to get the available parallelism.").into();} + let (in_senders , in_recievers ) = (0..threads).map(|_| std::sync::mpsc::sync_channel::, MakeJobError>>(args.thread_queue)).collect::<(Vec<_>, Vec<_>)>(); + let (out_senders, out_recievers) = (0..threads).map(|_| std::sync::mpsc::sync_channel::, MakeJobError>>(args.thread_queue)).collect::<(Vec<_>, Vec<_>)>(); + + std::thread::scope(|s| { + in_recievers.into_iter().zip(out_senders).map(|(ir, os)| { + s.spawn(move || { + while let Ok(job_result) = ir.recv() { + os.send(job_result.map(|job| job.r#do())).expect("The receiver to still exist."); + } + }); + }).for_each(drop); + + let some_ok_ref = &some_ok; + let some_error_ref = &some_error; + + if json { + s.spawn(move || { + print!("{{\"Ok\":{{\"urls\":["); + let mut first_job = true; + + let mut disconnected = 0usize; + for or in out_recievers.iter().cycle() { + let recieved = or.recv(); + match recieved { + Ok(Ok(Ok(url))) => { + if !first_job {print!(",");} + print!("{{\"Ok\":{{\"Ok\":{}}}}}", str_to_json_str(url.as_str())); + *some_ok_ref.lock().expect("No panics.") = true; + first_job = false; + }, + Ok(Ok(Err(e))) => { + if !first_job {print!(",");} + print!("{{\"Ok\":{{\"Err\":{{\"message\":{},\"variant\":{}}}}}}}", str_to_json_str(&e.to_string()), str_to_json_str(&format!("{e:?}"))); + *some_error_ref.lock().expect("No panics.") = true; + first_job = false; + }, + Ok(Err(e)) => { + if !first_job {print!(",");} + print!("{{\"Err\":{{\"message\":{},\"variant\":{}}}}}", str_to_json_str(&e.to_string()), str_to_json_str(&format!("{e:?}"))); + *some_error_ref.lock().expect("No panics.") = true; + first_job = false; + }, + Err(_) => { + #[allow(clippy::arithmetic_side_effects, reason = "Can't happen.")] + {disconnected += 1;} + if disconnected == threads {break;} + } + } + } + + print!("]}}}}"); + }); + } else { + s.spawn(move || { + let mut disconnected = 0usize; + for or in out_recievers.iter().cycle() { + let recieved = or.recv(); + match recieved { + Ok(Ok(Ok(url))) => { + println!("{url}"); + *some_ok_ref.lock().expect("No panics.") = true; + }, + Ok(Ok(Err(e))) => { + println!(); + eprintln!("DoJobError\t{e:?}"); + *some_error_ref.lock().expect("No panics.") = true; + } + Ok(Err(e)) => { + println!(); + eprintln!("MakeJobError\t{e:?}"); + *some_error_ref.lock().expect("No panics.") = true; + } + Err(_) => { + #[allow(clippy::arithmetic_side_effects, reason = "Can't happen.")] + {disconnected += 1;} + if disconnected == threads {break;} + } + } + } + }); + } + + for (i, job) in jobs.iter().enumerate() { + #[allow(clippy::arithmetic_side_effects, reason = "Can't happen.")] + in_senders.get(i % threads).expect("The amount of senders to not exceet the count of senders to make.").send(job).expect("To successfuly send the Job."); + } + drop(in_senders); + }) + } + + #[cfg(not(feature = "experiment-parallel"))] if json { print!("{{\"Ok\":{{\"urls\":["); let mut first_job = true; @@ -322,82 +418,6 @@ fn main() -> Result { print!("]}}}}"); } else { - #[cfg(feature = "experiment-parallel")] - { - let (in_senders , in_recievers ) = (0..args.threads).map(|_| std::sync::mpsc::sync_channel::, MakeJobError>>(args.thread_queue)).collect::<(Vec<_>, Vec<_>)>(); - let (out_senders, out_recievers) = (0..args.threads).map(|_| std::sync::mpsc::sync_channel::, MakeJobError>>(args.thread_queue)).collect::<(Vec<_>, Vec<_>)>(); - - std::thread::scope(|s| { - let in_reciever_threads = in_recievers.into_iter().zip(out_senders).enumerate().map(|(i, (ir, os))| { - s.spawn(move || { - loop { - match ir.recv() { - Ok(job_result) => { - #[cfg(feature = "experiment-parallel-debug")] println!("Router {i} routing {job_result:?}"); - os.send(job_result.map(|job| job.r#do())).expect("The receiver to still exist."); - }, - Err(_) => { - #[cfg(feature = "experiment-parallel-debug")] println!("Router {i} done"); - break; - } - } - } - }); - }).collect::>(); - - let some_ok_ref = &some_ok; - let some_error_ref = &some_error; - - s.spawn(move || { - let mut disconnected = 0usize; - for or in out_recievers.iter().cycle() { - let recieved = or.recv(); - #[cfg(feature = "experiment-parallel-debug")] println!("Recieved {recieved:?}"); - match recieved { - Ok(job_result) => match job_result { - Ok(job) => match job { - Ok(url) => { - println!("{url}"); - *some_ok_ref.lock().expect("No panics.") = true; - }, - Err(e) => { - println!(); - eprintln!("DoJobError\t{e:?}"); - *some_error_ref.lock().expect("No panics.") = true; - } - }, - Err(e) => { - println!(); - eprintln!("MakeJobError\t{e:?}"); - *some_error_ref.lock().expect("No panics.") = true; - } - }, - Err(_) => {disconnected += 1; if disconnected == args.threads {break;}} - } - } - }); - - let mut jobs = Box::new(Jobs { - #[cfg(feature = "cache")] - cache: args.cache_path.as_deref().unwrap_or(&*config.cache_path).into(), - job_configs_source: { - let ret = args.urls.into_iter().map(|url| JobConfig::from_str(&url)); - if !io::stdin().is_terminal() { - Box::new(ret.chain(io::stdin().lines().map(|line| JobConfig::from_str(&line?)))) - } else { - Box::new(ret) - } - }, - config: Cow::Owned(config) - }); - for (i, job) in Box::leak(jobs).iter().enumerate() { - #[cfg(feature = "experiment-parallel-debug")] println!("Putting {i} in: {job:?}"); - in_senders.get(i % args.threads).expect("The amount of senders to not exceet the count of senders to make.").send(job).expect("To successfuly send the Job."); - } - drop(in_senders); - }) - } - #[cfg(not(feature = "experiment-parallel"))] for job in jobs.iter() { match job { Ok(job) => match job.r#do() { diff --git a/src/types/rules.rs b/src/types/rules.rs index edc7c20..a867759 100644 --- a/src/types/rules.rs +++ b/src/types/rules.rs @@ -273,7 +273,7 @@ pub enum RuleError { /// Custom error. #[error(transparent)] #[cfg(feature = "custom")] - Custom(Box) + Custom(Box) } impl Rule { diff --git a/src/types/rules/conditions.rs b/src/types/rules/conditions.rs index 0c9b4aa..8c092c6 100755 --- a/src/types/rules/conditions.rs +++ b/src/types/rules/conditions.rs @@ -650,7 +650,7 @@ pub enum ConditionError { /// Custom error. #[error(transparent)] #[cfg(feature = "custom")] - Custom(Box) + Custom(Box) } impl Condition { diff --git a/src/types/rules/mappers.rs b/src/types/rules/mappers.rs index b468b1a..08e88d2 100644 --- a/src/types/rules/mappers.rs +++ b/src/types/rules/mappers.rs @@ -539,7 +539,7 @@ pub enum MapperError { /// Custom error. #[error(transparent)] #[cfg(feature = "custom")] - Custom(Box), + Custom(Box), /// Returned when the requested part of a URL is [`None`]. #[error("The requested part of the URL was None.")] UrlPartIsNone diff --git a/src/types/string_matcher.rs b/src/types/string_matcher.rs index 60d2b96..026cd02 100644 --- a/src/types/string_matcher.rs +++ b/src/types/string_matcher.rs @@ -311,7 +311,7 @@ pub enum StringMatcherError { /// Custom error. #[error(transparent)] #[cfg(feature = "custom")] - Custom(Box) + Custom(Box) } impl StringMatcher { diff --git a/src/types/string_modification.rs b/src/types/string_modification.rs index 0a07b18..8ba8838 100644 --- a/src/types/string_modification.rs +++ b/src/types/string_modification.rs @@ -999,7 +999,7 @@ pub enum StringModificationError { /// Custom error. #[error(transparent)] #[cfg(feature = "custom")] - Custom(Box) + Custom(Box) } impl From for StringModificationError { diff --git a/src/types/string_source.rs b/src/types/string_source.rs index e4c8ade..bb3bd51 100644 --- a/src/types/string_source.rs +++ b/src/types/string_source.rs @@ -465,7 +465,7 @@ pub enum StringSourceError { /// Custom error. #[error(transparent)] #[cfg(feature = "custom")] - Custom(Box) + Custom(Box) }