Skip to content

Commit

Permalink
Better parallelization and misc. upgrades
Browse files Browse the repository at this point in the history
Signed-off-by: James C. Wise <[email protected]>
  • Loading branch information
Scripter17 committed Dec 19, 2024
1 parent f60fe16 commit 0303f6b
Show file tree
Hide file tree
Showing 13 changed files with 269 additions and 221 deletions.
199 changes: 96 additions & 103 deletions Cargo.lock

Large diffs are not rendered by default.

11 changes: 5 additions & 6 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,22 @@ license = "AGPL-3.0-or-later"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
clap = { version = "4.5.21", features = ["derive", "unstable-v5"] }
serde = { version = "1.0.215", features = ["derive"] }
clap = { version = "4.5.23", features = ["derive", "unstable-v5"] }
serde = { version = "1.0.216", features = ["derive"] }
serde_json = "1.0.133"
url = { version = "2.5.4", features = ["serde"] }
reqwest = { version = "0.12.9", features = ["blocking", "socks"], optional = true }
const-str = { version = "0.5.7", optional = true }
thiserror = "2.0.3"
thiserror = "2.0.8"
regex = { version = "1.11.1", optional = true }
glob = { version = "0.3.1", optional = true }
psl = "2.1.60"
psl = "2.1.72"
form_urlencoded = "1.2.1"
regex-syntax = { version = "0.8.5", optional = true }
percent-encoding = "2.3.1"
which = { version = "7.0.0", optional = true }
base64 = { version = "0.22.1", optional = true }
diesel = { version = "2.2.5", features = ["sqlite", "returning_clauses_for_sqlite_3_35"], optional = true }
diesel = { version = "2.2.6", features = ["sqlite", "returning_clauses_for_sqlite_3_35"], optional = true }

[features]
default = [
Expand Down Expand Up @@ -87,7 +87,6 @@ debug = []
debug-time = []

experiment-parallel = []
experiment-parallel-debug = []

# https://stackoverflow.com/a/54842093/10720231
[profile.release]
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,9 @@ Often the first few cleanings will take a few hundred milliseconds each because

Mileage varies wildly but as long as you're not spawning a new instance of URL Cleaner for each URL it should be fast enough.

Please note that URL Cleaner is currently single threaded because I don't know how to do it well. Parallelizing yourself (for example, with [GNU Parallel](https://www.gnu.org/software/parallel/)) may give better results, especially in network-bound tasks.
While URL Cleaner defaults to being single threaded, there is an experimental compilation feature flag to allow parallelization (`experiment-parallel`). For CPU bound jobs, increasing `--thread-queue` can give better times, where for network letency bound jobs increasing `--threads` can give *much* better times.

Exact behavior and implementation of parallelization is still in flux, and it will likely never be added as a library feature due to some nonsense involving POSIX pthreads.

#### Credits

Expand Down
38 changes: 31 additions & 7 deletions benchmarking/benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ massif=1
dhat=1
memcheck=1
an_only_is_set=0
mode=
just_set_mode=0
urls_are_reset=0
num_mode=0
nums_are_reset=0
features=

for arg in "$@"; do
shift
Expand All @@ -42,18 +44,40 @@ for arg in "$@"; do
--only-massif) if [ $an_only_is_set -eq 0 ]; then an_only_is_set=1; hyperfine=0 ; callgrind=0; cachegrind=0 ; dhat=0; memcheck=0; else echo "Error: Multiple --only- flags were set."; exit 1; fi ;;
--only-dhat) if [ $an_only_is_set -eq 0 ]; then an_only_is_set=1; hyperfine=0 ; callgrind=0; cachegrind=0; massif=0 ; memcheck=0; else echo "Error: Multiple --only- flags were set."; exit 1; fi ;;
--only-memcheck) if [ $an_only_is_set -eq 0 ]; then an_only_is_set=1; hyperfine=0 ; callgrind=0; cachegrind=0; massif=0; dhat=0 ; else echo "Error: Multiple --only- flags were set."; exit 1; fi ;;
--nums) num_mode=1 ;;
*:*) if [ $urls_are_reset -eq 0 ]; then URLS=( ); urls_are_reset=1; fi; URLS=( ${URLS[@]} "$arg" ) ;;
[0123456789]*) if [ $num_mode -eq 1 ]; then if [ $nums_are_reset -eq 0 ]; then NUMS=( ); nums_are_reset=1; fi; NUMS=(${NUMS[@]} "$arg"); fi ;;
--) break ;;
*) echo Unknown option \"$arg\" && exit 1 ;;
--nums) mode=nums ; just_set_mode=1 ;;
--urls) mode=urls ; just_set_mode=1 ;;
--features) mode=features; just_set_mode=1 ;;
--) break ;;
--*) echo Unknown option \"$arg\"; exit 1 ;;
*) if [ "$mode" == "urls" ]; then
if [ $urls_are_reset -eq 0 ]; then
URLS=( )
urls_are_reset=1
fi
URLS=( ${URLS[@]} "$arg" )
elif [ "$mode" == "nums" ]; then
if [ $nums_are_reset -eq 0 ]; then
NUMS=( )
nums_are_reset=1
fi
NUMS=( ${NUMS[@]} "$arg" )
elif [ "$mode" == "features" ]; then
features_arg=--features
features="$arg"
mode=
else
echo "Modal arguments provided without a mode."
exit 1
fi ;;
esac
if [[ "$arg" =~ ^"--" && $just_set_mode -eq 0 ]]; then mode=; fi
just_set_mode=0
done

COMMAND="../target/release/url-cleaner --config ../default-config.json $@"
echo "$COMMAND"
if [ $compile -eq 1 ]; then
cargo build -r --config profile.release.strip=false --config profile.release.debug=2
cargo build -r $features_arg $features --config profile.release.strip=false --config profile.release.debug=2
if [ $? -ne 0 ]; then exit 2; fi
fi

Expand Down
44 changes: 25 additions & 19 deletions default-config.json
Original file line number Diff line number Diff line change
Expand Up @@ -76,20 +76,21 @@
"nerd.whatever.social", "z.opnxng.com"
],
"redirect-host-without-www-dot-prefixes": [
"2kgam.es", "4.nbcla.com", "a.co", "ab.co", "abc7.la", "abc7ne.ws", "adobe.ly", "aje.io", "aje.io", "amzn.asia", "amzn.ew", "amzn.to",
"apple.co", "b23.tv", "bbc.in", "bit.ly", "bitly.com", "bitly.com", "bityl.co", "blizz.ly", "blockclubchi.co", "bloom.bg", "boxd.it",
"buff.ly", "bzfd.it", "cbsn.ws", "cfl.re", "chn.ge", "chng.it", "clckhl.co", "cnb.cx", "cnn.it", "cos.lv", "cutt.ly", "db.tt", "dcdr.me",
"depop.app.link", "dis.gd", "dlvr.it", "econ.st", "etsy.me", "fal.cn", "fanga.me", "fb.me", "fdip.fr", "flip.it", "forms.gle", "g.co",
"glo.bo", "go.bsky.app", "go.forbes.com", "go.microsoft.com", "go.nasa.gov", "gofund.me", "goo.gl", "goo.su", "gum.co", "hmstr.fr",
"hulu.tv", "ift.tt", "intel.ly", "interc.pt", "is.gd", "iwe.one", "j.mp", "jbgm.es", "k00.fr", "katy.to", "kck.st", "kre.pe",
"l.leparisien.fr", "link.animaapp.com", "linkr.it", "lnk.to", "loom.ly", "loom.ly", "lpc.ca", "msft.it", "mzl.la", "n.pr", "nas.cr",
"nbc4i.co", "ninten.do", "ntdo.co.uk", "nvda.ws", "ny.ti", "nyer.cm", "nyp.st", "nyti.ms", "nyto.ms", "on.forbes.com", "on.ft.com",
"on.ft.com", "on.msnbc.com", "on.nyc.gov", "onl.bz", "onl.la", "onl.sc", "operagx.gg", "orlo.uk", "ow.ly", "peoplem.ag", "pin.it",
"pixiv.me", "play.st", "politi.co", "prn.to", "propub.li", "pulse.ly", "py.pl", "qr1.be", "rb.gy", "rb.gy", "rblx.co", "rdbl.co",
"redd.it", "reurl.cc", "reut.rs", "rzr.to", "s.goodsmile.link", "s.team", "s76.co", "shor.tf", "shorturl.at", "spoti.fi", "spr.ly",
"spr.ly", "sqex.to", "t.co", "t.ly", "theatln.tc", "thecut.io", "thr.cm", "thrn.co", "tiny.cc", "tmz.me", "to.pbs.org", "tps.to",
"tr.ee", "trib.al", "u.jd.com", "unes.co", "uni.cf", "visitlink.me", "w.wiki", "wlgrn.com", "wlo.link", "wn.nr", "wwdc.io", "x.gd",
"xbx.ly", "xhslink.com", "yrp.ca", "api.link.agorapulse.com", "perfht.ml", "share.firefox.dev", "unf.pa", "spr.ly", "thef.pub", "cons.lv", "kre.pe", "uniceflink.org"
"2kgam.es", "4.nbcla.com", "a.co", "ab.co", "abc7.la", "abc7ne.ws", "adobe.ly", "aje.io", "aje.io", "amzn.asia", "amzn.ew",
"amzn.to", "api.link.agorapulse.com", "apple.co", "b23.tv", "bbc.in", "bit.ly", "bitly.com", "bitly.com", "bityl.co", "blizz.ly",
"blockclubchi.co", "bloom.bg", "boxd.it", "buff.ly", "bzfd.it", "cbsn.ws", "cfl.re", "chn.ge", "chng.it", "clckhl.co", "cnb.cx",
"cnn.it", "cons.lv", "cos.lv", "cutt.ly", "db.tt", "dcdr.me", "depop.app.link", "dis.gd", "dlvr.it", "econ.st", "etsy.me", "fal.cn",
"fanga.me", "fb.me", "fdip.fr", "flip.it", "forms.gle", "g.co", "glo.bo", "go.bsky.app", "go.forbes.com", "go.microsoft.com",
"go.nasa.gov", "gofund.me", "goo.gl", "goo.su", "gum.co", "hmstr.fr", "hulu.tv", "ift.tt", "intel.ly", "interc.pt", "is.gd", "iwe.one",
"j.mp", "jbgm.es", "k00.fr", "katy.to", "kck.st", "kre.pe", "kre.pe", "l.leparisien.fr", "link.animaapp.com", "linkr.it", "lnk.to",
"loom.ly", "loom.ly", "lpc.ca", "msft.it", "mzl.la", "n.pr", "nas.cr", "nbc4i.co", "ninten.do", "ntdo.co.uk", "nvda.ws", "ny.ti",
"nyer.cm", "nyp.st", "nyti.ms", "nyto.ms", "on.forbes.com", "on.ft.com", "on.ft.com", "on.msnbc.com", "on.nyc.gov", "onl.bz",
"onl.la", "onl.sc", "operagx.gg", "orlo.uk", "ow.ly", "peoplem.ag", "perfht.ml", "pin.it", "pixiv.me", "play.st", "politi.co",
"prn.to", "propub.li", "pulse.ly", "py.pl", "qr1.be", "rb.gy", "rb.gy", "rblx.co", "rdbl.co", "redd.it", "reurl.cc", "reut.rs",
"rzr.to", "s.goodsmile.link", "s.team", "s76.co", "share.firefox.dev", "shor.tf", "shorturl.at", "spoti.fi", "spr.ly", "spr.ly",
"spr.ly", "sqex.to", "t.co", "t.ly", "theatln.tc", "thecut.io", "thef.pub", "thr.cm", "thrn.co", "tiny.cc", "tmz.me", "to.pbs.org",
"tps.to", "tr.ee", "trib.al", "u.jd.com", "unes.co", "unf.pa", "uni.cf", "uniceflink.org", "visitlink.me", "w.wiki", "wlgrn.com",
"wlo.link", "wn.nr", "wwdc.io", "x.gd", "xbx.ly", "xhslink.com", "yrp.ca"
],
"redirect-not-subdomains": [
"lnk.to", "visitlink.me", "goo.gl", "o93x.net", "pusle.ly"
Expand Down Expand Up @@ -296,10 +297,11 @@
"buymeacoff.ee", "buymeacoffee.com", "cach.me", "carleton.ca", "cash.app", "click.notification.elsevier.com", "deviantart.com",
"duckduckgo.com", "duckduckgo.com", "duckduckgogg42xjoc72x3sjasowoarfbgcmvfimaftt6twagswzczad.onion", "e.emailalerts.cnn.com",
"facebook.com", "gf.me", "gofundme.com", "gofundme.com", "goodreads.com", "google.com", "gprivate.com", "href.li", "instagr.am",
"instagram.com", "l.instagram.com", "l.threads.com", "lmddgtfy.net", "lmgtfy2.com", "lnk.bio", "old.reddit.com", "open.substack.com",
"out.reddit.com", "pawoo.net", "pixiv.net", "preview.tinyurl.com", "proxy.notsobot.com", "rd.goodreads.com", "reddit.com",
"shareasale-analytics.com", "shareasale.com", "sketchfab.com", "sketchfab.com", "skfb.ly", "smarturl.it", "ss3.shipstation.com",
"steamcommunity.com", "subscribestar.adult", "substack.com", "t.umblr.com", "tinyurl.com", "toyhou.se", "youtube.com", "twpf.jp"
"instagram.com", "l.instagram.com", "l.threads.com", "lmddgtfy.net", "lmgtfy2.com", "lnk.bio", "m.vk.com", "old.reddit.com",
"open.substack.com", "out.reddit.com", "pawoo.net", "pixiv.net", "preview.tinyurl.com", "proxy.notsobot.com", "rd.goodreads.com",
"reddit.com", "shareasale-analytics.com", "shareasale.com", "sketchfab.com", "sketchfab.com", "skfb.ly", "smarturl.it",
"ss3.shipstation.com", "steamcommunity.com", "subscribestar.adult", "substack.com", "t.umblr.com", "tinyurl.com", "toyhou.se",
"twpf.jp", "vk.com", "youtube.com"
]},
{"InSet": "lmgtfy-hosts"},
{"LengthIs": 4},
Expand Down Expand Up @@ -718,7 +720,11 @@
"pulse.ly": {"All": [
{"SetHost": "api.link.agorapulse.com"},
{"SetPart": {"part": {"BeforePathSegment": 0}, "value": "redirect"}}
]}
]},
"vk.com": {"IfCondition": {
"condition": {"PathIs": "/away.php"},
"mapper": {"GetUrlFromQueryParam": "to"}
}}
}
}
},
Expand Down
4 changes: 4 additions & 0 deletions src/glue/caching.rs
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,8 @@ impl InnerCache {
///
/// The inner [`Option`] is the cache entry.
/// # Errors
/// If the call to [`Self::connect`] returns an error, that error is returned.
///
/// If the call to [`RunQueryDsl::get_result`] returns an error, that error is returned.
pub fn read(&mut self, category: &str, key: &str) -> Result<Option<Option<String>>, ReadFromCacheError> {
debug!(InnerCache::read, self, category, key);
Expand All @@ -250,6 +252,8 @@ impl InnerCache {
///
/// If an entry doesn't exist, it is made.
/// # Errors
/// If the call to [`Self::connect`] returns an error, that error is returned.
///
/// If the call to [`RunQueryDsl::get_result`] returns an error, that error is returned.
pub fn write(&mut self, category: &str, key: &str, value: Option<&str>) -> Result<(), WriteToCacheError> {
debug!(InnerCache::write, self, category, key, value);
Expand Down
Loading

0 comments on commit 0303f6b

Please sign in to comment.