Many things. You know how my commits go

Scripter17 · Nov 20, 2024 · 3b146c0 · 3b146c0
1 parent d1b5c2c
commit 3b146c0
Show file tree

Hide file tree

Showing 21 changed files with 1,234 additions and 581 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -11,20 +11,20 @@ license = "AGPL-3.0-or-later"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-clap = { version = "4.5.20", features = ["derive", "unstable-v5"] }
-serde = { version = "1.0.214", features = ["derive"] }
-serde_json = "1.0.132"
-url = { version = "2.5.2", features = ["serde"] }
+clap = { version = "4.5.21", features = ["derive", "unstable-v5"] }
+serde = { version = "1.0.215", features = ["derive"] }
+serde_json = "1.0.133"
+url = { version = "2.5.3", features = ["serde"] }
 reqwest = { version = "0.12.9", features = ["blocking", "socks"], optional = true }
 const-str = { version = "0.5.7", optional = true }
-thiserror = "1.0.66"
+thiserror = "2.0.3"
 regex = { version = "1.11.1", optional = true }
 glob = { version = "0.3.1", optional = true }
-psl = "2.1.55"
+psl = "2.1.57"
 form_urlencoded = "1.2.1"
 regex-syntax = { version = "0.8.5", optional = true }
 percent-encoding = "2.3.1"
-which = { version = "6.0.3", optional = true }
+which = { version = "7.0.0", optional = true }
 base64 = { version = "0.22.1", optional = true }
 diesel = { version = "2.2.4", features = ["sqlite", "returning_clauses_for_sqlite_3_35"], optional = true }
 

diff --git a/README.md b/README.md
@@ -134,7 +134,50 @@ Currently only one list is included in the default config:
 Currently there is no command line syntax for them. There really should be.
 <!--/cmd-->
 
-#### Citations
+#### But how fast is it?
+
+Reasonably fast. [`benchmarking/benchmark.sh`] is a Bash script that runs some hyperfine and valgrind benchmarking so I can reliably check for regressions.
+
+On a mostly stock lenovo thinkpad T460S (Intel i5-6300U (4) @ 3.000GHz) running Kubuntu 24.10 (kernel 6.11.0) that has "not much" going on (FireFox, Steam, etc. are closed), hyperfine gives me the following benchmark:
+
+(The numbers are in milliseconds)
+
+```Json
+{
+  "https://x.com?a=2": {
+    "0":       5.176,
+    "1":       5.455,
+    "10":      5.284,
+    "100":     5.859,
+    "1000":    9.194,
+    "10000":  45.828
+  },
+  "https://example.com?fb_action_ids&mc_eid&ml_subscriber_hash&oft_ck&s_cid&unicorn_click_id": {
+    "0":       5.351,
+    "1":       5.306,
+    "10":      5.313,
+    "100":     5.836,
+    "1000":   11.340,
+    "10000":  62.017
+  },
+  "https://www.amazon.ca/UGREEN-Charger-Compact-Adapter-MacBook/dp/B0C6DX66TN/ref=sr_1_5?crid=2CNEQ7A6QR5NM&keywords=ugreen&qid=1704364659&sprefix=ugreen%2Caps%2C139&sr=8-5&ufe=app_do%3Aamzn1.fos.b06bdbbe-20fd-4ebc-88cf-fa04f1ca0da8": {
+    "0":       5.516,
+    "1":       5.228,
+    "10":      5.562,
+    "100":     6.279,
+    "1000":   14.972,
+    "10000": 101.226
+  }
+}
+```
+
+In practice, when using [URL Cleaner Site and its userscript](https://github.com/Scripter17/url-cleaner-site), performance is often up to 10x worse because for some reason `GM_XMLHttpRequest` always takes at least 10ms on my machine and, from basic testing, the amazon homepage has 1k URLs and takes about 8-10 requests to clean all of them.
+
+Mileage varies wildly but as long as you're not spawning a new instance of URL Cleaner for each URL it should be fast enough.
+
+Please note that URL Cleaner is currently single threaded because I don't know how to do it well. Parallelizing yourself (for example, with [GNU Parallel](https://www.gnu.org/software/parallel/)) may give better results.
+
+#### Credits
 
 The people and projects I have stolen various parts of the default config from.
 

diff --git a/benchmarking/benchmark.sh b/benchmarking/benchmark.sh
@@ -48,6 +48,8 @@ done
 
 COMMAND="../target/release/url-cleaner --config ../default-config.json $@"
 
+echo "$COMMAND"
+
 if [ $compile -eq 1 ]; then
   cargo build -r --config profile.release.strip=false --config profile.release.debug=2
   if [ $? -ne 0 ]; then exit 2; fi

diff --git a/default-config.json b/default-config.json
diff --git a/src/glue/caching.rs b/src/glue/caching.rs
@@ -49,18 +49,9 @@ pub struct NewCacheEntry<'a> {
 /// Convenience wrapper to contain the annoyingness of it all.
 /// 
 /// Internally it's an [`Arc`] of a [`Mutex`] so cloning is O(1) and sharing immutable references is not a problem.
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Default)]
 pub struct Cache(pub Arc<Mutex<InnerCache>>);
 
-impl Default for Cache {
-    /// Has the "path" of `:memory:`, which just stores the database in memory until the program exits.
-    /// 
-    /// Seems like a reasonable default.
-    fn default() -> Self {
-        Self(Default::default())
-    }
-}
-
 /// The internals of [`Cache`] that handles lazily connecting.
 pub struct InnerCache {
     /// The path being connected to.

diff --git a/src/lib.rs b/src/lib.rs
@@ -4,8 +4,9 @@
 //! # Examples
 //! ```
 //! use std::borrow::Cow;
+//! use std::str::FromStr;
 //! use url::Url;
-//!
+//! 
 //! use url_cleaner::types::*;
 #![cfg_attr(feature = "cache", doc = "use url_cleaner::glue::Cache;")]
 //! 
@@ -33,9 +34,9 @@
 #![cfg_attr(feature = "cache", doc = "    // That's fine because cloning a `Cache` is extremely cheap, because it's an `Arc<Mutex<InnerCache>>`.")]
 #![cfg_attr(feature = "cache", doc = "    cache: config.cache_path.as_str().into(),")]
 //!     // Ideally you'll be handling URLs in bulk.
-//!     job_config_source: Box::new(vec![
-//!         Url::parse("https://example.com?utm_source=url-cleaner-docs").unwrap()
-//!     ].into_iter().map(|url| Ok(url.into())))
+//!     job_configs_source: Box::new([
+//!         JobConfig::from_str("https://example.com?utm_source=url-cleaner-docs")
+//!     ].into_iter())
 //! };
 //! 
 //! for job in jobs.iter() {

diff --git a/src/main.rs b/src/main.rs
@@ -147,6 +147,7 @@ pub enum CliError {
     #[error(transparent)] SerdeJsonError(#[from] serde_json::Error)
 }
 
+/// Shorthand for serializing a string to JSON.
 fn str_to_json_str(s: &str) -> String {
     serde_json::to_string(s).expect("Serializing a string to never fail.")
 }
@@ -272,10 +273,10 @@ fn main() -> Result<ExitCode, CliError> {
     let mut jobs = Jobs {
         #[cfg(feature = "cache")]
         cache: args.cache_path.as_deref().unwrap_or(&*config.cache_path).into(),
-        job_config_source: {
-            let ret = args.urls.into_iter().map(|url| JobConfig::from_str(&url).map_err(Into::into));
+        job_configs_source: {
+            let ret = args.urls.into_iter().map(|url| JobConfig::from_str(&url));
             if !io::stdin().is_terminal() {
-                Box::new(ret.chain(io::stdin().lines().map(|line| JobConfig::from_str(&line?).map_err(Into::into))))
+                Box::new(ret.chain(io::stdin().lines().map(|line| JobConfig::from_str(&line?))))
             } else {
                 Box::new(ret)
             }
@@ -328,19 +329,19 @@ fn main() -> Result<ExitCode, CliError> {
                 },
                 Err(e) => {
                     println!();
-                    eprintln!("GetJobError\t{e:?}");
+                    eprintln!("MakeJobError\t{e:?}");
                     some_error = true;
                 }
             }
         }
     }
 
     #[cfg(feature = "debug-time")] eprintln!("Run Jobs: {:?}", x.elapsed());
-    #[cfg(feature = "debug-time")] let x = std::time::Instant::now();
+    // #[cfg(feature = "debug-time")] let x = std::time::Instant::now();
 
-    #[cfg(feature = "debug-time")] drop(jobs);
+    // #[cfg(feature = "debug-time")] drop(jobs);
 
-    #[cfg(feature = "debug-time")] eprintln!("Drop Jobs: {:?}", x.elapsed());
+    // #[cfg(feature = "debug-time")] eprintln!("Drop Jobs: {:?}", x.elapsed());
     #[cfg(feature = "debug-time")] eprintln!("Total: {:?}", start_time.elapsed());
 
     Ok(match (some_ok, some_error) {

diff --git a/src/types/config.rs b/src/types/config.rs
@@ -67,7 +67,7 @@ impl Config {
     /// If the specified file can't be loaded, returns the error [`GetConfigError::CantLoadConfigFile`].
     /// 
     /// If the config contained in the specified file can't be parsed, returns the error [`GetConfigError::CantParseConfigFile`].
-    pub fn load_from_file(path: &Path) -> Result<Self, GetConfigError> {
+    pub fn load_from_file<T: AsRef<Path>>(path: T) -> Result<Self, GetConfigError> {
         serde_json::from_str(&read_to_string(path).map_err(GetConfigError::CantLoadConfigFile)?).map_err(GetConfigError::CantParseConfigFile)
     }
 
@@ -106,7 +106,7 @@ impl Config {
     /// If `path` is `Some` and the call to [`Self::load_from_file`] returns an error, that error is returned.
     #[allow(dead_code, reason = "Public API.")]
     #[cfg(feature = "default-config")]
-    pub fn get_default_or_load(path: Option<&Path>) -> Result<Cow<'static, Self>, GetConfigError> {
+    pub fn get_default_or_load<T: AsRef<Path>>(path: Option<T>) -> Result<Cow<'static, Self>, GetConfigError> {
         Ok(match path {
             Some(path) => Cow::Owned(Self::load_from_file(path)?),
             None => Cow::Borrowed(Self::get_default()?)
@@ -119,7 +119,7 @@ impl Config {
     /// # Errors
     /// If the default config cannot be parsed, returns the error [`GetConfigError::CantParseDefaultConfig`].
     #[cfg(feature = "default-config")]
-    pub fn get_default_no_cache_or_load(path: Option<&Path>) -> Result<Self, GetConfigError> {
+    pub fn get_default_no_cache_or_load<T: AsRef<Path>>(path: Option<T>) -> Result<Self, GetConfigError> {
         Ok(match path {
             Some(path) => Self::load_from_file(path)?,
             None => Self::get_default_no_cache()?
@@ -189,7 +189,7 @@ pub const DEFAULT_CONFIG_STR: &str = include_str!("../../default-config.json");
 #[allow(dead_code, reason = "Public API.")]
 pub static DEFAULT_CONFIG: OnceLock<Config> = OnceLock::new();
 
-/// An enum containing all possible errors that can happen when loading/parsing a rules into a [`Rules`]
+/// An enum containing all possible errors that can happen when loading/parsing a config.
 #[derive(Debug, Error)]
 pub enum GetConfigError {
     /// Could not load the specified config file.

diff --git a/src/types/jobs/job_config.rs b/src/types/jobs/job_config.rs
@@ -2,6 +2,7 @@
 
 use std::error::Error;
 use std::str::FromStr;
+use std::io;
 
 use serde::{Serialize, Deserialize};
 use url::Url;
@@ -11,6 +12,24 @@ use crate::types::*;
 use crate::util::*;
 
 /// Defines how each [`Job`] from a [`Jobs`] should be constructed.
+/// 
+/// When deserializing from a string or using [`FromStr::from_str`]/[`TryFrom<&str>`], if the string starts with `{`, it's deserializes the string's value.
+/// 
+/// For example, `{"url": "https://example.com"}` and `"{\"url\": \"https://example.com\"}"` deserialize to the same value.
+/// 
+/// This allows for more flexible APIs where having to input JSON objects is infeasable, like in command line interfaces.
+/// ```
+/// # use std::str::FromStr;
+/// # use url_cleaner::types::*;
+/// assert_eq!(
+///     serde_json::from_str::<JobConfig>("{\"url\": \"https://example.com\"}").unwrap(),
+///     serde_json::from_str::<JobConfig>("\"{\\\"url\\\": \\\"https://example.com\\\"}\"").unwrap()
+/// );
+/// assert_eq!(
+///     JobConfig::from_str("https://example.com").unwrap(),
+///     JobConfig::from_str("{\"url\": \"https://example.com\"}").unwrap()
+/// );
+/// ```
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(remote = "Self")]
 pub struct JobConfig {
@@ -31,14 +50,22 @@ impl From<Url> for JobConfig {
 }
 
 /// The enum of errors [`JobConfig::from_str`] and [`<JobConfig as TryFrom<&str>>::try_from`] can return.
+/// 
+/// Additionally has [`Self::IoError`] and [`Self::Other`] to accomodate [`Jobs::job_configs_source`] iterators.
 #[derive(Debug, Error)]
 pub enum MakeJobConfigError {
     /// Returned when a [`url::ParseError`] is encountered.
     #[error(transparent)]
     UrlParseError(#[from] url::ParseError),
     /// Returned when a [`serde_json::Error`] is encountered.
     #[error(transparent)]
-    SerdeJsonError(#[from] serde_json::Error)
+    SerdeJsonError(#[from] serde_json::Error),
+    /// Returned when an [`io::Error`] is encountered.
+    #[error(transparent)]
+    IoError(#[from] io::Error),
+    /// Generic error wrapper.
+    #[error(transparent)]
+    Other(#[from] Box<dyn Error>)
 }
 
 impl FromStr for JobConfig {
@@ -66,18 +93,3 @@ impl TryFrom<&str> for JobConfig {
 }
 
 string_or_struct_magic!(JobConfig);
-
-/// The enum of errors that can happen when [`Jobs::iter`] tries to get a URL.
-#[derive(Debug, Error)]
-pub enum JobConfigSourceError {
-    /// Returned when a [`MakeJobConfigError`] is encountered.
-    #[error(transparent)]
-    MakeJobConfigError(#[from] MakeJobConfigError),
-    /// Returned when a [`std::io::Error`] is encountered.
-    #[error(transparent)]
-    IoError(#[from] std::io::Error),
-    /// Catch-all for user-defined URL sources with errors not listed here.
-    #[allow(dead_code, reason = "Public API for use in other people's code.")]
-    #[error(transparent)]
-    Other(#[from] Box<dyn Error>)
-}
diff --git a/src/types/jobs/job_scratchpad.rs b/src/types/jobs/job_scratchpad.rs
@@ -4,8 +4,6 @@ use std::collections::HashMap;
 
 use serde::{Serialize, Deserialize};
 
-#[allow(unused_imports, reason = "Used in a doc comment.")]
-use crate::types::*;
 use crate::util::*;
 
 /// Mutable state that you can use to track data between rules outside of the URL.

diff --git a/src/types/jobs/jobs.rs b/src/types/jobs/jobs.rs
@@ -21,7 +21,7 @@ pub struct Jobs<'a> {
     #[cfg(feature = "cache")]
     pub cache: Cache,
     /// The iterator [`JobConfig`]s are acquired from.
-    pub job_config_source: Box<dyn Iterator<Item = Result<JobConfig, JobConfigSourceError>>>
+    pub job_configs_source: Box<dyn Iterator<Item = Result<JobConfig, MakeJobConfigError>>>
 }
 
 impl ::core::fmt::Debug for Jobs<'_> {
@@ -31,15 +31,15 @@ impl ::core::fmt::Debug for Jobs<'_> {
         x.field("config", &self.config);
         #[cfg(feature = "cache")]
         x.field("cache", &self.cache);
-        x.field("job_config_source", &"...");
+        x.field("job_configs_source", &"...");
         x.finish()
     }
 }
 
-impl Jobs<'_> {
-    /// Iterates over [`Job`]s created from [`JobConfig`]s returned from [`Self::job_config_source`].
-    pub fn iter(&mut self) -> impl Iterator<Item = Result<Job<'_>, GetJobError>> {
-        (&mut self.job_config_source)
+impl<'a> Jobs<'a> {
+    /// Iterates over [`Job`]s created from [`JobConfig`]s returned from [`Self::job_configs_source`].
+    pub fn iter(&'a mut self) -> impl Iterator<Item = Result<Job<'a>, MakeJobError>> {
+        (&mut self.job_configs_source)
             .map(|job_config_result| match job_config_result {
                 Ok(JobConfig {url, context}) => Ok(Job {
                     url,
@@ -56,7 +56,7 @@ impl Jobs<'_> {
     /// 
     /// Can be more convenient than [`Self::iter`].
     #[allow(dead_code, reason = "Public API.")]
-    pub fn with_job_config(&self, job_config: JobConfig) -> Job<'_> {
+    pub fn with_job_config(&'a self, job_config: JobConfig) -> Job<'a> {
         Job {
             url: job_config.url,
             config: &self.config,
@@ -67,10 +67,10 @@ impl Jobs<'_> {
     }
 }
 
-/// The enum of errors [`Jobs::iter`] can return.
+/// The enum of errors that can happen when [`Jobs::iter`] tries to get a URL.
 #[derive(Debug, Error)]
-pub enum GetJobError {
-    /// Returned when a [`JobConfigSourceError`] is encountered.
+pub enum MakeJobError {
+    /// Returned when a [`MakeJobConfigError`] is encountered.
     #[error(transparent)]
-    JobConfigSourceError(#[from] JobConfigSourceError)
+    MakeJobConfigError(#[from] MakeJobConfigError)
 }