Stuff, including moving none_to_empty_string into StringSource and Ur…

…lPart variants
Scripter17 · Mar 8, 2024 · b04ecf2 · b04ecf2
1 parent fb98be4
commit b04ecf2
Show file tree

Hide file tree

Showing 16 changed files with 479 additions and 576 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/benchmarking/benchmark.sh b/benchmarking/benchmark.sh
@@ -22,9 +22,9 @@ for url in "${URLS[@]}"; do
     out="$(echo $url | rg / -r=-)-$lines"
 
     hyperfine -N -n "$url - $lines" -w 10 --input ./stdin "$COMMAND" --export-json "hyperfine-$out"
-    rm -f callgrind.out*
     cat stdin | valgrind --tool=callgrind "../target/release/url-cleaner" > /dev/null
-    gprof2dot --format=callgrind callgrind.out* --output "callgrind-$out.dot"
+    mv callgrind.out.* "callgrind-$out.out"
+    gprof2dot --format=callgrind "callgrind-$out.out" --output "callgrind-$out.dot"
     dot -Tpng "callgrind-$out.dot" -o "callgrind-$out.png"
   done
 done
diff --git a/default-config.json b/default-config.json
@@ -130,6 +130,16 @@
 
 
 
+          {
+						"condition": {"All": [
+							{"UnqualifiedAnyTld": "google"},
+							{"PathIs": "/url"}
+						]},
+						"mapper": {"GetUrlFromQueryParam": "q"}
+					},
+
+
+
 					{
 						"condition": {"All": [
 							{"FlagIsSet": "bypass.vip"},
@@ -202,7 +212,6 @@
 				"value": {"Part": {"PartSegments": {"part": "Path", "split": "/", "start": 2, "end": null}}}
 			}}
 		},
-
 		{
 			"comment": "https://profile.example1.com.example2.com -> https://profile1.example1.com",
 			"condition": {"All": [
@@ -251,27 +260,52 @@
 
 
 
+		{
+			"HostMap": {
+				"theonion.com" : "RemoveQuery",
+				"teespring.com": "RemoveQuery",
+				"instagram.com": "RemoveQuery",
+				"vxtwitter.com": {"SetHost": "twitter.com"},
+				"fixvx.com"    : {"SetHost": "twitter.com"},
+				"fxtwitter.com": {"SetHost": "twitter.com"},
+				"x.com"        : {"SetHost": "twitter.com"},
+				"youtube.com"  : {"RemoveQueryParams": ["si", "feature"]},
+				"youtu.be"     : {"All": [
+					{"SetHost" : "youtube.com"},
+					{"CopyPart": {"from": {"PathSegment": 0}, "to": {"QueryParam": "v"}}},
+					{"SetPart" : {"part": "Path", "value": "watch"}},
+					{"RemoveQueryParams": ["si", "feature"]}
+				]},
+				"instagram.com": {"RemoveQueryParams": ["igshid", "igsh"]},
+				"threads.net"  : {"RemoveQueryParams": ["igshid", "igsh"]},
+				"stackoverflow.com" : {"RemoveQueryParams": ["so_medium", "so_source", "c"]},
+				"duckduckgo.com"    : {"RemoveQueryParams": ["t", "atb", "ia"]},
+				"washingtonpost.com": {"RemoveQueryParams": ["itid"]},
+				"tumblr.com"        : "RemoveQuery",
+				"at.tumblr.com"     : {"All": [
+					{"SetHost": "www.tumblr.com"},
+					{"SetPart": {"part": {"PathSegment": 1}, "value": null}},
+					"RemoveQuery"
+				]}
+			}
+		},
+
+
+
 		{
 			"condition": {"All": [
 				{"MaybeWWWDomain": "pixiv.net"},
 				{"PathIs": "/member.php"}
 			]},
 			"mapper": {"All": [
 				{"SetPart": {"part": "Path", "value": "/users"}},
-				{"CopyPart": {"from": {"QueryParam": "id"}, "from_none_to_empty_string": false, "to": "NextPathSegment"}},
+				{"CopyPart": {"from": {"QueryParam": "id"}, "to": "NextPathSegment"}},
 				"RemoveQuery"
 			]}
 		},
 
 
 
-		{
-			"condition": {"QualifiedDomain": "at.tumblr.com"},
-			"mapper": {"All": [
-				{"SetHost": "www.tumblr.com"},
-				{"SetPart": {"part": {"PathSegment": 1}, "value": null}}
-			]}
-		},
 		{
 			"condition": {"All": [
 				{"UnqualifiedDomain": "tumblr.com"},
@@ -309,32 +343,6 @@
 
 
 
-		{
-			"HostMap": {
-				"theonion.com" : "RemoveQuery",
-				"teespring.com": "RemoveQuery",
-				"instagram.com": "RemoveQuery",
-				"vxtwitter.com": {"SetHost": "twitter.com"},
-				"fixvx.com"    : {"SetHost": "twitter.com"},
-				"fxtwitter.com": {"SetHost": "twitter.com"},
-				"x.com"        : {"SetHost": "twitter.com"},
-				"youtube.com"  : {"RemoveQueryParams": ["si", "feature"]},
-				"instagram.com": {"RemoveQueryParams": ["igshid", "igsh"]},
-				"threads.net"  : {"RemoveQueryParams": ["igshid", "igsh"]},
-				"youtu.be"     : {"All": [
-					{"SetHost" : "youtube.com"},
-					{"CopyPart": {"from": {"PathSegment": 0}, "to": {"QueryParam": "v"}}},
-					{"SetPart" : {"part": "Path", "value": "watch"}}
-				]},
-				"stackoverflow.com" : {"RemoveQueryParams": ["so_medium", "so_source", "c"]},
-				"duckduckgo.com"    : {"RemoveQueryParams": ["t", "atb", "ia"]},
-				"washingtonpost.com": {"RemoveQueryParams": ["itid"]},
-				"tumblr.com"        : "RemoveQuery"
-			}
-		},
-
-
-
 		{
 			"comment": "Discord embeds images using dedicated \"external images\" server(s). This gets the original image.",
 			"condition": {"All": [

diff --git a/src/glue/command.rs b/src/glue/command.rs
@@ -1,14 +1,14 @@
 use std::process::{Command, Output as CommandOutput, Stdio};
-use std::io::{Write, Error as IoError};
+use std::io::Write;
 use std::path::PathBuf;
-use std::str::{from_utf8, FromStr, Utf8Error};
+use std::str::{from_utf8, FromStr};
 use std::collections::HashMap;
 use std::convert::Infallible;
 use std::ffi::{OsStr, OsString};
 #[cfg(target_family = "unix")]
 use std::os::unix::ffi::OsStrExt;
 
-use url::{Url, ParseError};
+use url::Url;
 use thiserror::Error;
 use serde::{Serialize, Deserialize};
 use which::which;
@@ -57,15 +57,15 @@ string_or_struct_magic!(CommandConfig);
 /// The enum of all possible errors [`CommandConfig::exit_code`], [`CommandConfig::output`], and [`CommandConfig::get_url`] can return.
 #[derive(Debug, Error)]
 pub enum CommandError {
-    /// I/O error.
+    /// Returned when a [`std::io::Error`] is encountered.
     #[error(transparent)]
-    IoError(#[from] IoError),
-    /// UTF-8 error.
+    IoError(#[from] std::io::Error),
+    /// Returned when a [`std::str::Utf8Error`] is encountered.
     #[error(transparent)]
-    Utf8Error(#[from] Utf8Error),
-    /// URL parsing error.
+    Utf8Error(#[from] std::str::Utf8Error),
+    /// Returned when a [`url::ParseError`] is encountered.
     #[error(transparent)]
-    ParseError(#[from] ParseError),
+    UrlParseError(#[from] url::ParseError),
     /// The command was terminated by a signal. See [`std::process::ExitStatus::code`] for details.
     #[error("The command was terminated by a signal. See std::process::ExitStatus::code for details.")]
     SignalTermination,
@@ -147,7 +147,7 @@ impl CommandConfig {
     /// Runs the command, does the [`OutputHandler`] stuff, removes trailing newlines and carriage returns form the output, then extracts the URL.
     /// # Errors
     /// If the call to [`Self::output`] returns an error, that error is returned.
-    /// If the output cannot be parsed as a URL (give or take trailing newlines and carriage returns), returns the error [`CommandError::ParseError`].
+    /// If the output cannot be parsed as a URL (give or take trailing newlines and carriage returns), returns the error [`CommandError::UrlParseError`].
     pub fn get_url(&self, url: Option<&Url>) -> Result<Url, CommandError> {
         Ok(Url::parse(self.output(url, None)?.trim_end_matches(&['\r', '\n']))?)
     }

diff --git a/src/glue/regex.rs b/src/glue/regex.rs
@@ -14,6 +14,7 @@ use regex::{Regex, Replacer, Match, Captures};
 /// This is because converting a [`Regex`] into a [`RegexParts`] is extremely complicated and because it allows lazy compilation of regexes.
 /// Because the contained regex and regex parts have to always be in sync, the fields of this struct are unfortunately private.
 /// In place of public fields, various [`Into`]'s and getters are defined for this type.
+/// This does not implement [`std::ops::Deref`] or [`std::convert::AsRef`]`<`[`Regex`]`>` because [`Self::get_regex`] can panic, which is disallowed in [`std::ops::Deref::deref`] and [`std::convert::AsRef::as_ref`].
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(from = "RegexParts", into = "RegexParts")]
 pub struct RegexWrapper {
@@ -53,6 +54,12 @@ impl From<RegexWrapper> for RegexParts {
     }
 }
 
+impl AsRef<RegexParts> for RegexWrapper {
+    fn as_ref(&self) -> &RegexParts {
+        &self.parts
+    }
+}
+
 impl RegexWrapper {
     /// Gets the cached compiled regex and compiles it first if it's not already cached.
     /// # Panics

diff --git a/src/glue/regex/regex_parts.rs b/src/glue/regex/regex_parts.rs
@@ -26,26 +26,38 @@ impl AsRef<RegexConfig> for RegexParts {
     }
 }
 
+impl AsRef<String> for RegexParts {
+    fn as_ref(&self) -> &String {
+        &self.pattern
+    }
+}
+
+impl AsRef<str> for RegexParts {
+    fn as_ref(&self) -> &str {
+        &self.pattern
+    }
+}
+
 /// The configuration determining how a regular expression works.
 #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
 pub struct RegexConfig {
-    /// The flag that decides if [`RegexBuilder::case_insensitive`] is set. Defaults to `false`. This flags character is `'i'`.
+    /// The passed into [`RegexBuilder::case_insensitive`]. Defaults to `false`. This flags character is `'i'`.
     #[serde(default               , skip_serializing_if = "is_false")] pub case_insensitive: bool,
-    /// The flag that decides if [`RegexBuilder::crlf`] is set. Defaults to `false`. This flags character is `'R'`.
+    /// The passed into [`RegexBuilder::crlf`]. Defaults to `false`. This flags character is `'R'`.
     #[serde(default               , skip_serializing_if = "is_false")] pub crlf: bool,
-    /// The flag that decides if [`RegexBuilder::dot_matches_new_line`] is set. Defaults to `false`. This flags character is `'s'`.
+    /// The passed into [`RegexBuilder::dot_matches_new_line`]. Defaults to `false`. This flags character is `'s'`.
     #[serde(default               , skip_serializing_if = "is_false")] pub dot_matches_new_line: bool,
-    /// The flag that decides if [`RegexBuilder::ignore_whitespace`] is set. Defaults to `false`. This flags character is `'x'`.
+    /// The passed into [`RegexBuilder::ignore_whitespace`]. Defaults to `false`. This flags character is `'x'`.
     #[serde(default               , skip_serializing_if = "is_false")] pub ignore_whitespace: bool,
-    /// The flag that decides if [`RegexBuilder::line_terminator`] is set. Defaults to `b'\n'` (`10`).
+    /// The passed into [`RegexBuilder::line_terminator`]. Defaults to `b'\n'` (`10`).
     #[serde(default = "newline_u8", skip_serializing_if = "is_nlu8" )] pub line_terminator: u8,
-    /// The flag that decides if [`RegexBuilder::multi_line`] is set. Defaults to `false`. This flags character is `'m'`.
+    /// The passed into [`RegexBuilder::multi_line`]. Defaults to `false`. This flags character is `'m'`.
     #[serde(default               , skip_serializing_if = "is_false")] pub multi_line: bool,
-    /// The flag that decides if [`RegexBuilder::octal`] is set. Defaults to `false`. This flags character is `'o'` because the `regex` crate forgot and I said so.
+    /// The passed into [`RegexBuilder::octal`]. Defaults to `false`. This flags character is `'o'` because the `regex` crate forgot and I said so.
     #[serde(default               , skip_serializing_if = "is_false")] pub octal: bool,
-    /// The flag that decides if [`RegexBuilder::swap_greed`] is set. Defaults to `false`. This flags character is `'U'`.
+    /// The passed into [`RegexBuilder::swap_greed`]. Defaults to `false`. This flags character is `'U'`.
     #[serde(default               , skip_serializing_if = "is_false")] pub swap_greed: bool,
-    /// The flag that decides if [`RegexBuilder::unicode`] is set. Defaults to `true`. This flags character is `'u'`.
+    /// The passed into [`RegexBuilder::unicode`]. Defaults to `true`. This flags character is `'u'`.
     #[serde(default = "get_true"  , skip_serializing_if = "is_true" )] pub unicode: bool
 }
 

diff --git a/src/rules.rs b/src/rules.rs
@@ -55,8 +55,7 @@ pub enum Rule {
     ///             condition: Condition::Always,
     ///             mapper: Mapper::SetPart {
     ///                 part: UrlPart::NextPathSegment,
-    ///                 value: Some(FromStr::from_str("a").unwrap()),
-    ///                 value_none_to_empty_string: false
+    ///                 value: Some(FromStr::from_str("a").unwrap())
     ///             }
     ///         }
     ///     ],