Skip to content

Commit

Permalink
Stuff, including moving none_to_empty_string into StringSource and Ur…
Browse files Browse the repository at this point in the history
…lPart variants
  • Loading branch information
Scripter17 committed Mar 8, 2024
1 parent fb98be4 commit b04ecf2
Show file tree
Hide file tree
Showing 16 changed files with 479 additions and 576 deletions.
20 changes: 10 additions & 10 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions benchmarking/benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ for url in "${URLS[@]}"; do
out="$(echo $url | rg / -r=-)-$lines"

hyperfine -N -n "$url - $lines" -w 10 --input ./stdin "$COMMAND" --export-json "hyperfine-$out"
rm -f callgrind.out*
cat stdin | valgrind --tool=callgrind "../target/release/url-cleaner" > /dev/null
gprof2dot --format=callgrind callgrind.out* --output "callgrind-$out.dot"
mv callgrind.out.* "callgrind-$out.out"
gprof2dot --format=callgrind "callgrind-$out.out" --output "callgrind-$out.dot"
dot -Tpng "callgrind-$out.dot" -o "callgrind-$out.png"
done
done
78 changes: 43 additions & 35 deletions default-config.json
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,16 @@



{
"condition": {"All": [
{"UnqualifiedAnyTld": "google"},
{"PathIs": "/url"}
]},
"mapper": {"GetUrlFromQueryParam": "q"}
},



{
"condition": {"All": [
{"FlagIsSet": "bypass.vip"},
Expand Down Expand Up @@ -202,7 +212,6 @@
"value": {"Part": {"PartSegments": {"part": "Path", "split": "/", "start": 2, "end": null}}}
}}
},

{
"comment": "https://profile.example1.com.example2.com -> https://profile1.example1.com",
"condition": {"All": [
Expand Down Expand Up @@ -251,27 +260,52 @@



{
"HostMap": {
"theonion.com" : "RemoveQuery",
"teespring.com": "RemoveQuery",
"instagram.com": "RemoveQuery",
"vxtwitter.com": {"SetHost": "twitter.com"},
"fixvx.com" : {"SetHost": "twitter.com"},
"fxtwitter.com": {"SetHost": "twitter.com"},
"x.com" : {"SetHost": "twitter.com"},
"youtube.com" : {"RemoveQueryParams": ["si", "feature"]},
"youtu.be" : {"All": [
{"SetHost" : "youtube.com"},
{"CopyPart": {"from": {"PathSegment": 0}, "to": {"QueryParam": "v"}}},
{"SetPart" : {"part": "Path", "value": "watch"}},
{"RemoveQueryParams": ["si", "feature"]}
]},
"instagram.com": {"RemoveQueryParams": ["igshid", "igsh"]},
"threads.net" : {"RemoveQueryParams": ["igshid", "igsh"]},
"stackoverflow.com" : {"RemoveQueryParams": ["so_medium", "so_source", "c"]},
"duckduckgo.com" : {"RemoveQueryParams": ["t", "atb", "ia"]},
"washingtonpost.com": {"RemoveQueryParams": ["itid"]},
"tumblr.com" : "RemoveQuery",
"at.tumblr.com" : {"All": [
{"SetHost": "www.tumblr.com"},
{"SetPart": {"part": {"PathSegment": 1}, "value": null}},
"RemoveQuery"
]}
}
},



{
"condition": {"All": [
{"MaybeWWWDomain": "pixiv.net"},
{"PathIs": "/member.php"}
]},
"mapper": {"All": [
{"SetPart": {"part": "Path", "value": "/users"}},
{"CopyPart": {"from": {"QueryParam": "id"}, "from_none_to_empty_string": false, "to": "NextPathSegment"}},
{"CopyPart": {"from": {"QueryParam": "id"}, "to": "NextPathSegment"}},
"RemoveQuery"
]}
},



{
"condition": {"QualifiedDomain": "at.tumblr.com"},
"mapper": {"All": [
{"SetHost": "www.tumblr.com"},
{"SetPart": {"part": {"PathSegment": 1}, "value": null}}
]}
},
{
"condition": {"All": [
{"UnqualifiedDomain": "tumblr.com"},
Expand Down Expand Up @@ -309,32 +343,6 @@



{
"HostMap": {
"theonion.com" : "RemoveQuery",
"teespring.com": "RemoveQuery",
"instagram.com": "RemoveQuery",
"vxtwitter.com": {"SetHost": "twitter.com"},
"fixvx.com" : {"SetHost": "twitter.com"},
"fxtwitter.com": {"SetHost": "twitter.com"},
"x.com" : {"SetHost": "twitter.com"},
"youtube.com" : {"RemoveQueryParams": ["si", "feature"]},
"instagram.com": {"RemoveQueryParams": ["igshid", "igsh"]},
"threads.net" : {"RemoveQueryParams": ["igshid", "igsh"]},
"youtu.be" : {"All": [
{"SetHost" : "youtube.com"},
{"CopyPart": {"from": {"PathSegment": 0}, "to": {"QueryParam": "v"}}},
{"SetPart" : {"part": "Path", "value": "watch"}}
]},
"stackoverflow.com" : {"RemoveQueryParams": ["so_medium", "so_source", "c"]},
"duckduckgo.com" : {"RemoveQueryParams": ["t", "atb", "ia"]},
"washingtonpost.com": {"RemoveQueryParams": ["itid"]},
"tumblr.com" : "RemoveQuery"
}
},



{
"comment": "Discord embeds images using dedicated \"external images\" server(s). This gets the original image.",
"condition": {"All": [
Expand Down
20 changes: 10 additions & 10 deletions src/glue/command.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
use std::process::{Command, Output as CommandOutput, Stdio};
use std::io::{Write, Error as IoError};
use std::io::Write;
use std::path::PathBuf;
use std::str::{from_utf8, FromStr, Utf8Error};
use std::str::{from_utf8, FromStr};
use std::collections::HashMap;
use std::convert::Infallible;
use std::ffi::{OsStr, OsString};
#[cfg(target_family = "unix")]
use std::os::unix::ffi::OsStrExt;

use url::{Url, ParseError};
use url::Url;
use thiserror::Error;
use serde::{Serialize, Deserialize};
use which::which;
Expand Down Expand Up @@ -57,15 +57,15 @@ string_or_struct_magic!(CommandConfig);
/// The enum of all possible errors [`CommandConfig::exit_code`], [`CommandConfig::output`], and [`CommandConfig::get_url`] can return.
#[derive(Debug, Error)]
pub enum CommandError {
/// I/O error.
/// Returned when a [`std::io::Error`] is encountered.
#[error(transparent)]
IoError(#[from] IoError),
/// UTF-8 error.
IoError(#[from] std::io::Error),
/// Returned when a [`std::str::Utf8Error`] is encountered.
#[error(transparent)]
Utf8Error(#[from] Utf8Error),
/// URL parsing error.
Utf8Error(#[from] std::str::Utf8Error),
/// Returned when a [`url::ParseError`] is encountered.
#[error(transparent)]
ParseError(#[from] ParseError),
UrlParseError(#[from] url::ParseError),
/// The command was terminated by a signal. See [`std::process::ExitStatus::code`] for details.
#[error("The command was terminated by a signal. See std::process::ExitStatus::code for details.")]
SignalTermination,
Expand Down Expand Up @@ -147,7 +147,7 @@ impl CommandConfig {
/// Runs the command, does the [`OutputHandler`] stuff, removes trailing newlines and carriage returns form the output, then extracts the URL.
/// # Errors
/// If the call to [`Self::output`] returns an error, that error is returned.
/// If the output cannot be parsed as a URL (give or take trailing newlines and carriage returns), returns the error [`CommandError::ParseError`].
/// If the output cannot be parsed as a URL (give or take trailing newlines and carriage returns), returns the error [`CommandError::UrlParseError`].
pub fn get_url(&self, url: Option<&Url>) -> Result<Url, CommandError> {
Ok(Url::parse(self.output(url, None)?.trim_end_matches(&['\r', '\n']))?)
}
Expand Down
7 changes: 7 additions & 0 deletions src/glue/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use regex::{Regex, Replacer, Match, Captures};
/// This is because converting a [`Regex`] into a [`RegexParts`] is extremely complicated and because it allows lazy compilation of regexes.
/// Because the contained regex and regex parts have to always be in sync, the fields of this struct are unfortunately private.
/// In place of public fields, various [`Into`]'s and getters are defined for this type.
/// This does not implement [`std::ops::Deref`] or [`std::convert::AsRef`]`<`[`Regex`]`>` because [`Self::get_regex`] can panic, which is disallowed in [`std::ops::Deref::deref`] and [`std::convert::AsRef::as_ref`].
#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(from = "RegexParts", into = "RegexParts")]
pub struct RegexWrapper {
Expand Down Expand Up @@ -53,6 +54,12 @@ impl From<RegexWrapper> for RegexParts {
}
}

impl AsRef<RegexParts> for RegexWrapper {
fn as_ref(&self) -> &RegexParts {
&self.parts
}
}

impl RegexWrapper {
/// Gets the cached compiled regex and compiles it first if it's not already cached.
/// # Panics
Expand Down
30 changes: 21 additions & 9 deletions src/glue/regex/regex_parts.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,26 +26,38 @@ impl AsRef<RegexConfig> for RegexParts {
}
}

impl AsRef<String> for RegexParts {
fn as_ref(&self) -> &String {
&self.pattern
}
}

impl AsRef<str> for RegexParts {
fn as_ref(&self) -> &str {
&self.pattern
}
}

/// The configuration determining how a regular expression works.
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
pub struct RegexConfig {
/// The flag that decides if [`RegexBuilder::case_insensitive`] is set. Defaults to `false`. This flags character is `'i'`.
/// The passed into [`RegexBuilder::case_insensitive`]. Defaults to `false`. This flags character is `'i'`.
#[serde(default , skip_serializing_if = "is_false")] pub case_insensitive: bool,
/// The flag that decides if [`RegexBuilder::crlf`] is set. Defaults to `false`. This flags character is `'R'`.
/// The passed into [`RegexBuilder::crlf`]. Defaults to `false`. This flags character is `'R'`.
#[serde(default , skip_serializing_if = "is_false")] pub crlf: bool,
/// The flag that decides if [`RegexBuilder::dot_matches_new_line`] is set. Defaults to `false`. This flags character is `'s'`.
/// The passed into [`RegexBuilder::dot_matches_new_line`]. Defaults to `false`. This flags character is `'s'`.
#[serde(default , skip_serializing_if = "is_false")] pub dot_matches_new_line: bool,
/// The flag that decides if [`RegexBuilder::ignore_whitespace`] is set. Defaults to `false`. This flags character is `'x'`.
/// The passed into [`RegexBuilder::ignore_whitespace`]. Defaults to `false`. This flags character is `'x'`.
#[serde(default , skip_serializing_if = "is_false")] pub ignore_whitespace: bool,
/// The flag that decides if [`RegexBuilder::line_terminator`] is set. Defaults to `b'\n'` (`10`).
/// The passed into [`RegexBuilder::line_terminator`]. Defaults to `b'\n'` (`10`).
#[serde(default = "newline_u8", skip_serializing_if = "is_nlu8" )] pub line_terminator: u8,
/// The flag that decides if [`RegexBuilder::multi_line`] is set. Defaults to `false`. This flags character is `'m'`.
/// The passed into [`RegexBuilder::multi_line`]. Defaults to `false`. This flags character is `'m'`.
#[serde(default , skip_serializing_if = "is_false")] pub multi_line: bool,
/// The flag that decides if [`RegexBuilder::octal`] is set. Defaults to `false`. This flags character is `'o'` because the `regex` crate forgot and I said so.
/// The passed into [`RegexBuilder::octal`]. Defaults to `false`. This flags character is `'o'` because the `regex` crate forgot and I said so.
#[serde(default , skip_serializing_if = "is_false")] pub octal: bool,
/// The flag that decides if [`RegexBuilder::swap_greed`] is set. Defaults to `false`. This flags character is `'U'`.
/// The passed into [`RegexBuilder::swap_greed`]. Defaults to `false`. This flags character is `'U'`.
#[serde(default , skip_serializing_if = "is_false")] pub swap_greed: bool,
/// The flag that decides if [`RegexBuilder::unicode`] is set. Defaults to `true`. This flags character is `'u'`.
/// The passed into [`RegexBuilder::unicode`]. Defaults to `true`. This flags character is `'u'`.
#[serde(default = "get_true" , skip_serializing_if = "is_true" )] pub unicode: bool
}

Expand Down
3 changes: 1 addition & 2 deletions src/rules.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,7 @@ pub enum Rule {
/// condition: Condition::Always,
/// mapper: Mapper::SetPart {
/// part: UrlPart::NextPathSegment,
/// value: Some(FromStr::from_str("a").unwrap()),
/// value_none_to_empty_string: false
/// value: Some(FromStr::from_str("a").unwrap())
/// }
/// }
/// ],
Expand Down
Loading

0 comments on commit b04ecf2

Please sign in to comment.