-
-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #43 from robinst/check-domains
More strict parsing of hostname (authority) part of URLs
- Loading branch information
Showing
10 changed files
with
662 additions
and
236 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,22 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). | |
This project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html), | ||
with the exception that 0.x versions can break between minor versions. | ||
|
||
## [Unreleased] | ||
### Changed | ||
- More strict parsing of hostname (authority) part of URLs. Applies to | ||
emails, plain domains URLs (e.g. `example.com/foo`) and URLs with | ||
schemes where a host is expected (e.g. `https`). | ||
|
||
This fixes a few problems that have been reported over time, namely: | ||
|
||
- `https://www.example..com` is no longer parsed as an URL (#41) | ||
- `[email protected]` is no longer parsed as an email address (#29) | ||
- `https://*.example.org` is no longer parsed as an URL (#38) | ||
|
||
It's a tricky change and hopefully this solves some problems while | ||
not introducing too many new ones. If anything unexpectedly changed | ||
for you, please let us know! | ||
|
||
## [0.8.1] - 2022-04-14 | ||
### Changed | ||
- Skip parsing very short strings for URLs as a performance optimization | ||
|
@@ -76,6 +92,7 @@ Initial release of linkify, a Rust library to find links such as URLs and email | |
addresses in plain text, handling surrounding punctuation correctly. | ||
|
||
|
||
[Unreleased]: https://github.com/robinst/linkify/compare/0.8.1...HEAD | ||
[0.8.1]: https://github.com/robinst/linkify/compare/0.8.0...0.8.1 | ||
[0.8.0]: https://github.com/robinst/linkify/compare/0.7.0...0.8.0 | ||
[0.7.0]: https://github.com/robinst/linkify/compare/0.6.0...0.7.0 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,201 @@ | ||
//! Domain name related scanning, used by both email and URL scanners. | ||
//! | ||
//! This is called domains for familiarity but it's about the authority part of URLs as defined in | ||
//! https://datatracker.ietf.org/doc/html/rfc3986#section-3.2 | ||
//! | ||
//! ```text | ||
//! authority = [ userinfo "@" ] host [ ":" port ] | ||
//! | ||
//! | ||
//! userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) | ||
//! | ||
//! host = IP-literal / IPv4address / reg-name | ||
//! | ||
//! IP-literal = "[" ( IPv6address / IPvFuture ) "]" | ||
//! | ||
//! IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet | ||
//! | ||
//! reg-name = *( unreserved / pct-encoded / sub-delims ) | ||
//! | ||
//! | ||
//! unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" | ||
//! | ||
//! sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" | ||
//! | ||
//! pct-encoded = "%" HEXDIG HEXDIG | ||
//! ``` | ||
use std::char; | ||
|
||
pub(crate) fn find_authority_end( | ||
s: &str, | ||
mut userinfo_allowed: bool, | ||
require_host: bool, | ||
port_allowed: bool, | ||
) -> (Option<usize>, Option<usize>) { | ||
let mut end = Some(0); | ||
|
||
let mut maybe_last_dot = None; | ||
let mut last_dot = None; | ||
let mut dot_allowed = false; | ||
let mut hyphen_allowed = false; | ||
let mut all_numeric = true; | ||
let mut maybe_host = true; | ||
let mut host_ended = false; | ||
|
||
for (i, c) in s.char_indices() { | ||
let can_be_last = match c { | ||
// ALPHA | ||
'a'..='z' | 'A'..='Z' | '\u{80}'..=char::MAX => { | ||
// Can start or end a domain label, but not numeric | ||
dot_allowed = true; | ||
hyphen_allowed = true; | ||
last_dot = maybe_last_dot; | ||
all_numeric = false; | ||
|
||
if host_ended { | ||
maybe_host = false; | ||
} | ||
|
||
!require_host || !host_ended | ||
} | ||
// DIGIT | ||
'0'..='9' => { | ||
// Same as above, except numeric | ||
dot_allowed = true; | ||
hyphen_allowed = true; | ||
last_dot = maybe_last_dot; | ||
|
||
if host_ended { | ||
maybe_host = false; | ||
} | ||
|
||
!require_host || !host_ended | ||
} | ||
// unreserved | ||
'-' => { | ||
// Hyphen can't be at start of a label, e.g. `-b` in `a.-b.com` | ||
if !hyphen_allowed { | ||
maybe_host = false; | ||
} | ||
// Hyphen can't be at end of a label, e.g. `b-` in `a.b-.com` | ||
dot_allowed = false; | ||
all_numeric = false; | ||
|
||
!require_host | ||
} | ||
'.' => { | ||
if !dot_allowed { | ||
// Label can't be empty, e.g. `.example.com` or `a..com` | ||
host_ended = true; | ||
} | ||
dot_allowed = false; | ||
hyphen_allowed = false; | ||
maybe_last_dot = Some(i); | ||
|
||
false | ||
} | ||
'_' | '~' => { | ||
// Hostnames can't contain these and we don't want to treat them as delimiters. | ||
maybe_host = false; | ||
|
||
false | ||
} | ||
// sub-delims | ||
'!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' => { | ||
// Can't be in hostnames, but we treat them as delimiters | ||
host_ended = true; | ||
|
||
if !userinfo_allowed && require_host { | ||
// We don't have to look further | ||
break; | ||
} | ||
|
||
false | ||
} | ||
':' => { | ||
// Could be in userinfo, or we're getting a port now. | ||
if !userinfo_allowed && !port_allowed { | ||
break; | ||
} | ||
|
||
// Don't advance the last dot when we get to port numbers | ||
maybe_last_dot = last_dot; | ||
|
||
false | ||
} | ||
'@' => { | ||
if !userinfo_allowed { | ||
// We already had userinfo, can't have another `@` in a valid authority. | ||
return (None, None); | ||
} | ||
|
||
// Sike! Everything before this has been userinfo, so let's reset our | ||
// opinions about all the host bits. | ||
userinfo_allowed = false; | ||
|
||
maybe_last_dot = None; | ||
last_dot = None; | ||
dot_allowed = false; | ||
hyphen_allowed = false; | ||
all_numeric = true; | ||
maybe_host = true; | ||
host_ended = false; | ||
|
||
false | ||
} | ||
'/' => { | ||
if !require_host { | ||
// For schemes where we allow anything, we want to stop at delimiter characters | ||
// except if we get a slash closing the URL, which happened here. | ||
end = Some(i); | ||
} | ||
break; | ||
} | ||
_ => { | ||
// Anything else, this might be the end of the authority (can be empty). | ||
// Now let the rest of the code handle checking whether the end of the URL is | ||
// valid. | ||
break; | ||
} | ||
}; | ||
|
||
if can_be_last { | ||
end = Some(i + c.len_utf8()); | ||
} | ||
} | ||
|
||
if require_host { | ||
if maybe_host { | ||
// Can't have just a number without dots as the authority | ||
if all_numeric && last_dot.is_none() && end != Some(0) { | ||
return (None, None); | ||
} | ||
|
||
// If we have something that is not just numeric (not an IP address), | ||
// check that the TLD looks reasonable. This is to avoid linking things like | ||
// `[email protected]`. | ||
if !all_numeric { | ||
if let Some(last_dot) = last_dot { | ||
if !valid_tld(&s[last_dot + 1..]) { | ||
return (None, None); | ||
} | ||
} | ||
} | ||
|
||
return (end, last_dot); | ||
} else { | ||
return (None, None); | ||
} | ||
} else { | ||
return (end, last_dot); | ||
} | ||
} | ||
|
||
fn valid_tld(tld: &str) -> bool { | ||
tld.chars() | ||
.take_while(|c| c.is_ascii_alphabetic()) | ||
.take(2) | ||
.count() | ||
>= 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
use std::ops::Range; | ||
|
||
use crate::domains::find_authority_end; | ||
use crate::scanner::Scanner; | ||
|
||
/// Scan for email address starting from the trigger character "@". | ||
|
@@ -40,6 +41,9 @@ impl EmailScanner { | |
break; | ||
} | ||
atom_boundary = true; | ||
} else if c == '@' { | ||
// In `@[email protected]`, we don't want to extract `[email protected]`. | ||
return None; | ||
} else { | ||
break; | ||
} | ||
|
@@ -49,40 +53,8 @@ impl EmailScanner { | |
|
||
// See "Domain" in RFC 5321, plus extension of "sub-domain" in RFC 6531 | ||
fn find_end(&self, s: &str) -> Option<usize> { | ||
let mut first_in_sub_domain = true; | ||
let mut can_end_sub_domain = false; | ||
let mut first_dot = None; | ||
let mut end = None; | ||
|
||
for (i, c) in s.char_indices() { | ||
if first_in_sub_domain { | ||
if Self::sub_domain_allowed(c) { | ||
end = Some(i + c.len_utf8()); | ||
first_in_sub_domain = false; | ||
can_end_sub_domain = true; | ||
} else { | ||
break; | ||
} | ||
} else if c == '.' { | ||
if !can_end_sub_domain { | ||
break; | ||
} | ||
first_in_sub_domain = true; | ||
if first_dot.is_none() { | ||
first_dot = Some(i); | ||
} | ||
} else if c == '-' { | ||
can_end_sub_domain = false; | ||
} else if Self::sub_domain_allowed(c) { | ||
end = Some(i + c.len_utf8()); | ||
can_end_sub_domain = true; | ||
} else { | ||
break; | ||
} | ||
} | ||
|
||
if let Some(end) = end { | ||
if !self.domain_must_have_dot || first_dot.map(|d| d < end).unwrap_or(false) { | ||
if let (Some(end), last_dot) = find_authority_end(s, false, true, false) { | ||
if !self.domain_must_have_dot || last_dot.is_some() { | ||
Some(end) | ||
} else { | ||
None | ||
|
@@ -120,27 +92,4 @@ impl EmailScanner { | |
_ => c >= '\u{80}', | ||
} | ||
} | ||
|
||
// See "sub-domain" in RFC 5321. Extension in RFC 6531 is simplified, | ||
// this can also match invalid domains. | ||
fn sub_domain_allowed(c: char) -> bool { | ||
match c { | ||
'a'..='z' | 'A'..='Z' | '0'..='9' => true, | ||
_ => c >= '\u{80}', | ||
} | ||
} | ||
} | ||
|
||
/// Helper function to check if given string is considered an email address. | ||
#[inline] | ||
pub(crate) fn is_mail(input: &str) -> bool { | ||
input | ||
.char_indices() | ||
.filter(|(_, c)| *c == '@') | ||
.any(|(i, _)| { | ||
let scanner = EmailScanner { | ||
domain_must_have_dot: true, | ||
}; | ||
scanner.scan(input, i).is_some() | ||
}) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.