Skip to content

Commit

Permalink
Merge pull request #43 from robinst/check-domains
Browse files Browse the repository at this point in the history
More strict parsing of hostname (authority) part of URLs
  • Loading branch information
robinst authored Jul 11, 2022
2 parents 9a6ce39 + 97152fa commit b6ad06e
Show file tree
Hide file tree
Showing 10 changed files with 662 additions and 236 deletions.
17 changes: 17 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,22 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
This project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html),
with the exception that 0.x versions can break between minor versions.

## [Unreleased]
### Changed
- More strict parsing of hostname (authority) part of URLs. Applies to
emails, plain domains URLs (e.g. `example.com/foo`) and URLs with
schemes where a host is expected (e.g. `https`).

This fixes a few problems that have been reported over time, namely:

- `https://www.example..com` is no longer parsed as an URL (#41)
- `[email protected]` is no longer parsed as an email address (#29)
- `https://*.example.org` is no longer parsed as an URL (#38)

It's a tricky change and hopefully this solves some problems while
not introducing too many new ones. If anything unexpectedly changed
for you, please let us know!

## [0.8.1] - 2022-04-14
### Changed
- Skip parsing very short strings for URLs as a performance optimization
Expand Down Expand Up @@ -76,6 +92,7 @@ Initial release of linkify, a Rust library to find links such as URLs and email
addresses in plain text, handling surrounding punctuation correctly.


[Unreleased]: https://github.com/robinst/linkify/compare/0.8.1...HEAD
[0.8.1]: https://github.com/robinst/linkify/compare/0.8.0...0.8.1
[0.8.0]: https://github.com/robinst/linkify/compare/0.7.0...0.8.0
[0.7.0]: https://github.com/robinst/linkify/compare/0.6.0...0.7.0
Expand Down
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ memchr = "2.0.1"

[dev-dependencies]
criterion = "0.3"
plotters-backend = "= 0.3.2" # 0.3.4 requires later Rust
doc-comment = "0.3.3"


Expand Down
201 changes: 201 additions & 0 deletions src/domains.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
//! Domain name related scanning, used by both email and URL scanners.
//!
//! This is called domains for familiarity but it's about the authority part of URLs as defined in
//! https://datatracker.ietf.org/doc/html/rfc3986#section-3.2
//!
//! ```text
//! authority = [ userinfo "@" ] host [ ":" port ]
//!
//!
//! userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
//!
//! host = IP-literal / IPv4address / reg-name
//!
//! IP-literal = "[" ( IPv6address / IPvFuture ) "]"
//!
//! IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
//!
//! reg-name = *( unreserved / pct-encoded / sub-delims )
//!
//!
//! unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
//!
//! sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
//!
//! pct-encoded = "%" HEXDIG HEXDIG
//! ```
use std::char;

pub(crate) fn find_authority_end(
s: &str,
mut userinfo_allowed: bool,
require_host: bool,
port_allowed: bool,
) -> (Option<usize>, Option<usize>) {
let mut end = Some(0);

let mut maybe_last_dot = None;
let mut last_dot = None;
let mut dot_allowed = false;
let mut hyphen_allowed = false;
let mut all_numeric = true;
let mut maybe_host = true;
let mut host_ended = false;

for (i, c) in s.char_indices() {
let can_be_last = match c {
// ALPHA
'a'..='z' | 'A'..='Z' | '\u{80}'..=char::MAX => {
// Can start or end a domain label, but not numeric
dot_allowed = true;
hyphen_allowed = true;
last_dot = maybe_last_dot;
all_numeric = false;

if host_ended {
maybe_host = false;
}

!require_host || !host_ended
}
// DIGIT
'0'..='9' => {
// Same as above, except numeric
dot_allowed = true;
hyphen_allowed = true;
last_dot = maybe_last_dot;

if host_ended {
maybe_host = false;
}

!require_host || !host_ended
}
// unreserved
'-' => {
// Hyphen can't be at start of a label, e.g. `-b` in `a.-b.com`
if !hyphen_allowed {
maybe_host = false;
}
// Hyphen can't be at end of a label, e.g. `b-` in `a.b-.com`
dot_allowed = false;
all_numeric = false;

!require_host
}
'.' => {
if !dot_allowed {
// Label can't be empty, e.g. `.example.com` or `a..com`
host_ended = true;
}
dot_allowed = false;
hyphen_allowed = false;
maybe_last_dot = Some(i);

false
}
'_' | '~' => {
// Hostnames can't contain these and we don't want to treat them as delimiters.
maybe_host = false;

false
}
// sub-delims
'!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' => {
// Can't be in hostnames, but we treat them as delimiters
host_ended = true;

if !userinfo_allowed && require_host {
// We don't have to look further
break;
}

false
}
':' => {
// Could be in userinfo, or we're getting a port now.
if !userinfo_allowed && !port_allowed {
break;
}

// Don't advance the last dot when we get to port numbers
maybe_last_dot = last_dot;

false
}
'@' => {
if !userinfo_allowed {
// We already had userinfo, can't have another `@` in a valid authority.
return (None, None);
}

// Sike! Everything before this has been userinfo, so let's reset our
// opinions about all the host bits.
userinfo_allowed = false;

maybe_last_dot = None;
last_dot = None;
dot_allowed = false;
hyphen_allowed = false;
all_numeric = true;
maybe_host = true;
host_ended = false;

false
}
'/' => {
if !require_host {
// For schemes where we allow anything, we want to stop at delimiter characters
// except if we get a slash closing the URL, which happened here.
end = Some(i);
}
break;
}
_ => {
// Anything else, this might be the end of the authority (can be empty).
// Now let the rest of the code handle checking whether the end of the URL is
// valid.
break;
}
};

if can_be_last {
end = Some(i + c.len_utf8());
}
}

if require_host {
if maybe_host {
// Can't have just a number without dots as the authority
if all_numeric && last_dot.is_none() && end != Some(0) {
return (None, None);
}

// If we have something that is not just numeric (not an IP address),
// check that the TLD looks reasonable. This is to avoid linking things like
// `[email protected]`.
if !all_numeric {
if let Some(last_dot) = last_dot {
if !valid_tld(&s[last_dot + 1..]) {
return (None, None);
}
}
}

return (end, last_dot);
} else {
return (None, None);
}
} else {
return (end, last_dot);
}
}

fn valid_tld(tld: &str) -> bool {
tld.chars()
.take_while(|c| c.is_ascii_alphabetic())
.take(2)
.count()
>= 2
}
63 changes: 6 additions & 57 deletions src/email.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::ops::Range;

use crate::domains::find_authority_end;
use crate::scanner::Scanner;

/// Scan for email address starting from the trigger character "@".
Expand Down Expand Up @@ -40,6 +41,9 @@ impl EmailScanner {
break;
}
atom_boundary = true;
} else if c == '@' {
// In `@[email protected]`, we don't want to extract `[email protected]`.
return None;
} else {
break;
}
Expand All @@ -49,40 +53,8 @@ impl EmailScanner {

// See "Domain" in RFC 5321, plus extension of "sub-domain" in RFC 6531
fn find_end(&self, s: &str) -> Option<usize> {
let mut first_in_sub_domain = true;
let mut can_end_sub_domain = false;
let mut first_dot = None;
let mut end = None;

for (i, c) in s.char_indices() {
if first_in_sub_domain {
if Self::sub_domain_allowed(c) {
end = Some(i + c.len_utf8());
first_in_sub_domain = false;
can_end_sub_domain = true;
} else {
break;
}
} else if c == '.' {
if !can_end_sub_domain {
break;
}
first_in_sub_domain = true;
if first_dot.is_none() {
first_dot = Some(i);
}
} else if c == '-' {
can_end_sub_domain = false;
} else if Self::sub_domain_allowed(c) {
end = Some(i + c.len_utf8());
can_end_sub_domain = true;
} else {
break;
}
}

if let Some(end) = end {
if !self.domain_must_have_dot || first_dot.map(|d| d < end).unwrap_or(false) {
if let (Some(end), last_dot) = find_authority_end(s, false, true, false) {
if !self.domain_must_have_dot || last_dot.is_some() {
Some(end)
} else {
None
Expand Down Expand Up @@ -120,27 +92,4 @@ impl EmailScanner {
_ => c >= '\u{80}',
}
}

// See "sub-domain" in RFC 5321. Extension in RFC 6531 is simplified,
// this can also match invalid domains.
fn sub_domain_allowed(c: char) -> bool {
match c {
'a'..='z' | 'A'..='Z' | '0'..='9' => true,
_ => c >= '\u{80}',
}
}
}

/// Helper function to check if given string is considered an email address.
#[inline]
pub(crate) fn is_mail(input: &str) -> bool {
input
.char_indices()
.filter(|(_, c)| *c == '@')
.any(|(i, _)| {
let scanner = EmailScanner {
domain_must_have_dot: true,
};
scanner.scan(input, i).is_some()
})
}
8 changes: 6 additions & 2 deletions src/finder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use memchr::{memchr, memchr2, memchr3};

use crate::email::EmailScanner;
use crate::scanner::Scanner;
use crate::url::UrlScanner;
use crate::url::{DomainScanner, UrlScanner};

/// A link found in the input text.
#[derive(Debug)]
Expand Down Expand Up @@ -112,6 +112,7 @@ pub struct Links<'t> {
trigger_finder: Box<dyn Fn(&[u8]) -> Option<usize>>,
email_scanner: EmailScanner,
url_scanner: UrlScanner,
domain_scanner: DomainScanner,
}

/// Iterator over spans.
Expand Down Expand Up @@ -213,6 +214,7 @@ impl<'t> Links<'t> {
email_domain_must_have_dot: bool,
) -> Links<'t> {
let url_scanner = UrlScanner;
let domain_scanner = DomainScanner;
let email_scanner = EmailScanner {
domain_must_have_dot: email_domain_must_have_dot,
};
Expand All @@ -232,6 +234,7 @@ impl<'t> Links<'t> {
trigger_finder,
email_scanner,
url_scanner,
domain_scanner,
}
}
}
Expand All @@ -246,7 +249,8 @@ impl<'t> Iterator for Links<'t> {
while let Some(i) = (self.trigger_finder)(slice[find_from..].as_bytes()) {
let trigger = slice.as_bytes()[find_from + i];
let (scanner, kind): (&dyn Scanner, LinkKind) = match trigger {
b':' | b'.' => (&self.url_scanner, LinkKind::Url),
b':' => (&self.url_scanner, LinkKind::Url),
b'.' => (&self.domain_scanner, LinkKind::Url),
b'@' => (&self.email_scanner, LinkKind::Email),
_ => unreachable!(),
};
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@
#![deny(missing_docs)]
#![deny(missing_debug_implementations)]

mod domains;
mod email;
mod finder;
mod scanner;
Expand Down
Loading

0 comments on commit b6ad06e

Please sign in to comment.