From ae2d264f4a25319071cdae70128e24af84211059 Mon Sep 17 00:00:00 2001 From: Patrik Ragnarsson Date: Wed, 5 Oct 2022 21:52:09 +0200 Subject: [PATCH] Validate the normalized hostname According to the "preferred format" used by DNS. See https://en.wikipedia.org/wiki/Domain_Name_System#Domain_name_syntax,_internationalization Moves one invalid URL to the set of invalid URLs (if you enter http://www..twingly..com/ in the address bar in Chrome, it does a search, doesn't try to visit any site). --- lib/twingly/url.rb | 27 ++++++++++++++++++++++++--- spec/lib/twingly/url_spec.rb | 7 +------ 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/lib/twingly/url.rb b/lib/twingly/url.rb index 35eb392..6099902 100644 --- a/lib/twingly/url.rb +++ b/lib/twingly/url.rb @@ -22,6 +22,8 @@ class URL Addressable::URI::InvalidURIError, PublicSuffix::DomainInvalid, ].freeze + DOT = "." + HYPHEN = "-" CARRIAGE_RETURN = "\u000D" LINE_FEED = "\u000A" NBSP = "\u00A0" @@ -34,16 +36,20 @@ class URL ].join.freeze LEADING_AND_TRAILING_WHITESPACE = /\A[#{WHITESPACE_CHARS}]+|[#{WHITESPACE_CHARS}]+\z/.freeze + LETTERS_DIGITS_HYPHEN = /\A[a-zA-Z0-9-]+\z/.freeze private_constant :ACCEPTED_SCHEMES private_constant :CUSTOM_PSL private_constant :STARTS_WITH_WWW private_constant :ENDS_WITH_SLASH private_constant :ERRORS_TO_EXTEND + private_constant :DOT + private_constant :HYPHEN private_constant :NBSP private_constant :SPACE private_constant :WHITESPACE_CHARS private_constant :LEADING_AND_TRAILING_WHITESPACE + private_constant :LETTERS_DIGITS_HYPHEN class << self def parse(potential_url) @@ -91,10 +97,9 @@ def strip_whitespace(input) input.gsub(LEADING_AND_TRAILING_WHITESPACE, "") end - # Workaround for the following bug in addressable: - # https://github.com/sporkmonger/addressable/issues/224 def try_addressable_normalize(addressable_uri) - addressable_uri.normalize + ascii_host = addressable_uri.normalize.host + raise Twingly::URL::Error::ParseError unless valid_hostname?(ascii_host) rescue ArgumentError => error if error.message.include?("invalid byte sequence in UTF-8") raise Twingly::URL::Error::ParseError @@ -103,11 +108,27 @@ def try_addressable_normalize(addressable_uri) raise end + def valid_hostname?(hostname) + # No need to check the TLD, the public suffix list does that + labels = hostname.split(DOT)[0...-1].map(&:to_s) + + labels.all? { |label| valid_label?(label) } + end + + def valid_label?(label) + return false if label.start_with?(HYPHEN) + return false if label.end_with?(HYPHEN) + + label.match?(LETTERS_DIGITS_HYPHEN) + end + private :new private :internal_parse private :clean_input private :strip_whitespace private :try_addressable_normalize + private :valid_hostname? + private :valid_label? end def initialize(addressable_uri, public_suffix_domain) diff --git a/spec/lib/twingly/url_spec.rb b/spec/lib/twingly/url_spec.rb index 04be030..2008191 100644 --- a/spec/lib/twingly/url_spec.rb +++ b/spec/lib/twingly/url_spec.rb @@ -39,6 +39,7 @@ def invalid_urls "http://.gl/xxx", "http://.twingly.com/", "http://www.twingly.", + "http://www..twingly..com/", # Test that we can handle upstream bug in Addressable, references: # https://github.com/twingly/twingly-url/issues/62 @@ -564,12 +565,6 @@ def leading_and_trailing_whitespace it { is_expected.to eq(expected) } end - context "oddly enough, does not alter URLs with consecutive dots" do - let(:url) { "http://www..twingly..com/" } - - it { is_expected.to eq(url) } - end - context "does not add www. to blogspot URLs" do let(:url) { "http://jlchen1026.blogspot.com/" }