Skip to content

Commit

Permalink
Use Nokogiri::HTML5::Inference for parsing HTML fragments
Browse files Browse the repository at this point in the history
  • Loading branch information
marcoroth committed Apr 27, 2024
1 parent 90b3af3 commit 2d2e052
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 60 deletions.
3 changes: 3 additions & 0 deletions gem/Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ PATH
deface (~> 1.9)
html_press (~> 0.8.2)
nokogiri (~> 1.0)
nokogiri-html5-inference (~> 0.2)
phlex (~> 1.6)
phlex-rails (>= 0.9, < 2.0)
syntax_tree (~> 6.0)
Expand Down Expand Up @@ -105,6 +106,8 @@ GEM
racc (~> 1.4)
nokogiri (1.16.2-x86_64-linux)
racc (~> 1.4)
nokogiri-html5-inference (0.2.0)
nokogiri (~> 1.14)
parallel (1.22.1)
parser (3.2.0.0)
ast (~> 2.4.1)
Expand Down
21 changes: 2 additions & 19 deletions gem/lib/phlexing/parser.rb
Original file line number Diff line number Diff line change
@@ -1,32 +1,15 @@
# frozen_string_literal: true

require "nokogiri"
require "nokogiri/html5/inference"

module Phlexing
class Parser
def self.call(source)
source = ERBTransformer.call(source)
source = Minifier.call(source)

# Credit:
# https://github.com/spree/deface/blob/6bf18df76715ee3eb3d0cd1b6eda822817ace91c/lib/deface/parser.rb#L105-L111
#

html_tag = /<html(( .*?(?:(?!>)[\s\S])*>)|>)/i
head_tag = /<head(( .*?(?:(?!>)[\s\S])*>)|>)/i
body_tag = /<body(( .*?(?:(?!>)[\s\S])*>)|>)/i

if source =~ html_tag
Nokogiri::HTML::Document.parse(source)
elsif source =~ head_tag && source =~ body_tag
Nokogiri::HTML::Document.parse(source).css("html").first
elsif source =~ head_tag
Nokogiri::HTML::Document.parse(source).css("head").first
elsif source =~ body_tag
Nokogiri::HTML::Document.parse(source).css("body").first
else
Nokogiri::HTML::DocumentFragment.parse(source)
end
Nokogiri::HTML5::Inference.parse(source)
end
end
end
1 change: 1 addition & 0 deletions gem/phlexing.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Gem::Specification.new do |spec|
spec.add_dependency "deface", "~> 1.9"
spec.add_dependency "html_press", "~> 0.8.2"
spec.add_dependency "nokogiri", "~> 1.0"
spec.add_dependency "nokogiri-html5-inference", "~> 0.2"
spec.add_dependency "phlex", "~> 1.6"
spec.add_dependency "phlex-rails", ">= 0.9", "< 2.0"
spec.add_dependency "syntax_tree", "~> 6.0"
Expand Down
1 change: 0 additions & 1 deletion gem/test/phlexing/converter/uppercase_tags_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ class Phlexing::Converter::UppercaseTagsTest < Minitest::Spec

expected = <<~PHLEX.strip
html do
whitespace
head
whitespace
body
Expand Down
82 changes: 42 additions & 40 deletions gem/test/phlexing/parser_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,24 @@

require_relative "../test_helper"

def assert_dom_equal(expected, actual)
assert_equal expected, actual.gsub(%(<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">), "").squish
end

module Phlexing
class ParserTest < Minitest::Spec
before(:each) do
@nodes = []
end

def extract_children(node)
@nodes << node.name
@nodes << node.name if node.respond_to?(:name)

if node&.children
node.children.each do |node|
extract_children(node)
if node.is_a?(Nokogiri::XML::NodeSet)
node.each do |n|
extract_children(n)
end
else
if node&.children
node.children.each do |node|
extract_children(node)
end
end
end

Expand All @@ -28,88 +30,88 @@ def extract_children(node)
parser = Parser.call(nil)

assert_equal "#document-fragment", extract_children(parser).join(",")
assert_dom_equal "", parser.to_xml
assert_equal "", parser.to_html
assert_equal "#document-fragment", parser.name
assert_equal Nokogiri::HTML4::DocumentFragment, parser.class
assert_equal Nokogiri::HTML5::DocumentFragment, parser.class
end

it "should handle empty string" do
parser = Parser.call("")

assert_equal "#document-fragment", extract_children(parser).join(",")
assert_dom_equal "", parser.to_xml
assert_equal "", parser.to_html
assert_equal "#document-fragment", parser.name
assert_equal Nokogiri::HTML4::DocumentFragment, parser.class
assert_equal Nokogiri::HTML5::DocumentFragment, parser.class
end

it "should handle simple div" do
parser = Parser.call("<div></div>")

assert_equal "#document-fragment,div", extract_children(parser).join(",")
assert_dom_equal %(<div></div>), parser.to_html
assert_equal %(<div></div>), parser.to_html
assert_equal "#document-fragment", parser.name
assert_equal Nokogiri::HTML4::DocumentFragment, parser.class
assert_equal Nokogiri::HTML5::DocumentFragment, parser.class
end

it "should handle ERB" do
parser = Parser.call("<div><%= some_method %></div>")

assert_equal "#document-fragment,div,erb,text", extract_children(parser).join(",")
assert_dom_equal %(<div> <erb loud=""> some_method </erb> </div>), parser.to_xml
assert_equal %(<div><erb loud=""> some_method </erb></div>), parser.to_html
assert_equal "#document-fragment", parser.name
assert_equal Nokogiri::HTML4::DocumentFragment, parser.class
assert_equal Nokogiri::HTML5::DocumentFragment, parser.class
end

it "should handle html" do
parser = Parser.call("<html></html>")

assert_equal "document,html,html", extract_children(parser).join(",")
assert_dom_equal %(<html></html>), parser.to_xml
assert_equal "document,html,head,body", extract_children(parser).join(",")
assert_equal %(<html></html>), parser.to_html
assert_equal "document", parser.name
assert_equal Nokogiri::HTML4::Document, parser.class
assert_equal Nokogiri::HTML5::DocumentFragment, parser.class
end

it "should handle html, head and body" do
parser = Parser.call("<html><head><title>Title</title></head><body><h1>Hello</h1></body></html>")

assert_equal "document,html,html,head,title,text,body,h1,text", extract_children(parser).join(",")
assert_dom_equal %(<html> <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> <title>Title</title> </head> <body><h1>Hello</h1></body> </html>), parser.to_xml
assert_equal "document,html,head,title,text,body,h1,text", extract_children(parser).join(",")
assert_equal %(<html><head><title>Title</title></head><body><h1>Hello</h1></body></html>), parser.to_html
assert_equal "document", parser.name
assert_equal Nokogiri::HTML4::Document, parser.class
assert_equal Nokogiri::HTML5::Document, parser.class
end

it "should handle html and head" do
parser = Parser.call("<html><head><title>Title</title></head></html>")

assert_equal "document,html,html,head,title,text", extract_children(parser).join(",")
assert_dom_equal %(<html><head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> <title>Title</title> </head></html>), parser.to_xml
assert_equal "document,html,head,title,text,body", extract_children(parser).join(",")
assert_equal %(<html><head><title>Title</title></head></html>), parser.to_html
assert_equal "document", parser.name
assert_equal Nokogiri::HTML4::Document, parser.class
assert_equal Nokogiri::HTML5::Document, parser.class
end

it "should handle html and body" do
parser = Parser.call("<html><body><h1>Hello</h1></body></html>")

assert_equal "document,html,html,body,h1,text", extract_children(parser).join(",")
assert_dom_equal %(<html><body><h1>Hello</h1></body></html>), parser.to_xml
assert_equal "document,html,body,h1,text", extract_children(parser).join(",")
assert_equal %(<html><body><h1>Hello</h1></body></html>), parser.to_html
assert_equal "document", parser.name
assert_equal Nokogiri::HTML4::Document, parser.class
assert_equal Nokogiri::HTML5::Document, parser.class
end

it "should handle head and body" do
parser = Parser.call("<head><title>Title</title></head><body><h1>Hello</h1></body>")

assert_equal "html,head,title,text,body,h1,text", extract_children(parser).join(",")
assert_dom_equal %(<html> <head> <title>Title</title> </head> <body> <h1>Hello</h1> </body> </html>), parser.to_xml
assert_equal "html", parser.name
assert_equal Nokogiri::XML::Element, parser.class
assert_equal "head,title,text,body,h1,text", extract_children(parser).join(",")
assert_equal %(<head><title>Title</title></head><body><h1>Hello</h1></body>), parser.to_html
assert_equal false, parser.respond_to?(:name)
assert_equal Nokogiri::XML::NodeSet, parser.class
end

it "should handle head with title" do
parser = Parser.call("<head><title>Title</title></head>")

assert_equal "head,title,text", extract_children(parser).join(",")
assert_dom_equal %(<head> <title>Title</title> </head>), parser.to_xml
assert_equal %(<head><title>Title</title></head>), parser.to_html
assert_equal "head", parser.name
assert_equal Nokogiri::XML::Element, parser.class
end
Expand All @@ -118,7 +120,7 @@ def extract_children(node)
parser = Parser.call("<head></head>")

assert_equal "head", extract_children(parser).join(",")
assert_dom_equal %(<head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></head>), parser.to_html
assert_equal %(<head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></head>), parser.to_html
assert_equal "head", parser.name
assert_equal Nokogiri::XML::Element, parser.class
end
Expand All @@ -127,18 +129,18 @@ def extract_children(node)
parser = Parser.call("<body><h1>Hello</h1></body>")

assert_equal "body,h1,text", extract_children(parser).join(",")
assert_dom_equal %(<body> <h1>Hello</h1> </body>), parser.to_xml
assert_equal "body", parser.name
assert_equal Nokogiri::XML::Element, parser.class
assert_equal %(<body><h1>Hello</h1></body>), parser.to_html
# assert_equal "body", parser.name
assert_equal Nokogiri::XML::NodeSet, parser.class
end

it "should handle body" do
parser = Parser.call("<body></body>")

assert_equal "body", extract_children(parser).join(",")
assert_dom_equal %(<body></body>), parser.to_html
assert_equal "body", parser.name
assert_equal Nokogiri::XML::Element, parser.class
assert_equal %(<body></body>), parser.to_html
# assert_equal "body", parser.name
assert_equal Nokogiri::XML::NodeSet, parser.class
end
end
end

0 comments on commit 2d2e052

Please sign in to comment.