Skip to content

Commit

Permalink
Optimize substrings generated from Regexp
Browse files Browse the repository at this point in the history
  • Loading branch information
twalpole committed Nov 6, 2018
1 parent df1be80 commit 9e15cf9
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 16 deletions.
59 changes: 53 additions & 6 deletions lib/capybara/selector/regexp_disassembler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,48 @@ def initialize(regexp)

def alternated_substrings
@alternated_substrings ||= begin
process(alternation: true)
or_strings = process(alternation: true)
remove_or_covered(or_strings)
or_strings.any?(&:empty?) ? [] : or_strings
end
end

def substrings
@substrings ||= begin
process(alternation: false).first
strs = process(alternation: false).first
remove_and_covered(strs)
end
end

private

def remove_and_covered(strings)
# If we have "ab" and "abcd" required - only need to check for "abcd"
strings.delete_if do |sub_string|
strings.any? do |cover_string|
next if sub_string.equal? cover_string

cover_string.include?(sub_string)
end
end
end

def remove_or_covered(or_series)
# If we are going to match `("a" and "b") or ("ade" and "bce")` it only makes sense to match ("a" and "b")

# Ensure minimum sets of strings are being or'd
or_series.each { |strs| remove_and_covered(strs) }

# Remove any of the alternated string series that fully contain any other string series
or_series.delete_if do |and_strs|
or_series.any? do |and_strs2|
next if and_strs.equal? and_strs2

remove_and_covered(and_strs + and_strs2) == and_strs
end
end
end

def process(alternation:)
strs = extract_strings(Regexp::Parser.parse(@regexp), alternation: alternation)
strs = collapse(combine(strs).map(&:flatten))
Expand Down Expand Up @@ -68,8 +98,8 @@ def collapse(strs)
end

def extract_strings(expression, strings = [], alternation: false)
expression.each do |exp|
if optional?(exp)
expression.each do |exp| # rubocop:disable Metrics/BlockLength
if optional?(exp) && !(alternation && zero_or_one?(exp))
strings.push(nil)
next
end
Expand All @@ -87,12 +117,25 @@ def extract_strings(expression, strings = [], alternation: false)
if exp.terminal?
case exp.type
when :literal
strings.push(exp.text * min_repeat(exp))
if zero_or_one?(exp)
strings.push(Set.new([[''], [exp.text]]))
next
else
strings.push(exp.text * min_repeat(exp))
end
when :escape
strings.push(exp.char * min_repeat(exp))
if zero_or_one?(exp)
strings.push(Set.new([[''], [exp.text]]))
next
else
strings.push(exp.char * min_repeat(exp))
end
else
strings.push(nil)
end
elsif alternation && zero_or_one?(exp)
strings.push(Set.new([[''], extract_strings(exp, alternation: true)]))
next
else
min_repeat(exp).times { extract_strings(exp, strings, alternation: alternation) }
end
Expand All @@ -101,6 +144,10 @@ def extract_strings(expression, strings = [], alternation: false)
strings
end

def zero_or_one?(exp)
exp.quantity == [0, 1]
end

def alternative_strings(expression)
alternatives = expression.alternatives.map { |sub_exp| extract_strings(sub_exp, alternation: true) }
if alternatives.all?(&:any?)
Expand Down
46 changes: 36 additions & 10 deletions spec/regexp_dissassembler_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,37 @@
/abc./ => %w[abc],
/abc.*/ => %w[abc],
/abc.def/ => %w[abc def],
/abc.def.ghi/ => %w[abc def ghi]
/abc.def.ghi/ => %w[abc def ghi],
/abc.abcd.abcde/ => %w[abcde],
/.*/ => []
)
end

it 'handles optional characters' do
verify_strings(
it 'ignores optional characters for substrings' do
{
/abc*def/ => %w[ab def],
/abc*/ => %w[ab],
/c*/ => [],
/abc?def/ => %w[ab def],
/abc?/ => %w[ab],
/abc?def?/ => %w[ab de],
/abc?def?g/ => %w[ab de g]
/abc?def?g/ => %w[ab de g],
/d?/ => []
}.each do |regexp, expected|
expect(Capybara::Selector::RegexpDisassembler.new(regexp).substrings).to eq expected
end
end

it 'handles optional characters for #alternated_substrings' do
verify_alternated_strings(
/abc*def/ => [%w[ab def]],
/abc*/ => [%w[ab]],
/c*/ => [],
/abc?def/ => [%w[abdef], %w[abcdef]],
/abc?/ => [%w[ab]],
/abc?def?/ => [%w[abde], %w[abcde]],
/abc?def?g/ => [%w[abdeg], %w[abdefg], %w[abcdeg], %w[abcdefg]],
/d?/ => []
)
end

Expand Down Expand Up @@ -111,24 +130,31 @@
end
end

it 'handles alternation for #options' do
it 'handles alternation for #alternated_substrings' do
verify_alternated_strings(
/abc|def/ => [%w[abc], %w[def]],
/ab(?:c|d)/ => [%w[abc], %w[abd]],
/ab(c|d|e)fg/ => [%w[abcfg], %w[abdfg], %w[abefg]],
/ab?(c|d)fg/ => [%w[a cfg], %w[a dfg]],
/ab?(c|d)fg/ => [%w[acfg], %w[adfg], %w[abcfg], %w[abdfg]],
/ab(c|d)ef/ => [%w[abcef], %w[abdef]],
/ab(cd?|ef)g/ => [%w[abc g], %w[abefg]],
/ab(cd?|ef)g/ => [%w[abcg], %w[abcdg], %w[abefg]],
/ab(cd|ef*)g/ => [%w[abcdg], %w[abe g]],
/ab|cd*/ => [%w[ab], %w[c]],
/cd(?:ef|gh)|xyz/ => [%w[cdef], %w[cdgh], %w[xyz]],
/(cd(?:ef|gh)|xyz)/ => [%w[cdef], %w[cdgh], %w[xyz]],
/cd(ef|gh)+/ => [%w[cdef], %w[cdgh]],
/cd(ef|gh)?/ => [%w[cd]],
/cd(ef|gh)?ij/ => [%w[cd ij]],
/cd(ef|gh)?ij/ => [%w[cdij], %w[cdefij], %w[cdghij]],
/cd(ef|gh)+ij/ => [%w[cdef ij], %w[cdgh ij]],
/cd(ef|gh){2}ij/ => [%w[cdefefij], %w[cdefghij], %w[cdghefij], %w[cdghghij]],
/(cd(ef|g*))/ => [%w[cd]]
/(cd(ef|g*))/ => [%w[cd]],
/a|b*/ => [],
/ab(?:c|d?)/ => [%w[ab]],
/ab(c|d)|a*/ => [],
/(abc)?(d|e)/ => [%w[d], %w[e]],
/(abc*de)?(d|e)/ => [%w[d], %w[e]],
/(abc*de)?(d|e?)/ => [],
/(abc)?(d|e?)/ => []
)
end

Expand Down Expand Up @@ -193,7 +219,7 @@ def verify_strings(hsh)

def verify_alternated_strings(hsh, wrap: false)
hsh.each do |regexp, expected|
expected = [expected] if wrap
expected = [expected] if wrap && (expected != [])
expect(Capybara::Selector::RegexpDisassembler.new(regexp).alternated_substrings).to eq expected
end
end
Expand Down

0 comments on commit 9e15cf9

Please sign in to comment.