Skip to content

Commit

Permalink
Improve performance of readuntil using strings
Browse files Browse the repository at this point in the history
Heavily inspired by omus and #20621
  • Loading branch information
vtjnash authored and omus committed Feb 28, 2017
1 parent acdfb04 commit 2a449a2
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 34 deletions.
91 changes: 65 additions & 26 deletions base/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,6 @@ unsafe_read(io::AbstractPipe, p::Ptr{UInt8}, nb::UInt) = unsafe_read(pipe_reader
read(io::AbstractPipe) = read(pipe_reader(io))
readuntil(io::AbstractPipe, arg::UInt8) = readuntil(pipe_reader(io), arg)
readuntil(io::AbstractPipe, arg::Char) = readuntil(pipe_reader(io), arg)
readuntil(io::AbstractPipe, arg::AbstractString) = readuntil(pipe_reader(io), arg)
readuntil(io::AbstractPipe, arg) = readuntil(pipe_reader(io), arg)
readavailable(io::AbstractPipe) = readavailable(pipe_reader(io))

isreadable(io::AbstractPipe) = isreadable(pipe_reader(io))
Expand Down Expand Up @@ -434,7 +432,7 @@ function readuntil(s::IO, delim::Char)
end

function readuntil{T}(s::IO, delim::T)
out = T[]
out = (T === UInt8 ? StringVector(0) : Vector{T}())
while !eof(s)
c = read(s, T)
push!(out, c)
Expand All @@ -445,41 +443,80 @@ function readuntil{T}(s::IO, delim::T)
return out
end

function _rfind_sub_c(target, endc, endstate)
# find the largest i (< endstate) such that "target[(end - i + 1):(end - 1)] * c" == "target[1:i]"
# (equivalently, find the smallest tii' such that "target[tii':(end - 1)] * c" == "target[1:(end - tii' + 1)]")
# this assumes that the `target` iterator is pure and supports equality comparison for its state
# and that endstate is a valid state for target
tii = start(target)
i = start(target)
ti = tii
while tii != endstate
c, i = next(target, i)
tc, ti = next(target, ti)
if ti == endstate
if c == endc
return i
end
else
if tc == c
continue
end
end
tii = next(target, tii)[2]
i = start(target)
ti = tii
end
return start(target)
end

function readuntil(s::IO, target::AbstractString)
l = length(target)
if l == 0
ti = start(target)
if done(target, ti)
return ""
end
t = collect(target)
backtrack = zeros(Int, l)
for i = 2:l
b = backtrack[i - 1] + 1
if t[i] == t[b]
backtrack[i] = b
end
c1, tc1 = next(target, ti)
if done(target, tc1) && c1 < Char(0x80)
return readuntil_string(s, c1 % UInt8)
end
out = IOBuffer()
i = 0
while !eof(s)
c = read(s, Char)
write(out, c)
while i != 0 && c != t[i + 1]
if i > 0
i = backtrack[i]
else
i = 0
end
end
if c == t[i + 1]
i += 1
end
if i == l
break
tc, ti = next(target, ti)
if c == tc
done(target, ti) && break
else
ti = _rfind_sub_c(target, c, ti)
end
end
return String(take!(out))
end

function readuntil(s::IO, target::String)
ti = start(target)
if done(target, ti)
return ""
end
c1, tc1 = next(target, ti)
if done(target, tc1) && c1 < Char(0x80)
return readuntil_string(s, c1 % UInt8)
end
targetb = Vector{UInt8}(target) # convert String to a utf8-byte-iterator
out = StringVector(0)
while !eof(s)
c = read(s, UInt8)
push!(out, c)
tc, ti = next(targetb, ti)
if c == tc
done(targetb, ti) && break
else
ti = _rfind_sub_c(targetb, c, ti)
end
end
return String(out)
end

"""
readchomp(x)
Expand Down Expand Up @@ -523,11 +560,13 @@ Read at most `nb` bytes from `s`, returning a `Vector{UInt8}` of the bytes read.
function read(s::IO, nb=typemax(Int))
# Let readbytes! grow the array progressively by default
# instead of taking of risk of over-allocating
b = Array{UInt8}(nb == typemax(Int) ? 1024 : nb)
b = Vector{UInt8}(nb == typemax(Int) ? 1024 : nb)
nr = readbytes!(s, b, nb)
return resize!(b, nr)
end

read(s::IO, T::Type) = error("The IO stream does not support reading objects of type $T.")

"""
readstring(stream::IO)
readstring(filename::AbstractString)
Expand Down
25 changes: 17 additions & 8 deletions test/read.jl
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,6 @@ s = io(text)
close(s)
push!(l, ("PipeEndpoint", io))


# readuntil
@test readuntil(IOBuffer("aaabc"), "aab") == "aaab"
@test readuntil(IOBuffer("assassassinass"), "assassin") == "assassassin"


#FIXME See https://github.com/JuliaLang/julia/issues/14747
# Reading from open(::Command) seems to deadlock on Linux/Travis
#=
Expand Down Expand Up @@ -141,9 +135,24 @@ end

verbose = false


for (name, f) in l
io = ()->(s=f(text); push!(open_streams, s); s)
local function io(text=text)
local s = f(text)
push!(open_streams, s)
return s
end

verbose && println("$name readuntil...")
for (t, s, m) in [
("aaabc", "aab", "aaab"),
("assassassinass", "assassin", "assassassin")]
local t, s, m
@test readuntil(io(t), s) == m

s = SubString(s, start(s), endof(s))
@test readuntil(io(t), s) == m
end


write(filename, text)

Expand Down

0 comments on commit 2a449a2

Please sign in to comment.