Skip to content

Commit

Permalink
Improve performance of readuntil (#20621)
Browse files Browse the repository at this point in the history
* Improve performance of readuntil using strings

* Add backtracking

* Improve performance of readuntil using strings

Heavily inspired by omus and #20621

* Revise test to be completely unoffensive

Looking over my original test I realized it potentially could be
offensive. I've used a different example to avoid any potential issues.

* readuntil with on-the-fly backtrack caching

Caches backtracking information as it is needed. Using a SparseVector
which has a lower memory footprint than Vector but is more performant
than Dict.

* Add unicode test for readuntil

Skip testing the I/O producers "File" and "PipeEndpoint" when working
with unicode.

* Remove need for readuntil file

* [wip] reduce code duplication, allow generalized Int indexes

* refactor into single method instead of a type,
makes it possible to use readuntil with any array (indexable) object
and optimizes a few more cases
  • Loading branch information
omus authored Sep 28, 2017
1 parent d1b6f78 commit 056b374
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 37 deletions.
113 changes: 82 additions & 31 deletions base/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -152,10 +152,10 @@ flush(io::AbstractPipe) = flush(pipe_writer(io))
read(io::AbstractPipe, byte::Type{UInt8}) = read(pipe_reader(io), byte)
unsafe_read(io::AbstractPipe, p::Ptr{UInt8}, nb::UInt) = unsafe_read(pipe_reader(io), p, nb)
read(io::AbstractPipe) = read(pipe_reader(io))
readuntil(io::AbstractPipe, arg::UInt8) = readuntil(pipe_reader(io), arg)
readuntil(io::AbstractPipe, arg::Char) = readuntil(pipe_reader(io), arg)
readuntil(io::AbstractPipe, arg::AbstractString) = readuntil(pipe_reader(io), arg)
readuntil(io::AbstractPipe, arg) = readuntil(pipe_reader(io), arg)
readuntil(io::AbstractPipe, arg::UInt8) = readuntil(pipe_reader(io), arg)
readuntil(io::AbstractPipe, arg::Char) = readuntil(pipe_reader(io), arg)
readuntil_indexable(io::AbstractPipe, target#=::Indexable{T}=#, out) = readuntil_indexable(pipe_reader(io), target, out)

readavailable(io::AbstractPipe) = readavailable(pipe_reader(io))

isreadable(io::AbstractPipe) = isreadable(pipe_reader(io))
Expand Down Expand Up @@ -499,7 +499,7 @@ function readuntil(s::IO, delim::Char)
end

function readuntil(s::IO, delim::T) where T
out = T[]
out = (T === UInt8 ? StringVector(0) : Vector{T}())
while !eof(s)
c = read(s, T)
push!(out, c)
Expand All @@ -510,39 +510,89 @@ function readuntil(s::IO, delim::T) where T
return out
end

# based on code by Glen Hertz
function readuntil(s::IO, t::AbstractString)
l = length(t)
if l == 0
return ""
end
if l > 40
warn("readuntil(IO,AbstractString) will perform poorly with a long string")
# requires that indices for target are small ordered integers bounded by start and endof
function readuntil_indexable(io::IO, target#=::Indexable{T}=#, out)
T = eltype(target)
first = start(target)
if done(target, first)
return
end
out = IOBuffer()
m = Vector{Char}(l) # last part of stream to match
t = collect(t)
i = 0
while !eof(s)
i += 1
c = read(s, Char)
write(out, c)
if i <= l
m[i] = c
len = endof(target)
local cache # will be lazy initialized when needed
second = next(target, first)[2]
max_pos = second
pos = first
while !eof(io)
c = read(io, T)
# Backtrack until the next target character matches what was found
if out isa IO
write(out, c)
else
# shift to last part of s
for j = 2:l
m[j-1] = m[j]
end
m[l] = c
push!(out, c)
end
if i >= l && m == t
break
while true
c1, pos1 = next(target, pos)
if c == c1
pos = pos1
break
elseif pos == first
break
elseif pos == second
pos = first
else
# grow cache to contain up to `pos`
if !@isdefined(cache)
cache = zeros(Int, len)
end
while max_pos < pos
b = cache[max_pos] + first
cb, b1 = next(target, b)
ci, max_pos1 = next(target, max_pos)
if ci == cb
cache[max_pos1] = b1 - first
end
max_pos = max_pos1
end
pos = cache[pos] + first
end
end
done(target, pos) && break
end
return String(take!(out))
end

function readuntil(io::IO, target::AbstractString)
# small-string target optimizations
i = start(target)
done(target, i) && return ""
c, i = next(target, start(target))
if done(target, i) && c < Char(0x80)
return readuntil_string(io, c % UInt8)
end
# decide how we can index target
if target isa String
# convert String to a utf8-byte-iterator
target = Vector{UInt8}(target)
#elseif applicable(codeunit, target)
# TODO: a more general version of above optimization
# would be to permit accessing any string via codeunit
# target = CodeUnitVector(target)
elseif !(target isa SubString{String})
# type with unknown indexing behavior: convert to array
target = collect(target)
end
out = (eltype(target) === UInt8 ? StringVector(0) : IOBuffer())
readuntil_indexable(io, target, out)
out = isa(out, IO) ? take!(out) : out
return String(out)
end

function readuntil(io::IO, target::AbstractVector{T}) where T
out = (T === UInt8 ? StringVector(0) : Vector{T}())
readuntil_indexable(io, target, out)
return out
end


"""
readchomp(x)
Expand Down Expand Up @@ -592,6 +642,7 @@ function read(s::IO, nb::Integer = typemax(Int))
end

read(s::IO, ::Type{String}) = String(read(s))
read(s::IO, T::Type) = error("The IO stream does not support reading objects of type $T.")

## high-level iterator interfaces ##

Expand Down
3 changes: 2 additions & 1 deletion base/strings/types.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ SubString(s::T, i::Int, j::Int) where {T<:AbstractString} = SubString{T}(s, i, j
SubString(s::SubString, i::Int, j::Int) = SubString(s.string, s.offset+i, s.offset+j)
SubString(s::AbstractString, i::Integer, j::Integer) = SubString(s, Int(i), Int(j))
SubString(s::AbstractString, i::Integer) = SubString(s, i, endof(s))
SubString{T}(s::T) where {T<:AbstractString} = SubString(s, 1, endof(s))
SubString(s::AbstractString) = SubString(s, 1, endof(s))
SubString{T}(s::T) where {T<:AbstractString} = SubString{T}(s, 1, endof(s))

String(p::SubString{String}) =
unsafe_string(pointer(p.string, p.offset+1), nextind(p, p.endof)-1)
Expand Down
4 changes: 2 additions & 2 deletions src/jl_uv.c
Original file line number Diff line number Diff line change
Expand Up @@ -382,9 +382,9 @@ JL_DLLEXPORT int jl_fs_read(int handle, char *data, size_t len)
JL_DLLEXPORT int jl_fs_read_byte(int handle)
{
uv_fs_t req;
char c;
unsigned char c;
uv_buf_t buf[1];
buf[0].base = &c;
buf[0].base = (char*)&c;
buf[0].len = 1;
int ret = uv_fs_read(jl_io_loop, &req, handle, buf, 1, -1, NULL);
uv_fs_req_cleanup(&req);
Expand Down
33 changes: 30 additions & 3 deletions test/read.jl
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ s = io(text)
close(s)
push!(l, ("PipeEndpoint", io))


#FIXME See https://github.com/JuliaLang/julia/issues/14747
# Reading from open(::Command) seems to deadlock on Linux/Travis
#=
Expand Down Expand Up @@ -136,10 +135,38 @@ end

verbose = false


for (name, f) in l
local f
io = ()->(s=f(text); push!(open_streams, s); s)
local function io(text=text)
local s = f(text)
push!(open_streams, s)
return s
end

verbose && println("$name readuntil...")
for (t, s, m) in [
("a", "ab", "a"),
("b", "ab", "b"),
("α", "αγ", "α"),
("ab", "abc", "ab"),
("bc", "abc", "bc"),
("αβ", "αβγ", "αβ"),
("aaabc", "ab", "aaab"),
("aaabc", "ac", "aaabc"),
("aaabc", "aab", "aaab"),
("aaabc", "aac", "aaabc"),
("αααβγ", "αβ", "αααβ"),
("αααβγ", "ααβ", "αααβ"),
("αααβγ", "αγ", "αααβγ"),
("barbarbarians", "barbarian", "barbarbarian")]
local t, s, m
@test readuntil(io(t), s) == m
@test readuntil(io(t), SubString(s, start(s), endof(s))) == m
@test readuntil(io(t), GenericString(s)) == m
@test readuntil(io(t), Vector{UInt8}(s)) == Vector{UInt8}(m)
@test readuntil(io(t), collect(s)::Vector{Char}) == Vector{Char}(m)
end
cleanup()

write(filename, text)

Expand Down

0 comments on commit 056b374

Please sign in to comment.