Skip to content

Commit

Permalink
Merge pull request #24708 from JuliaLang/sk/revstring
Browse files Browse the repository at this point in the history
remove `RevString`; efficient generic `reverseind`
  • Loading branch information
StefanKarpinski authored Dec 4, 2017
2 parents e4cf911 + 5167f17 commit b8ee561
Show file tree
Hide file tree
Showing 17 changed files with 156 additions and 165 deletions.
15 changes: 15 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,12 @@ This section lists changes that do not have deprecation warnings.
`AbstractArray` types that specialized broadcasting using the old internal API will
need to switch to the new API. ([#20740])

* The `RevString` type has been removed from the language; `reverse(::String)` returns
a `String` with code points (or fragments thereof) in reverse order. In general,
`reverse(s)` should return a string of the same type and encoding as `s` with code
points in reverse order; any string type overrides `reverse` to return a different
type of string must also override `reverseind` to compute reversed indices correctly.

Library improvements
--------------------

Expand Down Expand Up @@ -409,6 +415,15 @@ Library improvements
* The `keys` of an `Associative` are now an `AbstractSet`. `Base.KeyIterator{<:Associative}`
has been changed to `KeySet{K, <:Associative{K}} <: AbstractSet{K}` ([#24580]).

* New function `ncodeunits(s::AbstractString)` gives the number of code units in a string.
The generic definition is constant time but calls `endof(s)` which may be inefficient.
Therefore custom string types may want to define direct `ncodeunits` methods.

* `reverseind(s::AbstractString, i::Integer)` now has an efficient generic fallback, so
custom string types do not need to provide their own efficient defintions. The generic
definition relies on `ncodeunits` however, so for optimal performance you may need to
define a custom method for that function.

Compiler/Runtime improvements
-----------------------------

Expand Down
2 changes: 1 addition & 1 deletion base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ export
Rational,
Regex,
RegexMatch,
RevString,
RoundFromZero,
RoundDown,
RoundingMode,
Expand Down Expand Up @@ -756,6 +755,7 @@ export
lstrip,
match,
matchall,
ncodeunits,
ndigits,
nextind,
normalize_string,
Expand Down
3 changes: 0 additions & 3 deletions base/precompile.jl
Original file line number Diff line number Diff line change
Expand Up @@ -578,9 +578,6 @@ precompile(Tuple{typeof(Base.LineEdit.complete_line), Base.LineEdit.PromptState,
precompile(Tuple{typeof(Base.LineEdit.input_string_newlines_aftercursor), Base.LineEdit.PromptState})
precompile(Tuple{typeof(Base.LineEdit.complete_line), Base.REPL.REPLCompletionProvider, Base.LineEdit.PromptState})
precompile(Tuple{getfield(Base, Symbol("#kw##parse")), Array{Any, 1}, typeof(Base.parse), String})
precompile(Tuple{typeof(Base.isvalid), Base.RevString{String}, Int64})
precompile(Tuple{typeof(Base.nextind), Base.RevString{String}, Int64})
precompile(Tuple{typeof(Base.search), Base.RevString{String}, Array{Char, 1}, Int64})
precompile(Tuple{typeof(Base.rsearch), String, Array{Char, 1}, Int64})
precompile(Tuple{getfield(Base.REPLCompletions, Symbol("#kw##find_start_brace")), Array{Any, 1}, typeof(Base.REPLCompletions.find_start_brace), String})
precompile(Tuple{typeof(Core.Inference.isbits), Tuple{Void, Void, Void}})
Expand Down
13 changes: 8 additions & 5 deletions base/repl/REPLCompletions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ end
# closed start brace from the end of the string.
function find_start_brace(s::AbstractString; c_start='(', c_end=')')
braces = 0
r = RevString(s)
r = reverse(s)
i = start(r)
in_single_quotes = false
in_double_quotes = false
Expand All @@ -245,18 +245,21 @@ function find_start_brace(s::AbstractString; c_start='(', c_end=')')
in_back_ticks = true
end
else
if !in_back_ticks && !in_double_quotes && c == '\'' && !done(r, i) && next(r, i)[1]!='\\'
if !in_back_ticks && !in_double_quotes &&
c == '\'' && !done(r, i) && next(r, i)[1] != '\\'
in_single_quotes = !in_single_quotes
elseif !in_back_ticks && !in_single_quotes && c == '"' && !done(r, i) && next(r, i)[1]!='\\'
elseif !in_back_ticks && !in_single_quotes &&
c == '"' && !done(r, i) && next(r, i)[1] != '\\'
in_double_quotes = !in_double_quotes
elseif !in_single_quotes && !in_double_quotes && c == '`' && !done(r, i) && next(r, i)[1]!='\\'
elseif !in_single_quotes && !in_double_quotes &&
c == '`' && !done(r, i) && next(r, i)[1] != '\\'
in_back_ticks = !in_back_ticks
end
end
braces == 1 && break
end
braces != 1 && return 0:-1, -1
method_name_end = reverseind(r, i)
method_name_end = reverseind(s, i)
startind = nextind(s, rsearch(s, non_identifier_chars, method_name_end))
return (startind:endof(s), method_name_end)
end
Expand Down
2 changes: 1 addition & 1 deletion base/shell.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ function shell_parse(str::AbstractString, interpolate::Bool=true;
special::AbstractString="")
s = lstrip(str)
# strips the end but respects the space when the string ends with "\\ "
r = RevString(s)
r = reverse(s)
i = start(r)
c_old = nothing
while !done(r,i)
Expand Down
24 changes: 17 additions & 7 deletions base/strings/basic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ julia> 'j' * "ulia"

one(::Union{T,Type{T}}) where {T<:AbstractString} = convert(T, "")

# generic number of code units; implementations generally know how long a string
# is though and should override this with a more efficient method
ncodeunits(s::AbstractString) = nextind(s, endof(s)) - 1

"""
length(s::AbstractString)
Expand Down Expand Up @@ -233,11 +237,11 @@ end
## Generic indexing functions ##

"""
thisind(str::AbstractString, i::Integer)
thisind(s::AbstractString, i::Integer)
Get the largest valid string index at or before `i`.
Returns `0` if there is no valid string index at or before `i`.
Returns `endof(str)` if `i≥endof(str)`.
If `i` is the index into a character in `s` then `thisind` returns the index of the
start of that character. If `i < start(s)` then it returns `start(s) - 1`.
If `i > ncodeunits(s)` then it returns `ncodeunits(s) + 1`.
# Examples
```jldoctest
Expand All @@ -253,15 +257,21 @@ julia> thisind("αβγdef", 3)
julia> thisind("αβγdef", 4)
3
julia> thisind("αβγdef", 20)
julia> thisind("αβγdef", 9)
9
julia> thisind("αβγdef", 10)
10
julia> thisind("αβγdef", 20)
10
"""
function thisind(s::AbstractString, i::Integer)
j = Int(i)
isvalid(s, j) && return j
j < start(s) && return 0
e = endof(s)
j >= endof(s) && return e
n = ncodeunits(s)
j > n && return n + 1
prevind(s, j)
end

Expand Down
52 changes: 26 additions & 26 deletions base/strings/search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -194,12 +194,6 @@ end
search(s::AbstractString, t::AbstractString, i::Integer=start(s)) = _search(s, t, i)
search(s::ByteArray, t::ByteArray, i::Integer=start(s)) = _search(s, t, i)

function rsearch(s::AbstractString, c::Chars)
j = search(RevString(s), c)
j == 0 && return 0
endof(s)-j+1
end

"""
rsearch(s::AbstractString, chars::Chars, [start::Integer])
Expand All @@ -212,44 +206,50 @@ julia> rsearch("aaabbb","b")
6:6
```
"""
function rsearch(s::AbstractString, c::Chars, i::Integer)
e = endof(s)
j = search(RevString(s), c, e-i+1)
j == 0 && return 0
e-j+1
function rsearch(s::AbstractString, c::Chars, i::Integer=start(s))
if i < 1
return i == 0 ? 0 : throw(BoundsError(s, i))
end
n = ncodeunits(s)
if i > n
return i == n+1 ? 0 : throw(BoundsError(s, i))
end
# r[reverseind(r,i)] == reverse(r)[i] == s[i]
# s[reverseind(s,j)] == reverse(s)[j] == r[j]
r = reverse(s)
j = search(r, c, reverseind(r, i))
j == 0 ? 0 : reverseind(s, j)
end

function _rsearchindex(s, t, i)
if isempty(t)
return 1 <= i <= nextind(s,endof(s)) ? i :
return 1 <= i <= nextind(s, endof(s)) ? i :
throw(BoundsError(s, i))
end
t = RevString(t)
rs = RevString(s)
t = reverse(t)
rs = reverse(s)
l = endof(s)
t1, j2 = next(t,start(t))
t1, j2 = next(t, start(t))
while true
i = rsearch(s,t1,i)
if i == 0 return 0 end
c, ii = next(rs,l-i+1)
i = rsearch(s, t1, i)
i == 0 && return 0
c, ii = next(rs, reverseind(rs, i))
j = j2; k = ii
matched = true
while !done(t,j)
if done(rs,k)
while !done(t, j)
if done(rs, k)
matched = false
break
end
c, k = next(rs,k)
d, j = next(t,j)
c, k = next(rs, k)
d, j = next(t, j)
if c != d
matched = false
break
end
end
if matched
return nextind(s,l-k+1)
end
i = l-ii+1
matched && return nextind(s, reverseind(s, k))
i = reverseind(s, ii)
end
end

Expand Down
58 changes: 16 additions & 42 deletions base/strings/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,20 @@ codeunit(s::AbstractString, i::Integer)
@gc_preserve s unsafe_load(pointer(s, i))
end

"""
ncodeunits(s::AbstractString)
The number of code units in a string. For example, for UTF-8-like data such as
the default `String` type, the number of code units is the number of bytes in
the string, a.k.a. `sizeof(s)`. For a UTF-16 encoded string type, however, the
code unit is `UInt16` so the number of code units is the number of `UInt16`
words in the representation of the string. The expression `codeunit(s, i)` is
valid and safe for precisely the range of `i` values `1:ncodeunits(s)`.
See also: [`codeunit`](@ref).
"""
ncodeunits(s::String) = sizeof(s)

write(io::IO, s::String) =
@gc_preserve s unsafe_write(io, pointer(s), reinterpret(UInt, sizeof(s)))

Expand All @@ -109,8 +123,8 @@ end
function thisind(s::String, i::Integer)
j = Int(i)
j < 1 && return 0
e = endof(s)
j >= e && return e
n = ncodeunits(s)
j > n && return n + 1
@inbounds while j > 0 && is_valid_continuation(codeunit(s,j))
j -= 1
end
Expand Down Expand Up @@ -281,14 +295,6 @@ function first_utf8_byte(ch::Char)
return b
end

function reverseind(s::String, i::Integer)
j = sizeof(s) + 1 - i
@inbounds while is_valid_continuation(codeunit(s, j))
j -= 1
end
return j
end

## overload methods for efficiency ##

isvalid(s::String, i::Integer) =
Expand Down Expand Up @@ -463,38 +469,6 @@ function string(a::Union{String,Char}...)
return out
end

function reverse(s::String)
dat = Vector{UInt8}(s)
n = length(dat)
n <= 1 && return s
buf = StringVector(n)
out = n
pos = 1
@inbounds while out > 0
ch = dat[pos]
if ch > 0xdf
if ch < 0xf0
(out -= 3) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
buf[out + 1], buf[out + 2], buf[out + 3] = ch, dat[pos + 1], dat[pos + 2]
pos += 3
else
(out -= 4) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
buf[out+1], buf[out+2], buf[out+3], buf[out+4] = ch, dat[pos+1], dat[pos+2], dat[pos+3]
pos += 4
end
elseif ch > 0x7f
(out -= 2) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
buf[out + 1], buf[out + 2] = ch, dat[pos + 1]
pos += 2
else
buf[out] = ch
out -= 1
pos += 1
end
end
String(buf)
end

function repeat(s::String, r::Integer)
r < 0 && throw(ArgumentError("can't repeat a string $r times"))
n = sizeof(s)
Expand Down
2 changes: 1 addition & 1 deletion base/strings/strings.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# This file is a part of Julia. License is MIT: https://julialang.org/license

include("strings/errors.jl")
include("strings/types.jl")
include("strings/substring.jl")
include("strings/basic.jl")
include("strings/search.jl")
include("strings/util.jl")
Expand Down
Loading

1 comment on commit b8ee561

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Executing the daily benchmark build, I will reply here when finished:

@nanosoldier runbenchmarks(ALL, isdaily = true)

Please sign in to comment.