From c2658e10e7d3fc14dfc436e8158a4dd64a9cc94a Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Tue, 20 Feb 2018 15:50:42 -0500 Subject: [PATCH] Separate AbstractString interface from iteration protocol Up until now, the basic interface new AbstractStrings had to implement was: ``` struct MyString; ...; end next(::MyString, i::Int64)::Tuple{Char, Int64} isvalid(::MyString, i::Int64)::Bool ncodeunits(::MyString)::Int64 ``` In this interface, the iteration state (i.e. the second tuple element returned from `next`) always had to be the next valid string index. This is inconvenient for several reasons: 1. The iteration protocol will change, breaking every use of this API 2. Some strings may want iteration states other than linear indicies for efficiency reasons (e.g. RopeStrings) 3. Strings implementors can no longer assume that the second argument they receive was necessarily produced by them, so may need to do various validation of the iteration sate on every iteration. This PR attempts to remidy this, by introducing a new generic `Next` iterator. The iterator is defined to iterate (values, next index) tuple, which is the return value the `next` method on strings at the moment and thus allows for a natural transition from the older API. Thus, this PR changes, the requisite method to implement from: ``` next(::MyString, i::Int)::Tuple{Char, Int} ``` to ``` next(::StringNext{MyString}, state::Any)::Tuple{Tuple{Char, Int}, Any} ``` where `StringNext{T} = Next{T, EachIndexString{T}}` Efficient implementations of iteration over strings, the indicies as well as `Pairs` can be derived from this iterator. The reason this iterator is useful is perhaps best understood by considering strings to be variable-length encodings of character arrays. In a variable-length encoding, one generally decodes the value and the length (i.e. the index of the next element) at the same time, so it makes sense to base the API on the implementation of an iterator with these semantics. To demonstrate the use and test the new abstract implementations based on this iterator, there are three string types in the test suite: - CharString, as before, which simply wraps an array of `Chars` with direct indexing. The only change to this iterator is to change the signature of the `next` method. - RopeString, which strings together several Strings, and more importantly does not have efficient linear iteration state. - DecodeString, which decodes escape sequences on the fly as part of iteration. This string type demonstrates one string type wrapping another string type to test the interface from both sides --- base/iterators.jl | 55 ++++++++++ base/regex.jl | 4 +- base/strings/basic.jl | 65 +----------- base/strings/iteration.jl | 120 ++++++++++++++++++++++ base/strings/string.jl | 7 +- base/strings/substring.jl | 7 +- base/strings/util.jl | 8 +- base/sysimg.jl | 2 +- stdlib/Test/src/Test.jl | 4 +- test/strings/basic.jl | 30 ++++-- test/strings/teststringtypes.jl | 174 ++++++++++++++++++++++++++++++++ 11 files changed, 393 insertions(+), 83 deletions(-) create mode 100644 base/strings/iteration.jl create mode 100644 test/strings/teststringtypes.jl diff --git a/base/iterators.jl b/base/iterators.jl index 2f69d74346208..34399dc89937b 100644 --- a/base/iterators.jl +++ b/base/iterators.jl @@ -247,6 +247,60 @@ setindex!(v::Pairs, value, key) = (v.data[key] = value; v) get(v::Pairs, key, default) = get(v.data, key, default) get(f::Base.Callable, collection::Pairs, key) = get(f, v.data, key) +""" + Iterators.Next(values::A, idx::eltype(I)=firstindex(values), itr::I=eachindex(values)) where {A,I} + +Returns a tuple of a value and the subsequent index. This iterator is useful for the +implementation of variably-length encoded arrays where decoding the element and +obtaining the offset or index of the next element generally involve the same computation. + +A default implementation is provided that simply iterates over `eachindex` and uses +`getindex` to obtain the value corresponding to the index. It is allowed (and encouraged) +to overload iteration for a specific `Next{A}` in order to provide a more efficient +implementation that computes both in one step. + +The index in the last tuple will generally be equivalent to `lastindex(values)+1` +though users should only rely on the fact that it is `> lastindex(values)` to allow +implemntations the flexibility to choose a different value. + +The `idx` argument provides a means by which to resume this iterator from a given index. +The first value returned by the `Next` iterator should correspond to the element at `idx`. +Please note that if you override iteration for `Next{A}` and your iteration state is not +the next index, you will have to additionally overload `Next(data::A, idx, itr::I)` for +four `A`. + +# Examples: + +julia> first(Next(['a','b','c'])) +('a', 2) + +julia> first(Next(['a','b','c'], 3)) +('c', 4) +""" +struct Next{A, I} + data::A + itr::I + Next{A}(data::A, itr::I) where {A, I} = new{A, I}(data, itr) +end +Next(data, idx, itr) = Rest(Next{typeof(data)}(data, itr), idx) +Next(data, idx) = Next(data, idx, eachindex(data)) +Next(data) = Next{typeof(data)}(data, eachindex(data)) + +start(lip::Next) = start(lip.itr) +done(lip::Next, state) = done(lip.itr, state) +function next(lip::Next, state) + nidx = ns = next(lip.itr, state) + # A bit awkward now, done for consistency with the new iteration protocol + done(lip.itr, ns) && (nidx = lastindex(lip.itr)+1) + (lip.data[ns], nidx), ns +end + +length(lip::Next) = length(lip.itr) +eltype(::Type{Next{A, I}}) where {A, I} = Tuple{eltype(A), eltype(I)} + +IteratorSize(::Type{<:Next{I}}) where {I} = IteratorSize(I) +IteratorEltype(::Type{<:Next{I}}) where {I} = IteratorEltype(I) + # zip abstract type AbstractZipIterator end @@ -1070,6 +1124,7 @@ end function fixpoint_iter_type(itrT::Type, valT::Type, stateT::Type) nextvalstate = Base._return_type(next, Tuple{itrT, stateT}) nextvalstate <: Tuple{Any, Any} || return Any + nextvalstate === Union{} && return Union{} nextvalstate = Tuple{ typejoin(valT, fieldtype(nextvalstate, 1)), typejoin(stateT, fieldtype(nextvalstate, 2))} diff --git a/base/regex.jl b/base/regex.jl index 6f75eee72e5f5..6bed77e23d581 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -228,7 +228,9 @@ ncodeunits(s::SubstitutionString) = ncodeunits(s.string) codeunit(s::SubstitutionString) = codeunit(s.string) codeunit(s::SubstitutionString, i::Integer) = codeunit(s.string, i) isvalid(s::SubstitutionString, i::Integer) = isvalid(s.string, i) -next(s::SubstitutionString, i::Integer) = next(s.string, i) +start(s::StringNext{<:SubstitutionString}) = start(StringNext(s.data.string)) +next(s::StringNext{<:SubstitutionString}, state) = next(StringNext(s.data.string), state) +done(s::StringNext{<:SubstitutionString}, state) = done(StringNext(s.data.string), state) function show(io::IO, s::SubstitutionString) print(io, "s") diff --git a/base/strings/basic.jl b/base/strings/basic.jl index 50091f5f11049..c3f507902f71e 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -42,6 +42,9 @@ AbstractString ## required string functions ## +# N.B. iteration for StringNext{T} is a required part of the iteration protocol +include("strings/iteration.jl") + """ ncodeunits(s::AbstractString) -> Int @@ -121,52 +124,6 @@ Stacktrace: @propagate_inbounds isvalid(s::AbstractString, i::Integer) = typeof(i) === Int ? throw(MethodError(isvalid, (s, i))) : isvalid(s, Int(i)) -""" - next(s::AbstractString, i::Integer) -> Tuple{Char, Int} - -Return a tuple of the character in `s` at index `i` with the index of the start -of the following character in `s`. This is the key method that allows strings to -be iterated, yielding a sequences of characters. If `i` is out of bounds in `s` -then a bounds error is raised. The `next` function, as part of the iteration -protocoal may assume that `i` is the start of a character in `s`. - -See also: [`getindex`](@ref), [`start`](@ref), [`done`](@ref), -[`checkbounds`](@ref) -""" -@propagate_inbounds next(s::AbstractString, i::Integer) = typeof(i) === Int ? - throw(MethodError(next, (s, i))) : next(s, Int(i)) - -## basic generic definitions ## - -start(s::AbstractString) = 1 -done(s::AbstractString, i::Integer) = i > ncodeunits(s) -eltype(::Type{<:AbstractString}) = Char -sizeof(s::AbstractString) = ncodeunits(s) * sizeof(codeunit(s)) -firstindex(s::AbstractString) = 1 -lastindex(s::AbstractString) = thisind(s, ncodeunits(s)) - -function getindex(s::AbstractString, i::Integer) - @boundscheck checkbounds(s, i) - @inbounds return isvalid(s, i) ? next(s, i)[1] : string_index_err(s, i) -end - -getindex(s::AbstractString, i::Colon) = s -# TODO: handle other ranges with stride ±1 specially? -# TODO: add more @propagate_inbounds annotations? -getindex(s::AbstractString, v::AbstractVector{<:Integer}) = - sprint(io->(for i in v; write(io, s[i]) end), sizehint=length(v)) -getindex(s::AbstractString, v::AbstractVector{Bool}) = - throw(ArgumentError("logical indexing not supported for strings")) - -function get(s::AbstractString, i::Integer, default) -# TODO: use ternary once @inbounds is expression-like - if checkbounds(Bool, s, i) - @inbounds return s[i] - else - return default - end -end - ## bounds checking ## checkbounds(::Type{Bool}, s::AbstractString, i::Integer) = @@ -379,6 +336,7 @@ julia> thisind("αβγdef", 10) julia> thisind("αβγdef", 20) 20 +``` """ thisind(s::AbstractString, i::Integer) = thisind(s, Int(i)) @@ -470,21 +428,6 @@ function nextind(s::AbstractString, i::Int, n::Int) return i + n end -## string index iteration type ## - -struct EachStringIndex{T<:AbstractString} - s::T -end -keys(s::AbstractString) = EachStringIndex(s) - -length(e::EachStringIndex) = length(e.s) -first(::EachStringIndex) = 1 -last(e::EachStringIndex) = lastindex(e.s) -start(e::EachStringIndex) = start(e.s) -next(e::EachStringIndex, state) = (state, nextind(e.s, state)) -done(e::EachStringIndex, state) = done(e.s, state) -eltype(::Type{<:EachStringIndex}) = Int - """ isascii(c::Union{Char,AbstractString}) -> Bool diff --git a/base/strings/iteration.jl b/base/strings/iteration.jl new file mode 100644 index 0000000000000..96d17d8ba18bc --- /dev/null +++ b/base/strings/iteration.jl @@ -0,0 +1,120 @@ +# A specialized iterator for EachIndex of strings +struct EachStringIndex{T<:AbstractString} + s::T +end +keys(s::AbstractString) = EachStringIndex(s) + +length(e::EachStringIndex) = length(e.s) +first(::EachStringIndex) = 1 +last(e::EachStringIndex) = lastindex(e.s) +eltype(::Type{<:EachStringIndex}) = Int + +# Iteration over StringNext +# +# Any new subtype of AbstractString, should override +# +# next(::StringNext{MyString}, state) +# +# to provide iteration over the string and its indices. All other iteration methods, +# including iteration over strings, iteration over pairs, indexing into string, +# iteration over indicies alone are derived from this method. + +const StringNext{T<:AbstractString} = Iterators.Next{T, EachStringIndex{T}} +StringNext(x::T) where {T<:AbstractString} = Next(x) +StringNext(x::T, idx) where {T<:AbstractString} = Next(x, idx) +StringNext(x::T, idx, itr) where {T<:AbstractString} = Next(x, idx, itr) + +start(sp::StringNext) = 1 +function done(s::StringNext, i) + if isa(i, Integer) + return i > ncodeunits(s.data) + else + throw(MethodError(done, (s, i))) + end +end +function next(s::StringNext, i) + if isa(i, Integer) && !isa(i, Int) + return next(s, Int(i)) + else + throw(MethodError(next, (s, i))) + end +end + +# Derive iteration over pairs from `StringNext` +const StringPairs{T<:AbstractString} = Iterators.Pairs{Int, Char, EachStringIndex{T}, T} +StringPairs{T}(x::T) where {T<:AbstractString} = Iterators.Pairs(x, eachindex(x)) +StringPairs(x::T) where {T<:AbstractString} = StringPairs{T}(x) + +Iterators.pairs(s::AbstractString) = StringPairs(s) + +start(e::StringPairs) = (firstindex(e.data), start(StringNext(e.data))) +done(e::StringPairs, (idx, state)) = done(StringNext(e.data), state) +function next(s::StringPairs, (idx, state)) + ((c, nidx), state) = next(StringNext(s.data), state) + Pair(idx, c), (nidx, state) +end + +# Derive reverse pair iteration. +# N.B. String implementers may wish to override +# +# next(s::Iterators.Reverse{<:StringPairs}, idx) +# +# to provide efficient variable-length reverse decoding +Iterators.reverse(s::StringPairs) = Iterators.Reverse(s) + +start(e::Iterators.Reverse{<:StringPairs}) = ncodeunits(e.itr.data)+1 +done(e::Iterators.Reverse{<:StringPairs}, idx) = idx == firstindex(e.itr.data) +function next(s::Iterators.Reverse{<:StringPairs}, idx) + tidx = thisind(s.itr.data, idx-1) + (c, nidx) = first(Next(s.itr.data, tidx)) + Pair(tidx, c), tidx +end + +function prev(s::AbstractString, idx) + (i, c), _ = next(Iterators.Reverse(StringPairs(s)), idx) + (c, i) +end + + +# Derive iteration over strings from `StringNext` +start(s::AbstractString) = start(StringNext(s)) +done(s::AbstractString, state) = done(StringNext(s), state) +function next(s::AbstractString, state) + ((c, _), state) = next(StringNext(s), state) + (c, state) +end + +eltype(::Type{<:AbstractString}) = Char +sizeof(s::AbstractString) = ncodeunits(s) * sizeof(codeunit(s)) +firstindex(s::AbstractString) = 1 +lastindex(s::AbstractString) = thisind(s, ncodeunits(s)) + +function getindex(s::AbstractString, i::Integer) + @boundscheck checkbounds(s, i) + @inbounds return isvalid(s, i) ? first(first(Next(s, i))) : string_index_err(s, i) +end + +getindex(s::AbstractString, i::Colon) = s +# TODO: handle other ranges with stride ±1 specially? +# TODO: add more @propagate_inbounds annotations? +getindex(s::AbstractString, v::AbstractVector{<:Integer}) = + sprint(io->(for i in v; write(io, s[i]) end), sizehint=length(v)) +getindex(s::AbstractString, v::AbstractVector{Bool}) = + throw(ArgumentError("logical indexing not supported for strings")) + +function get(s::AbstractString, i::Integer, default) +# TODO: use ternary once @inbounds is expression-like + if checkbounds(Bool, s, i) + @inbounds return s[i] + else + return default + end +end + +# Derive iteration over indices from `StringNext` +start(e::EachStringIndex) = start(StringPairs(e.s)) +done(e::EachStringIndex, state) = done(StringPairs(e.s), state) +function next(e::EachStringIndex, state) + ((idx, _), state) = next(StringPairs(e.s), state) + (idx, state) +end \ No newline at end of file diff --git a/base/strings/string.jl b/base/strings/string.jl index 487bfd8819d1d..8032533a5c2f3 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -167,10 +167,11 @@ is_valid_continuation(c) = c & 0xc0 == 0x80 ## required core functionality ## -@propagate_inbounds function next(s::String, i::Int) +@propagate_inbounds function next(sp::StringNext{String}, i::Int) + s = sp.data b = codeunit(s, i) u = UInt32(b) << 24 - between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1 + between(b, 0x80, 0xf7) || return ((reinterpret(Char, u), i + 1), i+1) return next_continued(s, i, u) end @@ -193,7 +194,7 @@ function next_continued(s::String, i::Int, u::UInt32) b & 0xc0 == 0x80 || @goto ret u |= UInt32(b); i += 1 @label ret - return reinterpret(Char, u), i + return (reinterpret(Char, u), i), i end @propagate_inbounds function getindex(s::String, i::Int) diff --git a/base/strings/substring.jl b/base/strings/substring.jl index 879955daeba48..5845c67901872 100644 --- a/base/strings/substring.jl +++ b/base/strings/substring.jl @@ -62,10 +62,11 @@ function codeunit(s::SubString, i::Integer) @inbounds return codeunit(s.string, s.offset + i) end -function next(s::SubString, i::Integer) +function next(sp::StringNext{<:SubString}, i::Int) + s = sp.data @boundscheck checkbounds(s, i) - @inbounds c, i = next(s.string, s.offset + i) - return c, i - s.offset + @inbounds (c, idx), i = next(StringNext(s.string), s.offset + i) + return (c, idx - s.offset), i - s.offset end function getindex(s::SubString, i::Integer) diff --git a/base/strings/util.jl b/base/strings/util.jl index 223510155e018..898cc2b0dfbbd 100644 --- a/base/strings/util.jl +++ b/base/strings/util.jl @@ -136,9 +136,9 @@ julia> lstrip(a) function lstrip(s::AbstractString, chars::Chars=_default_delims) e = lastindex(s) for (i, c) in pairs(s) - !(c in chars) && return SubString(s, i, e) + !(c in chars) && return @inbounds SubString(s, i, e) end - SubString(s, e+1, e) + @inbounds SubString(s, e+1, e) end """ @@ -161,9 +161,9 @@ julia> rstrip(a) """ function rstrip(s::AbstractString, chars::Chars=_default_delims) for (i, c) in Iterators.reverse(pairs(s)) - c in chars || return SubString(s, 1, i) + c in chars || return @inbounds SubString(s, 1, i) end - SubString(s, 1, 0) + @inbounds SubString(s, 1, 0) end """ diff --git a/base/sysimg.jl b/base/sysimg.jl index 8750bfb0dd366..0a380423fd18b 100644 --- a/base/sysimg.jl +++ b/base/sysimg.jl @@ -176,7 +176,7 @@ Array{T}(::Missing, d...) where {T} = fill!(Array{T}(uninitialized, d...), missi include("abstractdict.jl") include("iterators.jl") -using .Iterators: zip, enumerate +using .Iterators: zip, enumerate, Next using .Iterators: Flatten, product # for generators include("namedtuple.jl") diff --git a/stdlib/Test/src/Test.jl b/stdlib/Test/src/Test.jl index f636d93e6e261..39478502418db 100644 --- a/stdlib/Test/src/Test.jl +++ b/stdlib/Test/src/Test.jl @@ -1427,7 +1427,9 @@ Base.ncodeunits(s::GenericString) = ncodeunits(s.string) Base.codeunit(s::GenericString) = codeunit(s.string) Base.codeunit(s::GenericString, i::Integer) = codeunit(s.string, i) Base.isvalid(s::GenericString, i::Integer) = isvalid(s.string, i) -Base.next(s::GenericString, i::Integer) = next(s.string, i) +Base.start(s::Base.StringNext{GenericString}) = start(Base.StringNext(s.data.string)) +Base.done(s::Base.StringNext{GenericString}, state) = done(Base.StringNext(s.data.string), state) +Base.next(s::Base.StringNext{GenericString}, state) = next(Base.StringNext(s.data.string), state) Base.reverse(s::GenericString) = GenericString(reverse(s.string)) Base.reverse(s::SubString{GenericString}) = GenericString(typeof(s.string)(reverse(String(s)))) diff --git a/test/strings/basic.jl b/test/strings/basic.jl index 92273526fa442..990c4b6a0d3d6 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -238,7 +238,6 @@ end @test first(eachindex("foobar")) === 1 @test first(eachindex("")) === 1 @test last(eachindex("foobar")) === lastindex("foobar") - @test done(eachindex("foobar"),7) @test Int == eltype(Base.EachStringIndex) == eltype(Base.EachStringIndex{String}) == eltype(Base.EachStringIndex{GenericString}) == @@ -480,17 +479,11 @@ end @test nextind(s, lastindex(s)) > sizeof(s) end end + # Test cmp with AbstractStrings that don't index the same as UTF-8, which would include # (LegacyString.)UTF16String and (LegacyString.)UTF32String, among others. +include("teststringtypes.jl") -mutable struct CharStr <: AbstractString - chars::Vector{Char} - CharStr(x) = new(collect(x)) -end -Base.start(x::CharStr) = start(x.chars) -Base.next(x::CharStr, i::Int) = next(x.chars, i) -Base.done(x::CharStr, i::Int) = done(x.chars, i) -Base.lastindex(x::CharStr) = lastindex(x.chars) @testset "cmp without UTF-8 indexing" begin # Simple case, with just ANSI Latin 1 characters @test "áB" != CharStr("áá") # returns false with bug @@ -866,3 +859,22 @@ let x = SubString("ab", 1, 1) @test y === x chop("ab") === chop.(["ab"])[1] end + +@testset "Generic String APIs" begin + cs = CharString([' ', 'a', 'b', 'c', ' ']) + @test cs == " abc " + @test lstrip(cs) == SubString(cs, 2) == "abc " + @test rstrip(cs) == " abc" + rs = RopeString([" ", "abc", " "]) + @test rs == cs + @test lstrip(rs) == lstrip(cs) == "abc " + @test rstrip(rs) == rstrip(cs) == " abc" + ds = DecodeString(raw" \x61b\u0063 ") + @test ds == cs + @test lstrip(ds) == lstrip(cs) == "abc " + @test rstrip(ds) == rstrip(cs) == " abc" + ds2 = DecodeString(RopeString([" ", raw"\x61b\u00", "63 "])) + @test ds2 == cs + @test lstrip(ds2) == lstrip(cs) == "abc " + @test rstrip(ds2) == rstrip(cs) == " abc" +end diff --git a/test/strings/teststringtypes.jl b/test/strings/teststringtypes.jl new file mode 100644 index 0000000000000..61edc0e12c10a --- /dev/null +++ b/test/strings/teststringtypes.jl @@ -0,0 +1,174 @@ +# A simple abstract string with fixed indicies +struct CharString <: AbstractString + chars::Vector{Char} +end +CharString(s::AbstractString) = CharString(Vector{Char}(s)) + +Base.ncodeunits(c::CharString) = length(c.chars) +Base.thisind(c::CharString, i::Int) = i +Base.isvalid(c::CharString, i::Int) = 1 <= i <= ncodeunits(c) +function Base.next(sp::Base.StringNext{CharString}, i::Int) + (sp.data.chars[i], i+1), i+1 +end +const CharStr = CharString + +# An abstract string with efficient non-linear indicies +struct RopeString <: AbstractString + strs::Vector{String} +end + +function lin2trip(rs::RopeString, idx) + linear = idx + element = 1 + while ncodeunits(rs.strs[element]) < idx + idx -= ncodeunits(rs.strs[element]) + element += 1 + end + (linear, element, idx) +end + +function Base.Next(rs::RopeString, idx::Int, itr::Base.EachStringIndex{RopeString}) + Base.Iterators.Rest(Base.StringNext(rs), lin2trip(rs, idx)) +end +Base.ncodeunits(rs::RopeString) = sum(ncodeunits, rs.strs) +function Base.isvalid(rs::RopeString, idx::Int) + (lin, el, eidx) = lin2trip(rs, idx) + Base.isvalid(rs.strs[el], eidx) +end +function Base.thisind(rs::RopeString, idx::Int) + (lin, el, eidx) = lin2trip(rs, idx) + lin - (eidx - Base.thisind(rs.strs[el], eidx)) +end +function Base.start(rs::Base.StringNext{RopeString}) + element = 1 + local ni + while element <= length(rs.data.strs) + el = rs.data.strs[element] + ni = start(el) + done(el, ni) || break + element += 1 + end + (1, element, ni) +end +Base.done(rs::Base.StringNext{RopeString}, (linear, element, ni)::Tuple{Int, Int, Int}) = element > length(rs.data.strs) +function Base.next(rs::Base.StringNext{RopeString}, (linear, element, i)::Tuple{Int, Int, Int}) + el = rs.data.strs[element] + (c, nexti), _ = next(Base.StringNext(el), i) + linear += nexti - i + while done(el, nexti) + element += 1 + element <= length(rs.data.strs) || break + el = rs.data.strs[element] + nexti = start(el) + end + (c, linear), (linear, element, nexti) +end +Base.start(rs::Base.StringNext{SubString{RopeString}}) = lin2trip(rs.data.string, rs.data.offset+firstindex(rs.data.string)) +function Base.done(rs::Base.StringNext{SubString{RopeString}}, (linear, element, ni)::Tuple) + (linear - rs.data.offset) > rs.data.ncodeunits && return true + done(Base.StringNext(rs.data.string), (linear, element, ni)) +end +function Base.next(rs::Base.StringNext{SubString{RopeString}}, state::Tuple) + (c, linear), state = next(Base.StringNext(rs.data.string), state) + (c, linear - rs.data.offset), state +end + +# DecodeString +struct DecodeString{S<:AbstractString} <: AbstractString + s::S +end +function Base.Next(s::T, idx, itr::Base.EachStringIndex{T}) where {T<:DecodeString} + n = Base.Next(s.s, idx, eachindex(s.s)) + (n isa Base.Next) && return Base.Next(s) + (n isa Base.Iterators.Rest) && return Base.Iterators.Rest(Base.Next(s), n.st) + error("Result of `Next` must be `Next` or `Rest`, not $(typeof(n))") +end +Base.ncodeunits(s::DecodeString) = ncodeunits(s.s) +Base.start(sp::Base.StringNext{<:DecodeString}) = start(Base.StringNext(sp.data.s)) +Base.done(sp::Base.StringNext{<:DecodeString}, state::Int) = done(Base.StringNext(sp.data.s), state::Int) +Base.done(sp::Base.StringNext{<:DecodeString}, state) = done(Base.StringNext(sp.data.s), state) +Base.SubString(d::DecodeString, start::Int, ncodeunits::Int) = DecodeString(SubString(d.s, start, ncodeunits)) +function Base.next(sp::Base.StringNext{<:DecodeString}, state) + sps = Base.StringNext(sp.data.s) + (c, i), state = next(sps, state) + if c != '\\' || done(sps, state) + return (c, i), state + end + (c, i), state = next(sps, state) + if c == 'x' || c == 'u' || c == 'U' + n = k = 0 + m = c == 'x' ? 2 : + c == 'u' ? 4 : 8 + while (k += 1) <= m && !done(sps, state) + (c′, i′), state′ = next(sps, state) + n = '0' <= c′ <= '9' ? n<<4 + (c′-'0') : + 'a' <= c′ <= 'f' ? n<<4 + (c′-'a'+10) : + 'A' <= c′ <= 'F' ? n<<4 + (c′-'A'+10) : break + (i, state) = (i′, state′) + end + if k == 1 || n > 0x10ffff + u = m == 4 ? 'u' : 'U' + throw(ArgumentError("invalid $(m == 2 ? "hex (\\x)" : + "unicode (\\$u)") escape sequence")) + end + return (Char(n), i), state + elseif '0' <= c <= '7' + k = 1 + n = c-'0' + while (k += 1) <= 3 && !done(sps, state) + (c′, i′), state′ = next(sps, state) + n = ('0' <= c <= '7') ? n<<3 + c-'0' : break + (i, state) = (i′, state′) + end + if n > 255 + throw(ArgumentError("octal escape sequence out of range")) + end + return (Char(n), i), state + else + return ( + c == 'a' ? '\a' : + c == 'b' ? '\b' : + c == 't' ? '\t' : + c == 'n' ? '\n' : + c == 'v' ? '\v' : + c == 'f' ? '\f' : + c == 'r' ? '\r' : + c == 'e' ? '\e' : c, i), state + end +end +is_octal_char(c) = c in '0':'7' +is_hex_char(c) = (c in '0':'9') || (c in 'A':'F') || (c in 'a':'f') +function Base.thisind(s::DecodeString, ind::Int) + si = thisind(s.s, ind) + c = s.s[si] + if c == '\\' + return si + end + could_be_single_char = c in ('a', 'b', 'c', 'n', 'v', 'f', 'r', 'e') + could_be_unicode = is_hex_char(c) + could_be_octal = is_octal_char(c) + nchars = 1 + pi = si + while pi >= 1 && (could_be_single_char || could_be_unicode || could_be_octal) + nchars += 1 + (c, pi) = Base.prev(s.s, pi) + if c == '\\' + return (could_be_single_char || could_be_octal) ? pi : si + elseif c in ('u', 'U', 'x') + (c, pi) = Base.prev(s.s, pi) + m = c == 'U' ? 8 : + c == 'u' ? 4 : + 2 + if c == '\\' + return nchars < m ? pi : si + end + return si + else + could_be_octal |= nchars <= 3 && is_octal_char(c) + could_be_unicode |= nchars <= 8 && is_hex_char(c) + end + could_be_single_char = false + end + return si +end +Base.isvalid(s::DecodeString, ind::Int) = thisind(s, ind) == ind