Skip to content

Commit

Permalink
graphemes(s, m:n) substring slicing (#44266)
Browse files Browse the repository at this point in the history
* graphemes(s, m:n) substring slicing

* variable naming

* whoops

* consolidate tests

* empty-range test

* note complexity

* Update stdlib/Unicode/src/Unicode.jl

Co-authored-by: Sebastian Stock <42280794+sostock@users.noreply.github.com>

* news fix

Co-authored-by: Sebastian Stock <42280794+sostock@users.noreply.github.com>
  • Loading branch information
stevengj and sostock authored Apr 2, 2022
1 parent daa2101 commit 41c2c7c
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 0 deletions.
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ Standard library changes

#### UUIDs

#### Unicode

* `graphemes(s, m:n)` returns a substring of the `m`-th to `n`-th graphemes in `s` ([#44266]).

#### Mmap

#### DelimitedFiles
Expand Down
63 changes: 63 additions & 0 deletions stdlib/Unicode/src/Unicode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,69 @@ letter combined with an accent mark is a single grapheme.)
"""
graphemes(s::AbstractString) = Base.Unicode.GraphemeIterator{typeof(s)}(s)

"""
graphemes(s::AbstractString, m:n) -> SubString
Returns a [`SubString`](@ref) of `s` consisting of the `m`-th
through `n`-th graphemes of the string `s`, where the second
argument `m:n` is an integer-valued [`AbstractUnitRange`](@ref).
Loosely speaking, this corresponds to the `m:n`-th user-perceived
"characters" in the string. For example:
```jldoctest
julia> s = graphemes("exposé", 3:6)
"posé"
julia> collect(s)
5-element Vector{Char}:
'p': ASCII/Unicode U+0070 (category Ll: Letter, lowercase)
'o': ASCII/Unicode U+006F (category Ll: Letter, lowercase)
's': ASCII/Unicode U+0073 (category Ll: Letter, lowercase)
'e': ASCII/Unicode U+0065 (category Ll: Letter, lowercase)
'́': Unicode U+0301 (category Mn: Mark, nonspacing)
```
This consists of the 3rd to *7th* codepoints ([`Char`](@ref)s) in `"exposé"`,
because the grapheme `"é"` is actually *two* Unicode codepoints
(an `'e'` followed by an acute-accent combining character U+0301).
Because finding grapheme boundaries requires iteration over the
string contents, the `graphemes(s, m:n)` function requires time
proportional to the length of the string (number of codepoints)
before the end of the substring.
!!! compat "Julia 1.9"
The `m:n` argument of `graphemes` requires Julia 1.9.
"""
function graphemes(s::AbstractString, r::AbstractUnitRange{<:Integer})
m, n = Int(first(r)), Int(last(r))
m > 0 || throw(ArgumentError("starting index $m is not ≥ 1"))
n < m && return @view s[1:0]
c0 = eltype(s)(0x00000000)
state = Ref{Int32}(0)
count = 0
i, iprev, ilast = 1, 1, lastindex(s)
# find the start of the m-th grapheme
while i ilast && count < m
@inbounds c = s[i]
count += Base.Unicode.isgraphemebreak!(state, c0, c)
c0 = c
i, iprev = nextind(s, i), i
end
start = iprev
count < m && throw(BoundsError(s, i))
# find the end of the n-th grapheme
while i ilast
@inbounds c = s[i]
count += Base.Unicode.isgraphemebreak!(state, c0, c)
count > n && break
c0 = c
i, iprev = nextind(s, i), i
end
count < n && throw(BoundsError(s, i))
return @view s[start:iprev]
end

using Base.Unicode: utf8proc_error, UTF8PROC_DECOMPOSE, UTF8PROC_CASEFOLD, UTF8PROC_STRIPMARK

function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32}, options::Integer)
Expand Down
10 changes: 10 additions & 0 deletions stdlib/Unicode/test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,16 @@ end

@test Base.Unicode.isgraphemebreak('α', 'β')
@test !Base.Unicode.isgraphemebreak('α', '\u0302')

for pre in ("",""), post in ("","")
prelen = length(graphemes(pre))
@test graphemes(pre * "öü" * post, (1:2) .+ prelen) == "öü"
@test graphemes(pre * "" * post, (1:1) .+ prelen) == ""
end
@test graphemes("äöüx", 6:5)::SubString{String} == ""
@test_throws BoundsError graphemes("äöüx", 2:5)
@test_throws BoundsError graphemes("äöüx", 5:5)
@test_throws ArgumentError graphemes("äöüx", 0:1)
end

@testset "#3721, #6939 up-to-date character widths" begin
Expand Down

0 comments on commit 41c2c7c

Please sign in to comment.