From 41c2c7c5924f9c577983fdbd002b0bbfeb349601 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Sat, 2 Apr 2022 17:27:09 -0400
Subject: [PATCH] graphemes(s, m:n) substring slicing (#44266)

* graphemes(s, m:n) substring slicing

* variable naming

* whoops

* consolidate tests

* empty-range test

* note complexity

* Update stdlib/Unicode/src/Unicode.jl

Co-authored-by: Sebastian Stock <42280794+sostock@users.noreply.github.com>

* news fix

Co-authored-by: Sebastian Stock <42280794+sostock@users.noreply.github.com>
---
 NEWS.md                         |  4 +++
 stdlib/Unicode/src/Unicode.jl   | 63 +++++++++++++++++++++++++++++++++
 stdlib/Unicode/test/runtests.jl | 10 ++++++
 3 files changed, 77 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 6e0b00c92f041..60eb4a5ed06b4 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -90,6 +90,10 @@ Standard library changes
 
 #### UUIDs
 
+#### Unicode
+
+* `graphemes(s, m:n)` returns a substring of the `m`-th to `n`-th graphemes in `s` ([#44266]).
+
 #### Mmap
 
 #### DelimitedFiles
diff --git a/stdlib/Unicode/src/Unicode.jl b/stdlib/Unicode/src/Unicode.jl
index e31f7ee1e27f2..0467a8d50aa6b 100644
--- a/stdlib/Unicode/src/Unicode.jl
+++ b/stdlib/Unicode/src/Unicode.jl
@@ -143,6 +143,69 @@ letter combined with an accent mark is a single grapheme.)
 """
 graphemes(s::AbstractString) = Base.Unicode.GraphemeIterator{typeof(s)}(s)
 
+"""
+    graphemes(s::AbstractString, m:n) -> SubString
+
+Returns a [`SubString`](@ref) of `s` consisting of the `m`-th
+through `n`-th graphemes of the string `s`, where the second
+argument `m:n` is an integer-valued [`AbstractUnitRange`](@ref).
+
+Loosely speaking, this corresponds to the `m:n`-th user-perceived
+"characters" in the string.  For example:
+
+```jldoctest
+julia> s = graphemes("exposé", 3:6)
+"posé"
+
+julia> collect(s)
+5-element Vector{Char}:
+ 'p': ASCII/Unicode U+0070 (category Ll: Letter, lowercase)
+ 'o': ASCII/Unicode U+006F (category Ll: Letter, lowercase)
+ 's': ASCII/Unicode U+0073 (category Ll: Letter, lowercase)
+ 'e': ASCII/Unicode U+0065 (category Ll: Letter, lowercase)
+ '́': Unicode U+0301 (category Mn: Mark, nonspacing)
+```
+This consists of the 3rd to *7th* codepoints ([`Char`](@ref)s) in `"exposé"`,
+because the grapheme `"é"` is actually *two* Unicode codepoints
+(an `'e'` followed by an acute-accent combining character U+0301).
+
+Because finding grapheme boundaries requires iteration over the
+string contents, the `graphemes(s, m:n)` function requires time
+proportional to the length of the string (number of codepoints)
+before the end of the substring.
+
+!!! compat "Julia 1.9"
+    The `m:n` argument of `graphemes` requires Julia 1.9.
+"""
+function graphemes(s::AbstractString, r::AbstractUnitRange{<:Integer})
+    m, n = Int(first(r)), Int(last(r))
+    m > 0 || throw(ArgumentError("starting index $m is not ≥ 1"))
+    n < m && return @view s[1:0]
+    c0 = eltype(s)(0x00000000)
+    state = Ref{Int32}(0)
+    count = 0
+    i, iprev, ilast = 1, 1, lastindex(s)
+    # find the start of the m-th grapheme
+    while i ≤ ilast && count < m
+        @inbounds c = s[i]
+        count += Base.Unicode.isgraphemebreak!(state, c0, c)
+        c0 = c
+        i, iprev = nextind(s, i), i
+    end
+    start = iprev
+    count < m && throw(BoundsError(s, i))
+    # find the end of the n-th grapheme
+    while i ≤ ilast
+        @inbounds c = s[i]
+        count += Base.Unicode.isgraphemebreak!(state, c0, c)
+        count > n && break
+        c0 = c
+        i, iprev = nextind(s, i), i
+    end
+    count < n && throw(BoundsError(s, i))
+    return @view s[start:iprev]
+end
+
 using Base.Unicode: utf8proc_error, UTF8PROC_DECOMPOSE, UTF8PROC_CASEFOLD, UTF8PROC_STRIPMARK
 
 function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32}, options::Integer)
diff --git a/stdlib/Unicode/test/runtests.jl b/stdlib/Unicode/test/runtests.jl
index a4faac2bd3ba9..1d1b78e02bf27 100644
--- a/stdlib/Unicode/test/runtests.jl
+++ b/stdlib/Unicode/test/runtests.jl
@@ -271,6 +271,16 @@ end
 
     @test Base.Unicode.isgraphemebreak('α', 'β')
     @test !Base.Unicode.isgraphemebreak('α', '\u0302')
+
+    for pre in ("","ä"), post in ("","x̂")
+        prelen = length(graphemes(pre))
+        @test graphemes(pre * "öü" * post, (1:2) .+ prelen) == "öü"
+        @test graphemes(pre * "ö" * post, (1:1) .+ prelen) == "ö"
+    end
+    @test graphemes("äöüx", 6:5)::SubString{String} == ""
+    @test_throws BoundsError graphemes("äöüx", 2:5)
+    @test_throws BoundsError graphemes("äöüx", 5:5)
+    @test_throws ArgumentError graphemes("äöüx", 0:1)
 end
 
 @testset "#3721, #6939 up-to-date character widths" begin