diff --git a/NEWS.md b/NEWS.md index df05b8f50b189..c1d18bb7c14ac 100644 --- a/NEWS.md +++ b/NEWS.md @@ -32,6 +32,7 @@ New library functions * `findfirst`, `findlast`, `findnext` and `findprev` now accept a character as first argument to search for that character in a string passed as the second argument ([#31664]). * New `findall(pattern, string)` method where `pattern` is a string or regex ([#31834]). +* `count(pattern, string)` gives the number of things `findall` would match ([#32849]). * `istaskfailed` is now documented and exported, like its siblings `istaskdone` and `istaskstarted` ([#32300]). * `RefArray` and `RefValue` objects now accept index `CartesianIndex()` in `getindex` and `setindex!` ([#32653]) diff --git a/base/regex.jl b/base/regex.jl index d28c619e959b5..dd343ab4d86e8 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -328,26 +328,58 @@ findfirst(r::Regex, s::AbstractString) = findnext(r,s,firstindex(s)) """ - findall(pattern::Union{AbstractString,Regex}, string::AbstractString; overlap::Bool=false) + findall( + pattern::Union{AbstractString,Regex}, + string::AbstractString; + overlap::Bool = false, + ) Return a `Vector{UnitRange{Int}}` of all the matches for `pattern` in `string`. Each element of the returned vector is a range of indices where the matching sequence is found, like the return value of [`findnext`](@ref). If `overlap=true`, the matching sequences are allowed to overlap indices in the -original string, otherwise they must be from distinct character ranges. +original string, otherwise they must be from disjoint character ranges. """ function findall(t::Union{AbstractString,Regex}, s::AbstractString; overlap::Bool=false) found = UnitRange{Int}[] i, e = firstindex(s), lastindex(s) while true r = findnext(t, s, i) - isnothing(r) && return found + isnothing(r) && break push!(found, r) j = overlap || isempty(r) ? first(r) : last(r) - j > e && return found + j > e && break + @inbounds i = nextind(s, j) + end + return found +end + +""" + count( + pattern::Union{AbstractString,Regex}, + string::AbstractString; + overlap::Bool = false, + ) + +Return the number of matches for `pattern` in `string`. This is equivalent to +calling `length(findall(pattern, string))` but more efficient. + +If `overlap=true`, the matching sequences are allowed to overlap indices in the +original string, otherwise they must be from disjoint character ranges. +""" +function count(t::Union{AbstractString,Regex}, s::AbstractString; overlap::Bool=false) + n = 0 + i, e = firstindex(s), lastindex(s) + while true + r = findnext(t, s, i) + isnothing(r) && break + n += 1 + j = overlap || isempty(r) ? first(r) : last(r) + j > e && break @inbounds i = nextind(s, j) end + return n end """ diff --git a/test/regex.jl b/test/regex.jl index aad5413fd250b..287647bce9d56 100644 --- a/test/regex.jl +++ b/test/regex.jl @@ -46,12 +46,18 @@ @test_throws ArgumentError match(r"test", GenericString("this is a test")) @test_throws ArgumentError findfirst(r"test", GenericString("this is a test")) - # findall: + # findall @test findall(r"\w+", "foo bar") == [1:3, 5:7] @test findall(r"\w+", "foo bar", overlap=true) == [1:3, 2:3, 3:3, 5:7, 6:7, 7:7] @test findall(r"\w*", "foo bar") == [1:3, 4:3, 5:7, 8:7] @test findall(r"\b", "foo bar") == [1:0, 4:3, 5:4, 8:7] + # count + @test count(r"\w+", "foo bar") == 2 + @test count(r"\w+", "foo bar", overlap=true) == 6 + @test count(r"\w*", "foo bar") == 4 + @test count(r"\b", "foo bar") == 4 + # Named subpatterns let m = match(r"(?.)(.)(?.)", "xyz") @test (m[:a], m[2], m["b"]) == ("x", "y", "z")