From 12a6906f1e11d0f69c2f97f82dbfd23e650b3dbb Mon Sep 17 00:00:00 2001 From: ScottPJones <scottjones@alum.mit.edu> Date: Sun, 18 Oct 2020 00:54:36 -0400 Subject: [PATCH] Add AbstractPattern and AbstractMatch to allow for more general pattern matching --- base/broadcast.jl | 2 +- base/exports.jl | 2 ++ base/regex.jl | 28 +++++++++++++++++++++------- base/strings/search.jl | 6 +++--- base/strings/util.jl | 2 +- 5 files changed, 28 insertions(+), 12 deletions(-) diff --git a/base/broadcast.jl b/base/broadcast.jl index b55051d82546d1..9c4533b77414a7 100644 --- a/base/broadcast.jl +++ b/base/broadcast.jl @@ -675,7 +675,7 @@ julia> Broadcast.broadcastable("hello") # Strings break convention of matching i Base.RefValue{String}("hello") ``` """ -broadcastable(x::Union{Symbol,AbstractString,Function,UndefInitializer,Nothing,RoundingMode,Missing,Val,Ptr,Regex,Pair}) = Ref(x) +broadcastable(x::Union{Symbol,AbstractString,Function,UndefInitializer,Nothing,RoundingMode,Missing,Val,Ptr,AbstractPattern,Pair}) = Ref(x) broadcastable(::Type{T}) where {T} = Ref{Type{T}}(T) broadcastable(x::Union{AbstractArray,Number,Ref,Tuple,Broadcasted}) = x # Default to collecting iterables — which will error for non-iterables diff --git a/base/exports.jl b/base/exports.jl index 2c0c628eec866b..287866ca59503b 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -22,6 +22,8 @@ export AbstractVector, AbstractVecOrMat, Array, + AbstractMatch, + AbstractPattern, AbstractDict, BigFloat, BigInt, diff --git a/base/regex.jl b/base/regex.jl index 75c3777fd681a0..68b8acf3c3cecb 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -7,6 +7,13 @@ include("pcre.jl") const DEFAULT_COMPILER_OPTS = PCRE.UTF | PCRE.NO_UTF_CHECK | PCRE.ALT_BSUX | PCRE.UCP const DEFAULT_MATCH_OPTS = PCRE.NO_UTF_CHECK +""" + An abstract type representing any sort of pattern matching expression (typically a regular + expression). + `AbstractPattern` objects can be used to match strings with [`match`](@ref). +""" +abstract type AbstractPattern end + """ Regex(pattern[, flags]) @@ -17,7 +24,7 @@ with [`match`](@ref). `Regex(pattern[, flags])` constructor is usually used if the `pattern` string needs to be interpolated. See the documentation of the string macro for details on flags. """ -mutable struct Regex +mutable struct Regex <: AbstractPattern pattern::String compile_options::UInt32 match_options::UInt32 @@ -128,10 +135,16 @@ function show(io::IO, re::Regex) end end +""" + `AbstractMatch` objects are used to represent information about matches found in a string + using an `AbstractPattern`. +""" +abstract type AbstractMatch end + # TODO: map offsets into strings in other encodings back to original indices. # or maybe it's better to just fail since that would be quite slow -struct RegexMatch +struct RegexMatch <: AbstractMatch match::SubString{String} captures::Vector{Union{Nothing,SubString{String}}} offset::Int @@ -278,7 +291,8 @@ true """ function match end -function match(re::Regex, str::Union{SubString{String}, String}, idx::Integer, add_opts::UInt32=UInt32(0)) +function match(re::Regex, str::Union{SubString{String}, String}, idx::Integer, + add_opts::UInt32=UInt32(0)) compile(re) opts = re.match_options | add_opts matched, data = PCRE.exec_r_data(re.regex, str, idx-1, opts) @@ -336,7 +350,7 @@ findfirst(r::Regex, s::AbstractString) = findnext(r,s,firstindex(s)) """ findall( - pattern::Union{AbstractString,Regex}, + pattern::Union{AbstractString,AbstractPattern}, string::AbstractString; overlap::Bool = false, ) @@ -365,7 +379,7 @@ julia> findall("a", "banana") 6:6 ``` """ -function findall(t::Union{AbstractString,Regex}, s::AbstractString; overlap::Bool=false) +function findall(t::Union{AbstractString,AbstractPattern}, s::AbstractString; overlap::Bool=false) found = UnitRange{Int}[] i, e = firstindex(s), lastindex(s) while true @@ -381,7 +395,7 @@ end """ count( - pattern::Union{AbstractString,Regex}, + pattern::Union{AbstractString,AbstractPattern}, string::AbstractString; overlap::Bool = false, ) @@ -392,7 +406,7 @@ calling `length(findall(pattern, string))` but more efficient. If `overlap=true`, the matching sequences are allowed to overlap indices in the original string, otherwise they must be from disjoint character ranges. """ -function count(t::Union{AbstractString,Regex}, s::AbstractString; overlap::Bool=false) +function count(t::Union{AbstractString,AbstractPattern}, s::AbstractString; overlap::Bool=false) n = 0 i, e = firstindex(s), lastindex(s) while true diff --git a/base/strings/search.jl b/base/strings/search.jl index b1908ac99c8600..140a5eab06350b 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -88,7 +88,7 @@ end """ findfirst(pattern::AbstractString, string::AbstractString) - findfirst(pattern::Regex, string::String) + findfirst(pattern::AbstractPattern, string::String) Find the first occurrence of `pattern` in `string`. Equivalent to [`findnext(pattern, string, firstindex(s))`](@ref). @@ -250,7 +250,7 @@ end """ findnext(pattern::AbstractString, string::AbstractString, start::Integer) - findnext(pattern::Regex, string::String, start::Integer) + findnext(pattern::AbstractPattern, string::String, start::Integer) Find the next occurrence of `pattern` in `string` starting at position `start`. `pattern` can be either a string, or a regular expression, in which case `string` @@ -507,7 +507,7 @@ findprev(ch::AbstractChar, string::AbstractString, ind::Integer) = findprev(==(ch), string, ind) """ - occursin(needle::Union{AbstractString,Regex,AbstractChar}, haystack::AbstractString) + occursin(needle::Union{AbstractString,AbstractPattern,AbstractChar}, haystack::AbstractString) Determine whether the first argument is a substring of the second. If `needle` is a regular expression, checks whether `haystack` contains a match. diff --git a/base/strings/util.jl b/base/strings/util.jl index c45a353f07c593..15da308c36460b 100644 --- a/base/strings/util.jl +++ b/base/strings/util.jl @@ -546,7 +546,7 @@ If `count` is provided, replace at most `count` occurrences. `pat` may be a single character, a vector or a set of characters, a string, or a regular expression. If `r` is a function, each occurrence is replaced with `r(s)` -where `s` is the matched substring (when `pat` is a `Regex` or `AbstractString`) or +where `s` is the matched substring (when `pat` is a `AbstractPattern` or `AbstractString`) or character (when `pat` is an `AbstractChar` or a collection of `AbstractChar`). If `pat` is a regular expression and `r` is a [`SubstitutionString`](@ref), then capture group references in `r` are replaced with the corresponding matched text.