Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Restrict wrapped types to reduce invalidations #310

Merged
merged 3 commits into from
Nov 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 45 additions & 29 deletions src/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,41 @@ function reftype(sz::Int)
end
end

# This check is only there to print a user-friendly warning before
# a TypeError is thrown due to restrictions in the type signature
function check_supported_eltype(::Type{T}, ::Type{U}) where {T, U}
T === Symbol &&
throw(ArgumentError("CategoricalArray no longer supports Symbol as element type "*
"as that forces recompiling too many Julia Base methods: " *
"use strings instead, e.g. via categorical(string.(x))"))
T <: Union{SupportedTypes, Missing} ||
throw(ArgumentError("CategoricalArray only supports " *
"AbstractString, AbstractChar and Number element types " *
"(got element type $U)"))
end

fixstringtype(T::Type) = T <: SubString || T === AbstractString ? String : T
fixstringtype(T::Union) = Union{fixstringtype(T.a), fixstringtype(T.b)}
fixstringtype(::Type{Union{}}) = Union{}

# Find a narrow type that is supported to hold all elements if possible
function fixtype(A::AbstractArray{T}) where T
if T <: Union{SupportedTypes, Missing}
return fixstringtype(T)
else
U = fixstringtype(mapreduce(typeof, Base.promote_typejoin, A))
check_supported_eltype(U, T)
return U
end
end

"""
CategoricalArray{T}(undef, dims::Dims; levels=nothing, ordered=false)
CategoricalArray{T}(undef, dims::Int...; levels=nothing, ordered=false)

Construct an uninitialized `CategoricalArray` with levels of type `T` and dimensions `dim`.
Construct an uninitialized `CategoricalArray` with levels of type
`T <: $SupportedTypes` and dimensions `dims`.

The `levels` keyword argument can be a vector specifying possible values for the data
(this is equivalent to but more efficient than calling [`levels!`](@ref)
on the resulting array).
Expand All @@ -52,8 +78,6 @@ in ascending order; else, they are kept in their order of appearance in `A`.
The `ordered` keyword argument determines whether the array values can be compared
according to the ordering of levels or not (see [`isordered`](@ref)).

CategoricalArray(A::CategoricalArray; levels=nothing, ordered=false)

If `A` is already a `CategoricalArray`, its levels, orderedness and reference type
are preserved unless explicitly overriden.
"""
Expand All @@ -62,7 +86,8 @@ function CategoricalArray end
"""
CategoricalVector{T}(undef, m::Int; levels=nothing, ordered=false)

Construct an uninitialized `CategoricalVector` with levels of type `T` and dimensions `dim`.
Construct an uninitialized `CategoricalVector` with levels of type
`T <: $SupportedTypes` and dimensions `dim`.

The `levels` keyword argument can be a vector specifying possible values for the data
(this is equivalent to but more efficient than calling [`levels!`](@ref)
Expand All @@ -87,8 +112,6 @@ in ascending order; else, they are kept in their order of appearance in `A`.
The `ordered` keyword argument determines whether the array values can be compared
according to the ordering of levels or not (see [`isordered`](@ref)).

CategoricalVector(A::CategoricalVector; levels=nothing, ordered=false)

If `A` is already a `CategoricalVector`, its levels, orderedness and reference type
are preserved unless explicitly overriden.
"""
Expand All @@ -97,7 +120,8 @@ function CategoricalVector end
"""
CategoricalMatrix{T}(undef, m::Int, n::Int; levels=nothing, ordered=false)

Construct an uninitialized `CategoricalMatrix` with levels of type `T` and dimensions `dim`.
Construct an uninitialized `CategoricalMatrix` with levels of type
`T <: $SupportedTypes` and dimensions `dim`.
The `ordered` keyword argument determines whether the array values can be compared
according to the ordering of levels or not (see [`isordered`](@ref)).

Expand All @@ -118,8 +142,6 @@ in ascending order; else, they are kept in their order of appearance in `A`.
The `ordered` keyword argument determines whether the array values can be compared
according to the ordering of levels or not (see [`isordered`](@ref)).

CategoricalMatrix(A::CategoricalMatrix; levels=nothing, ordered=isordered(A))

If `A` is already a `CategoricalMatrix`, its levels, orderedness and reference type
are preserved unless explicitly overriden.
"""
Expand All @@ -137,6 +159,7 @@ function CategoricalArray{T, N, R}(::UndefInitializer, dims::NTuple{N,Int};
ordered::Bool=false) where {T, N, R}
U = leveltype(nonmissingtype(T))
S = T >: Missing ? Union{U, Missing} : U
check_supported_eltype(S, T)
V = CategoricalValue{U, R}
levs = levels === nothing ? U[] : collect(U, levels)
CategoricalArray{S, N}(zeros(R, dims), CategoricalPool{U, R, V}(levs, ordered))
Expand Down Expand Up @@ -231,6 +254,7 @@ function CategoricalArray{T, N, R}(A::AbstractArray;
end

# From AbstractArray

CategoricalArray{T, N}(A::AbstractArray{S, N};
levels::Union{AbstractVector, Nothing}=nothing,
ordered::Bool=_isordered(A)) where {S, T, N} =
Expand All @@ -242,17 +266,17 @@ CategoricalArray{T}(A::AbstractArray{S, N};
CategoricalArray(A::AbstractArray{T, N};
levels::Union{AbstractVector, Nothing}=nothing,
ordered::Bool=_isordered(A)) where {T, N} =
CategoricalArray{fixstringtype(T), N}(A, levels=levels, ordered=ordered)
CategoricalArray{fixtype(A), N}(A, levels=levels, ordered=ordered)

CategoricalVector(A::AbstractVector{T};
levels::Union{AbstractVector, Nothing}=nothing,
ordered::Bool=_isordered(A)) where {T} =
CategoricalArray{fixstringtype(T), 1}(A, levels=levels, ordered=ordered)
CategoricalArray{fixtype(A), 1}(A, levels=levels, ordered=ordered)

CategoricalMatrix(A::AbstractMatrix{T};
levels::Union{AbstractVector, Nothing}=nothing,
ordered::Bool=_isordered(A)) where {T} =
CategoricalArray{fixstringtype(T), 2}(A, levels=levels, ordered=ordered)
CategoricalArray{fixtype(A), 2}(A, levels=levels, ordered=ordered)

# From CategoricalArray (preserve R)
CategoricalArray{T, N}(A::CategoricalArray{S, N, R};
Expand Down Expand Up @@ -286,12 +310,12 @@ convert(::Type{CategoricalArray{T, N}}, A::AbstractArray{S, N}) where {S, T, N}
convert(::Type{CategoricalArray{T}}, A::AbstractArray{S, N}) where {S, T, N} =
convert(CategoricalArray{T, N}, A)
convert(::Type{CategoricalArray}, A::AbstractArray{T, N}) where {T, N} =
convert(CategoricalArray{T, N}, A)
convert(CategoricalArray{fixtype(A), N}, A)

convert(::Type{CategoricalVector{T}}, A::AbstractVector) where {T} =
convert(CategoricalVector{T, DefaultRefType}, A)
convert(::Type{CategoricalVector}, A::AbstractVector{T}) where {T} =
convert(CategoricalVector{T}, A)
convert(CategoricalVector{fixtype(A)}, A)
convert(::Type{CategoricalVector{T}},
A::CategoricalVector{S, R}) where {S, T, R <: Integer} =
convert(CategoricalVector{T, R}, A)
Expand All @@ -301,7 +325,7 @@ convert(::Type{CategoricalVector}, A::CategoricalVector) = A
convert(::Type{CategoricalMatrix{T}}, A::AbstractMatrix) where {T} =
convert(CategoricalMatrix{T, DefaultRefType}, A)
convert(::Type{CategoricalMatrix}, A::AbstractMatrix{T}) where {T} =
convert(CategoricalMatrix{T}, A)
convert(CategoricalMatrix{fixtype(A)}, A)
convert(::Type{CategoricalMatrix{T}},
A::CategoricalMatrix{S, R}) where {S, T, R <: Integer} =
convert(CategoricalMatrix{T, R}, A)
Expand All @@ -313,6 +337,8 @@ convert(::Type{CategoricalArray{T, N, R}}, A::AbstractArray{S, N}) where {S, T,

function _convert(::Type{CategoricalArray{T, N, R}}, A::AbstractArray{S, N};
levels::Union{AbstractVector, Nothing}=nothing) where {S, T, N, R}
check_supported_eltype(T, T)

res = CategoricalArray{T, N, R}(undef, size(A), levels=levels)
copyto!(res, A)

Expand Down Expand Up @@ -698,7 +724,7 @@ function vcat(A::CategoricalArray...)
[x==0 ? 0 : ii[x] for x in a.refs]::Array{Int,ndims(a)}
end

T = Base.promote_eltype(A...) >: Missing ?
T = cat_promote_eltype(A...) >: Missing ?
Union{eltype(newlevels), Missing} : eltype(newlevels)
refs = DefaultRefType[refsvec...;]
pool = CategoricalPool(newlevels, ordered)
Expand Down Expand Up @@ -912,15 +938,15 @@ are preserved unless explicitly overriden.
compress::Bool=false) where {T, N}
# @inline is needed so that return type is inferred when compress is not provided
RefType = compress ? reftype(length(unique(A))) : DefaultRefType
CategoricalArray{fixstringtype(T), N, RefType}(A, levels=levels, ordered=ordered)
CategoricalArray{fixtype(A), N, RefType}(A, levels=levels, ordered=ordered)
end
@inline function categorical(A::CategoricalArray{T, N, R};
levels::Union{AbstractVector, Nothing}=nothing,
ordered=_isordered(A),
compress::Bool=false) where {T, N, R}
# @inline is needed so that return type is inferred when compress is not provided
RefType = compress ? reftype(length(CategoricalArrays.levels(A))) : R
CategoricalArray{fixstringtype(T), N, RefType}(A, levels=levels, ordered=ordered)
CategoricalArray{T, N, RefType}(A, levels=levels, ordered=ordered)
end

function in(x::Any, y::CategoricalArray{T, N, R}) where {T, N, R}
Expand Down Expand Up @@ -1046,21 +1072,11 @@ end

StructTypes.construct(::Type{<:CategoricalArray{Union{Missing, T}}},
A::AbstractVector) where {T} =
categoricalmissing(T, A)
CategoricalArray{Union{Missing, T}}(replace(A, nothing=>missing))
StructTypes.construct(::Type{<:CategoricalArray{Union{Missing, T}}},
A::Vector) where {T} =
categoricalmissing(T, A)
categoricalmissing(T, A::AbstractVector) =
CategoricalArray{Union{Missing, T}}(replace(A, nothing=>missing))

StructTypes.construct(::Type{<:CategoricalArray{Union{Nothing, T}}},
A::AbstractVector) where {T} =
categoricalnothing(T, A)
StructTypes.construct(::Type{<:CategoricalArray{Union{Nothing, T}}},
A::Vector) where {T} =
categoricalnothing(T, A)
categoricalnothing(T, A::AbstractVector) = CategoricalArray{Union{Nothing, T}}(A)

# DataAPI refarray/refvalue/refpool support
struct CategoricalRefPool{T, P} <: AbstractVector{T}
pool::P
Expand Down
2 changes: 1 addition & 1 deletion src/pool.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ avoid doing a dict lookup twice
end

function mergelevels(ordered, levels...)
T = Base.promote_eltype(levels...)
T = cat_promote_eltype(levels...)
res = Vector{T}(undef, 0)

nonempty_lv = findfirst(!isempty, levels)
Expand Down
19 changes: 10 additions & 9 deletions src/recode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -274,8 +274,9 @@ recode!(a::AbstractArray, default::Any, pairs::Pair...) =
recode!(a, a, default, pairs...)
recode!(a::AbstractArray, pairs::Pair...) = recode!(a, a, nothing, pairs...)

promote_valuetype(x::Pair{K, V}) where {K, V} = V
promote_valuetype(x::Pair{K, V}, y::Pair...) where {K, V} = promote_type(V, promote_valuetype(y...))
cat_promote_valuetype(x::Pair{K, V}) where {K, V} = V
cat_promote_valuetype(x::Pair{K, V}, y::Pair...) where {K, V} =
cat_promote_type(V, cat_promote_valuetype(y...))

keytype_hasmissing(x::Pair{K}) where {K} = K === Missing
keytype_hasmissing(x::Pair{K}, y::Pair...) where {K} = K === Missing || keytype_hasmissing(y...)
Expand Down Expand Up @@ -350,11 +351,11 @@ recode(a::AbstractArray, pairs::Pair...) = recode(a, nothing, pairs...)
recode(a::CategoricalArray, pairs::Pair...) = recode(a, nothing, pairs...)

function recode(a::AbstractArray, default::Any, pairs::Pair...)
V = promote_valuetype(pairs...)
V = cat_promote_valuetype(pairs...)
# T cannot take into account eltype(src), since we can't know
# whether it matters at compile time (all levels recoded or not)
# and using a wider type than necessary would be annoying
T = default isa Nothing ? V : promote_type(typeof(default), V)
T = default isa Nothing ? V : cat_promote_type(typeof(default), V)
# Exception 1: if T === Missing and default not missing,
# assume the caller wants to recode only some values to missing,
# but accept original values
Expand All @@ -371,11 +372,11 @@ function recode(a::AbstractArray, default::Any, pairs::Pair...)
end

function recode(a::CategoricalArray{S, N, R}, default::Any, pairs::Pair...) where {S, N, R}
V = promote_valuetype(pairs...)
V = cat_promote_valuetype(pairs...)
# T cannot take into account eltype(src), since we can't know
# whether it matters at compile time (all levels recoded or not)
# and using a wider type than necessary would be annoying
T = default isa Nothing ? V : promote_type(typeof(default), V)
T = default isa Nothing ? V : cat_promote_type(typeof(default), V)
# Exception 1: if T === Missing and default not missing,
# assume the caller wants to recode only some values to missing,
# but accept original values
Expand All @@ -396,13 +397,13 @@ end
function Base.replace(a::CategoricalArray{S, N, R}, pairs::Pair...) where {S, N, R}
# Base.replace(a::Array, pairs::Pair...) uses a wider type promotion than
# recode. It promotes the source type S with the replaced types T.
T = promote_valuetype(pairs...)
T = cat_promote_valuetype(pairs...)
# Exception: replacing missings
# Example: replace(categorical([missing,1.5]), missing=>0)
if keytype_hasmissing(pairs...)
dest = CategoricalArray{promote_type(nonmissingtype(S), T), N, R}(undef, size(a))
dest = CategoricalArray{cat_promote_type(nonmissingtype(S), T), N, R}(undef, size(a))
else
dest = CategoricalArray{promote_type(S, T), N, R}(undef, size(a))
dest = CategoricalArray{cat_promote_type(S, T), N, R}(undef, size(a))
end
recode!(dest, a, nothing, pairs...)
end
Expand Down
14 changes: 7 additions & 7 deletions src/typedefs.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
const DefaultRefType = UInt32
const SupportedTypes = Union{AbstractString, AbstractChar, Number}

## Pools

# Type params:
# * `T` type of categorized values
# * `R` integer type for referencing category levels
# * `V` categorical value type
mutable struct CategoricalPool{T, R <: Integer, V}
mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer, V}
levels::Vector{T} # category levels ordered by their reference codes
invindex::Dict{T, R} # map from category levels to their reference codes
valindex::Vector{V} # "category value" objects 1-to-1 matching `index`
Expand Down Expand Up @@ -42,9 +43,6 @@ mutable struct CategoricalPool{T, R <: Integer, V}
function CategoricalPool{T, R, V}(levels::Vector{T},
invindex::Dict{T, R},
ordered::Bool) where {T, R, V}
if T <: CategoricalValue && T !== Union{}
throw(ArgumentError("Level type $T cannot be a categorical value type"))
end
if !(V <: CategoricalValue)
throw(ArgumentError("Type $V is not a categorical value type"))
end
Expand All @@ -70,7 +68,7 @@ end
## Values

"""
CategoricalValue{T, R <: Integer}
CategoricalValue{T <: $SupportedTypes, R <: Integer}

A wrapper around a value of type `T` corresponding to a level
in a `CategoricalPool`.
Expand All @@ -82,7 +80,7 @@ if [`isordered`](@ref) is `true` for the value's pool, and in that case
the order of the pool's [`levels`](@ref DataAPI.levels) is used rather than the standard
ordering of values of type `T`.
"""
struct CategoricalValue{T, R <: Integer}
struct CategoricalValue{T <: SupportedTypes, R <: Integer}
level::R
pool::CategoricalPool{T, R, CategoricalValue{T, R}}
end
Expand All @@ -96,7 +94,9 @@ end
# * `V` original type of elements (excluding Missing) before categorization
# * `C` categorical value type
# * `U` type of missing value, `Union{}` if missing values are not accepted
abstract type AbstractCategoricalArray{T, N, R, V, C, U} <: AbstractArray{Union{C, U}, N} end
abstract type AbstractCategoricalArray{T <: Union{CategoricalValue, SupportedTypes, Missing}, N,
R <: Integer, V, C <: CategoricalValue, U} <:
AbstractArray{Union{C, U}, N} end
const AbstractCategoricalVector{T, R, V, C, U} = AbstractCategoricalArray{T, 1, R, V, C, U}
const AbstractCategoricalMatrix{T, R, V, C, U} = AbstractCategoricalArray{T, 2, R, V, C, U}

Expand Down
Loading