-
Notifications
You must be signed in to change notification settings - Fork 34
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
WIP: Better copyto #172
WIP: Better copyto #172
Changes from all commits
5528a9a
5b5f565
e6a51fa
ff08d35
c0f3f61
b4e8089
35d103f
5c5e571
89ed2e4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -405,11 +405,12 @@ copy(A::CategoricalArray) = deepcopy(A) | |
CatArrOrSub{T, N} = Union{CategoricalArray{T, N}, | ||
SubArray{<:Any, N, <:CategoricalArray{T}}} where {T, N} | ||
|
||
function copyto!(dest::CatArrOrSub{T, N}, dstart::Integer, | ||
src::CatArrOrSub{<:Any, N}, sstart::Integer, | ||
n::Integer) where {T, N} | ||
function copyto!(dest::CatArrOrSub, dstart::Integer, | ||
src::CatArrOrSub, sstart::Integer, | ||
n::Integer) | ||
n == 0 && return dest | ||
n < 0 && throw(ArgumentError(string("tried to copy n=", n, " elements, but n should be nonnegative"))) | ||
n < 0 && throw(ArgumentError(string("tried to copy n=", n, | ||
" elements, but n should be nonnegative"))) | ||
destinds, srcinds = LinearIndices(dest), LinearIndices(src) | ||
(dstart ∈ destinds && dstart+n-1 ∈ destinds) || throw(BoundsError(dest, dstart:dstart+n-1)) | ||
(sstart ∈ srcinds && sstart+n-1 ∈ srcinds) || throw(BoundsError(src, sstart:sstart+n-1)) | ||
|
@@ -419,43 +420,49 @@ function copyto!(dest::CatArrOrSub{T, N}, dstart::Integer, | |
dpool = pool(dest) | ||
spool = pool(src) | ||
|
||
# try converting src to dest type to avoid partial copy corruption of dest | ||
# in the event that the src cannot be copied into dest | ||
slevs = convert(Vector{T}, levels(src)) | ||
if eltype(src) >: Missing && !(eltype(dest) >: Missing) && !all(x -> x > 0, srefs) | ||
throw(MissingException("cannot copy array with missing values to an array with element type $T")) | ||
end | ||
|
||
newlevels, ordered = mergelevels(isordered(dest), levels(dest), slevs) | ||
# Orderedness cannot be preserved if the source was unordered and new levels | ||
# need to be added: new comparisons would only be based on the source's order | ||
# (this is consistent with what happens when adding a new level via setindex!) | ||
ordered &= isordered(src) | (length(newlevels) == length(levels(dest))) | ||
if ordered != isordered(dest) | ||
isa(dest, SubArray) && throw(ArgumentError("cannot set ordered=$ordered on dest SubArray as it would affect the parent. Found when trying to set levels to $newlevels.")) | ||
ordered!(dest, ordered) | ||
end | ||
|
||
# Simple case: replace all values | ||
if !isa(dest, SubArray) && dstart == dstart == 1 && n == length(dest) == length(src) | ||
# Set index to reflect refs | ||
levels!(dpool, T[]) # Needed in case src and dest share some levels | ||
levels!(dpool, index(spool)) | ||
|
||
# Set final levels in their visible order | ||
levels!(dpool, newlevels) | ||
|
||
copyto!(drefs, srefs) | ||
else # More work to do: preserve some values (and therefore index) | ||
levels!(dpool, newlevels) | ||
|
||
indexmap = indexin(index(spool), index(dpool)) | ||
|
||
@inbounds for i = 0:(n-1) | ||
x = srefs[sstart+i] | ||
drefs[dstart+i] = x > 0 ? indexmap[x] : 0 | ||
if isordered(dest) | ||
remap = similar(levels(src), eltype(drefs)) | ||
seen = falses(length(levels(src))) # needed as we have to dynamically collect levels | ||
@inbounds for i in 0:n-1 | ||
s = srefs[sstart+i] | ||
if eltype(src) >: Missing && s == 0 | ||
if eltype(dest) >: Missing | ||
drefs[dstart+i] = 0 | ||
else | ||
throw(MissingException("cannot copy array with missing values to an array with element type $T")) | ||
end | ||
else | ||
if seen[s] | ||
drefs[dstart+i] = remap[s] | ||
else | ||
seen[s] = true | ||
rm = index(dpool)[s] | ||
remap[s] = rm | ||
drefs[dstart+i] = rm | ||
end | ||
end | ||
end | ||
else | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are you sure we need to duplicate all of this code for performance? Branch prediction is usually quite efficient, so even if you have an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would have to do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since there's already a branch I don't think it will matter, even if the compiler doesn't hoist it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. but we have to do |
||
remap = Vector{eltype(drefs)}(undef, length(levels(src))) | ||
seen = falses(length(levels(src))) # needed as we have to dynamically collect levels | ||
@inbounds for i in 0:n-1 | ||
s = srefs[sstart+i] | ||
if eltype(src) >: Missing && s == 0 | ||
if eltype(dest) >: Missing | ||
drefs[dstart+i] = 0 | ||
else | ||
throw(MissingException("cannot copy array with missing values to an array with element type $T")) | ||
end | ||
else | ||
if seen[s] | ||
drefs[dstart+i] = remap[s] | ||
else | ||
seen[s] = true | ||
dest[dstart+i] = src[sstart+i] | ||
remap[s] = drefs[dstart+i] | ||
end | ||
end | ||
end | ||
|
||
end | ||
|
||
dest | ||
|
@@ -464,11 +471,10 @@ end | |
copyto!(dest::CatArrOrSub, src::CatArrOrSub) = | ||
copyto!(dest, 1, src, 1, length(src)) | ||
|
||
copyto!(dest::CatArrOrSub, dstart::Integer, src::CatArrOrSub) = | ||
copyto!(dest, dstart, src, 1, length(src)) | ||
|
||
@static if VERSION >= v"0.7.0-DEV.3208" | ||
using Future | ||
Future.copy!(dest::CatArrOrSub, src::AbstractArray) = | ||
copyto!(dest, 1, src, 1, length(src)) | ||
Future.copy!(dest::CatArrOrSub, src::CatArrOrSub) = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This method isn't needed now AFAICT. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you want to happen if we want to |
||
copyto!(dest, 1, src, 1, length(src)) | ||
end | ||
|
@@ -694,14 +700,12 @@ function Base.push!(A::CategoricalVector, item) | |
A | ||
end | ||
|
||
function Base.append!(A::CategoricalVector, B::CatArrOrSub) | ||
levels!(A, union(levels(A), levels(B))) | ||
function Base.append!(A::CategoricalVector, B::AbstractArray) | ||
len = length(A) | ||
len2 = length(B) | ||
resize!(A.refs, len + len2) | ||
for i = 1:len2 | ||
A[len + i] = B[i] | ||
end | ||
# As in Base, A will be left modified if it is not possible to copy B to A | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder why Base doesn't call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe the reason is that you would have to do try-catch which will slow down the function? I will open an issue with Base. |
||
copyto!(A, len + 1, B) | ||
return A | ||
end | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -188,18 +188,37 @@ end | |
levels!(x, ["Young", "Middle", "Old"]) | ||
ordered!(x, true) | ||
|
||
for copyf! in (copy!, copyto!) | ||
y = CategoricalArray{Union{T, String}}(["X", "Z", "Y", "X"]) | ||
@test_throws KeyError copyf!(x, y) | ||
end | ||
|
||
x = CategoricalArray{Union{T, String}}(["Old", "Young", "Middle", "Young"]) | ||
levels!(x, ["Young", "Middle", "Old"]) | ||
|
||
for copyf! in (copy!, copyto!) | ||
y = CategoricalArray{Union{T, String}}(["X", "Z", "Y", "X"]) | ||
@test copyf!(x, y) === x | ||
@test x == y | ||
@test levels(x) == ["Young", "Middle", "Old", "X", "Y", "Z"] | ||
# the additional levels are added in order of appereance in y | ||
@test levels(x) == ["Young", "Middle", "Old", "X", "Z", "Y"] | ||
@test !isordered(x) | ||
end | ||
|
||
x = CategoricalArray{Union{T, String}}(["Old", "Young", "Middle", "Young"]) | ||
levels!(x, ["Young", "Middle", "Old"]) | ||
ordered!(x, true) | ||
y = CategoricalArray{Union{T, String}}(["X", "Z", "Y", "X"]) | ||
# Test with missing values | ||
if T === Missing | ||
x[3] = missing | ||
y[3] = missing | ||
end | ||
@test_throws KeyError copyto!(x, 1, y, 2) | ||
|
||
x = CategoricalArray{Union{T, String}}(["Old", "Young", "Middle", "Young"]) | ||
levels!(x, ["Young", "Middle", "Old"]) | ||
y = CategoricalArray{Union{T, String}}(["X", "Z", "Y", "X"]) | ||
a = (Union{String, Missing})["Z", "Y", "X", "Young"] | ||
# Test with missing values | ||
if T === Missing | ||
|
@@ -442,6 +461,32 @@ end | |
end | ||
end | ||
|
||
@testset "copyto! and copy! corner cases" begin | ||
x = categorical([1:255;1:255], true) | ||
y = categorical([1, 1000], true) | ||
@test copyto!(x, 1, y, 1, 1)[1] == y[1] | ||
@test_throws ErrorException copyto!(x, 1, y, 1, 1)[2] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better check the length? Actually, better check the whole contents of the result (same for the line above). |
||
|
||
x = categorical([1,2,3]) | ||
y = categorical([5,6,7]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better use unsorted values to really test that the order is preserved. |
||
copyto!(x,y) | ||
@test levels(x) = [1,2,3,5,6,7] | ||
|
||
@test copy!(x, [1,1,1]) == [1,1,1] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not also check levels? Also with different unsorted values? Same below. |
||
|
||
x = categorical([1,2,3]) | ||
y = categorical([5,6,7]) | ||
@test copyto!(x, 1, y, 1, 1) == [5, 2, 3] | ||
@test levels(x) == [1, 2, 3, 5] | ||
|
||
x = categorical([1,2,3], ordered=true) | ||
y = categorical([10]) | ||
@test_throws KeyError copyto!(x, 1, y, 1) | ||
|
||
x = categorical([1, 2, 3]) | ||
@test copy!(x, [1, 1, 1]) == [1, 1, 1] | ||
end | ||
|
||
@testset "resize!()" begin | ||
x = CategoricalArray{Union{T, String}}(["Old", "Young", "Middle", "Young"]) | ||
@test resize!(x, 3) === x | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Couldn't you just use
remap[s] == -1
as a sentinel?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is what I have done first, but allocating
seen
should be cheap, and the problem is thatremap
, in order to be fast, should have the same eltype asdrefs
and they are unsigned.At least this is what I thought. Now I see that we can use
remap[s] == 0
as sentinel, as we use0
for missing which is handled in other way anyway. I will change this