diff --git a/NEWS.md b/NEWS.md index f5a60b8431..608493a60a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,11 @@ +# DataFrames.jl v1.3.2 Patch Release Notes + +## Bug fixes + +* Fix aliasing detection in `sort!` (now only identical columns passing `===` + test are considered aliases) + ([#2981](https://github.com/JuliaData/DataFrames.jl/issues/2981)) + # DataFrames.jl v1.3.1 Patch Release Notes ## Bug fixes diff --git a/Project.toml b/Project.toml index 8ed4e93129..1468c2cbd9 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "DataFrames" uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -version = "1.3.1" +version = "1.3.2" [deps] Compat = "34da2185-b29b-5c13-b0c7-acf172513d20" diff --git a/src/abstractdataframe/sort.jl b/src/abstractdataframe/sort.jl index 91ad82d99f..c5ca91c62c 100755 --- a/src/abstractdataframe/sort.jl +++ b/src/abstractdataframe/sort.jl @@ -595,18 +595,23 @@ _sortperm(df::AbstractDataFrame, a::Algorithm, o::Ordering) = rev::Union{Bool, AbstractVector{Bool}}=false, order::Union{Ordering, AbstractVector{<:Ordering}}=Forward) -Sort data frame `df` by column(s) `cols`. -Sorting on multiple columns is done lexicographicallly. +Sort data frame `df` by column(s) `cols`. Sorting on multiple columns is done +lexicographicallly. `cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). If -`cols` selects no columns, sort `df` on all columns (this -behaviour is deprecated and will change in future versions). +`cols` selects no columns, sort `df` on all columns (this behaviour is +deprecated and will change in future versions). $SORT_ARGUMENTS -If `alg` is `nothing` (the default), the most appropriate algorithm is -chosen automatically among `TimSort`, `MergeSort` and `RadixSort` depending -on the type of the sorting columns and on the number of rows in `df`. +If `alg` is `nothing` (the default), the most appropriate algorithm is chosen +automatically among `TimSort`, `MergeSort` and `RadixSort` depending on the type +of the sorting columns and on the number of rows in `df`. + +`sort!` will produce a correct result even if some columns of passed data frame +are identical (checked with `===`). Otherwise, if two columns share some part of +memory but are not identical (e.g. are different views of the same parent +vector) then `sort!` result might be incorrect. # Examples ```jldoctest @@ -678,22 +683,20 @@ function Base.sort!(df::AbstractDataFrame, cols=All(); end function Base.sort!(df::AbstractDataFrame, a::Base.Sort.Algorithm, o::Base.Sort.Ordering) - c = collect(eachcol(df)) - toskip = Set{Int}() - for (i, col) in enumerate(c) - # Check if this column has been sorted already - if any(j -> c[j] === col, 1:i-1) + seen_cols = IdDict{Any, Nothing}() + for (i, col) in enumerate(eachcol(df)) + if haskey(seen_cols, col) push!(toskip, i) - elseif any(j -> Base.mightalias(c[j], col), 1:i-1) - throw(ArgumentError("data frame contains non identical columns that share the same memory")) + else + seen_cols[col] = nothing end end p = _sortperm(df, a, o) pp = similar(p) - for (i, col) in enumerate(c) + for (i, col) in enumerate(eachcol(df)) if !(i in toskip) copyto!(pp, p) Base.permute!!(col, pp) diff --git a/test/sort.jl b/test/sort.jl index 0bb4c9a710..f66cb93c25 100644 --- a/test/sort.jl +++ b/test/sort.jl @@ -210,6 +210,7 @@ end @testset "sort! tests" begin # safe aliasing test + # this works because 2:5 is immutable df = DataFrame() x = [10:-1:1;] df.x1 = view(x, 2:5) @@ -219,6 +220,15 @@ end @test x == [10, 6, 7, 8, 9, 5, 4, 3, 2, 1] @test df == DataFrame(x1=6:9, x2=6:9) + df = DataFrame() + x = [10:-1:1;] + df.x1 = view(x, 2:5) + df.x2 = view(x, [2:5;]) + sort!(df) + @test_broken issorted(df) + @test_broken x == [10, 6, 7, 8, 9, 5, 4, 3, 2, 1] + @test_broken df == DataFrame(x1=6:9, x2=6:9) + df = DataFrame() x = [10:-1:1;] df.x1 = view(x, 2:5) @@ -229,23 +239,33 @@ end @test x == [10, 9, 6, 7, 8, 5, 4, 3, 2, 1] @test df == DataFrame(x1=[9; 6:8], x2=[9; 6:8]) + df = DataFrame() + x = [10:-1:1;] + df.x1 = view(x, [2:5;]) + df.x2 = view(x, 2:5) + dfv = view(df, 2:4, :) + sort!(dfv) + @test_broken issorted(dfv) + @test_broken x == [10, 9, 6, 7, 8, 5, 4, 3, 2, 1] + @test_broken df == DataFrame(x1=[9; 6:8], x2=[9; 6:8]) + # unsafe aliasing test df = DataFrame() x = [10:-1:1;] df.x1 = view(x, 2:5) df.x2 = view(x, 3:6) - @test_throws ArgumentError sort!(df) - @test x == 10:-1:1 - @test df == DataFrame(x1=9:-1:6, x2=8:-1:5) + sort!(df) + @test_broken x == 10:-1:1 + @test_broken df == DataFrame(x1=9:-1:6, x2=8:-1:5) df = DataFrame() x = [10:-1:1;] df.x1 = view(x, 2:5) df.x2 = view(x, 3:6) dfv = view(df, 2:4, :) - @test_throws ArgumentError sort!(dfv) - @test x == 10:-1:1 - @test df == DataFrame(x1=9:-1:6, x2=8:-1:5) + sort!(dfv) + @test_broken x == 10:-1:1 + @test_broken df == DataFrame(x1=9:-1:6, x2=8:-1:5) # complex view sort test Random.seed!(1234) @@ -323,4 +343,59 @@ end end end +@testset "correct aliasing detection" begin + # make sure sorting a view does not produce an error + for sel in ([1, 2, 3], 1:3) + df = DataFrame(a=1:5, b=11:15, c=21:25) + dfc = copy(df) + sdf = view(df, sel, :) + sort!(sdf) + @test df[:, 1:3] == dfc + end + + Random.seed!(1234) + for sel in ([1:100;], 1:100) + df = DataFrame(a=rand(100), b=rand(100), c=rand(100), id=1:100) + dfc = sort(df) + sdf = view(df, sel, :) + sort!(sdf) + @test df == dfc + sort!(df, :id) + df.d = df.a + sort!(sdf) + @test df[:, 1:4] == dfc + end + + # this is a test that different views are incorrectly handled + x = [1:6;] + df = DataFrame(a=view(x, 1:5), b=view(x, 6:-1:2), copycols=false) + dfv = view(df, 1:3, :) + sort!(dfv, :b) + @test df == DataFrame(a=[3, 2, 1, 6, 5], b=[4, 5, 6, 1, 2]) + # this is the "correct" result if we had no aliasing + x = [1:6;] + df = DataFrame(a=view(x, 1:5), b=view(x, 6:-1:2)) + dfv = view(df, 1:3, :) + sort!(dfv, :b) + @test df == DataFrame(a=[3, 2, 1, 4, 5], b=[4, 5, 6, 3, 2]) + + df = DataFrame(x1=rand(10)) + df.x2 = df.x1 + df.x3 = df.x1 + df.x4 = df.x1 + df.x5 = df.x1 + sort!(df, :x4) + @test issorted(df) + + df = DataFrame(x1=rand(10)) + df.x2 = df.x1 + df.x3 = df.x1 + df.x4 = df.x1 + df.x5 = df.x1 + dfv = @view df[1:5, 1:4] + sort!(dfv, :x4) + @test issorted(dfv) + @test issorted(df[1:5, :]) +end + end # module