diff --git a/NEWS.md b/NEWS.md index e9e9295f88..0146ba5386 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,10 @@ ## New functionalities +* `firstindex`, `lastindex`, `size`, `ndims`, and `axes` are now consistently defined + and documented in the manual for `AbstractDataFrame`, `DataFrameRow`, + `DataFrameRows`, `DataFrameColumns`, `GroupedDataFrame`, `GroupKeys`, and `GroupKey` + ([#2573](https://github.com/JuliaData/DataFrames.jl/pull/2573)) ## Deprecated diff --git a/docs/src/lib/indexing.md b/docs/src/lib/indexing.md index 493b70b8b9..7de477d7e6 100644 --- a/docs/src/lib/indexing.md +++ b/docs/src/lib/indexing.md @@ -235,15 +235,32 @@ The elements of a `GroupedDataFrame` are [`SubDataFrame`](@ref)s of its parent. # Common API for types defined in DataFrames.jl -This table presents return value types of calling `names`, `propertynames` and `keys` +This table presents return value types of calling `names`, `propertynames`, `keys`, `length` and `ndims` on types exposed to the user by DataFrames.jl: -| Type | `names` | `propertynames` | `keys` | -|---------------------|------------------|------------------|------------------| -| `AbstractDataFrame` | `Vector{String}` | `Vector{Symbol}` | undefined | -| `DataFrameRow` | `Vector{String}` | `Vector{Symbol}` | `Vector{Symbol}` | -| `DataFrameRows` | `Vector{String}` | `Vector{Symbol}` | vector of `Int` | -| `DataFrameColumns` | `Vector{String}` | `Vector{Symbol}` | `Vector{Symbol}` | -| `GroupedDataFrame` | `Vector{String}` | tuple of fields | `GroupKeys` | -| `GroupKeys` | undefined | tuple of fields | vector of `Int` | -| `GroupKey` | `Vector{String}` | `Vector{Symbol}` | `Vector{Symbol}` | +| Type | `names` | `propertynames` | `keys` | `length` | `ndims` | +|---------------------|------------------|------------------|------------------|-----------|---------| +| `AbstractDataFrame` | `Vector{String}` | `Vector{Symbol}` | undefined | undefined | `2` | +| `DataFrameRow` | `Vector{String}` | `Vector{Symbol}` | `Vector{Symbol}` | `Int` | `1` | +| `DataFrameRows` | `Vector{String}` | `Vector{Symbol}` | vector of `Int` | `Int` | `1` | +| `DataFrameColumns` | `Vector{String}` | `Vector{Symbol}` | `Vector{Symbol}` | `Int` | `1` | +| `GroupedDataFrame` | `Vector{String}` | tuple of fields | `GroupKeys` | `Int` | `1` | +| `GroupKeys` | undefined | tuple of fields | vector of `Int` | `Int` | `1` | +| `GroupKey` | `Vector{String}` | `Vector{Symbol}` | `Vector{Symbol}` | `Int` | `1` | + +Additionally the above types `T` (i.e. `AbstractDataFrame`, `DataFrameRow`, `DataFrameRows`, +`DataFrameColumns`, `GroupedDataFrame`, `GroupKeys`, `GroupKey`) the following methods are defined: +* `size(::T)` returning a `Tuple` of `Int`. +* `size(::T, ::Integer)` returning an `Int`. +* `axes(::T)` returning a `Tuple` of `Int` vectors. +* `axes(::T, ::Integer)` returning an `Int` vector for a valid dimension (except + `DataFrameRows` and `GroupKeys` for which `Base.OneTo(1)` is also returned for + a dimension higher than a valid one because they are `AbstractVector`). +* `firstindex(::T)` returning `1` (except `AbstractDataFrame` for which it is undefined). +* `firstindex(::T, ::Integer)` returning `1` for a valid dimension (except `DataFrameRows` + and `GroupKeys` for which `1` is also returned for a dimension higher than a valid one + because they are `AbstractVector`). +* `lastindex(::T)` returning `Int` (except `AbstractDataFrame` for which it is undefined). +* `lastindex(::T, ::Integer)` returning `Int` for a valid dimension (except `DataFrameRows` + and `GroupKeys` for which `1` is also returned for a dimension higher than a valid one + because they are `AbstractVector`). diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index cb8aa43d43..75703ca840 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -332,7 +332,10 @@ end Base.isempty(df::AbstractDataFrame) = size(df, 1) == 0 || size(df, 2) == 0 -Base.lastindex(df::AbstractDataFrame, i::Integer) = last(axes(df, i)) +if VERSION < v"1.6" + Base.firstindex(df::AbstractDataFrame, i::Integer) = first(axes(df, i)) + Base.lastindex(df::AbstractDataFrame, i::Integer) = last(axes(df, i)) +end Base.axes(df::AbstractDataFrame, i::Integer) = Base.OneTo(size(df, i)) """ diff --git a/src/abstractdataframe/iteration.jl b/src/abstractdataframe/iteration.jl index 2858993338..1c5a30d632 100644 --- a/src/abstractdataframe/iteration.jl +++ b/src/abstractdataframe/iteration.jl @@ -176,14 +176,25 @@ Base.IteratorSize(::Type{<:DataFrameColumns}) = Base.HasShape{1}() Base.size(itr::DataFrameColumns) = (size(parent(itr), 2),) function Base.size(itr::DataFrameColumns, d::Integer) - d < 1 && throw(ArgumentError("dimension out of range")) - return d == 1 ? size(itr)[1] : 1 + d != 1 && throw(ArgumentError("dimension out of range")) + return size(itr)[1] end +Base.ndims(::DataFrameColumns) = 1 +Base.ndims(::Type{<:DataFrameColumns}) = 1 + Base.length(itr::DataFrameColumns) = size(itr)[1] Base.eltype(::Type{<:DataFrameColumns}) = AbstractVector + Base.firstindex(itr::DataFrameColumns) = 1 Base.lastindex(itr::DataFrameColumns) = length(itr) + +if VERSION < v"1.6" + Base.firstindex(itr::DataFrameColumns, i::Integer) = first(axes(itr, i)) + Base.lastindex(itr::DataFrameColumns, i::Integer) = last(axes(itr, i)) +end +Base.axes(itr::DataFrameColumns, i::Integer) = Base.OneTo(size(itr, i)) + Base.iterate(itr::DataFrameColumns, i::Integer=1) = i <= length(itr) ? (itr[i], i + 1) : nothing Base.@propagate_inbounds Base.getindex(itr::DataFrameColumns, idx::ColumnIndex) = diff --git a/src/dataframerow/dataframerow.jl b/src/dataframerow/dataframerow.jl index fb4473d45a..141a0862cf 100644 --- a/src/dataframerow/dataframerow.jl +++ b/src/dataframerow/dataframerow.jl @@ -376,8 +376,15 @@ Return the number of dimensions of a data frame row, which is always `1`. Base.ndims(::DataFrameRow) = 1 Base.ndims(::Type{<:DataFrameRow}) = 1 +Base.firstindex(r::DataFrameRow) = 1 Base.lastindex(r::DataFrameRow) = length(r) +if VERSION < v"1.6" + Base.firstindex(r::DataFrameRow, i::Integer) = first(axes(r, i)) + Base.lastindex(r::DataFrameRow, i::Integer) = last(axes(r, i)) +end +Base.axes(r::DataFrameRow, i::Integer) = Base.OneTo(size(r, i)) + Base.iterate(r::DataFrameRow) = iterate(r, 1) function Base.iterate(r::DataFrameRow, st) diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl index ca75a1a3eb..293e4f0b2e 100644 --- a/src/groupeddataframe/groupeddataframe.jl +++ b/src/groupeddataframe/groupeddataframe.jl @@ -380,7 +380,21 @@ function Base.iterate(gd::GroupedDataFrame, i=1) end end -Compat.lastindex(gd::GroupedDataFrame) = gd.ngroups +Base.size(gd::GroupedDataFrame) = (length(gd),) +Base.size(gd::GroupedDataFrame, i::Integer) = size(gd)[i] + +Base.ndims(::GroupedDataFrame) = 1 +Base.ndims(::Type{<:GroupedDataFrame}) = 1 + +Base.firstindex(gd::GroupedDataFrame) = 1 +Base.lastindex(gd::GroupedDataFrame) = gd.ngroups + +if VERSION < v"1.6" + Base.firstindex(gd::GroupedDataFrame, i::Integer) = first(axes(gd, i)) + Base.lastindex(gd::GroupedDataFrame, i::Integer) = last(axes(gd, i)) +end +Base.axes(gd::GroupedDataFrame, i::Integer) = Base.OneTo(size(gd, i)) + Base.first(gd::GroupedDataFrame) = gd[1] Base.last(gd::GroupedDataFrame) = gd[end] @@ -457,6 +471,22 @@ end Base.parent(key::GroupKey) = getfield(key, :parent) Base.length(key::GroupKey) = length(parent(key).cols) + +Base.size(key::GroupKey) = (length(key),) +Base.size(key::GroupKey, i::Integer) = size(key)[i] + +Base.ndims(::GroupKey) = 1 +Base.ndims(::Type{<:GroupKey}) = 1 + +Base.firstindex(key::GroupKey) = 1 +Base.lastindex(key::GroupKey) = length(key) + +if VERSION < v"1.6" + Base.firstindex(key::GroupKey, i::Integer) = first(axes(key, i)) + Base.lastindex(key::GroupKey, i::Integer) = last(axes(key, i)) +end +Base.axes(key::GroupKey, i::Integer) = Base.OneTo(size(key, i)) + Base.names(key::GroupKey) = string.(parent(key).cols) # Private fields are never exposed since they can conflict with column names Base.propertynames(key::GroupKey, private::Bool=false) = copy(parent(key).cols) diff --git a/test/indexing.jl b/test/indexing.jl index f3ec2c0866..df7a85770b 100644 --- a/test/indexing.jl +++ b/test/indexing.jl @@ -1859,4 +1859,150 @@ end @test_throws ArgumentError df[1] = [2] end +@testset "array interface tests for all types" begin + df = DataFrame(reshape(1:12, 3, 4), :auto) + @test_throws MethodError length(df) + @test ndims(df) == ndims(typeof(df)) == 2 + @test size(df) == (3, 4) + @test size(df, 1) == 3 + @test size(df, 2) == 4 + @test_throws ArgumentError size(df, 3) + @test_throws ArgumentError size(df, 0) + @test axes(df) == (1:3, 1:4) + @test axes(df, 1) == 1:3 + @test axes(df, 2) == 1:4 + @test_throws ArgumentError axes(df, 3) + @test_throws ArgumentError axes(df, 0) + @test_throws MethodError firstindex(df) + @test firstindex(df, 1) == 1 + @test firstindex(df, 2) == 1 + @test_throws ArgumentError firstindex(df, 3) + @test_throws ArgumentError firstindex(df, 0) + @test_throws MethodError lastindex(df) + @test lastindex(df, 1) == 3 + @test lastindex(df, 2) == 4 + @test_throws ArgumentError lastindex(df, 3) + @test_throws ArgumentError lastindex(df, 0) + + dfr = df[1, 1:3] + @test length(dfr) == 3 + @test ndims(dfr) == ndims(typeof(dfr)) == 1 + @test size(dfr) == (3,) + @test size(dfr, 1) == 3 + @test_throws BoundsError size(dfr, 2) + @test_throws BoundsError size(dfr, 0) + @test axes(dfr) == (1:3,) + @test axes(dfr, 1) == 1:3 + @test_throws BoundsError axes(dfr, 2) + @test_throws BoundsError axes(dfr, 0) + @test firstindex(dfr) == 1 + @test firstindex(dfr, 1) == 1 + @test_throws BoundsError firstindex(dfr, 2) + @test_throws BoundsError firstindex(dfr, 0) + @test lastindex(dfr) == 3 + @test lastindex(dfr, 1) == 3 + @test_throws BoundsError lastindex(dfr, 2) + @test_throws BoundsError lastindex(dfr, 0) + + er = eachrow(df) + @test length(er) == 3 + @test ndims(er) == ndims(typeof(er)) == 1 + @test size(er) == (3,) + @test size(er, 1) == 3 + @test size(er, 2) == 1 + @test_throws BoundsError size(er, 0) + @test axes(er) == (1:3,) + @test axes(er, 1) == 1:3 + @test axes(er, 2) == 1:1 + @test_throws BoundsError axes(er, 0) + @test firstindex(er) == 1 + @test firstindex(er, 1) == 1 + @test firstindex(er, 2) == 1 + @test_throws BoundsError firstindex(er, 0) + @test lastindex(er) == 3 + @test lastindex(er, 1) == 3 + @test lastindex(er, 2) == 1 + @test_throws BoundsError lastindex(er, 0) + + ec = eachcol(df) + @test length(ec) == 4 + @test ndims(ec) == ndims(typeof(ec)) == 1 + @test size(ec) == (4,) + @test size(ec, 1) == 4 + @test_throws ArgumentError size(ec, 2) + @test_throws ArgumentError size(ec, 0) + @test axes(ec) == (1:4,) + @test axes(ec, 1) == 1:4 + @test_throws ArgumentError axes(ec, 2) + @test_throws ArgumentError axes(ec, 0) + @test firstindex(ec) == 1 + @test firstindex(ec, 1) == 1 + @test_throws ArgumentError firstindex(ec, 2) + @test_throws ArgumentError firstindex(ec, 0) + @test lastindex(ec) == 4 + @test lastindex(ec, 1) == 4 + @test_throws ArgumentError lastindex(ec, 2) + @test_throws ArgumentError lastindex(ec, 0) + + gdf = groupby(df, [:x1, :x2, :x3]) + @test length(gdf) == 3 + @test ndims(gdf) == ndims(typeof(gdf)) == 1 + @test size(gdf) == (3,) + @test size(gdf, 1) == 3 + @test_throws BoundsError size(gdf, 2) + @test_throws BoundsError size(gdf, 0) + @test axes(gdf) == (1:3,) + @test axes(gdf, 1) == 1:3 + @test_throws BoundsError axes(gdf, 2) + @test_throws BoundsError axes(gdf, 0) + @test firstindex(gdf) == 1 + @test firstindex(gdf, 1) == 1 + @test_throws BoundsError firstindex(gdf, 2) + @test_throws BoundsError firstindex(gdf, 0) + @test lastindex(gdf) == 3 + @test lastindex(gdf, 1) == 3 + @test_throws BoundsError lastindex(gdf, 2) + @test_throws BoundsError lastindex(gdf, 0) + + kgdf = keys(gdf) + @test length(kgdf) == 3 + @test ndims(kgdf) == ndims(typeof(kgdf)) == 1 + @test size(kgdf) == (3,) + @test size(kgdf, 1) == 3 + @test size(kgdf, 2) == 1 + @test_throws BoundsError size(kgdf, 0) + @test axes(kgdf) == (1:3,) + @test axes(kgdf, 1) == 1:3 + @test axes(kgdf, 2) == 1:1 + @test_throws BoundsError axes(kgdf, 0) + @test firstindex(kgdf) == 1 + @test firstindex(kgdf, 1) == 1 + @test firstindex(kgdf, 2) == 1 + @test_throws BoundsError firstindex(kgdf, 0) + @test lastindex(kgdf) == 3 + @test lastindex(kgdf, 1) == 3 + @test lastindex(kgdf, 2) == 1 + @test_throws BoundsError lastindex(kgdf, 0) + + gk = kgdf[1] + @test length(gk) == 3 + @test ndims(gk) == ndims(typeof(gk)) == 1 + @test size(gk) == (3,) + @test size(gk, 1) == 3 + @test_throws BoundsError size(gk, 2) + @test_throws BoundsError size(gk, 0) + @test axes(gk) == (1:3,) + @test axes(gk, 1) == 1:3 + @test_throws BoundsError axes(gk, 2) + @test_throws BoundsError axes(gk, 0) + @test firstindex(gk) == 1 + @test firstindex(gk, 1) == 1 + @test_throws BoundsError firstindex(gk, 2) + @test_throws BoundsError firstindex(gk, 0) + @test lastindex(gk) == 3 + @test lastindex(gk, 1) == 3 + @test_throws BoundsError lastindex(gk, 2) + @test_throws BoundsError lastindex(gk, 0) +end + end # module diff --git a/test/iteration.jl b/test/iteration.jl index 9c0903c795..43eba85059 100644 --- a/test/iteration.jl +++ b/test/iteration.jl @@ -28,7 +28,7 @@ using Test, DataFrames @test length(eachcol(df)) == size(df, 2) @test size(eachcol(df)) == (size(df, 2),) @test size(eachcol(df), 1) == size(df, 2) - @test size(eachcol(df), 2) == 1 + @test_throws ArgumentError size(eachcol(df), 2) @test_throws ArgumentError size(eachcol(df), 0) @test eachcol(df)[1] == df[:, 1] @test eachcol(df)[:A] === df[!, :A]