From 530f7e2d618582f46355f8286c01a7315a88b9e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 12 Mar 2023 21:14:57 +0100 Subject: [PATCH 1/2] add Iterators.partition for DataFrameRows --- NEWS.md | 7 ++++ src/abstractdataframe/iteration.jl | 60 ++++++++++++++++++++++++++++++ test/dataframe.jl | 21 +++++++++++ 3 files changed, 88 insertions(+) diff --git a/NEWS.md b/NEWS.md index 86721af196..25126c37c8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# DataFrames.jl v1.6 Release Notes + +## New functionalities + +* Add `Iterators.partition` support for `DataFrameRows` + ([#3299](https://github.com/JuliaData/DataFrames.jl/pull/3299)) + # DataFrames.jl v1.5 Release Notes ## New functionalities diff --git a/src/abstractdataframe/iteration.jl b/src/abstractdataframe/iteration.jl index d20285e4dc..da986d7a7c 100644 --- a/src/abstractdataframe/iteration.jl +++ b/src/abstractdataframe/iteration.jl @@ -92,6 +92,66 @@ Compat.hasproperty(itr::DataFrameRows, s::AbstractString) = haskey(index(parent( # Private fields are never exposed since they can conflict with column names Base.propertynames(itr::DataFrameRows, private::Bool=false) = propertynames(parent(itr)) +""" + Iterators.partition(dfr::DataFrameRows, n::Integer) + +Iterate over `dfr` `DataFrameRows` `n` rows at a time, returning each block +as a `DataFraneRows` over a view of rows of parent of `dfr`. + +# Examples + +```jldoctest +julia> collect(Iterators.partition(eachrow(DataFrame(x=1:5)), 2)) +3-element Vector{DataFrames.DataFrameRows{SubDataFrame{DataFrame, DataFrames.Index, UnitRange{Int64}}}}: + 2×1 DataFrameRows + Row │ x + │ Int64 +─────┼─────── + 1 │ 1 + 2 │ 2 + 2×1 DataFrameRows + Row │ x + │ Int64 +─────┼─────── + 1 │ 3 + 2 │ 4 + 1×1 DataFrameRows + Row │ x + │ Int64 +─────┼─────── + 1 │ 5 +``` +""" +function Iterators.partition(dfr::DataFrameRows, n::Integer) + n < 1 && throw(ArgumentError("cannot create partitions of length $n")) + return Iterators.PartitionIterator(dfr, Int(n)) +end + +# use autodetection of eltype +Base.IteratorEltype(::Type{<:Iterators.PartitionIterator{<:DataFrameRows}}) = + Base.EltypeUnknown() + +# we do not need to be overly specific here as we rely on autodetection of eltype +# this method is needed only to override the fallback for `PartitionIterator` +Base.eltype(::Type{<:Iterators.PartitionIterator{<:DataFrameRows}}) = + DataFrameRows + +IteratorSize(::Type{<:Iterators.PartitionIterator{<:DataFrameRows}}) = + Base.HasLength() + +function Base.length(itr::Iterators.PartitionIterator{<:DataFrameRows}) + l = nrow(parent(itr.c)) + return cld(l, itr.n) +end + +function Base.iterate(itr::Iterators.PartitionIterator{<:DataFrameRows}, state::Int=1) + df = parent(itr.c) + last_idx = nrow(df) + state > last_idx && return nothing + r = min(state + itr.n - 1, last_idx) + return eachrow(view(df, state:r, :)), r + 1 +end + # Iteration by columns const DATAFRAMECOLUMNS_DOCSTR = """ diff --git a/test/dataframe.jl b/test/dataframe.jl index febfe4cd35..d9cce76828 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -2311,6 +2311,20 @@ end @test all(v -> v isa SubDataFrame, res) @test_throws ArgumentError Iterators.partition(df, false) @test_throws ArgumentError Iterators.partition(df, -1) + + dfr = eachrow(df) + p = Iterators.partition(dfr, 2) + @test p isa Iterators.PartitionIterator + @test Tables.partitions(p) === p + @test eltype(p) === DataFrames.DataFrameRows + @test Base.IteratorEltype(typeof(p)) === Base.EltypeUnknown() + @test length(p) == 3 + @test Base.IteratorSize(typeof(p)) === Base.HasLength() + res = collect(p) + @test res == eachrow.([DataFrame(x=1:2), DataFrame(x=3:4), DataFrame(x=5)]) + @test all(v -> v isa DataFrames.DataFrameRows, res) + @test_throws ArgumentError Iterators.partition(df, false) + @test_throws ArgumentError Iterators.partition(df, -1) end p = Iterators.partition(DataFrame(), 1) @test p isa Iterators.PartitionIterator @@ -2318,6 +2332,13 @@ end @test isempty(p) @test length(p) == 0 @test eltype(collect(p)) <: SubDataFrame + + p = Iterators.partition(eachrow(DataFrame()), 1) + @test p isa Iterators.PartitionIterator + @test Tables.partitions(p) === p + @test isempty(p) + @test length(p) == 0 + @test eltype(collect(p)) <: DataFrames.DataFrameRows end end # module From 8f3b475e01d5490231fb75fc27fee7296b111053 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 8 Apr 2023 13:29:50 +0200 Subject: [PATCH 2/2] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/iteration.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/abstractdataframe/iteration.jl b/src/abstractdataframe/iteration.jl index da986d7a7c..4028befa3c 100644 --- a/src/abstractdataframe/iteration.jl +++ b/src/abstractdataframe/iteration.jl @@ -95,8 +95,8 @@ Base.propertynames(itr::DataFrameRows, private::Bool=false) = propertynames(pare """ Iterators.partition(dfr::DataFrameRows, n::Integer) -Iterate over `dfr` `DataFrameRows` `n` rows at a time, returning each block -as a `DataFraneRows` over a view of rows of parent of `dfr`. +Iterate over `DataFrameRows` `dfr` `n` rows at a time, returning each block +as a `DataFrameRows` over a view of rows of parent of `dfr`. # Examples @@ -136,7 +136,7 @@ Base.IteratorEltype(::Type{<:Iterators.PartitionIterator{<:DataFrameRows}}) = Base.eltype(::Type{<:Iterators.PartitionIterator{<:DataFrameRows}}) = DataFrameRows -IteratorSize(::Type{<:Iterators.PartitionIterator{<:DataFrameRows}}) = +Base.IteratorSize(::Type{<:Iterators.PartitionIterator{<:DataFrameRows}}) = Base.HasLength() function Base.length(itr::Iterators.PartitionIterator{<:DataFrameRows})