Merge branch 'main' into nb/manipulation_function_basics

JuliaData · May 20, 2024 · 7614fc3 · 7614fc3
2 parents 621f253 + 0276504
commit 7614fc3
Show file tree

Hide file tree

Showing 24 changed files with 167 additions and 53 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -16,10 +16,35 @@
 
 ## Bug fixes
 
+* Correctly throw an error if negative number of rows is passed
+  to `first` or `last`
+  ([#3402](https://github.com/JuliaData/DataFrames.jl/pull/3402))
 * Always use the default thread pool for multithreaded operations,
   instead of using the interactive thread pool when Julia was started
   with `-tM,N` with N > 0
   ([#3385](https://github.com/JuliaData/DataFrames.jl/pull/3385))
+* Correctly return `Bool[]` in the `nonunique` function applied to a data frame
+  with a pulled column that has zero levels in the pool
+  ([#3393](https://github.com/JuliaData/DataFrames.jl/pull/3393))
+* Correctly index `eachrow` and `eachcol` with `CartesianIndex`
+  ([#3413](https://github.com/JuliaData/DataFrames.jl/issues/3413))
+* Correctly handle non-standard integers when converting them to `BigInt`
+  ([#3419](https://github.com/JuliaData/DataFrames.jl/issues/3419))
+
+## Removed deprecations
+
+* The `by` and `aggregate` functions that were deprecated before 1.0
+  release are now removed.
+  ([#3422](https://github.com/JuliaData/DataFrames.jl/issues/3422))
+
+## Julia compatibility change
+
+* Ensure that `allunique(::AbstractDataFrame, ::Any)` always gets
+  interpreted as test for uniqueness of rows in the first positional argument
+  ([#3434](https://github.com/JuliaData/DataFrames.jl/issues/3434))
+* Make sure that an empty vector of `Any` or of `AbstractVector` is treated as having
+  no columns when a data frame is being processed with `combine`/`select`/`transform`.
+  ([#3435](https://github.com/JuliaData/DataFrames.jl/issues/3435))
 
 # DataFrames.jl v1.6.1 Release Notes
 

diff --git a/Project.toml b/Project.toml
@@ -31,7 +31,7 @@ Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 CategoricalArrays = "0.10.0"
 Combinatorics = "1.0.2"
 Compat = "4.2"
-DataAPI = "1.15.0"
+DataAPI = "1.16.0"
 DataStructures = "0.18"
 DataValues = "0.4.13"
 InlineStrings = "1.3.0"

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -11,4 +11,4 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
-Documenter = "0.27"
+Documenter = "1"
diff --git a/docs/make.jl b/docs/make.jl
@@ -16,7 +16,8 @@ makedocs(
     format = Documenter.HTML(
         canonical = "https://juliadata.github.io/DataFrames.jl/stable/",
         assets = ["assets/favicon.ico"],
-        edit_link = "main"
+        edit_link = "main",
+        size_threshold_ignore = ["man/basics.md", "lib/functions.md"],
     ),
     pages = Any[
         "Introduction" => "index.md",
@@ -42,11 +43,10 @@ makedocs(
             hide("Internals" => "lib/internals.md"),
         ]
     ],
-    strict = true
 )
 
-# Deploy built documentation from Travis.
-# =======================================
+# Deploy built documentation.
+# ===========================
 
 deploydocs(
     # options

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -12,12 +12,13 @@ other packages you can check-out the following resources
 * [Data Wrangling with DataFrames.jl Cheat Sheet](https://www.ahsmart.com/pub/data-wrangling-with-data-frames-jl-cheat-sheet/)
 * [DataFrames Tutorial using Jupyter Notebooks](https://github.com/bkamins/Julia-DataFrames-Tutorial/)
 * [Julia Academy DataFrames.jl tutorial](https://github.com/JuliaAcademy/DataFrames)
-* [JuliaCon 2019](https://github.com/bkamins/JuliaCon2019-DataFrames-Tutorial),
-  [JuliaCon 2020](https://github.com/bkamins/JuliaCon2020-DataFrames-Tutorial),
-  [JuliaCon 2021](https://github.com/bkamins/JuliaCon2021-DataFrames-Tutorial),
+* [JuliaCon 2023](https://github.com/bkamins/JuliaCon2023-Tutorial),
   [JuliaCon 2022](https://github.com/bkamins/JuliaCon2022-DataFrames-Tutorial),
-  [PyData Global 2020](https://github.com/bkamins/PyDataGlobal2020),
-  and [ODSC Europe 2021](https://github.com/bkamins/ODSC-EUROPE-2021) tutorials
+  [JuliaCon 2021](https://github.com/bkamins/JuliaCon2021-DataFrames-Tutorial),
+  [JuliaCon 2020](https://github.com/bkamins/JuliaCon2020-DataFrames-Tutorial),
+  [JuliaCon 2019](https://github.com/bkamins/JuliaCon2019-DataFrames-Tutorial),
+  [ODSC Europe 2021](https://github.com/bkamins/ODSC-EUROPE-2021) tutorials,
+  and [PyData Global 2020](https://github.com/bkamins/PyDataGlobal2020)
 * [DataFrames.jl showcase](https://github.com/bkamins/DataFrames-Showcase)
 
 If you prefer to learn DataFrames.jl from a book you can consider reading:

diff --git a/docs/src/man/basics.md b/docs/src/man/basics.md
@@ -175,6 +175,40 @@ julia> DataFrame([(a=1, b=0), (a=2, b=0)])
    2 │     2      0
 ```
 
+Sometimes your source data might have a heterogeneous set of columns for each observation.
+Here is an example:
+
+```
+julia> source = [(type="circle", radius=10), (type="square", side=20)]
+2-element Vector{NamedTuple{names, Tuple{String, Int64}} where names}:
+ (type = "circle", radius = 10)
+ (type = "square", side = 20)
+```
+
+If you want to create a data frame from such data containing all columns present in at least
+one of the source observations, with a `missing` entry if some column is not present then
+you can use `Tables.dictcolumntable` function to help you create the desired data frame:
+
+```
+julia> DataFrame(Tables.dictcolumntable(source))
+2×3 DataFrame
+ Row │ type    radius   side
+     │ String  Int64?   Int64?
+─────┼──────────────────────────
+   1 │ circle       10  missing
+   2 │ square  missing       20
+```
+
+The role of `Tables.dictcolumntable` is to make sure that the `DataFrame` constructor gets information
+about all columns present in the source data and properly instantiates them. If we did not use
+this function the `DataFrame` constructor would assume that the first row of data contains the set
+of columns present in the source, which would lead to an error in our example:
+
+```
+julia> DataFrame(source)
+ERROR: type NamedTuple has no field radius
+```
+
 Let us finish our review of constructors by showing how to create a `DataFrame`
 from a matrix. In this case you pass a matrix as a first argument. If the second
 argument is just `:auto` then column names `x1`, `x2`, ... will be auto generated.

diff --git a/docs/src/man/getting_started.md b/docs/src/man/getting_started.md
@@ -443,7 +443,7 @@ A particular common case of a collection that supports the
 a vector of `NamedTuple`s:
 ```jldoctest dataframe
 julia> v = [(a=1, b=2), (a=3, b=4)]
-2-element Vector{NamedTuple{(:a, :b), Tuple{Int64, Int64}}}:
+2-element Vector{@NamedTuple{a::Int64, b::Int64}}:
  (a = 1, b = 2)
  (a = 3, b = 4)
 
@@ -460,7 +460,7 @@ You can also easily convert a data frame back to a vector of `NamedTuple`s:
 julia> using Tables
 
 julia> Tables.rowtable(df)
-2-element Vector{NamedTuple{(:a, :b), Tuple{Int64, Int64}}}:
+2-element Vector{@NamedTuple{a::Int64, b::Int64}}:
  (a = 1, b = 2)
  (a = 3, b = 4)
 ```
diff --git a/docs/src/man/querying_frameworks.md b/docs/src/man/querying_frameworks.md
@@ -5,6 +5,9 @@ DataFramesMeta.jl, DataFrameMacros.jl and Query.jl. They implement a functionali
 [dplyr](https://dplyr.tidyverse.org/) or
 [LINQ](https://en.wikipedia.org/wiki/Language_Integrated_Query).
 
+These frameworks are designed both to make it easier for new users to start working with data frames in Julia
+and to allow advanced users to write more compact code.
+
 ## DataFramesMeta.jl
 
 The [DataFramesMeta.jl](https://github.com/JuliaStats/DataFramesMeta.jl) package
@@ -30,7 +33,7 @@ pipe the output of one transformation as an input to another, as with
 Below we present several selected examples of usage of the package.
 
 First we subset rows of the source data frame using a logical condition
-and select its two columns, renaming one of them:
+and select two of its columns, renaming one of them:
 
 ```jldoctest dataframesmeta
 julia> using DataFramesMeta

diff --git a/src/DataFrames.jl b/src/DataFrames.jl
@@ -20,6 +20,7 @@ import DataAPI,
        DataAPI.Between,
        DataAPI.Cols,
        DataAPI.describe,
+       DataAPI.groupby,
        DataAPI.innerjoin,
        DataAPI.outerjoin,
        DataAPI.rightjoin,

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
@@ -558,14 +558,19 @@ Base.first(df::AbstractDataFrame) = df[1, :]
     first(df::AbstractDataFrame, n::Integer; view::Bool=false)
 
 Get a data frame with the `n` first rows of `df`.
+Get all rows if `n` is greater than the number of rows in `df`.
+Error if `n` is negative.
 
 If `view=false` a freshly allocated `DataFrame` is returned.
 If `view=true` then a `SubDataFrame` view into `df` is returned.
 
 $METADATA_FIXED
 """
-@inline Base.first(df::AbstractDataFrame, n::Integer; view::Bool=false) =
-    view ? Base.view(df, 1:min(n ,nrow(df)), :) : df[1:min(n, nrow(df)), :]
+@inline function Base.first(df::AbstractDataFrame, n::Integer; view::Bool=false)
+    n < 0 && throw(ArgumentError("Number of elements must be nonnegative"))
+    r = min(n, nrow(df))
+    return view ? Base.view(df, 1:r, :) : df[1:r, :]
+end
 
 """
     last(df::AbstractDataFrame)
@@ -580,14 +585,19 @@ Base.last(df::AbstractDataFrame) = df[nrow(df), :]
     last(df::AbstractDataFrame, n::Integer; view::Bool=false)
 
 Get a data frame with the `n` last rows of `df`.
+Get all rows if `n` is greater than the number of rows in `df`.
+Error if `n` is negative.
 
 If `view=false` a freshly allocated `DataFrame` is returned.
 If `view=true` then a `SubDataFrame` view into `df` is returned.
 
 $METADATA_FIXED
 """
-@inline Base.last(df::AbstractDataFrame, n::Integer; view::Bool=false) =
-    view ? Base.view(df, max(1, nrow(df)-n+1):nrow(df), :) : df[max(1, nrow(df)-n+1):nrow(df), :]
+@inline function Base.last(df::AbstractDataFrame, n::Integer; view::Bool=false)
+    n < 0 && throw(ArgumentError("Number of elements must be nonnegative"))
+    r = max(1, nrow(df) - n + 1)
+    return view ? Base.view(df, r:nrow(df), :) : df[r:nrow(df), :]
+end
 
 """
     describe(df::AbstractDataFrame; cols=:)
@@ -1476,7 +1486,7 @@ function fillcombinations(df::AbstractDataFrame, indexcols;
     end
 
     # make sure we do not overflow in the target data frame size
-    target_rows = Int(prod(x -> big(length(x)), uniquevals))
+    target_rows = Int(prod(x -> BigInt(length(x)), uniquevals))
     if iszero(target_rows)
         @assert iszero(nrow(df))
         cdf = copy(df)

diff --git a/src/abstractdataframe/iteration.jl b/src/abstractdataframe/iteration.jl
@@ -57,7 +57,7 @@ julia> eachrow(df)
    4 │     4     14
 
 julia> copy.(eachrow(df))
-4-element Vector{NamedTuple{(:x, :y), Tuple{Int64, Int64}}}:
+4-element Vector{@NamedTuple{x::Int64, y::Int64}}:
  (x = 1, y = 11)
  (x = 2, y = 12)
  (x = 3, y = 13)
@@ -81,6 +81,7 @@ Base.IndexStyle(::Type{<:DataFrameRows}) = Base.IndexLinear()
 Base.size(itr::DataFrameRows) = (size(parent(itr), 1), )
 
 Base.@propagate_inbounds Base.getindex(itr::DataFrameRows, i::Int) = parent(itr)[i, :]
+Base.@propagate_inbounds Base.getindex(itr::DataFrameRows, i::CartesianIndex{1}) = itr[i[1]]
 Base.@propagate_inbounds Base.getindex(itr::DataFrameRows, idx) =
     eachrow(@view parent(itr)[idx isa AbstractVector && !(eltype(idx) <: Bool) ? copy(idx) : idx, :])
 
@@ -263,6 +264,8 @@ Base.iterate(itr::DataFrameColumns, i::Integer=1) =
     i <= length(itr) ? (itr[i], i + 1) : nothing
 Base.@propagate_inbounds Base.getindex(itr::DataFrameColumns, idx::ColumnIndex) =
     parent(itr)[!, idx]
+Base.@propagate_inbounds Base.getindex(itr::DataFrameColumns, idx::CartesianIndex{1}) =
+    itr[idx[1]]
 Base.@propagate_inbounds Base.getindex(itr::DataFrameColumns, idx::MultiColumnIndex) =
     eachcol(parent(itr)[!, idx])
 Base.:(==)(itr1::DataFrameColumns, itr2::DataFrameColumns) =

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
@@ -822,7 +822,12 @@ function select_transform!((nc,)::Ref{Any}, df::AbstractDataFrame, newdf::DataFr
             res = newres
         elseif !(res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix,
                                Tables.AbstractRow})
-            res = Tables.columntable(res)
+            if res isa Union{AbstractVector{Any}, AbstractVector{<:AbstractVector}}
+                @assert isempty(res)
+                res = DataFrame()
+            else
+                res = Tables.columntable(res)
+            end
         end
     end
 

diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl
@@ -87,7 +87,7 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first)
     if !(keep in (:first, :last, :noduplicates))
         throw(ArgumentError("`keep` must be :first, :last, or :noduplicates"))
     end
-    ncol(df) == 0 && return Bool[]
+    nrow(df) == 0 && return Bool[]
     res = fill(true, nrow(df))
     cols = ntuple(i -> df[!, i], ncol(df))
     if keep == :first
@@ -207,6 +207,11 @@ function Base.allunique(df::AbstractDataFrame, cols=:)
                             Val(false), nothing, false, nothing, true)[1] == nrow(df)
 end
 
+# avoid invoking Base.allunique(f, iterator) introduced in Julia 1.11
+
+Base.allunique(df::AbstractDataFrame, cols::Tuple) =
+    invoke(Base.allunique, Tuple{AbstractDataFrame, Any}, df, cols)
+
 """
     unique(df::AbstractDataFrame; view::Bool=false, keep::Symbol=:first)
     unique(df::AbstractDataFrame, cols; view::Bool=false, keep::Symbol=:first)

diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl
@@ -1546,7 +1546,7 @@ function allcombinations(::Type{DataFrame}, pairs::Pair{Symbol, <:Any}...)
     @assert length(colvalues) == length(colnames)
     @assert all(x -> x isa AbstractVector, colvalues)
 
-    target_rows = Int(prod(x -> big(length(x)), colvalues))
+    target_rows = Int(prod(x -> BigInt(length(x)), colvalues))
     out_df = DataFrame()
     inner = 1
     for (val, cname) in zip(colvalues, colnames)
@@ -1563,4 +1563,3 @@ function allcombinations(::Type{DataFrame}, pairs::Pair{Symbol, <:Any}...)
 end
 
 _try_select_no_copy(df::DataFrame, cols) = select(df, cols, copycols=false)
-
diff --git a/src/deprecated.jl b/src/deprecated.jl
@@ -1,11 +1,3 @@
-export by, aggregate
-
-# TODO: remove definitions in 2.0 release
-by(args...; kwargs...) = throw(ArgumentError("by function was removed from DataFrames.jl. " *
-                                             "Use the `combine(groupby(...), ...)` or `combine(f, groupby(...))` instead."))
-aggregate(args...; kwargs...) = throw(ArgumentError("aggregate function was removed from DataFrames.jl. " *
-                                                    "Use the `combine` function instead."))
-
 # TODO: remove deprecation in 2.0 release
 import Base.delete!
 @deprecate delete!(df::DataFrame, inds) deleteat!(df::DataFrame, inds)
diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl
@@ -156,7 +156,7 @@ function refpool_and_array(x::AbstractArray)
         else
             minval, maxval = extrema(x)
         end
-        ngroups = big(maxval) - big(minval) + 1
+        ngroups = BigInt(maxval) - BigInt(minval) + 1
         # Threshold chosen with the same rationale as the row_group_slots! refpool method:
         # refpool approach is faster but we should not allocate too much memory either
         # We also have to avoid overflow, including with ngroups + 1 for missing values
@@ -337,7 +337,11 @@ function row_group_slots!(cols::NTuple{N, AbstractVector},
        nt = max(1, lg ÷ 100_000)
     end
     # if there are few rows per group limit the number of threads used
-    nt = clamp(round(Int, (lg / 4) / ngroups - 2), 1, nt)
+    if ngroups == 0
+        nt = 1
+    else
+        nt = clamp(round(Int, (lg / 4) / ngroups - 2), 1, nt)
+    end
 
     seen = fill(false, ngroups)
     seen_vec = Vector{Vector{Bool}}(undef, nt)

diff --git a/src/join/core.jl b/src/join/core.jl
@@ -328,7 +328,7 @@ function _innerjoin_unsorted_int(left::AbstractVector{<:Union{Integer, Missing}}
                                  right::AbstractVector{<:Union{Integer, Missing}})
     minv, maxv = extrema_missing(right)
 
-    val_range = big(maxv) - big(minv)
+    val_range = BigInt(maxv) - BigInt(minv)
     if val_range > typemax(Int) - 3 || val_range ÷ 2 > max(64, length(right)) ||
        minv < typemin(Int) + 2 || maxv > typemax(Int) - 3
        return _innerjoin_unsorted(left, right)
@@ -648,7 +648,7 @@ function _semijoin_unsorted_int(left::AbstractVector{<:Union{Integer, Missing}},
                                 right_shorter::Bool)
     minv, maxv = extrema_missing(right)
 
-    val_range = big(maxv) - big(minv)
+    val_range = BigInt(maxv) - BigInt(minv)
     if val_range > typemax(Int) - 3 || val_range ÷ 2 > max(64, length(right)) ||
        minv < typemin(Int) + 2 || maxv > typemax(Int) - 3
        return _semijoin_unsorted(left, right, seen_rows, right_shorter)

diff --git a/test/dataframe.jl b/test/dataframe.jl
@@ -1180,10 +1180,16 @@ end
     @test_throws BoundsError first(DataFrame(x=[]))
     @test_throws BoundsError last(DataFrame(x=[]))
 
-    @test first(df, 6) == DataFrame(A=1:6)
-    @test first(df, 1) == DataFrame(A=1)
-    @test last(df, 6) == DataFrame(A=5:10)
-    @test last(df, 1) == DataFrame(A=10)
+    for v in (true, false)
+        @test first(df, 6, view=v) == DataFrame(A=1:6)
+        @test first(df, 1, view=v) == DataFrame(A=1)
+        @test first(df, 0, view=v) == DataFrame(A=Int[])
+        @test_throws ArgumentError first(df, -1, view=v)
+        @test last(df, 6, view=v) == DataFrame(A=5:10)
+        @test last(df, 1, view=v) == DataFrame(A=10)
+        @test last(df, 0, view=v) == DataFrame(A=Int[])
+        @test_throws ArgumentError last(df, -1, view=v)
+    end
 
     @inferred first(df, 6)
     @inferred last(df, 6)
@@ -2325,6 +2331,7 @@ end
         @test allunique(df, [])
         @test allunique(df, x -> 1:4)
         @test allunique(df, [:a, :b] => ByRow(string))
+        @test_throws ArgumentError allunique(df, ())
     end
 end