Skip to content

Commit

Permalink
use dict to cache eltype names (#2750)
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins authored May 7, 2021
1 parent 32b86d4 commit 818cb11
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 101 deletions.
95 changes: 92 additions & 3 deletions src/abstractdataframe/io.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,91 @@
"""
DataFrames.getmaxwidths(df::AbstractDataFrame,
io::IO,
rowindices1::AbstractVector{Int},
rowindices2::AbstractVector{Int},
rowlabel::Symbol,
rowid::Union{Integer, Nothing},
show_eltype::Bool,
buffer::IOBuffer)
Calculate, for each column of an AbstractDataFrame, the maximum
string width used to render the name of that column, its type, and the
longest entry in that column -- among the rows of the data frame
will be rendered to IO. The widths for all columns are returned as a
vector.
Return a `Vector{Int}` giving the maximum string widths required to render
each column, including that column's name and type.
NOTE: The last entry of the result vector is the string width of the
implicit row ID column contained in every `AbstractDataFrame`.
# Arguments
- `df::AbstractDataFrame`: The data frame whose columns will be printed.
- `io::IO`: The `IO` to which `df` is to be printed
- `rowindices1::AbstractVector{Int}: A set of indices of the first
chunk of the AbstractDataFrame that would be rendered to IO.
- `rowindices2::AbstractVector{Int}: A set of indices of the second
chunk of the AbstractDataFrame that would be rendered to IO. Can
be empty if the AbstractDataFrame would be printed without any
ellipses.
- `rowlabel::AbstractString`: The label that will be used when rendered the
numeric ID's of each row. Typically, this will be set to "Row".
- `rowid`: Used to handle showing `DataFrameRow`.
- `show_eltype`: Whether to print the column type
under the column name in the heading.
- `buffer`: buffer passed around to avoid reallocations in `ourstrwidth`
"""
function getmaxwidths(df::AbstractDataFrame,
io::IO,
rowindices1::AbstractVector{Int},
rowindices2::AbstractVector{Int},
rowlabel::Symbol,
rowid::Union{Integer, Nothing},
show_eltype::Bool,
buffer::IOBuffer,
truncstring::Int)
maxwidths = Vector{Int}(undef, size(df, 2) + 1)

undefstrwidth = ourstrwidth(io, "#undef", buffer, truncstring)

ct = show_eltype ? batch_compacttype(Any[eltype(c) for c in eachcol(df)]) : String[]
j = 1
for (col_idx, (name, col)) in enumerate(pairs(eachcol(df)))
# (1) Consider length of column name
# do not truncate column name
maxwidth = ourstrwidth(io, name, buffer, 0)

# (2) Consider length of longest entry in that column
for indices in (rowindices1, rowindices2), i in indices
if isassigned(col, i)
maxwidth = max(maxwidth, ourstrwidth(io, col[i], buffer, truncstring))
else
maxwidth = max(maxwidth, undefstrwidth)
end
end
if show_eltype
# do not truncate eltype name
maxwidths[j] = max(maxwidth, ourstrwidth(io, ct[col_idx], buffer, 0))
else
maxwidths[j] = maxwidth
end
j += 1
end

# do not truncate rowlabel
if rowid isa Nothing
rowmaxwidth1 = isempty(rowindices1) ? 0 : ndigits(maximum(rowindices1))
rowmaxwidth2 = isempty(rowindices2) ? 0 : ndigits(maximum(rowindices2))
maxwidths[j] = max(max(rowmaxwidth1, rowmaxwidth2),
ourstrwidth(io, rowlabel, buffer, 0))
else
maxwidths[j] = max(ndigits(rowid), ourstrwidth(io, rowlabel, buffer, 0))
end

return maxwidths
end

"""
show(io::IO, mime::MIME, df::AbstractDataFrame)
Expand Down Expand Up @@ -107,8 +195,9 @@ function _show(io::IO, ::MIME"text/html", df::AbstractDataFrame;
if eltypes
write(io, "<tr>")
write(io, "<th></th>")
ct = batch_compacttype(Any[eltype(df[!, idx]) for idx in 1:mxcol])
for j in 1:mxcol
s = html_escape(compacttype(eltype(df[!, j])))
s = html_escape(ct[j])
write(io, "<th>$s</th>")
end
write(io, "</tr>")
Expand Down Expand Up @@ -281,8 +370,8 @@ function _show(io::IO, ::MIME"text/latex", df::AbstractDataFrame;
write(io, "\t\\hline\n")
if eltypes
write(io, "\t& ")
header = join(map(c -> latex_escape(string(compacttype(c))),
eltype.(eachcol(df)[1:mxcol])), " & ")
ct = batch_compacttype(Any[eltype(df[!, idx]) for idx in 1:mxcol])
header = join(latex_escape.(ct), " & ")
write(io, header)
mxcol < size(df, 2) && write(io, " & ")
write(io, "\\\\\n")
Expand Down
135 changes: 37 additions & 98 deletions src/abstractdataframe/show.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,39 @@ if VERSION < v"1.5.0-DEV.261" || VERSION < v"1.5.0-DEV.266"
end
end

"""Return compact string representation of type T"""
function compacttype(T::Type, maxwidth::Int=8, initial::Bool=true)
# For most data frames, especially wide, columns having the same element type
# occur multiple times. batch_compacttype ensures that we compute string
# representation of a specific column element type only once and then reuse it.

function batch_compacttype(types::Vector{Any}, maxwidths::Vector{Int})
@assert length(types) == length(maxwidths)
cache = Dict{Any, String}()
return map(types, maxwidths) do T, maxwidth
get!(cache, T) do
compacttype(T, maxwidth)
end
end
end

function batch_compacttype(types::Vector{Any}, maxwidth::Int=8)
cache = Dict{Type, String}()
return map(types) do T
get!(cache, T) do
compacttype(T, maxwidth)
end
end
end

"""
compacttype(T::Type, maxwidth::Int=8, initial::Bool=true)
Return compact string representation of type `T`.
For displaying data frame we do not want string representation of type to be
longer than `maxwidth`. This function implements rules how type names are
cropped if they are longer than `maxwidth`.
"""
function compacttype(T::Type, maxwidth::Int=8)
maxwidth = max(8, maxwidth)

T === Any && return "Any"
Expand All @@ -82,8 +113,6 @@ function compacttype(T::Type, maxwidth::Int=8, initial::Bool=true)
T = nonmissingtype(T)
sT = string(T)
suffix = "?"
# ignore "?" for initial width counting but respect it for display
initial || (maxwidth -= 1)
textwidth(sT) maxwidth && return sT * suffix
else
suffix = ""
Expand Down Expand Up @@ -119,93 +148,6 @@ function compacttype(T::Type, maxwidth::Int=8, initial::Bool=true)
return first(sT, stop) * "" * suffix
end

"""
DataFrames.getmaxwidths(df::AbstractDataFrame,
io::IO,
rowindices1::AbstractVector{Int},
rowindices2::AbstractVector{Int},
rowlabel::Symbol,
rowid::Union{Integer, Nothing},
show_eltype::Bool,
buffer::IOBuffer)
Calculate, for each column of an AbstractDataFrame, the maximum
string width used to render the name of that column, its type, and the
longest entry in that column -- among the rows of the data frame
will be rendered to IO. The widths for all columns are returned as a
vector.
Return a `Vector{Int}` giving the maximum string widths required to render
each column, including that column's name and type.
NOTE: The last entry of the result vector is the string width of the
implicit row ID column contained in every `AbstractDataFrame`.
# Arguments
- `df::AbstractDataFrame`: The data frame whose columns will be printed.
- `io::IO`: The `IO` to which `df` is to be printed
- `rowindices1::AbstractVector{Int}: A set of indices of the first
chunk of the AbstractDataFrame that would be rendered to IO.
- `rowindices2::AbstractVector{Int}: A set of indices of the second
chunk of the AbstractDataFrame that would be rendered to IO. Can
be empty if the AbstractDataFrame would be printed without any
ellipses.
- `rowlabel::AbstractString`: The label that will be used when rendered the
numeric ID's of each row. Typically, this will be set to "Row".
- `rowid`: Used to handle showing `DataFrameRow`.
- `show_eltype`: Whether to print the column type
under the column name in the heading.
- `buffer`: buffer passed around to avoid reallocations in `ourstrwidth`
"""
function getmaxwidths(df::AbstractDataFrame,
io::IO,
rowindices1::AbstractVector{Int},
rowindices2::AbstractVector{Int},
rowlabel::Symbol,
rowid::Union{Integer, Nothing},
show_eltype::Bool,
buffer::IOBuffer,
truncstring::Int)
maxwidths = Vector{Int}(undef, size(df, 2) + 1)

undefstrwidth = ourstrwidth(io, "#undef", buffer, truncstring)

j = 1
for (name, col) in pairs(eachcol(df))
# (1) Consider length of column name
# do not truncate column name
maxwidth = ourstrwidth(io, name, buffer, 0)

# (2) Consider length of longest entry in that column
for indices in (rowindices1, rowindices2), i in indices
if isassigned(col, i)
maxwidth = max(maxwidth, ourstrwidth(io, col[i], buffer, truncstring))
else
maxwidth = max(maxwidth, undefstrwidth)
end
end
if show_eltype
# do not truncate eltype name
maxwidths[j] = max(maxwidth, ourstrwidth(io, compacttype(eltype(col)), buffer, 0))
else
maxwidths[j] = maxwidth
end
j += 1
end

# do not truncate rowlabel
if rowid isa Nothing
rowmaxwidth1 = isempty(rowindices1) ? 0 : ndigits(maximum(rowindices1))
rowmaxwidth2 = isempty(rowindices2) ? 0 : ndigits(maximum(rowindices2))
maxwidths[j] = max(max(rowmaxwidth1, rowmaxwidth2),
ourstrwidth(io, rowlabel, buffer, 0))
else
maxwidths[j] = max(ndigits(rowid), ourstrwidth(io, rowlabel, buffer, 0))
end

return maxwidths
end

function _show(io::IO,
df::AbstractDataFrame;
allrows::Bool = !get(io, :limit, false),
Expand All @@ -220,13 +162,10 @@ function _show(io::IO,
_check_consistency(df)

names_str = names(df)
names_len = textwidth.(names_str)
maxwidth = max.(9, names_len)
types = eltype.(eachcol(df))

# NOTE: If we reuse `types` here, the time to print the first table is 2x
# more. This should be something related to type inference.
types_str = compacttype.(eltype.(eachcol(df)), maxwidth)
names_len = Int[textwidth(n) for n in names_str]
maxwidth = Int[max(9, nl) for nl in names_len]
types = Any[eltype(c) for c in eachcol(df)]
types_str = batch_compacttype(types, maxwidth)

if allcols && allrows
crop = :none
Expand Down

2 comments on commit 818cb11

@bkamins
Copy link
Member Author

@bkamins bkamins commented on 818cb11 May 7, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/36256

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v1.1.1 -m "<description of version>" 818cb1171ccc50b964676c67b12c28ffc6d39626
git push origin v1.1.1

Please sign in to comment.