Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RFC: add Histogram types #6601

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -865,8 +865,8 @@ export
# statistics
cor,
cov,
Histogram,
hist,
hist2d,
histrange,
mean!,
mean,
Expand Down
131 changes: 73 additions & 58 deletions base/statistics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ end

## nice-valued ranges for histograms

function histrange{T<:FloatingPoint,N}(v::AbstractArray{T,N}, n::Integer)
function histrange{T<:FloatingPoint,N}(v::AbstractArray{T,N}, n::Integer, interval::Symbol)
if length(v) == 0
return 0.0:1.0:0.0
end
Expand All @@ -437,12 +437,17 @@ function histrange{T<:FloatingPoint,N}(v::AbstractArray{T,N}, n::Integer)
step = 10*e
end
end
start = step*(ceil(lo/step)-1)
nm1 = iceil((hi - start)/step)
if interval == :right
start = step*(ceil(lo/step)-1)
nm1 = iceil((hi - start)/step)
else
start = step*floor(lo/step)
nm1 = ifloor((hi - start)/step)+1
end
start:step:(start + nm1*step)
end

function histrange{T<:Integer,N}(v::AbstractArray{T,N}, n::Integer)
function histrange{T<:Integer,N}(v::AbstractArray{T,N}, n::Integer, interval::Symbol)
if length(v) == 0
return 0:1:0
end
Expand All @@ -451,7 +456,7 @@ function histrange{T<:Integer,N}(v::AbstractArray{T,N}, n::Integer)
step = 1
else
bw = (hi - lo) / n
e = 10^max(0,ifloor(log10(bw)))
e = int(10^max(0,ifloor(log10(bw))))
r = bw / e
if r <= 1
step = e
Expand All @@ -463,8 +468,13 @@ function histrange{T<:Integer,N}(v::AbstractArray{T,N}, n::Integer)
step = 10*e
end
end
start = step*(ceil(lo/step)-1)
nm1 = iceil((hi - start)/step)
if interval == :right
start = step*(iceil(lo/step)-1)
nm1 = iceil((hi - start)/step)
else
start = step*ifloor(lo/step)
nm1 = ifloor((hi - start)/step)+1
end
start:step:(start + nm1*step)
end

Expand All @@ -478,71 +488,76 @@ function sturges(n) # Sturges' formula
iceil(log2(n))+1
end

function hist!{HT}(h::AbstractArray{HT}, v::AbstractVector, edg::AbstractVector; init::Bool=true)
n = length(edg) - 1
length(h) == n || error("length(h) must equal length(edg) - 1.")
if init
fill!(h, zero(HT))
# N-dimensional histogram object
immutable Histogram{T<:Real,N,E}
edges::E
weights::Array{T,N}
interval::Symbol
function Histogram(edges::NTuple{N,AbstractArray},weights::Array{T,N},interval::Symbol)
interval == :right || interval == :left || error("interval must :left or :right")
map(x -> length(x)-1,edges) == size(weights) || error("Histogram edge vectors must be 1 longer than corresponding weight dimensions")
new(edges,weights,interval)
end
for x in v
i = searchsortedfirst(edg, x)-1
if 1 <= i <= n
h[i] += 1
end
end
edg, h
end
Histogram{T,N}(edges::NTuple{N,AbstractVector},weights::AbstractArray{T,N},interval::Symbol=:right) = Histogram{T,N,typeof(edges)}(edges,weights,interval)
Histogram{T,N}(edges::NTuple{N,AbstractVector},::Type{T},interval::Symbol=:right) = Histogram(edges,zeros(T,map(x -> length(x)-1,edges)...),interval)
Histogram{N}(edges::NTuple{N,AbstractVector},interval::Symbol=:right) = Histogram(edges,Int,interval)

hist(v::AbstractVector, edg::AbstractVector) = hist!(Array(Int, length(edg)-1), v, edg)
hist(v::AbstractVector, n::Integer) = hist(v,histrange(v,n))
hist(v::AbstractVector) = hist(v,sturges(length(v)))
isequal(h1::Histogram,h2::Histogram) = isequal(h1.edges,h2.edges) && isequal(h1.weights,h2.weights)

function hist!{HT}(H::AbstractArray{HT,2}, A::AbstractMatrix, edg::AbstractVector; init::Bool=true)
m, n = size(A)
size(H) == (length(edg)-1, n) || error("Incorrect size of H.")
if init
fill!(H, zero(HT))
# 1-dimensional
Histogram{T}(edge::AbstractVector,weights::AbstractVector{T},interval::Symbol=:right) = Histogram{T,1,(typeof(edge),)}((edge,),weights,interval)
Histogram{T}(edge::AbstractVector,::Type{T},interval::Symbol=:right) = Histogram(edge,zeros(T,length(edge)-1),interval)
Histogram(edge::AbstractVector,interval::Symbol=:right) = Histogram(edge,Int,interval)

function push!{T,E}(h::Histogram{T,1,E}, x::Real)
i = if h.interval == :right
searchsortedfirst(h.edges[1], x) - 1
else
searchsortedlast(h.edges[1], x)
end
for j = 1:n
hist!(sub(H, :, j), sub(A, :, j), edg)
if 1 <= i <= length(h.weights)
@inbounds h.weights[i] += one(T)
end
edg, H
h
end

hist(A::AbstractMatrix, edg::AbstractVector) = hist!(Array(Int, length(edg)-1, size(A,2)), A, edg)
hist(A::AbstractMatrix, n::Integer) = hist(A,histrange(A,n))
hist(A::AbstractMatrix) = hist(A,sturges(size(A,1)))
function append!{T}(h::Histogram{T,1}, v::AbstractVector)
for x in v
push!(h,x)
end
h
end

hist(v::AbstractVector, edg::AbstractVector; interval::Symbol=:right) = append!(Histogram(edg,interval),v)
hist(v::AbstractVector, n::Integer; interval::Symbol=:right) = hist(v,histrange(v,n,interval);interval=interval)
hist(v::AbstractVector; interval::Symbol=:right) = hist(v,sturges(length(v));interval=interval)

## hist2d
function hist2d!{HT}(H::AbstractArray{HT,2}, v::AbstractMatrix,
edg1::AbstractVector, edg2::AbstractVector; init::Bool=true)
size(v,2) == 2 || error("hist2d requires an Nx2 matrix.")
n = length(edg1) - 1
m = length(edg2) - 1
size(H) == (n, m) || error("Incorrect size of H.")
if init
fill!(H, zero(HT))
# N-dimensional
function push!{T,N}(h::Histogram{T,N},xs::NTuple{N,Real})
is = if h.interval == :right
map((edge, x) -> searchsortedfirst(edge,x) - 1, h.edges, xs)
else
map(searchsortedlast, h.edges, xs)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps this is an argument for having searchsortedfirst return 1 less by default? cf. #5664

end
for i = 1:size(v,1)
x = searchsortedfirst(edg1, v[i,1]) - 1
y = searchsortedfirst(edg2, v[i,2]) - 1
if 1 <= x <= n && 1 <= y <= m
@inbounds H[x,y] += 1
end
try
h.weights[is...] += one(T)
catch e
isa(e,BoundsError) || rethrow(e)
end
edg1, edg2, H
h
end
function append!{T,N}(h::Histogram{T,N}, vs::NTuple{N,AbstractVector})
for xs in zip(vs...)
push!(h,xs)
end
h
end

hist2d(v::AbstractMatrix, edg1::AbstractVector, edg2::AbstractVector) =
hist2d!(Array(Int, length(edg1)-1, length(edg2)-1), v, edg1, edg2)

hist2d(v::AbstractMatrix, edg::AbstractVector) = hist2d(v, edg, edg)

hist2d(v::AbstractMatrix, n1::Integer, n2::Integer) =
hist2d(v, histrange(sub(v,:,1),n1), histrange(sub(v,:,2),n2))
hist2d(v::AbstractMatrix, n::Integer) = hist2d(v, n, n)
hist2d(v::AbstractMatrix) = hist2d(v, sturges(size(v,1)))
hist{N}(vs::NTuple{N,AbstractVector}, edges::NTuple{N,AbstractVector}; interval::Symbol=:right) = append!(Histogram(edges,interval),vs)
hist{N}(vs::NTuple{N,AbstractVector}, ns::NTuple{N,Integer}; interval::Symbol=:right) = hist(vs, map((v,n) -> histrange(v,n,interval),vs,ns);interval=interval)
hist{N}(vs::NTuple{N,AbstractVector}, n::Integer; interval::Symbol=:right) = hist(vs, map(v -> histrange(v,n,interval),vs);interval=interval)
hist{N}(vs::NTuple{N,AbstractVector}; interval::Symbol=:right) = hist(vs, sturges(length(vs[1]));interval=interval)


## quantiles ##
Expand Down
8 changes: 4 additions & 4 deletions test/arrayops.jl
Original file line number Diff line number Diff line change
Expand Up @@ -723,13 +723,13 @@ B = cat(3, 1, 2, 3)
begin
local a,h,i
a = rand(5,5)
h = mapslices(v -> hist(v,0:0.1:1)[2], a, 1)
H = mapslices(v -> hist(v,0:0.1:1)[2], a, 2)
h = mapslices(v -> hist(v,0:0.1:1), a, 1)
H = mapslices(v -> hist(v,0:0.1:1), a, 2)
s = mapslices(sort, a, [1])
S = mapslices(sort, a, [2])
for i = 1:5
@test h[:,i] == hist(a[:,i],0:0.1:1)[2]
@test vec(H[i,:]) == hist(vec(a[i,:]),0:0.1:1)[2]
@test h[i] == hist(a[:,i],0:0.1:1)
@test H[i] == hist(vec(a[i,:]),0:0.1:1)
@test s[:,i] == sort(a[:,i])
@test vec(S[i,:]) == sort(vec(a[i,:]))
end
Expand Down
2 changes: 1 addition & 1 deletion test/parallel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ end # @unix_only(SharedArray tests)
if nprocs() < 4
remotecall_fetch(1, () -> addprocs(4 - nprocs()))
end
workloads = hist(@parallel((a,b)->[a,b], for i=1:7; myid(); end), nprocs())[2]
workloads = hist(@parallel((a,b)->[a,b], for i=1:7; myid(); end), nprocs()).weights
@test maximum(workloads) - minimum(workloads) <= 1

# @parallel reduction should work even with very short ranges
Expand Down
26 changes: 16 additions & 10 deletions test/statistics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -191,16 +191,22 @@ end


# test hist

@test sum(hist([1,2,3])[2]) == 3
@test hist([])[2] == []
@test hist([1])[2] == [1]
@test hist([1,2,3],[0,2,4]) == ([0,2,4],[2,1])
@test hist([1,2,3],0:2:4) == (0:2:4,[2,1])
@test all(hist([1:100]/100,0.0:0.01:1.0)[2] .==1)
@test hist([1,1,1,1,1])[2][1] == 5
@test sum(hist2d(rand(100, 2))[3]) == 100
@test hist([1 2 3 4;1 2 3 4]) == (0.0:2.0:4.0, [2 2 0 0; 0 0 2 2])
@test sum(hist([1,2,3]).weights) == 3
@test hist([]).weights == []
@test hist([1]).weights == [1]
@test hist([1,2,3],[0,2,4]) == Histogram([0,2,4],[2,1])
@test hist([1,2,3],[0,2,4]) != Histogram([0,2,4],[1,1])
@test hist([1,2,3],0:2:4) == Histogram(0:2:4,[2,1])
@test all(hist([1:100]/100,0.0:0.01:1.0).weights .==1)
@test hist([1,1,1,1,1]).weights[1] == 5
@test sum(hist((rand(100),rand(100))).weights) == 100
@test hist(1:100,5;interval=:right).weights == [20,20,20,20,20]
@test hist(1:100,5;interval=:left).weights == [19,20,20,20,20,1]
@test hist(0:99,5;interval=:right).weights == [1,20,20,20,20,19]
@test hist(0:99,5;interval=:left).weights == [20,20,20,20,20]

#r = 0.0:2.0:4.0
#@test hist([1 2 3 4;1 2 3 4],r) == [Histogram(r,[2,0]),Histogram(r,[2,0]),Histogram(r,[0,2]),Histogram(r,[0,2])]

@test midpoints(1.0:1.0:10.0) == 1.5:1.0:9.5
@test midpoints(1:10) == 1.5:9.5
Expand Down