Skip to content

Commit

Permalink
move cor, cov, std, stdm, var, varm and linreg to StatsBase (#27152)
Browse files Browse the repository at this point in the history
  • Loading branch information
fredrikekre authored and ViralBShah committed May 28, 2018
1 parent e253cda commit 85dce1b
Show file tree
Hide file tree
Showing 3 changed files with 2 additions and 145 deletions.
24 changes: 0 additions & 24 deletions src/linalg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1010,27 +1010,3 @@ end

chol(A::SparseMatrixCSC) = error("Use cholesky() instead of chol() for sparse matrices.")
eigen(A::SparseMatrixCSC) = error("Use IterativeEigensolvers.eigs() instead of eigen() for sparse matrices.")

function Base.cov(X::SparseMatrixCSC; dims::Int=1, corrected::Bool=true)
vardim = dims
a, b = size(X)
n, p = vardim == 1 ? (a, b) : (b, a)

# The covariance can be decomposed into two terms
# 1/(n - 1) ∑ (x_i - x̄)*(x_i - x̄)' = 1/(n - 1) (∑ x_i*x_i' - n*x̄*x̄')
# which can be evaluated via a sparse matrix-matrix product

# Compute ∑ x_i*x_i' = X'X using sparse matrix-matrix product
out = Matrix(Base.unscaled_covzm(X, vardim))

# Compute x̄
x̄ᵀ = mean(X, dims=vardim)

# Subtract n*x̄*x̄' from X'X
@inbounds for j in 1:p, i in 1:p
out[i,j] -= x̄ᵀ[i] * x̄ᵀ[j]' * n
end

# scale with the sample size n or the corrected sample size n - 1
return rmul!(out, inv(n - corrected))
end
61 changes: 0 additions & 61 deletions src/sparsematrix.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3489,67 +3489,6 @@ function hash(A::SparseMatrixCSC{T}, h::UInt) where T
hashrun(0, length(A)-lastidx, h) # Hash zeros at end
end

## Statistics

# This is the function that does the reduction underlying var/std
function Base.centralize_sumabs2!(R::AbstractArray{S}, A::SparseMatrixCSC{Tv,Ti}, means::AbstractArray) where {S,Tv,Ti}
lsiz = Base.check_reducedims(R,A)
size(means) == size(R) || error("size of means must match size of R")
isempty(R) || fill!(R, zero(S))
isempty(A) && return R

colptr = A.colptr
rowval = A.rowval
nzval = A.nzval
m = size(A, 1)
n = size(A, 2)

if size(R, 1) == size(R, 2) == 1
# Reduction along both columns and rows
R[1, 1] = Base.centralize_sumabs2(A, means[1])
elseif size(R, 1) == 1
# Reduction along rows
@inbounds for col = 1:n
mu = means[col]
r = convert(S, (m-colptr[col+1]+colptr[col])*abs2(mu))
@simd for j = colptr[col]:colptr[col+1]-1
r += abs2(nzval[j] - mu)
end
R[1, col] = r
end
elseif size(R, 2) == 1
# Reduction along columns
rownz = fill(convert(Ti, n), m)
@inbounds for col = 1:n
@simd for j = colptr[col]:colptr[col+1]-1
row = rowval[j]
R[row, 1] += abs2(nzval[j] - means[row])
rownz[row] -= 1
end
end
for i = 1:m
R[i, 1] += rownz[i]*abs2(means[i])
end
else
# Reduction along a dimension > 2
@inbounds for col = 1:n
lastrow = 0
@simd for j = colptr[col]:colptr[col+1]-1
row = rowval[j]
for i = lastrow+1:row-1
R[i, col] = abs2(means[i, col])
end
R[row, col] = abs2(nzval[j] - means[row, col])
lastrow = row
end
for i = lastrow+1:m
R[i, col] = abs2(means[i, col])
end
end
end
return R
end

## Uniform matrix arithmetic

(+)(A::SparseMatrixCSC, J::UniformScaling) = A + sparse(J, size(A)...)
Expand Down
62 changes: 2 additions & 60 deletions test/sparse.jl
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ end
pA = sparse(rand(3, 7))

for arr in (se33, sA, pA)
for f in (sum, prod, minimum, maximum, var)
for f in (sum, prod, minimum, maximum)
farr = Array(arr)
@test f(arr) f(farr)
@test f(arr, dims=1) f(farr, dims=1)
Expand Down Expand Up @@ -518,9 +518,8 @@ end
@test prod(sparse(Int[])) === 1
@test_throws ArgumentError minimum(sparse(Int[]))
@test_throws ArgumentError maximum(sparse(Int[]))
@test var(sparse(Int[])) === NaN

for f in (sum, prod, var)
for f in (sum, prod)
@test isequal(f(spzeros(0, 1), dims=1), f(Matrix{Int}(I, 0, 1), dims=1))
@test isequal(f(spzeros(0, 1), dims=2), f(Matrix{Int}(I, 0, 1), dims=2))
@test isequal(f(spzeros(0, 1), dims=(1, 2)), f(Matrix{Int}(I, 0, 1), dims=(1, 2)))
Expand Down Expand Up @@ -2033,63 +2032,6 @@ end
@test issymmetric(B)
end

# Faster covariance function for sparse matrices
# Prevents densifying the input matrix when subtracting the mean
# Test against dense implementation
# PR https://github.com/JuliaLang/julia/pull/22735
# Part of this test needed to be hacked due to the treatment
# of Inf in sparse matrix algebra
# https://github.com/JuliaLang/julia/issues/22921
# The issue will be resolved in
# https://github.com/JuliaLang/julia/issues/22733
@testset "optimizing sparse $elty covariance" for elty in (Float64, Complex{Float64})
n = 10
p = 5
np2 = div(n*p, 2)
nzvals, x_sparse = guardsrand(1) do
if elty <: Real
nzvals = randn(np2)
else
nzvals = complex.(randn(np2), randn(np2))
end
nzvals, sparse(rand(1:n, np2), rand(1:p, np2), nzvals, n, p)
end
x_dense = convert(Matrix{elty}, x_sparse)
@testset "Test with no Infs and NaNs, vardim=$vardim, corrected=$corrected" for vardim in (1, 2),
corrected in (true, false)
@test cov(x_sparse, dims=vardim, corrected=corrected)
cov(x_dense , dims=vardim, corrected=corrected)
end

@testset "Test with $x11, vardim=$vardim, corrected=$corrected" for x11 in (NaN, Inf),
vardim in (1, 2),
corrected in (true, false)
x_sparse[1,1] = x11
x_dense[1 ,1] = x11

cov_sparse = cov(x_sparse, dims=vardim, corrected=corrected)
cov_dense = cov(x_dense , dims=vardim, corrected=corrected)
@test cov_sparse[2:end, 2:end] cov_dense[2:end, 2:end]
@test isfinite.(cov_sparse) == isfinite.(cov_dense)
@test isfinite.(cov_sparse) == isfinite.(cov_dense)
end

@testset "Test with NaN and Inf, vardim=$vardim, corrected=$corrected" for vardim in (1, 2),
corrected in (true, false)
x_sparse[1,1] = Inf
x_dense[1 ,1] = Inf
x_sparse[2,1] = NaN
x_dense[2 ,1] = NaN

cov_sparse = cov(x_sparse, dims=vardim, corrected=corrected)
cov_dense = cov(x_dense , dims=vardim, corrected=corrected)
@test cov_sparse[(1 + vardim):end, (1 + vardim):end]
cov_dense[ (1 + vardim):end, (1 + vardim):end]
@test isfinite.(cov_sparse) == isfinite.(cov_dense)
@test isfinite.(cov_sparse) == isfinite.(cov_dense)
end
end

@testset "similar should not alias the input sparse array" begin
a = sparse(rand(3,3) .+ 0.1)
b = similar(a, Float32, Int32)
Expand Down

0 comments on commit 85dce1b

Please sign in to comment.