Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve handling of NaN in ranks and rank correlations #659

Merged
merged 7 commits into from
Mar 7, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 94 additions & 7 deletions src/rankcorr.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,101 @@ Compute Spearman's rank correlation coefficient. If `x` and `y` are vectors, the
output is a float, otherwise it's a matrix corresponding to the pairwise correlations
of the columns of `x` and `y`.
"""
corspearman(x::RealVector, y::RealVector) = cor(tiedrank(x), tiedrank(y))
function corspearman(x::RealVector, y::RealVector)
n = length(x)
n == length(y) || throw(DimensionMismatch("vectors must have same length"))
(any(isnan, x) || any(isnan, y)) && return NaN
return cor(tiedrank(x), tiedrank(y))
end

function corspearman(X::RealMatrix, y::RealVector)
size(X, 1) == length(y) ||
throw(DimensionMismatch("X and y have inconsistent dimensions"))
n = size(X, 2)
C = Matrix{Float64}(I, n, 1)
any(isnan, y) && return fill!(C, NaN)
yrank = tiedrank(y)
for j = 1:n
Xj = view(X, :, j)
if any(isnan, Xj)
C[j,1] = NaN
else
Xjrank = tiedrank(Xj)
C[j,1] = cor(Xjrank, yrank)
end
end
return C
end

function corspearman(x::RealVector, Y::RealMatrix)
size(Y, 1) == length(x) ||
throw(DimensionMismatch("x and Y have inconsistent dimensions"))
n = size(Y, 2)
C = Matrix{Float64}(I, 1, n)
any(isnan, x) && return fill!(C, NaN)
xrank = tiedrank(x)
for j = 1:n
Yj = view(Y, :, j)
if any(isnan, Yj)
C[1,j] = NaN
else
Yjrank = tiedrank(Yj)
C[1,j] = cor(xrank, Yjrank)
end
end
return C
end

corspearman(X::RealMatrix, Y::RealMatrix) =
cor(mapslices(tiedrank, X, dims=1), mapslices(tiedrank, Y, dims=1))
corspearman(X::RealMatrix, y::RealVector) = cor(mapslices(tiedrank, X, dims=1), tiedrank(y))
corspearman(x::RealVector, Y::RealMatrix) = cor(tiedrank(x), mapslices(tiedrank, Y, dims=1))
function corspearman(X::RealMatrix)
n = size(X, 2)
C = Matrix{Float64}(I, n, n)
for j = 1:n
Xj = view(X, :, j)
if any(isnan, Xj)
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
C[:,j] .= NaN
C[j,:] .= NaN
C[j,j] = 1
continue
end
Xjrank = tiedrank(Xj)
for i = 1:(j-1)
Xi = view(X, :, i)
if any(isnan, Xi)
C[i,j] = C[j,i] = NaN
else
Xirank = tiedrank(Xi)
C[i,j] = C[j,i] = cor(Xjrank, Xirank)
end
end
end
return C
end

corspearman(X::RealMatrix) = (Z = mapslices(tiedrank, X, dims=1); cor(Z, Z))
function corspearman(X::RealMatrix, Y::RealMatrix)
size(X, 1) == size(Y, 1) ||
throw(ArgumentError("number of columns in each array must match"))
nr = size(X, 2)
nc = size(Y, 2)
C = Matrix{Float64}(undef, nr, nc)
for j = 1:nr
Xj = view(X, :, j)
if any(isnan, Xj)
C[j,:] .= NaN
continue
end
Xjrank = tiedrank(Xj)
for i = 1:nc
Yi = view(Y, :, i)
if any(isnan, Yi)
C[j,i] = NaN
else
Yirank = tiedrank(Yi)
C[j,i] = cor(Xjrank, Yirank)
end
end
end
return C
end


#######################################
Expand All @@ -44,7 +131,7 @@ function corkendall!(x::RealVector, y::RealVector, permx::AbstractVector{<:Integ
# Initial sorting
permute!(x, permx)
permute!(y, permx)

# Use widen to avoid overflows on both 32bit and 64bit
npairs = div(widen(n) * (n - 1), 2)
ntiesx = ndoubleties = nswaps = widen(0)
Expand Down
55 changes: 54 additions & 1 deletion test/rankcorr.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@ using StatsBase
using Test

X = Float64[1 0; 2 1; 3 0; 4 1; 5 10]
Y = Float64[5 5 6; 3 4 1; 4 0 4; 2 6 1; 5 7 10]

x1 = X[:,1]
x2 = X[:,2]
y = [5, 3, 4, 2, 5]
y = Y[:,1]

# corspearman

Expand All @@ -23,6 +24,9 @@ c22 = corspearman(x2, x2)
@test corspearman(X, X) ≈ [c11 c12; c12 c22]
@test corspearman(X) ≈ [c11 c12; c12 c22]

@test corspearman(X, Y) ==
[corspearman(X[:,i], Y[:,j]) for i in axes(X, 2), j in axes(Y, 2)]

# corkendall

# Check error, handling of NaN, Inf etc
Expand Down Expand Up @@ -106,3 +110,52 @@ w = repeat(z, n)

StatsBase.midpoint(1,10) == 5
StatsBase.midpoint(1,widen(10)) == 5


# NaN handling

Xnan = copy(X)
Xnan[1,1] = NaN
Ynan = copy(Y)
Ynan[2,1] = NaN

for f in (corspearman, corkendall)
@test isnan(f([1.0, NaN, 2.0], [2.0, 1.0, 3.4]))
@test all(isnan, f([1.0, NaN], [1 2; 3 4]))
@test all(isnan, f([1 2; 3 4], [1.0, NaN]))
@test isequal(f([1 NaN; NaN 4]), [1 NaN; NaN 1])
@test all(isnan, f([1 NaN; NaN 4], [1 NaN; NaN 4]))
@test all(isnan, f([1 NaN; NaN 4], [NaN 1; NaN 4]))

@test isequal(f(Xnan, Ynan),
[f(Xnan[:,i], Ynan[:,j]) for i in axes(Xnan, 2), j in axes(Ynan, 2)])
@test isequal(f(Xnan),
[i == j ? 1.0 : f(Xnan[:,i], Xnan[:,j])
for i in axes(Xnan, 2), j in axes(Xnan, 2)])
for k in 1:2
@test isequal(f(Xnan[:,k], Ynan),
[f(Xnan[:,k], Ynan[:,j]) for i in 1:1, j in axes(Ynan, 2)])
# TODO: fix corkendall (PR#659)
if f === corspearman
@test isequal(f(Xnan, Ynan[:,k]),
[f(Xnan[:,i], Ynan[:,k]) for i in axes(Xnan, 2), j in 1:1])
else
@test isequal(f(Xnan, Ynan[:,k]),
[f(Xnan[:,i], Ynan[:,k]) for i in axes(Xnan, 2)])
end
end
end


# Wrong dimensions

@test_throws DimensionMismatch corspearman([1], [1, 2])
@test_throws DimensionMismatch corspearman([1], [1 2; 3 4])
@test_throws DimensionMismatch corspearman([1 2; 3 4], [1])
@test_throws ArgumentError corspearman([1 2; 3 4: 4 6], [1 2; 3 4])

# TODO: fix corkendall to match corspearman (PR#659)
@test_throws ErrorException corkendall([1], [1, 2])
@test_throws ErrorException corkendall([1], [1 2; 3 4])
@test_throws ErrorException corkendall([1 2; 3 4], [1])
@test_throws ArgumentError corkendall([1 2; 3 4: 4 6], [1 2; 3 4])