From 7073d71458e8d88c01908c951a17361c395e0985 Mon Sep 17 00:00:00 2001
From: Klaus Crusius <klaus.crusius@web.de>
Date: Mon, 19 Nov 2018 14:06:37 +0100
Subject: [PATCH 1/5] added sprandn methods with Type

---
 stdlib/SparseArrays/src/sparsematrix.jl  | 4 +++-
 stdlib/SparseArrays/src/sparsevector.jl  | 2 ++
 stdlib/SparseArrays/test/sparse.jl       | 7 +++++++
 stdlib/SparseArrays/test/sparsevector.jl | 5 +++++
 4 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/stdlib/SparseArrays/src/sparsematrix.jl b/stdlib/SparseArrays/src/sparsematrix.jl
index 02c975168c6ec..7fac079ac97d6 100644
--- a/stdlib/SparseArrays/src/sparsematrix.jl
+++ b/stdlib/SparseArrays/src/sparsematrix.jl
@@ -1471,7 +1471,7 @@ sprand(r::AbstractRNG, ::Type{Bool}, m::Integer, n::Integer, density::AbstractFl
 sprand(::Type{T}, m::Integer, n::Integer, density::AbstractFloat) where {T} = sprand(GLOBAL_RNG, T, m, n, density)
 
 """
-    sprandn([rng], m[,n],p::AbstractFloat)
+    sprandn([rng],[,Type],m[,n],p::AbstractFloat)
 
 Create a random sparse vector of length `m` or sparse matrix of size `m` by `n`
 with the specified (independent) probability `p` of any entry being nonzero,
@@ -1488,6 +1488,8 @@ julia> sprandn(2, 2, 0.75)
 """
 sprandn(r::AbstractRNG, m::Integer, n::Integer, density::AbstractFloat) = sprand(r,m,n,density,randn,Float64)
 sprandn(m::Integer, n::Integer, density::AbstractFloat) = sprandn(GLOBAL_RNG,m,n,density)
+sprandn(r::AbstractRNG, ::Type{T}, m::Integer, n::Integer, density::AbstractFloat) where T = sprand(r,m,n,density,(r,i) -> randn(r,T,i), T)
+sprandn(::Type{T}, m::Integer, n::Integer, density::AbstractFloat) where T = sprandn(GLOBAL_RNG,T,m,n,density)
 
 LinearAlgebra.fillstored!(S::SparseMatrixCSC, x) = (fill!(nzvalview(S), x); S)
 
diff --git a/stdlib/SparseArrays/src/sparsevector.jl b/stdlib/SparseArrays/src/sparsevector.jl
index e7f0e1a2db26a..ce9fbbc35e947 100644
--- a/stdlib/SparseArrays/src/sparsevector.jl
+++ b/stdlib/SparseArrays/src/sparsevector.jl
@@ -506,6 +506,8 @@ sprand(::Type{T}, n::Integer, p::AbstractFloat) where {T} = sprand(GLOBAL_RNG, T
 
 sprandn(n::Integer, p::AbstractFloat) = sprand(GLOBAL_RNG, n, p, randn)
 sprandn(r::AbstractRNG, n::Integer, p::AbstractFloat) = sprand(r, n, p, randn)
+sprandn(::Type{T}, n::Integer, p::AbstractFloat) where T = sprand(GLOBAL_RNG, n, p, (r, i) -> randn(r, T, i))
+sprandn(r::AbstractRNG, ::Type{T}, n::Integer, p::AbstractFloat) where T = sprand(r, n, p, (r, i) -> randn(r, T, i))
 
 ## Indexing into Matrices can return SparseVectors
 
diff --git a/stdlib/SparseArrays/test/sparse.jl b/stdlib/SparseArrays/test/sparse.jl
index c8e5ebd8b5acb..9eb2069d3984c 100644
--- a/stdlib/SparseArrays/test/sparse.jl
+++ b/stdlib/SparseArrays/test/sparse.jl
@@ -2335,4 +2335,11 @@ end
     @test m2.module == SparseArrays
 end
 
+@testset "sprandn with type $T" for T in (Float64, Float32, Float16, ComplexF64, ComplexF32, ComplexF16)
+    @test sprandn(T, 5, 5, 0.5) isa AbstractSparseMatrix{T}
+end
+@testset "sprandn with invalid type $T" for T in (AbstractFloat, BigFloat, Complex)
+    @test_throws MethodError sprandn(T, 5, 5, 0.5)
+end
+
 end # module
diff --git a/stdlib/SparseArrays/test/sparsevector.jl b/stdlib/SparseArrays/test/sparsevector.jl
index 6061b21a6721c..d8ada3963e2c5 100644
--- a/stdlib/SparseArrays/test/sparsevector.jl
+++ b/stdlib/SparseArrays/test/sparsevector.jl
@@ -154,6 +154,11 @@ end
             end
         end
 
+        let xr = sprandn(ComplexF64, 1000, 0.9)
+            @test isa(xr, SparseVector{ComplexF64,Int})
+            @test length(xr) == 1000
+        end
+
         let xr = sprand(Bool, 1000, 0.9)
             @test isa(xr, SparseVector{Bool,Int})
             @test length(xr) == 1000

From 2bad969cfeba6c336594127cd9e8af74e66f829e Mon Sep 17 00:00:00 2001
From: Klaus Crusius <klaus.crusius@web.de>
Date: Wed, 12 Dec 2018 19:27:12 +0100
Subject: [PATCH 2/5] improved speed and space for spmatmul

---
 stdlib/SparseArrays/src/linalg.jl  | 84 ++++++++++++++++++------------
 stdlib/SparseArrays/test/sparse.jl |  3 +-
 2 files changed, 52 insertions(+), 35 deletions(-)

diff --git a/stdlib/SparseArrays/src/linalg.jl b/stdlib/SparseArrays/src/linalg.jl
index 4350a2d922fc9..be8a8a90bb0c1 100644
--- a/stdlib/SparseArrays/src/linalg.jl
+++ b/stdlib/SparseArrays/src/linalg.jl
@@ -147,61 +147,79 @@ end
 *(A::Adjoint{<:Any,<:SparseMatrixCSC{Tv,Ti}}, B::Adjoint{<:Any,<:SparseMatrixCSC{Tv,Ti}}) where {Tv,Ti} = spmatmul(copy(A), copy(B))
 *(A::Transpose{<:Any,<:SparseMatrixCSC{Tv,Ti}}, B::Transpose{<:Any,<:SparseMatrixCSC{Tv,Ti}}) where {Tv,Ti} = spmatmul(copy(A), copy(B))
 
-function spmatmul(A::SparseMatrixCSC{Tv,Ti}, B::SparseMatrixCSC{Tv,Ti};
-                  sortindices::Symbol = :sortcols) where {Tv,Ti}
+# Gustavsen's matrix multiplication algorithm revisited.
+# The result rowval vector is already sorted by construction.
+# The auxiliary Vector{Ti} xb is replaced by a BitArray of same length.
+# Besides SparseMatrixCSC also SparseVector is accepted as B.
+# The optional argument controlling a sorting algorithm is obsolete.
+function spmatmul(A::SparseMatrixCSC{Tv,Ti},
+                  B::Union{<:SparseMatrixCSC{Tv,Ti},<:SparseVector{Tv,Ti}}) where {Tv,Ti,N}
+
     mA, nA = size(A)
-    mB, nB = size(B)
-    nA==mB || throw(DimensionMismatch())
-
-    colptrA = A.colptr; rowvalA = A.rowval; nzvalA = A.nzval
-    colptrB = B.colptr; rowvalB = B.rowval; nzvalB = B.nzval
-    # TODO: Need better estimation of result space
-    nnzC = min(mA*nB, length(nzvalA) + length(nzvalB))
-    colptrC = Vector{Ti}(undef, nB+1)
+    nB = size(B, 2)
+    nA == size(B, 1) || throw(DimensionMismatch())
+
+    rowvalA = rowvals(A); nzvalA = nonzeros(A)
+    rowvalB = rowvals(B); nzvalB = nonzeros(B)
+    nnzC = estimate_mulsize(mA, nnz(A), nA, nnz(B), nB)
+    if B isa SparseMatrixCSC; colptrC = Vector{Ti}(undef, nB+1) end
     rowvalC = Vector{Ti}(undef, nnzC)
     nzvalC = Vector{Tv}(undef, nnzC)
 
     @inbounds begin
         ip = 1
-        xb = zeros(Ti, mA)
-        x  = zeros(Tv, mA)
+        x  = Vector{Tv}(undef, mA)
+        xb = BitArray(undef, mA)
         for i in 1:nB
+            fill!(xb, false)
             if ip + mA - 1 > nnzC
-                resize!(rowvalC, nnzC + max(nnzC,mA))
-                resize!(nzvalC, nnzC + max(nnzC,mA))
-                nnzC = length(nzvalC)
+                nnzC += max(mA, nnzC>>2)
+                resize!(rowvalC, nnzC)
+                resize!(nzvalC, nnzC)
             end
-            colptrC[i] = ip
-            for jp in colptrB[i]:(colptrB[i+1] - 1)
+            if B isa SparseMatrixCSC; colptrC[i] = ip end
+            for jp in nzrange(B, i)
                 nzB = nzvalB[jp]
                 j = rowvalB[jp]
-                for kp in colptrA[j]:(colptrA[j+1] - 1)
+                for kp in nzrange(A, j)
                     nzC = nzvalA[kp] * nzB
                     k = rowvalA[kp]
-                    if xb[k] != i
-                        rowvalC[ip] = k
-                        ip += 1
-                        xb[k] = i
-                        x[k] = nzC
-                    else
+                    if xb[k]
                         x[k] += nzC
+                    else
+                        x[k] = nzC
+                        xb[k] = true
                     end
                 end
             end
-            for vp in colptrC[i]:(ip - 1)
-                nzvalC[vp] = x[rowvalC[vp]]
+            for k in findall(xb)
+                nzvalC[ip] = x[k]
+                rowvalC[ip] = k
+                ip += 1
             end
         end
-        colptrC[nB+1] = ip
+        if B isa SparseMatrixCSC; colptrC[nB+1] = ip end
     end
 
-    deleteat!(rowvalC, colptrC[end]:length(rowvalC))
-    deleteat!(nzvalC, colptrC[end]:length(nzvalC))
+    ip -= 1
+    resize!(rowvalC, ip)
+    resize!(nzvalC, ip)
+
+    # This modification of Gustavson's algorithm has sorted row indices.
+    # The not needed sort makes a major performance improvement and saves space.
+    if B isa SparseMatrixCSC
+        SparseMatrixCSC(mA, nB, colptrC, rowvalC, nzvalC)
+    else
+        SparseVector(mA, rowvalC, nzvalC)
+    end
+end
 
-    # The Gustavson algorithm does not guarantee the product to have sorted row indices.
-    Cunsorted = SparseMatrixCSC(mA, nB, colptrC, rowvalC, nzvalC)
-    C = SparseArrays.sortSparseMatrixCSC!(Cunsorted, sortindices=sortindices)
-    return C
+# For randomly distributed nonzeros of the factors this is a rather good estimation
+# for the number of nonzeros of the product.
+# For heavily structured matrices the value tends to over-estimation.
+function estimate_mulsize(m::Integer, nnzA::Integer, n::Integer, nnzB::Integer, k::Integer)
+    p = (nnzA / (m * n)) * (nnzB / (n * k))
+    isnan(p) ? 0 : Int(ceil(-expm1(log1p(-p) * n) * m * k)) # is (1 - (1 - p)^n) * m * k
 end
 
 # Frobenius dot/inner product: trace(A'B)
diff --git a/stdlib/SparseArrays/test/sparse.jl b/stdlib/SparseArrays/test/sparse.jl
index 4328d59592ce9..040fde198a62e 100644
--- a/stdlib/SparseArrays/test/sparse.jl
+++ b/stdlib/SparseArrays/test/sparse.jl
@@ -318,8 +318,7 @@ end
         a = sprand(10, 5, 0.7)
         b = sprand(5, 15, 0.3)
         @test maximum(abs.(a*b - Array(a)*Array(b))) < 100*eps()
-        @test maximum(abs.(SparseArrays.spmatmul(a,b,sortindices=:sortcols) - Array(a)*Array(b))) < 100*eps()
-        @test maximum(abs.(SparseArrays.spmatmul(a,b,sortindices=:doubletranspose) - Array(a)*Array(b))) < 100*eps()
+        @test maximum(abs.(SparseArrays.spmatmul(a,b) - Array(a)*Array(b))) < 100*eps()
         f = Diagonal(rand(5))
         @test Array(a*f) == Array(a)*f
         @test Array(f*b) == f*Array(b)

From 7a49ff0b5624f6c5ba6932cf397c34f17a3feb82 Mon Sep 17 00:00:00 2001
From: Klaus Crusius <klaus.crusius@web.de>
Date: Wed, 12 Dec 2018 21:50:38 +0100
Subject: [PATCH 3/5] reduced number of allocations

---
 stdlib/SparseArrays/src/linalg.jl | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/stdlib/SparseArrays/src/linalg.jl b/stdlib/SparseArrays/src/linalg.jl
index be8a8a90bb0c1..3f4c0c1409d5d 100644
--- a/stdlib/SparseArrays/src/linalg.jl
+++ b/stdlib/SparseArrays/src/linalg.jl
@@ -149,7 +149,7 @@ end
 
 # Gustavsen's matrix multiplication algorithm revisited.
 # The result rowval vector is already sorted by construction.
-# The auxiliary Vector{Ti} xb is replaced by a BitArray of same length.
+# The auxiliary Vector{Ti} xb is replaced by a Vector{Bool} of same length.
 # Besides SparseMatrixCSC also SparseVector is accepted as B.
 # The optional argument controlling a sorting algorithm is obsolete.
 function spmatmul(A::SparseMatrixCSC{Tv,Ti},
@@ -169,7 +169,7 @@ function spmatmul(A::SparseMatrixCSC{Tv,Ti},
     @inbounds begin
         ip = 1
         x  = Vector{Tv}(undef, mA)
-        xb = BitArray(undef, mA)
+        xb = Vector{Bool}(undef, mA)
         for i in 1:nB
             fill!(xb, false)
             if ip + mA - 1 > nnzC
@@ -192,10 +192,12 @@ function spmatmul(A::SparseMatrixCSC{Tv,Ti},
                     end
                 end
             end
-            for k in findall(xb)
-                nzvalC[ip] = x[k]
-                rowvalC[ip] = k
-                ip += 1
+            for k in 1:mA
+                if xb[k]
+                    nzvalC[ip] = x[k]
+                    rowvalC[ip] = k
+                    ip += 1
+                end
             end
         end
         if B isa SparseMatrixCSC; colptrC[nB+1] = ip end

From 526b33fefe8f4e94b41bd951907cbdff8b15a9f8 Mon Sep 17 00:00:00 2001
From: Klaus Crusius <klaus.crusius@web.de>
Date: Wed, 12 Dec 2018 22:11:55 +0100
Subject: [PATCH 4/5] increase initial allocation size

---
 stdlib/SparseArrays/src/linalg.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stdlib/SparseArrays/src/linalg.jl b/stdlib/SparseArrays/src/linalg.jl
index 3f4c0c1409d5d..2c0b381ba0455 100644
--- a/stdlib/SparseArrays/src/linalg.jl
+++ b/stdlib/SparseArrays/src/linalg.jl
@@ -161,7 +161,7 @@ function spmatmul(A::SparseMatrixCSC{Tv,Ti},
 
     rowvalA = rowvals(A); nzvalA = nonzeros(A)
     rowvalB = rowvals(B); nzvalB = nonzeros(B)
-    nnzC = estimate_mulsize(mA, nnz(A), nA, nnz(B), nB)
+    nnzC = estimate_mulsize(mA, nnz(A), nA, nnz(B), nB) * 11 ÷ 10 # 10% more
     if B isa SparseMatrixCSC; colptrC = Vector{Ti}(undef, nB+1) end
     rowvalC = Vector{Ti}(undef, nnzC)
     nzvalC = Vector{Tv}(undef, nnzC)

From 855959124dbafadafdcd324771ec06831bc79a1a Mon Sep 17 00:00:00 2001
From: Klaus Crusius <klaus.crusius@web.de>
Date: Sun, 16 Dec 2018 18:52:31 +0100
Subject: [PATCH 5/5] general performance improvements of spmatmul

---
 stdlib/SparseArrays/src/linalg.jl | 85 +++++++++++++++++++------------
 1 file changed, 53 insertions(+), 32 deletions(-)

diff --git a/stdlib/SparseArrays/src/linalg.jl b/stdlib/SparseArrays/src/linalg.jl
index 2c0b381ba0455..d8446d705d465 100644
--- a/stdlib/SparseArrays/src/linalg.jl
+++ b/stdlib/SparseArrays/src/linalg.jl
@@ -150,34 +150,35 @@ end
 # Gustavsen's matrix multiplication algorithm revisited.
 # The result rowval vector is already sorted by construction.
 # The auxiliary Vector{Ti} xb is replaced by a Vector{Bool} of same length.
-# Besides SparseMatrixCSC also SparseVector is accepted as B.
 # The optional argument controlling a sorting algorithm is obsolete.
-function spmatmul(A::SparseMatrixCSC{Tv,Ti},
-                  B::Union{<:SparseMatrixCSC{Tv,Ti},<:SparseVector{Tv,Ti}}) where {Tv,Ti,N}
-
+# depending on expected execution speed the sorting of the result column is
+# done by a quicksort of the row indices or by a full scan of the dense result vector.
+# The last is faster, if more than ≈ 1/32 of the result column is nonzero.
+# TODO: extend to SparseMatrixCSCUnion to allow for SubArrays (view(X, :, r)).
+function spmatmul(A::SparseMatrixCSC{Tv,Ti}, B::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
     mA, nA = size(A)
     nB = size(B, 2)
     nA == size(B, 1) || throw(DimensionMismatch())
 
     rowvalA = rowvals(A); nzvalA = nonzeros(A)
     rowvalB = rowvals(B); nzvalB = nonzeros(B)
-    nnzC = estimate_mulsize(mA, nnz(A), nA, nnz(B), nB) * 11 ÷ 10 # 10% more
-    if B isa SparseMatrixCSC; colptrC = Vector{Ti}(undef, nB+1) end
+    nnzC = max(estimate_mulsize(mA, nnz(A), nA, nnz(B), nB) * 11 ÷ 10, mA)
+    colptrC = Vector{Ti}(undef, nB+1)
     rowvalC = Vector{Ti}(undef, nnzC)
     nzvalC = Vector{Tv}(undef, nnzC)
+    nzpercol = nnzC ÷ max(nB, 1)
 
     @inbounds begin
         ip = 1
-        x  = Vector{Tv}(undef, mA)
-        xb = Vector{Bool}(undef, mA)
+        xb = fill(false, mA)
         for i in 1:nB
-            fill!(xb, false)
             if ip + mA - 1 > nnzC
                 nnzC += max(mA, nnzC>>2)
                 resize!(rowvalC, nnzC)
                 resize!(nzvalC, nnzC)
             end
-            if B isa SparseMatrixCSC; colptrC[i] = ip end
+            colptrC[i] = ip0 = ip
+            k0 = ip - 1
             for jp in nzrange(B, i)
                 nzB = nzvalB[jp]
                 j = rowvalB[jp]
@@ -185,45 +186,65 @@ function spmatmul(A::SparseMatrixCSC{Tv,Ti},
                     nzC = nzvalA[kp] * nzB
                     k = rowvalA[kp]
                     if xb[k]
-                        x[k] += nzC
+                        nzvalC[k+k0] += nzC
                     else
-                        x[k] = nzC
+                        nzvalC[k+k0] = nzC
                         xb[k] = true
+                        rowvalC[ip] = k
+                        ip += 1
                     end
                 end
             end
-            for k in 1:mA
-                if xb[k]
-                    nzvalC[ip] = x[k]
-                    rowvalC[ip] = k
-                    ip += 1
+            if ip > ip0
+                if prefer_sort(ip-k0, mA)
+                    # in-place sort of indices. Effort: O(nnz*ln(nnz)).
+                    sort!(rowvalC, ip0, ip-1, QuickSort, Base.Order.Forward)
+                    for vp = ip0:ip-1
+                        k = rowvalC[vp]
+                        xb[k] = false
+                        nzvalC[vp] = nzvalC[k+k0]
+                    end
+                else
+                    # scan result vector (effort O(mA))
+                    for k = 1:mA
+                        if xb[k]
+                            xb[k] = false
+                            rowvalC[ip0] = k
+                            nzvalC[ip0] = nzvalC[k+k0]
+                            ip0 += 1
+                        end
+                    end
                 end
             end
         end
-        if B isa SparseMatrixCSC; colptrC[nB+1] = ip end
+        colptrC[nB+1] = ip
     end
 
-    ip -= 1
-    resize!(rowvalC, ip)
-    resize!(nzvalC, ip)
+    resize!(rowvalC, ip - 1)
+    resize!(nzvalC, ip - 1)
 
-    # This modification of Gustavson's algorithm has sorted row indices.
-    # The not needed sort makes a major performance improvement and saves space.
-    if B isa SparseMatrixCSC
-        SparseMatrixCSC(mA, nB, colptrC, rowvalC, nzvalC)
-    else
-        SparseVector(mA, rowvalC, nzvalC)
-    end
+    # This modification of Gustavson algorithm has sorted row indices
+    C = SparseMatrixCSC(mA, nB, colptrC, rowvalC, nzvalC)
+    return C
 end
 
-# For randomly distributed nonzeros of the factors this is a rather good estimation
-# for the number of nonzeros of the product.
-# For heavily structured matrices the value tends to over-estimation.
+# estimated number of non-zeros in matrix product
+# it is assumed, that the non-zero indices are distributed independently and uniformly
+# in both matrices. Over-estimation is possible if that is not the case.
 function estimate_mulsize(m::Integer, nnzA::Integer, n::Integer, nnzB::Integer, k::Integer)
     p = (nnzA / (m * n)) * (nnzB / (n * k))
-    isnan(p) ? 0 : Int(ceil(-expm1(log1p(-p) * n) * m * k)) # is (1 - (1 - p)^n) * m * k
+    p >= 1 ? m*k : p > 0 ? Int(ceil(-expm1(log1p(-p) * n)*m*k)) : 0 # (1-(1-p)^n)*m*k
 end
 
+# determine if sort! shall be used or the whole column be scanned
+# based on empirical data on i7-3610QM CPU
+# measuring runtimes of the scanning and sorting loops of the algorithm.
+# The parameters 6 and 3 might be modified for different architectures.
+prefer_sort(nz::Integer, m::Integer) = m > 6 && 3 * ilog2(nz) * nz < m
+
+# minimal number of bits required to represent integer; ilog2(n) >= log2(n)
+ilog2(n::Integer) = sizeof(n)<<3 - leading_zeros(n)
+
 # Frobenius dot/inner product: trace(A'B)
 function dot(A::SparseMatrixCSC{T1,S1},B::SparseMatrixCSC{T2,S2}) where {T1,T2,S1,S2}
     m, n = size(A)