Skip to content

Commit

Permalink
format
Browse files Browse the repository at this point in the history
  • Loading branch information
ArnoStrouwen committed Sep 23, 2023
1 parent 974a36f commit daf7212
Show file tree
Hide file tree
Showing 15 changed files with 180 additions and 87 deletions.
28 changes: 22 additions & 6 deletions benchmarks/applelu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,13 @@ function luflop(m, n = m; innerflop = 2)
end
end

algs = [LUFactorization(), GenericLUFactorization(), RFLUFactorization(), AppleAccelerateLUFactorization(), MetalLUFactorization()]
algs = [
LUFactorization(),
GenericLUFactorization(),
RFLUFactorization(),
AppleAccelerateLUFactorization(),
MetalLUFactorization(),
]
res = [Float32[] for i in 1:length(algs)]

ns = 4:8:500
Expand All @@ -28,10 +34,14 @@ for i in 1:length(ns)
rng = MersenneTwister(123)
global A = rand(rng, Float32, n, n)
global b = rand(rng, Float32, n)
global u0= rand(rng, Float32, n)
global u0 = rand(rng, Float32, n)

for j in 1:length(algs)
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A),
copy(b);
u0 = copy(u0),
alias_A = true,
alias_b = true))
push!(res[j], luflop(n) / bt / 1e9)
end
end
Expand All @@ -41,11 +51,17 @@ __parameterless_type(T) = Base.typename(T).wrapper
parameterless_type(x) = __parameterless_type(typeof(x))
parameterless_type(::Type{T}) where {T} = __parameterless_type(T)

p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
p = plot(ns,
res[1];
ylabel = "GFLOPs",
xlabel = "N",
title = "GFLOPs for NxN LU Factorization",
label = string(Symbol(parameterless_type(algs[1]))),
legend = :outertopright)
for i in 2:length(res)
plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
end
p

savefig("metallubench.png")
savefig("metallubench.pdf")
savefig("metallubench.pdf")
20 changes: 15 additions & 5 deletions benchmarks/cudalu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,14 @@ for i in 1:length(ns)
rng = MersenneTwister(123)
global A = rand(rng, Float32, n, n)
global b = rand(rng, Float32, n)
global u0= rand(rng, Float32, n)
global u0 = rand(rng, Float32, n)

for j in 1:length(algs)
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A),
copy(b);
u0 = copy(u0),
alias_A = true,
alias_b = true))
push!(res[j], luflop(n) / bt / 1e9)
end
end
Expand All @@ -41,11 +45,17 @@ __parameterless_type(T) = Base.typename(T).wrapper
parameterless_type(x) = __parameterless_type(typeof(x))
parameterless_type(::Type{T}) where {T} = __parameterless_type(T)

p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
p = plot(ns,
res[1];
ylabel = "GFLOPs",
xlabel = "N",
title = "GFLOPs for NxN LU Factorization",
label = string(Symbol(parameterless_type(algs[1]))),
legend = :outertopright)
for i in 2:length(res)
plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
end
p

savefig("cudaoffloadlubench.png")
savefig("cudaoffloadlubench.pdf")
savefig("cudaoffloadlubench.pdf")
29 changes: 23 additions & 6 deletions benchmarks/lu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,14 @@ function luflop(m, n = m; innerflop = 2)
end
end

algs = [LUFactorization(), GenericLUFactorization(), RFLUFactorization(), MKLLUFactorization(), FastLUFactorization(), SimpleLUFactorization()]
algs = [
LUFactorization(),
GenericLUFactorization(),
RFLUFactorization(),
MKLLUFactorization(),
FastLUFactorization(),
SimpleLUFactorization(),
]
res = [Float64[] for i in 1:length(algs)]

ns = 4:8:500
Expand All @@ -28,10 +35,14 @@ for i in 1:length(ns)
rng = MersenneTwister(123)
global A = rand(rng, n, n)
global b = rand(rng, n)
global u0= rand(rng, n)
global u0 = rand(rng, n)

for j in 1:length(algs)
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A),
copy(b);
u0 = copy(u0),
alias_A = true,
alias_b = true))
push!(res[j], luflop(n) / bt / 1e9)
end
end
Expand All @@ -41,11 +52,17 @@ __parameterless_type(T) = Base.typename(T).wrapper
parameterless_type(x) = __parameterless_type(typeof(x))
parameterless_type(::Type{T}) where {T} = __parameterless_type(T)

p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
p = plot(ns,
res[1];
ylabel = "GFLOPs",
xlabel = "N",
title = "GFLOPs for NxN LU Factorization",
label = string(Symbol(parameterless_type(algs[1]))),
legend = :outertopright)
for i in 2:length(res)
plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
end
p

savefig("lubench.png")
savefig("lubench.pdf")
savefig("lubench.pdf")
20 changes: 15 additions & 5 deletions benchmarks/metallu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,14 @@ for i in 1:length(ns)
rng = MersenneTwister(123)
global A = rand(rng, Float32, n, n)
global b = rand(rng, Float32, n)
global u0= rand(rng, Float32, n)
global u0 = rand(rng, Float32, n)

for j in 1:length(algs)
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A),
copy(b);
u0 = copy(u0),
alias_A = true,
alias_b = true))
GC.gc()
push!(res[j], luflop(n) / bt / 1e9)
end
Expand All @@ -42,11 +46,17 @@ __parameterless_type(T) = Base.typename(T).wrapper
parameterless_type(x) = __parameterless_type(typeof(x))
parameterless_type(::Type{T}) where {T} = __parameterless_type(T)

p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
p = plot(ns,
res[1];
ylabel = "GFLOPs",
xlabel = "N",
title = "GFLOPs for NxN LU Factorization",
label = string(Symbol(parameterless_type(algs[1]))),
legend = :outertopright)
for i in 2:length(res)
plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
end
p

savefig("metal_large_lubench.png")
savefig("metal_large_lubench.pdf")
savefig("metal_large_lubench.pdf")
18 changes: 9 additions & 9 deletions docs/src/solvers/solvers.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,15 @@ but one may need to change this to receive more performance or precision. If
more precision is necessary, `QRFactorization()` and `SVDFactorization()` are
the best choices, with SVD being the slowest but most precise.

For efficiency, `RFLUFactorization` is the fastest for dense LU-factorizations until around
For efficiency, `RFLUFactorization` is the fastest for dense LU-factorizations until around
150x150 matrices, though this can be dependent on the exact details of the hardware. After this
point, `MKLLUFactorization` is usually faster on most hardware. Note that on Mac computers
that `AppleAccelerateLUFactorization` is generally always the fastest. `LUFactorization` will
use your base system BLAS which can be fast or slow depending on the hardware configuration.
use your base system BLAS which can be fast or slow depending on the hardware configuration.
`SimpleLUFactorization` will be fast only on very small matrices but can cut down on compile times.

For very large dense factorizations, offloading to the GPU can be preferred. Metal.jl can be used
on Mac hardware to offload, and has a cutoff point of being faster at around size 20,000 x 20,000
on Mac hardware to offload, and has a cutoff point of being faster at around size 20,000 x 20,000
matrices (and only supports Float32). `CudaOffloadFactorization` can be more efficient at a
much smaller cutoff, possibly around size 1,000 x 1,000 matrices, though this is highly dependent
on the chosen GPU hardware. `CudaOffloadFactorization` requires a CUDA-compatible NVIDIA GPU.
Expand All @@ -31,9 +31,9 @@ CUDA offload supports Float64 but most consumer GPU hardware will be much faster
this is only recommended for Float32 matrices.

!!! note

Performance details for dense LU-factorizations can be highly dependent on the hardware configuration.
For details see [this issue](https://github.com/SciML/LinearSolve.jl/issues/357).
Performance details for dense LU-factorizations can be highly dependent on the hardware configuration.
For details see [this issue](https://github.com/SciML/LinearSolve.jl/issues/357).
If one is looking to best optimize their system, we suggest running the performance
tuning benchmark.

Expand Down Expand Up @@ -65,19 +65,19 @@ The interface is detailed [here](@ref custom).
### Lazy SciMLOperators

If the linear operator is given as a lazy non-concrete operator, such as a `FunctionOperator`,
then using a Krylov method is preferred in order to not concretize the matrix.
then using a Krylov method is preferred in order to not concretize the matrix.
Krylov.jl generally outperforms IterativeSolvers.jl and KrylovKit.jl, and is compatible
with CPUs and GPUs, and thus is the generally preferred form for Krylov methods. The
choice of Krylov method should be the one most constrained to the type of operator one
has, for example if positive definite then `Krylov_CG()`, but if no good properties then
use `Krylov_GMRES()`.

!!! tip

If your materialized operator is a uniform block diagonal matrix, then you can use
`SimpleGMRES(; blocksize = <known block size>)` to further improve performance.
This often shows up in Neural Networks where the Jacobian wrt the Inputs (almost always)
is a Uniform Block Diagonal matrix of Block Size = size of the input divided by the
is a Uniform Block Diagonal matrix of Block Size = size of the input divided by the
batch size.

## Full List of Methods
Expand Down
2 changes: 1 addition & 1 deletion ext/LinearSolveBlockDiagonalsExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ using LinearSolve, BlockDiagonals

function LinearSolve.init_cacheval(alg::SimpleGMRES{false}, A::BlockDiagonal, b, args...;
kwargs...)
@assert ndims(A) == 2 "ndims(A) == $(ndims(A)). `A` must have ndims == 2."
@assert ndims(A)==2 "ndims(A) == $(ndims(A)). `A` must have ndims == 2."

Check warning on line 7 in ext/LinearSolveBlockDiagonalsExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveBlockDiagonalsExt.jl#L7

Added line #L7 was not covered by tests
# We need to perform this check even when `zeroinit == true`, since the type of the
# cache is dependent on whether we are able to use the specialized dispatch.
bsizes = blocksizes(A)
Expand Down
2 changes: 1 addition & 1 deletion ext/LinearSolveKernelAbstractionsExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ using GPUArraysCore
function LinearSolve._fast_sym_givens!(c, s, R, nr::Int, inner_iter::Int, bsize::Int, Hbis)
backend = get_backend(Hbis)
kernel! = __fast_sym_givens_kernel!(backend)
kernel!(c[inner_iter], s[inner_iter], R[nr + inner_iter], Hbis; ndrange=bsize)
kernel!(c[inner_iter], s[inner_iter], R[nr + inner_iter], Hbis; ndrange = bsize)

Check warning on line 12 in ext/LinearSolveKernelAbstractionsExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveKernelAbstractionsExt.jl#L12

Added line #L12 was not covered by tests
return c, s, R
end

Expand Down
59 changes: 38 additions & 21 deletions ext/LinearSolveMKLExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,49 +2,60 @@ module LinearSolveMKLExt

using MKL_jll
using LinearAlgebra: BlasInt, LU
using LinearAlgebra.LAPACK: require_one_based_indexing, chkfinite, chkstride1,
@blasfunc, chkargsok
using LinearAlgebra.LAPACK: require_one_based_indexing,
chkfinite, chkstride1,
@blasfunc, chkargsok
using LinearAlgebra
const usemkl = MKL_jll.is_available()

using LinearSolve
using LinearSolve: ArrayInterface, MKLLUFactorization, @get_cacheval, LinearCache, SciMLBase

function getrf!(A::AbstractMatrix{<:Float64}; ipiv = similar(A, BlasInt, min(size(A,1),size(A,2))), info = Ref{BlasInt}(), check = false)
function getrf!(A::AbstractMatrix{<:Float64};

Check warning on line 14 in ext/LinearSolveMKLExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveMKLExt.jl#L14

Added line #L14 was not covered by tests
ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
info = Ref{BlasInt}(),
check = false)
require_one_based_indexing(A)
check && chkfinite(A)
chkstride1(A)
m, n = size(A)
lda = max(1,stride(A, 2))
lda = max(1, stride(A, 2))

Check warning on line 22 in ext/LinearSolveMKLExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveMKLExt.jl#L22

Added line #L22 was not covered by tests
if isempty(ipiv)
ipiv = similar(A, BlasInt, min(size(A,1),size(A,2)))
ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2)))

Check warning on line 24 in ext/LinearSolveMKLExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveMKLExt.jl#L24

Added line #L24 was not covered by tests
end
ccall((@blasfunc(dgetrf_), MKL_jll.libmkl_rt), Cvoid,
(Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64},
(Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64},
Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}),
m, n, A, lda, ipiv, info)
m, n, A, lda, ipiv, info)
chkargsok(info[])
A, ipiv, info[], info #Error code is stored in LU factorization type
end

function getrf!(A::AbstractMatrix{<:Float32}; ipiv = similar(A, BlasInt, min(size(A,1),size(A,2))), info = Ref{BlasInt}(), check = false)
function getrf!(A::AbstractMatrix{<:Float32};

Check warning on line 34 in ext/LinearSolveMKLExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveMKLExt.jl#L34

Added line #L34 was not covered by tests
ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
info = Ref{BlasInt}(),
check = false)
require_one_based_indexing(A)
check && chkfinite(A)
chkstride1(A)
m, n = size(A)
lda = max(1,stride(A, 2))
lda = max(1, stride(A, 2))

Check warning on line 42 in ext/LinearSolveMKLExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveMKLExt.jl#L42

Added line #L42 was not covered by tests
if isempty(ipiv)
ipiv = similar(A, BlasInt, min(size(A,1),size(A,2)))
ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2)))

Check warning on line 44 in ext/LinearSolveMKLExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveMKLExt.jl#L44

Added line #L44 was not covered by tests
end
ccall((@blasfunc(sgetrf_), MKL_jll.libmkl_rt), Cvoid,
(Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32},
(Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32},
Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}),
m, n, A, lda, ipiv, info)
m, n, A, lda, ipiv, info)
chkargsok(info[])
A, ipiv, info[], info #Error code is stored in LU factorization type
end

function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float64}, ipiv::AbstractVector{BlasInt}, B::AbstractVecOrMat{<:Float64}; info = Ref{BlasInt}())
function getrs!(trans::AbstractChar,

Check warning on line 54 in ext/LinearSolveMKLExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveMKLExt.jl#L54

Added line #L54 was not covered by tests
A::AbstractMatrix{<:Float64},
ipiv::AbstractVector{BlasInt},
B::AbstractVecOrMat{<:Float64};
info = Ref{BlasInt}())
require_one_based_indexing(A, ipiv, B)
LinearAlgebra.LAPACK.chktrans(trans)
chkstride1(A, B, ipiv)
Expand All @@ -57,14 +68,19 @@ function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float64}, ipiv::Abstrac
end
nrhs = size(B, 2)
ccall(("dgetrs_", MKL_jll.libmkl_rt), Cvoid,
(Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64}, Ref{BlasInt},
Ptr{BlasInt}, Ptr{Float64}, Ref{BlasInt}, Ptr{BlasInt}, Clong),
trans, n, size(B,2), A, max(1,stride(A,2)), ipiv, B, max(1,stride(B,2)), info, 1)
(Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64}, Ref{BlasInt},
Ptr{BlasInt}, Ptr{Float64}, Ref{BlasInt}, Ptr{BlasInt}, Clong),
trans, n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, max(1, stride(B, 2)), info,
1)
LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[]))
B
end

function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float32}, ipiv::AbstractVector{BlasInt}, B::AbstractVecOrMat{<:Float32}; info = Ref{BlasInt}())
function getrs!(trans::AbstractChar,

Check warning on line 79 in ext/LinearSolveMKLExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveMKLExt.jl#L79

Added line #L79 was not covered by tests
A::AbstractMatrix{<:Float32},
ipiv::AbstractVector{BlasInt},
B::AbstractVecOrMat{<:Float32};
info = Ref{BlasInt}())
require_one_based_indexing(A, ipiv, B)
LinearAlgebra.LAPACK.chktrans(trans)
chkstride1(A, B, ipiv)
Expand All @@ -77,9 +93,10 @@ function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float32}, ipiv::Abstrac
end
nrhs = size(B, 2)
ccall(("sgetrs_", MKL_jll.libmkl_rt), Cvoid,
(Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32}, Ref{BlasInt},
Ptr{BlasInt}, Ptr{Float32}, Ref{BlasInt}, Ptr{BlasInt}, Clong),
trans, n, size(B,2), A, max(1,stride(A,2)), ipiv, B, max(1,stride(B,2)), info, 1)
(Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32}, Ref{BlasInt},
Ptr{BlasInt}, Ptr{Float32}, Ref{BlasInt}, Ptr{BlasInt}, Clong),
trans, n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, max(1, stride(B, 2)), info,
1)
LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[]))
B
end
Expand Down Expand Up @@ -125,4 +142,4 @@ function SciMLBase.solve!(cache::LinearCache, alg::MKLLUFactorization;
=#
end

end
end
2 changes: 1 addition & 1 deletion ext/LinearSolveMetalExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ function SciMLBase.solve!(cache::LinearCache, alg::MetalLUFactorization;
SciMLBase.build_linear_solution(alg, y, nothing, cache)
end

end
end
5 changes: 3 additions & 2 deletions src/LinearSolve.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@ PrecompileTools.@recompile_invalidations begin
import InteractiveUtils

using LinearAlgebra: BlasInt, LU
using LinearAlgebra.LAPACK: require_one_based_indexing, chkfinite, chkstride1,
@blasfunc, chkargsok
using LinearAlgebra.LAPACK: require_one_based_indexing,
chkfinite, chkstride1,
@blasfunc, chkargsok

import GPUArraysCore
import Preferences
Expand Down
Loading

0 comments on commit daf7212

Please sign in to comment.