diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 24ef0f84..b6fc8766 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -15,7 +15,14 @@ steps: julia --project -e ' # make sure the 1.6-era Manifest works on this Julia version using Pkg - Pkg.resolve()' + Pkg.resolve() + + # work around XUnit.jl bug + try + Pkg.add(url="https://github.com/maleadt/XUnit.jl", rev="pass_compat") + catch err + @warn "Could not install patched version of XUnit.jl" + end' if: build.message !~ /\[skip tests\]/ timeout_in_minutes: 120 matrix: diff --git a/.gitignore b/.gitignore index 8b137891..c181d1f8 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1 @@ - +test/Manifest.toml diff --git a/test/Project.toml b/test/Project.toml index c11f3159..8828b9af 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,6 +1,9 @@ [deps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" +Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +XUnit = "3e3c03f2-1a94-11e9-2981-050a4ca824ab" diff --git a/test/blas.jl b/test/blas.jl index 68b87a40..5f296941 100644 --- a/test/blas.jl +++ b/test/blas.jl @@ -4,12 +4,12 @@ using LinearAlgebra CUDA.CUBLAS.cublasSetMathMode(CUBLAS.handle(), CUBLAS.CUBLAS_TENSOR_OP_MATH) -@test_if "blas" @testset "BLAS API" begin +@testset "BLAS API" begin @testset "WMMA GEMM $(A_type)*$(B_type)+$(CD_type)=$(CD_type) ($( !transpose_a ? 'N' : 'T' )$( !transpose_b ? 'N' : 'T' ))" for transpose_a = [false, true], transpose_b = [false, true], (A_type, B_type, CD_type, min_dimension) in [(Float16, Float16, Float16, 256), (Float16, Float16, Float32, 128)] - @testset "(M = $M, N = $N, K = $K)" for M in min_dimension .* [1, 2], + @testcase "(M = $M, N = $N, K = $K)" for M in min_dimension .* [1, 2], N in min_dimension .* [1, 2], K in min_dimension .* [1, 2] diff --git a/test/matmul.jl b/test/matmul.jl index e4a1deb8..907db44b 100644 --- a/test/matmul.jl +++ b/test/matmul.jl @@ -6,15 +6,15 @@ using LinearAlgebra ################################################################################ @testset "Matmul API" begin - @test_if "fpu compute and data types" @testset "FPU GEMM $(A_type)*$(B_type)+$(CD_type)=$(CD_type) ($( !transpose_a ? 'N' : 'T' )$( !transpose_b ? 'N' : 'T' )) OP ($(OP_M), $(OP_N), $(OP_K))" for + @testset "FPU GEMM $(A_type)*$(B_type)+$(CD_type)=$(CD_type) ($( !transpose_a ? 'N' : 'T' )$( !transpose_b ? 'N' : 'T' )) OP ($(OP_M), $(OP_N), $(OP_K))" for (A_type, B_type, CD_type, min_dimension) in [ (Float16, Float16, Float32, 128), (Float32, Float32, Float32, 128), (Float32, Float32, Float64, 128), (Float64, Float64, Float64, 128), - (Int16, Int16, Int16, 128), (Int32, Int32, Int32, 128), (Int64, Int64, Int64, 128), - ], - transpose_a = [false, true], - transpose_b = [false, true], + (Int16, Int16, Int16, 128), (Int32, Int32, Int32, 128), (Int64, Int64, Int64, 128), + ], + transpose_a = [false, true], + transpose_b = [false, true], (OP_M, OP_N, OP_K) in [(8, 16, 2)] - @testset "(M = $M, N = $N, K = $K)" for (M, N, K) in vcat(min_dimension.*[[1,1,1], [2, 2, 1], [1, 1, 2], [2, 2, 2]], [[2048, 2048, 2048]]) + @testcase "(M = $M, N = $N, K = $K)" for (M, N, K) in vcat(min_dimension.*[[1,1,1], [2, 2, 1], [1, 1, 2], [2, 2, 2]], [[2048, 2048, 2048]]) alpha = convert(A_type, 2) beta = convert(CD_type, 3) @@ -59,7 +59,7 @@ using LinearAlgebra # Transpose outputs, if necessary new_a_h = transpose_a ? transpose(a_h) : a_h new_b_h = transpose_b ? transpose(b_h) : b_h - + if A_type <: Integer @test all(isapprox.(alpha * CD_type.(new_a_h) * CD_type.(new_b_h) + beta * c_h, Array(d))) else @@ -68,13 +68,13 @@ using LinearAlgebra end end - @test_if "fpu operator shape" @testset "FPU GEMM OPERATOR SHAPE ($(OP_M), $(OP_N), $(OP_K)) (NN, NT, TN, TT)" for (OP_M, OP_N, OP_K) in [ - (4, 8, 1), (8, 8, 1), (4, 16, 1), (4, 8, 2), (8, 16, 2) + @testset "FPU GEMM OPERATOR SHAPE ($(OP_M), $(OP_N), $(OP_K)) (NN, NT, TN, TT)" for (OP_M, OP_N, OP_K) in [ + (4, 8, 1), (8, 8, 1), (4, 16, 1), (4, 8, 2), (8, 16, 2) ] - @testset "NN, NT, TN, TT" for (transpose_a, transpose_b) in [(false, false), (false, true), (true, false), (true, true)] + @testcase "NN, NT, TN, TT" for (transpose_a, transpose_b) in [(false, false), (false, true), (true, false), (true, true)] (M, N, K) = (128, 128, 128) (A_type, B_type, CD_type) = (Float32, Float32, Float32) - + alpha = convert(A_type, 2) beta = convert(CD_type, 3) @@ -114,18 +114,18 @@ using LinearAlgebra # Transpose outputs, if necessary new_a_h = transpose_a ? transpose(a_h) : a_h new_b_h = transpose_b ? transpose(b_h) : b_h - + @test all(isapprox.(alpha * CD_type.(new_a_h) * CD_type.(new_b_h) + beta * c_h, Array(d); rtol = sqrt(eps(A_type)))) end end - @test_if "tropical fpu" @testset "TROPICAL GEMM $(A_type)*$(B_type)+$(CD_type)=$(CD_type) ($( !transpose_a ? 'N' : 'T' )$( !transpose_b ? 'N' : 'T' )) OP ($(OP_M), $(OP_N), $(OP_K))" for - (A_type, B_type, CD_type, min_dimension) in [(Float32, Float32, Float32, 128)], - transpose_a = [false, true], - transpose_b = [false, true], + @testset "TROPICAL GEMM $(A_type)*$(B_type)+$(CD_type)=$(CD_type) ($( !transpose_a ? 'N' : 'T' )$( !transpose_b ? 'N' : 'T' )) OP ($(OP_M), $(OP_N), $(OP_K))" for + (A_type, B_type, CD_type, min_dimension) in [(Float32, Float32, Float32, 128)], + transpose_a = [false, true], + transpose_b = [false, true], (OP_M, OP_N, OP_K) in [(8, 16, 2)] - @testset "(M = $M, N = $N, K = $K)" for (M, N, K) in vcat(min_dimension.*[[1,1,1], [2, 2, 1], [1, 1, 2], [2, 2, 2]]) + @testcase "(M = $M, N = $N, K = $K)" for (M, N, K) in vcat(min_dimension.*[[1,1,1], [2, 2, 1], [1, 1, 2], [2, 2, 2]]) a_h = rand(A_type, (M, K)) / sqrt(A_type(K)) b_h = rand(B_type, (K, N)) / sqrt(B_type(K)) c_h = rand(CD_type, (M, N)) @@ -135,7 +135,7 @@ using LinearAlgebra for j in 1 : N d_h[i, j] = c_h[i, j] for k in 1 : K - d_h[i, j] = max(a_h[i, k] + b_h[k, j], d_h[i, j]) + d_h[i, j] = max(a_h[i, k] + b_h[k, j], d_h[i, j]) end end end @@ -164,16 +164,16 @@ using LinearAlgebra ) GemmKernels.matmul(a, b, c, d, conf; kernel = Kernel.matmul_pipelined) - + @test all(isapprox.(d_h, Array(d); rtol = sqrt(eps(A_type)))) end end - @test_if "wmma" @testset "WMMA GEMM $(A_type)*$(B_type)+$(CD_type)=$(CD_type) ($( !transpose_a ? 'N' : 'T' )$( !transpose_b ? 'N' : 'T' ))" for transpose_a = [false, true], + @testset "WMMA GEMM $(A_type)*$(B_type)+$(CD_type)=$(CD_type) ($( !transpose_a ? 'N' : 'T' )$( !transpose_b ? 'N' : 'T' ))" for transpose_a = [false, true], transpose_b = [false, true], (A_type, B_type, CD_type, min_dimension) in [(Float16, Float16, Float16, 256), (Float16, Float16, Float32, 128)] - @testset "(M = $M, N = $N, K = $K)" for (M, N, K) in vcat(min_dimension.*[[1,1,1], [2,2,1], [1,1,2], [2,2,2]], [[2048, 2048, 2048]]) + @testcase "(M = $M, N = $N, K = $K)" for (M, N, K) in vcat(min_dimension.*[[1,1,1], [2,2,1], [1,1,2], [2,2,2]], [[2048, 2048, 2048]]) alpha = convert(A_type, 2) beta = convert(CD_type, 3) @@ -217,10 +217,10 @@ using LinearAlgebra end end - @test_if "bias" @testset "WMMA GEMM ($( !transpose_a ? 'N' : 'T' )$( !transpose_b ? 'N' : 'T' )) + bias" for transpose_a = [false, true], + @testset "WMMA GEMM ($( !transpose_a ? 'N' : 'T' )$( !transpose_b ? 'N' : 'T' )) + bias" for transpose_a = [false, true], transpose_b = [false, true] - @testset "(M = $M, N = $N, K = $K)" for (M, N, K) in [(128, 128, 128), (256, 256, 256), (4096, 4096, 4096)] + @testcase "(M = $M, N = $N, K = $K)" for (M, N, K) in [(128, 128, 128), (256, 256, 256), (4096, 4096, 4096)] a_h = rand(Float16, (M, K)) / sqrt(Float16(K)) b_h = rand(Float16, (K, N)) / sqrt(Float16(K)) c_h = rand(Float32, (M, N)) @@ -268,8 +268,8 @@ using LinearAlgebra end end - @test_if "diagonal" @testset "WMMA GEMM (A = diagonal, B = $( !transpose_b ? 'N' : 'T' ))" for transpose_b = [false, true] - @testset "(M = $M, N = $N, K = $K)" for (M, N, K) in [(128, 128, 128), (256, 256, 256), (4096, 4096, 4096)] + @testset "WMMA GEMM (A = diagonal, B = $( !transpose_b ? 'N' : 'T' ))" for transpose_b = [false, true] + @testcase "(M = $M, N = $N, K = $K)" for (M, N, K) in [(128, 128, 128), (256, 256, 256), (4096, 4096, 4096)] @assert M == K "Diagonal only supports square A matrix (M == K)" transpose_a = false @@ -312,10 +312,10 @@ using LinearAlgebra end end - @test_if "complex" @testset "WMMA Complex GEMM ($( !transpose_a ? 'N' : 'T' )$( !transpose_b ? 'N' : 'T' ))" for transpose_a = [false, true], + @testset "WMMA Complex GEMM ($( !transpose_a ? 'N' : 'T' )$( !transpose_b ? 'N' : 'T' ))" for transpose_a = [false, true], transpose_b = [false, true] - @testset "(M = $M, N = $N, K = $K)" for (M, N, K) = [(128, 128, 128), (256, 256, 256), (2048, 2048, 2048)] + @testcase "(M = $M, N = $N, K = $K)" for (M, N, K) = [(128, 128, 128), (256, 256, 256), (2048, 2048, 2048)] a_h = rand(Complex{Float16}, (M, K)) / sqrt(Float16(K)); b_h = rand(Complex{Float16}, (K, N)) / sqrt(Float16(K)); c_h = rand(Complex{Float32}, (M, N)); @@ -377,8 +377,8 @@ using LinearAlgebra end end - @test_if "dual" @testset "WMMA Dual GEMM" begin - @testset "(M = $M, N = $N, K = $K)" for (M, N, K) in [(128, 128, 128), (256, 256, 256), (2048, 2048, 2048)] + @testset "WMMA Dual GEMM" begin + @testcase "(M = $M, N = $N, K = $K)" for (M, N, K) in [(128, 128, 128), (256, 256, 256), (2048, 2048, 2048)] a_h = rand(Complex{Float16}, (M, K)) / sqrt(Float16(K)); b_h = rand(Complex{Float16}, (K, N)) / sqrt(Float16(K)); c_h = rand(Complex{Float32}, (M, N)); diff --git a/test/runtests.jl b/test/runtests.jl index d49b1f72..d2dfb96f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,26 +1,25 @@ -using GemmKernels -using Test -import CUDA -import InteractiveUtils +using Distributed -@info "Julia details\n\n" * sprint(io->InteractiveUtils.versioninfo(io)) -@info "CUDA details\n\n" * sprint(io->CUDA.versioninfo(io)) +# determine parallelism +cpu_jobs = Sys.CPU_THREADS +memory_jobs = Int(Sys.free_memory()) รท (2 * 2^30) +jobs = min(cpu_jobs, memory_jobs) +@info "Running $jobs tests in parallel. If this is too many, set the `JULIA_CPU_THREADS` environment variable." -macro test_if(label, expr) - return quote - if isempty(ARGS) || $(label) in ARGS - $(esc(expr)) - else - nothing - end - end +# add workers +exeflags = Base.julia_cmd() +filter!(exeflags.exec) do c + return !(startswith(c, "--depwarn") || startswith(c, "--check-bounds")) end - -CUDA.allowscalar(false) - -@testset "GemmKernels.jl" begin - include("tiling.jl") - include("matmul.jl") - include("blas.jl") +push!(exeflags.exec, "--check-bounds=yes") +push!(exeflags.exec, "--startup-file=no") +push!(exeflags.exec, "--depwarn=yes") +push!(exeflags.exec, "--project=$(Base.active_project())") +exename = popfirst!(exeflags.exec) +withenv("JULIA_NUM_THREADS" => 1, "OPENBLAS_NUM_THREADS" => 1) do + addprocs(jobs; exename, exeflags) end + +@everywhere using XUnit +runtests("tests.jl") diff --git a/test/tests.jl b/test/tests.jl new file mode 100644 index 00000000..b11e8ccd --- /dev/null +++ b/test/tests.jl @@ -0,0 +1,25 @@ +using Distributed, XUnit, Dates +using CUDA, GemmKernels +CUDA.allowscalar(false) + +if myid() == 1 + using InteractiveUtils + @info "Julia details:\n" * sprint(io->InteractiveUtils.versioninfo(io)) + @info "CUDA details:\n" * sprint(io->CUDA.versioninfo(io)) +end + +t0 = now() +try + @testset runner=DistributedTestRunner() "GemmKernels.jl" begin + include("tiling.jl") + include("matmul.jl") + include("blas.jl") + end +finally + if myid() == 1 + t1 = now() + elapsed = canonicalize(Dates.CompoundPeriod(t1-t0)) + println("Testing finished in $elapsed") + end +end + diff --git a/test/tiling.jl b/test/tiling.jl index ebd4b200..0b101f6c 100644 --- a/test/tiling.jl +++ b/test/tiling.jl @@ -2,39 +2,39 @@ using GemmKernels.Tiling ################################################################################ -@test_if "tiling" @testset "Tiling API" begin +@testset "Tiling API" begin @testset "Tiles" begin - @testset "Index" begin + @testcase "Index" begin @test Tile(M = 4, N = 4, K = 4).index == (M = 0, N = 0, K = 0) end - @testset "Projection" begin + @testcase "Projection" begin @test Tile(M = 1, N = 2, K = 3).MN == Tile(M = 1, N = 2) @test Tile(M = 1, N = 2, K = 3).NM == Tile(N = 2, M = 1) @test Tile(M = 1, N = 2, K = 3).M == Tile(M = 1) @test Tile(M = 1, N = 2, K = 3).KMN == Tile(K = 3, M = 1, N = 2) end - @testset "Transposition" begin + @testcase "Transposition" begin @test transpose(Tile(M = 1, N = 2)) == Tile(N = 2, M = 1) @test transpose(Tile(M = 1, N = 2, K = 3)) == Tile(K = 3, N = 2, M = 1) end - @testset "Translate base" begin + @testcase "Translate base" begin tile = translate_base(Tile(M = 10, N = 20), (M = 1, N = 2)) @test tile.size == (M = 10, N = 20) @test tile.base == (M = 1, N = 2) @test tile.offset == (M = 0, N = 0) end - @testset "Translate offset" begin + @testcase "Translate offset" begin tile = translate_offset(Tile(M = 10, N = 20), (M = 1, N = 2)) @test tile.size == (M = 10, N = 20) @test tile.base == (M = 0, N = 0) @test tile.offset == (M = 1, N = 2) end - @testset "Linearise" begin + @testcase "Linearise" begin tile = Tile(M = 3, N = 5) for i = 0 : 2, j = 0 : 4 tile_t = translate_offset(tile, (M = i, N = j)) @@ -45,7 +45,7 @@ using GemmKernels.Tiling end @testset "Tile iteration" begin - @testset "Subdivide" begin + @testcase "Subdivide" begin tile_size = (M = 8, N = 4) num_tiles = (M = 2, N = 4) tile = Tile(M = num_tiles.M * tile_size.M, N = num_tiles.N * tile_size.N) @@ -59,7 +59,7 @@ using GemmKernels.Tiling end end - @testset "Parallellise" begin + @testcase "Parallellise" begin tile_size = (M = 8, N = 4) num_tiles = (M = 2, N = 8) tile = Tile(M = num_tiles.M * tile_size.M, N = num_tiles.N * tile_size.N)