Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parallelized testing using XUnit.jl. #71

Merged
merged 4 commits into from
Feb 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 2 additions & 15 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
@@ -1,30 +1,17 @@
steps:
- label: "Julia 1.5"
plugins:
- JuliaCI/julia#v1:
version: '1.5'
- JuliaCI/julia-test#v1: ~
- JuliaCI/julia-coverage#v1:
codecov: true
agents:
queue: "juliagpu"
cuda: "*"
cap: "recent"
if: build.message !~ /\[skip tests\]/
timeout_in_minutes: 60

- label: "Julia 1.6-nightly"
plugins:
- JuliaCI/julia#v1:
version: "1.6-nightly"
- JuliaCI/julia-test#v1: ~
- JuliaCI/julia-coverage#v1:
agents:
queue: "juliagpu"
cuda: "*"
cap: "recent"
if: build.message !~ /\[skip tests\]/
timeout_in_minutes: 60

- label: "Julia nightly"
plugins:
- JuliaCI/julia#v1:
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ ForwardDiff = "0.10"
GPUifyLoops = "0.2"
LLVM = "3"
StaticArrays = "0.12, 1.0"
julia = "1.5"
julia = "1.6"
4 changes: 0 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ _Flexible and performant GEMM kernels in Julia_

| Julia | CI |
| ----------- | -- |
| 1.5 | [![Continuous Integration][buildkite-julia1-image]][buildkite-julia1-url] |
| 1.6-nightly | [![Continuous Integration][buildkite-julia16nightly-image]][buildkite-julia16nightly-url] |
| nightly | [![Continuous Integration][buildkite-julianightly-image]][buildkite-julianightly-url] |

Expand Down Expand Up @@ -56,9 +55,6 @@ We compare our kernels with the state-of-the-art libraries cuBLAS and CUTLASS on
For more details on the implementation and performance results, please see our accompanying paper (pre-print available on [arXiv][arxiv-paper]).
The [`CITATION.bib`](CITATION.bib) file in the root of this repository contains a citation in BibTeX format.

[buildkite-julia1-image]: https://badge.buildkite.com/92f2ead968bafc516afa354576cccb7ab2f5b42a272d9cb0f0.svg?branch=master&step=Julia%201.5
[buildkite-julia1-url]: https://buildkite.com/julialang/gemmkernels-dot-jl

[buildkite-julia16nightly-image]: https://badge.buildkite.com/92f2ead968bafc516afa354576cccb7ab2f5b42a272d9cb0f0.svg?branch=master&step=Julia%201.6-nightly
[buildkite-julia16nightly-url]: https://buildkite.com/julialang/gemmkernels-dot-jl
[buildkite-julianightly-image]: https://badge.buildkite.com/92f2ead968bafc516afa354576cccb7ab2f5b42a272d9cb0f0.svg?branch=master&step=Julia%20nightly
Expand Down
5 changes: 3 additions & 2 deletions test/Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
[deps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
XUnit = "3e3c03f2-1a94-11e9-2981-050a4ca824ab"
21 changes: 11 additions & 10 deletions test/blas.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ using LinearAlgebra

CUDA.CUBLAS.cublasSetMathMode(CUBLAS.handle(), CUBLAS.CUBLAS_TENSOR_OP_MATH)

@test_if "blas" @testset "BLAS API" begin
@testset "WMMA GEMM ($( !transpose_a ? 'N' : 'T' )$( !transpose_b ? 'N' : 'T' ))" for transpose_a = [false, true],
transpose_b = [false, true]

@testset "(M = $M, N = $N, K = $K)" for M in [128, 256],
@testset "BLAS API" begin
@testset "WMMA GEMM" begin
for transpose_a = [false, true], transpose_b = [false, true],
M in [128, 256], N in [128, 256], K in [128, 256]
@testcase "$( !transpose_a ? 'N' : 'T' )$( !transpose_b ? 'N' : 'T' ); M = $M, N = $N, K = $K" begin
N in [128, 256],
K in [128, 256]

Expand All @@ -35,12 +35,12 @@ CUDA.CUBLAS.cublasSetMathMode(CUBLAS.handle(), CUBLAS.CUBLAS_TENSOR_OP_MATH)
@test all(isapprox.(Array(c_gemmkernels), Array(c_cublas); rtol=sqrt(eps(Float16))));
end
end
end

@testset "WMMA GEMM (A = diagonal, B = $( !transpose_b ? 'N' : 'T' ))" for transpose_b = [false, true]
@testset "(M = $M, N = $N, K = $K)" for M in [128, 256],
N in [128, 256],
K in [M]

@testset "WMMA GEMM diagonal" begin
for transpose_b = [false, true],
M in [128, 256], N in [128, 256], K in [M]
@testcase "A = diagonal, B = $( !transpose_b ? 'N' : 'T' ); M = $M, N = $N, K = $K" begin
transpose_a = false

alpha = rand(Float32)
Expand All @@ -66,4 +66,5 @@ CUDA.CUBLAS.cublasSetMathMode(CUBLAS.handle(), CUBLAS.CUBLAS_TENSOR_OP_MATH)
@test all(isapprox.(Array(c_gemmkernels), Array(c_cublas); rtol=sqrt(eps(Float16))));
end
end
end
end
72 changes: 40 additions & 32 deletions test/matmul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ using LinearAlgebra
################################################################################

@testset "Matmul API" begin
@test_if "wmma" @testset "WMMA GEMM ($( !transpose_a ? 'N' : 'T' )$( !transpose_b ? 'N' : 'T' ))" for transpose_a = [false, true],
transpose_b = [false, true]

@testset "(M = $M, N = $N, K = $K)" for (M, N, K) in [(128, 128, 128), (256, 256, 128), (128, 128, 256), (256, 256, 256), (2048, 2048, 2048)]
@testset "WMMA GEMM" begin
for transpose_a = [false, true], transpose_b = [false, true],
(M, N, K) in [(128, 128, 128), (256, 256, 128), (128, 128, 256), (256, 256, 256), (2048, 2048, 2048)]
@testcase "$( !transpose_a ? 'N' : 'T' )$( !transpose_b ? 'N' : 'T' ); M = $M, N = $N, K = $K" begin
alpha = 2
beta = 3

Expand Down Expand Up @@ -51,12 +51,13 @@ using LinearAlgebra

@test all(isapprox.(alpha * Float32.(new_a_h) * Float32.(new_b_h) + beta * c_h, Array(d); rtol = sqrt(eps(Float16))))
end
end
end

@test_if "bias" @testset "WMMA GEMM ($( !transpose_a ? 'N' : 'T' )$( !transpose_b ? 'N' : 'T' )) + bias" for transpose_a = [false, true],
transpose_b = [false, true]

@testset "(M = $M, N = $N, K = $K)" for (M, N, K) in [(128, 128, 128), (256, 256, 256), (4096, 4096, 4096)]
@testset "WMMA GEMM + bias" begin
for transpose_a = [false, true], transpose_b = [false, true],
(M, N, K) in [(128, 128, 128), (256, 256, 256), (4096, 4096, 4096)]
@testcase "$( !transpose_a ? 'N' : 'T' )$( !transpose_b ? 'N' : 'T' ); M = $M, N = $N, K = $K" begin
a_h = rand(Float16, (M, K)) / sqrt(Float16(K))
b_h = rand(Float16, (K, N)) / sqrt(Float16(K))
c_h = rand(Float32, (M, N))
Expand Down Expand Up @@ -102,10 +103,13 @@ using LinearAlgebra

@test all(isapprox.(Float32.(new_a_h) * Float32.(new_b_h) + c_h .+ Array(bias), Array(d); rtol = sqrt(eps(Float16))))
end
end
end

@test_if "diagonal" @testset "WMMA GEMM (A = diagonal, B = $( !transpose_b ? 'N' : 'T' ))" for transpose_b = [false, true]
@testset "(M = $M, N = $N, K = $K)" for (M, N, K) in [(128, 128, 128), (256, 256, 256), (4096, 4096, 4096)]
@testset "WMMA GEMM diagonal" begin
for transpose_b = [false, true],
(M, N, K) in [(128, 128, 128), (256, 256, 256), (4096, 4096, 4096)]
@testcase "A = diagonal, B = $( !transpose_b ? 'N' : 'T' ); M = $M, N = $N, K = $K" begin
@assert M == K "Diagonal only supports square A matrix (M == K)"

transpose_a = false
Expand Down Expand Up @@ -146,24 +150,25 @@ using LinearAlgebra

@test all(isapprox.(Float32.(Diagonal(new_a_h)) * Float32.(new_b_h) + c_h, Array(d); rtol = sqrt(eps(Float16))))
end
end
end

@test_if "complex" @testset "WMMA Complex GEMM ($( !transpose_a ? 'N' : 'T' )$( !transpose_b ? 'N' : 'T' ))" for transpose_a = [false, true],
transpose_b = [false, true]

@testset "(M = $M, N = $N, K = $K)" for (M, N, K) = [(128, 128, 128), (256, 256, 256), (2048, 2048, 2048)]
a_h = rand(Complex{Float16}, (M, K)) / sqrt(Float16(K));
b_h = rand(Complex{Float16}, (K, N)) / sqrt(Float16(K));
c_h = rand(Complex{Float32}, (M, N));
@testset "WMMA Complex GEMM" begin
for transpose_a = [false, true], transpose_b = [false, true],
(M, N, K) = [(128, 128, 128), (256, 256, 256), (2048, 2048, 2048)]
@testcase "$( !transpose_a ? 'N' : 'T' )$( !transpose_b ? 'N' : 'T' ); M = $M, N = $N, K = $K" begin
a_h = rand(Complex{Float16}, (M, K)) / sqrt(Float16(K))
b_h = rand(Complex{Float16}, (K, N)) / sqrt(Float16(K))
c_h = rand(Complex{Float32}, (M, N))

# Transpose input if necessary
a_h = transpose_a ? transpose(a_h) : a_h
b_h = transpose_b ? transpose(b_h) : b_h

a = CuArray(a_h);
b = CuArray(b_h);
c = CuArray(c_h);
d = similar(c);
a = CuArray(a_h)
b = CuArray(b_h)
c = CuArray(c_h)
d = similar(c)

conf = GemmKernels.get_config(
gemm_shape = (M = M, N = N, K = K),
Expand Down Expand Up @@ -209,20 +214,22 @@ using LinearAlgebra

# TODO: Figure out why changing this to a * b + c = d instead of a * b = d - c
# makes tests fail for CC (see #19).
@test all(isapprox.(Complex{Float32}.(new_a_h) * Complex{Float32}.(new_b_h), Array(d) - c_h; rtol=sqrt(eps(Float16))));
@test all(isapprox.(Complex{Float32}.(new_a_h) * Complex{Float32}.(new_b_h), Array(d) - c_h; rtol=sqrt(eps(Float16))))
end
end
end

@test_if "dual" @testset "WMMA Dual GEMM" begin
@testset "(M = $M, N = $N, K = $K)" for (M, N, K) in [(128, 128, 128), (256, 256, 256), (2048, 2048, 2048)]
a_h = rand(Complex{Float16}, (M, K)) / sqrt(Float16(K));
b_h = rand(Complex{Float16}, (K, N)) / sqrt(Float16(K));
c_h = rand(Complex{Float32}, (M, N));
@testset "WMMA Dual GEMM" begin
for (M, N, K) in [(128, 128, 128), (256, 256, 256), (2048, 2048, 2048)]
@testcase "M = $M, N = $N, K = $K" begin
a_h = rand(Complex{Float16}, (M, K)) / sqrt(Float16(K))
b_h = rand(Complex{Float16}, (K, N)) / sqrt(Float16(K))
c_h = rand(Complex{Float32}, (M, N))

a = CuArray(a_h);
b = CuArray(b_h);
c = CuArray(c_h);
d = similar(c);
a = CuArray(a_h)
b = CuArray(b_h)
c = CuArray(c_h)
d = similar(c)

conf = GemmKernels.get_config(
gemm_shape = (M = M, N = N, K = K),
Expand Down Expand Up @@ -261,7 +268,8 @@ using LinearAlgebra
c_dual = reinterpret(ForwardDiff.Dual{Float32,Float32,1}, c_h)
d_dual = reinterpret(ForwardDiff.Dual{Float32,Float32,1}, Array(d))

@test all(isapprox.(a_dual * b_dual + c_dual, d_dual; rtol=sqrt(eps(Float16))));
@test all(isapprox.(a_dual * b_dual + c_dual, d_dual; rtol=sqrt(eps(Float16))))
end
end
end
end
Expand Down
18 changes: 6 additions & 12 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,25 +1,19 @@
using GemmKernels
using Test

using Pkg
Pkg.add(PackageSpec(name="XUnit", rev="9b756fcda72d813dbf017f8400d7c55251ef7d1b"))

using XUnit

import CUDA
import InteractiveUtils

@info "Julia details\n\n" * sprint(io->InteractiveUtils.versioninfo(io))
@info "CUDA details\n\n" * sprint(io->CUDA.versioninfo(io))

macro test_if(label, expr)
return quote
if isempty(ARGS) || $(label) in ARGS
$(esc(expr))
else
nothing
end
end
end

CUDA.allowscalar(false)

@testset "GemmKernels.jl" begin
@testset runner=ParallelTestRunner() "GemmKernels.jl" begin
include("tiling.jl")
include("matmul.jl")
include("blas.jl")
Expand Down
18 changes: 9 additions & 9 deletions test/tiling.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,39 +2,39 @@ using GemmKernels.Tiling

################################################################################

@test_if "tiling" @testset "Tiling API" begin
@testset "Tiling API" begin
@testset "Tiles" begin
@testset "Index" begin
@testcase "Index" begin
@test Tile(M = 4, N = 4, K = 4).index == (M = 0, N = 0, K = 0)
end

@testset "Projection" begin
@testcase "Projection" begin
@test Tile(M = 1, N = 2, K = 3).MN == Tile(M = 1, N = 2)
@test Tile(M = 1, N = 2, K = 3).NM == Tile(N = 2, M = 1)
@test Tile(M = 1, N = 2, K = 3).M == Tile(M = 1)
@test Tile(M = 1, N = 2, K = 3).KMN == Tile(K = 3, M = 1, N = 2)
end

@testset "Transposition" begin
@testcase "Transposition" begin
@test transpose(Tile(M = 1, N = 2)) == Tile(N = 2, M = 1)
@test transpose(Tile(M = 1, N = 2, K = 3)) == Tile(K = 3, N = 2, M = 1)
end

@testset "Translate base" begin
@testcase "Translate base" begin
tile = translate_base(Tile(M = 10, N = 20), (M = 1, N = 2))
@test tile.size == (M = 10, N = 20)
@test tile.base == (M = 1, N = 2)
@test tile.offset == (M = 0, N = 0)
end

@testset "Translate offset" begin
@testcase "Translate offset" begin
tile = translate_offset(Tile(M = 10, N = 20), (M = 1, N = 2))
@test tile.size == (M = 10, N = 20)
@test tile.base == (M = 0, N = 0)
@test tile.offset == (M = 1, N = 2)
end

@testset "Linearise" begin
@testcase "Linearise" begin
tile = Tile(M = 3, N = 5)
for i = 0 : 2, j = 0 : 4
tile_t = translate_offset(tile, (M = i, N = j))
Expand All @@ -45,7 +45,7 @@ using GemmKernels.Tiling
end

@testset "Tile iteration" begin
@testset "Subdivide" begin
@testcase "Subdivide" begin
tile_size = (M = 8, N = 4)
num_tiles = (M = 2, N = 4)
tile = Tile(M = num_tiles.M * tile_size.M, N = num_tiles.N * tile_size.N)
Expand All @@ -59,7 +59,7 @@ using GemmKernels.Tiling
end
end

@testset "Parallellise" begin
@testcase "Parallellise" begin
tile_size = (M = 8, N = 4)
num_tiles = (M = 2, N = 8)
tile = Tile(M = num_tiles.M * tile_size.M, N = num_tiles.N * tile_size.N)
Expand Down