diff --git a/benchmarks/runbenchmarks.jl b/benchmarks/runbenchmarks.jl index f49a16d0..5f402eec 100644 --- a/benchmarks/runbenchmarks.jl +++ b/benchmarks/runbenchmarks.jl @@ -105,6 +105,7 @@ end include("../configs/configs.jl") results = Dict() +baseline_results = Dict() details = Dict() for cf in get_configs() @@ -115,7 +116,7 @@ for cf in get_configs() run_gemm(cf, a, b, c, d) # benchmark - profile_results = CUDA.@profiled begin + profile_results = CUDA.@profile begin for sample in 1:NUM_SAMPLES run_gemm(cf, a, b, c, d) end @@ -123,22 +124,41 @@ for cf in get_configs() # XXX: This works for now, since every GEMM is one kernel, but later on we may want to benchmark # operations consisting of multiple kernel launches... - # XXX: Will this always work with mangling? - matmul_results = filter(row -> contains(row.name, String(Symbol(cf.kernel))), profile_results.device) - - @assert size(matmul_results, 1) == NUM_SAMPLES + profile_results = profile_results.device # get info details[cf.name] = Dict( - "registers" => matmul_results[1, "registers"], - "dynamic_shared_mem" => matmul_results[1, "shared_mem"].dynamic, - "static_shared_mem" => matmul_results[1, "shared_mem"].static, - "local_mem" => matmul_results[1, "local_mem"].thread + "registers" => profile_results[1, "registers"], + "dynamic_shared_mem" => profile_results[1, "shared_mem"].dynamic, + "static_shared_mem" => profile_results[1, "shared_mem"].static, + "local_mem" => profile_results[1, "local_mem"].thread ) - times = 1e9 .* (matmul_results[!, "stop"] - matmul_results[!, "start"]) + times = 1e9 .* (profile_results[!, "stop"] - profile_results[!, "start"]) + @assert length(times) == NUM_SAMPLES + + @info "\tGemmKernels: $(prettytime(times)) $(prettyflops(times, cf.config.matmul_shape))" + + if !isnothing(cf.baseline) + # benchmark baseline + baseline_profile_results = CUDA.@profile begin + for sample in 1:NUM_SAMPLES + run_baseline(cf, a, b, c, d) + end + end + + baseline_profile_results = baseline_profile_results.device + @assert size(baseline_profile_results, 1) % NUM_SAMPLES == 0 + + baseline_times = 1e9 .* sum.(Iterators.partition(baseline_profile_results[!, "stop"] - baseline_profile_results[!, "start"], size(baseline_profile_results, 1) รท NUM_SAMPLES)) + @assert length(baseline_times) == NUM_SAMPLES + + baseline_ratio = "$(round(100 * minimum(baseline_times) / minimum(times); sigdigits=3))" + @info "\tBaseline: $(prettytime(baseline_times)) $(prettyflops(baseline_times, cf.config.matmul_shape)) (GemmKernels: $(baseline_ratio)%)" + + baseline_results[cf.name] = Dict("times" => baseline_times) + end - @info "\t$(prettytime(times)) $(prettyflops(times, cf.config.matmul_shape))" results[cf.name] = Dict("times" => times) end @@ -303,6 +323,20 @@ if previous_results !== nothing end end + # Print results compared to baseline. + println(io, "# Comparison with baseline") + + println(io, "| test | GemmKernels | Baseline | % |") + println(io, "|------|-------------|----------|---|") + + for k in keys(baseline_results) + times = results[k]["times"] + baseline_times = baseline_results[k]["times"] + baseline_ratio = "$(round(100 * minimum(baseline_times) / minimum(times); sigdigits=3))" + + println(io, "| $(markdown_escaped_code(k)) | $(prettytime(times)) | $(prettytime(baseline_times)) | $(baseline_ratio) |") + end + body = String(take!(io)) println(body) diff --git a/configs/configs.jl b/configs/configs.jl index 56568605..cf83647b 100644 --- a/configs/configs.jl +++ b/configs/configs.jl @@ -21,6 +21,7 @@ struct Configuration epilogue # The epilogue to use. verify # Verify function to use. kernel # The kernel function to use. + baseline # Baseline implementation to compare performance against end function get_custom_mul!(element_update) @@ -82,6 +83,12 @@ function run_gemm(cf::Configuration, a, b, c, d) kernel = cf.kernel) end +# Run the baseline. +function run_baseline(cf::Configuration, a, b, c, d) + @assert !isnothing(cf.baseline) + cf.baseline(a, b, c, d, cf.alpha, cf.beta, cf.transpose_a, cf.transpose_b) +end + # Verify results. function verify(cf::Configuration, c_h, d) cf.verify(c_h, d) @@ -101,18 +108,28 @@ function verify_dual(c_h, d) isapprox(c_dual, d_dual) end +function fpu_baseline(a, b, c, d, alpha, beta, transpose_a, transpose_b) + CUDA.CUBLAS.cublasSetMathMode(CUBLAS.handle(), CUBLAS.CUBLAS_DEFAULT_MATH) + CUDA.CUBLAS.gemmEx!(!transpose_a ? 'N' : 'T', !transpose_b ? 'N' : 'T', alpha, a, b, beta, c) +end + +function wmma_baseline(a, b, c, d, alpha, beta, transpose_a, transpose_b) + CUDA.CUBLAS.cublasSetMathMode(CUBLAS.handle(), CUBLAS.CUBLAS_TENSOR_OP_MATH) + CUDA.CUBLAS.gemmEx!(!transpose_a ? 'N' : 'T', !transpose_b ? 'N' : 'T', alpha, a, b, beta, c) +end + function get_configs() rv = [] # FPU Op - for (A_type, B_type, CD_type) in [ - (Float16, Float16, Float32), - (Float32, Float32, Float32), - (Float32, Float32, Float64), - (Float64, Float64, Float64), - (Int16, Int16, Int16), - (Int32, Int32, Int32), - (Int64, Int64, Int64)], + for (A_type, B_type, CD_type, baseline_func) in [ + (Float16, Float16, Float32, fpu_baseline), + (Float32, Float32, Float32, fpu_baseline), + (Float32, Float32, Float64, nothing), + (Float64, Float64, Float64, fpu_baseline), + (Int16, Int16, Int16, nothing), + (Int32, Int32, Int32, nothing), + (Int64, Int64, Int64, nothing)], transpose_a = [false, true], transpose_b = [false, true], (OP_M, OP_N, OP_K, OP_MB, OP_NB, OP_KB) in [(8, 16, 2, 4, 8, 1)], @@ -151,7 +168,8 @@ function get_configs() mul!, Epilogue.Default(), verify_default, - Kernel.matmul_pipelined)) + Kernel.matmul_pipelined, + baseline_func)) end # FPU Op shapes @@ -209,7 +227,8 @@ function get_configs() mul!, Epilogue.Default(), verify_default, - Kernel.matmul_pipelined)) + Kernel.matmul_pipelined, + fpu_baseline)) end # Tropical GEMM @@ -254,7 +273,8 @@ function get_configs() get_custom_mul!((a, b, c) -> max(a + b, c)), Epilogue.Default(), verify_default, - Kernel.matmul_pipelined)) + Kernel.matmul_pipelined, + nothing)) end # WMMA GEMM @@ -298,7 +318,8 @@ function get_configs() mul!, Epilogue.Default(), verify_default, - Kernel.matmul_pipelined)) + Kernel.matmul_pipelined, + wmma_baseline)) end # WMMA GEMM + bias @@ -344,7 +365,8 @@ function get_configs() mul!, Epilogue.Bias(pointer(bias)), (c_h, d) -> verify_bias(c_h, d, bias), - Kernel.matmul_pipelined)) + Kernel.matmul_pipelined, + nothing)) end # WMMA Diagonal GEMM @@ -394,7 +416,8 @@ function get_configs() (C, A, B, alpha, beta) -> mul!(C, Diagonal(A[1:M,1]), B, true, true), Epilogue.Default(), verify_default, - Kernel.matmul_singlestage)) + Kernel.matmul_singlestage, + nothing)) end # WMMA Complex GEMM @@ -453,7 +476,8 @@ function get_configs() mul!, Epilogue.Default(), verify_default, - Kernel.matmul_pipelined)) + Kernel.matmul_pipelined, + nothing)) end # WMMA Dual GEMM @@ -511,7 +535,8 @@ function get_configs() (C, A, B, alpha, beta) -> mul!(dual_conv(C), dual_conv(Complex{Float32}.(A)), dual_conv(Complex{Float32}.(B)), true, true), Epilogue.Default(), verify_dual, - Kernel.matmul_pipelined)) + Kernel.matmul_pipelined, + nothing)) end rv