diff --git a/benchmarks/runbenchmarks.jl b/benchmarks/runbenchmarks.jl
index f49a16d0..5f402eec 100644
--- a/benchmarks/runbenchmarks.jl
+++ b/benchmarks/runbenchmarks.jl
@@ -105,6 +105,7 @@ end
 include("../configs/configs.jl")
 
 results = Dict()
+baseline_results = Dict()
 details = Dict()
 
 for cf in get_configs()
@@ -115,7 +116,7 @@ for cf in get_configs()
     run_gemm(cf, a, b, c, d)
 
     # benchmark
-    profile_results = CUDA.@profiled begin
+    profile_results = CUDA.@profile begin
         for sample in 1:NUM_SAMPLES
             run_gemm(cf, a, b, c, d)
         end
@@ -123,22 +124,41 @@ for cf in get_configs()
 
     # XXX: This works for now, since every GEMM is one kernel, but later on we may want to benchmark
     # operations consisting of multiple kernel launches...
-    # XXX: Will this always work with mangling?
-    matmul_results = filter(row -> contains(row.name, String(Symbol(cf.kernel))), profile_results.device)
-
-    @assert size(matmul_results, 1) == NUM_SAMPLES
+    profile_results = profile_results.device
 
     # get info
     details[cf.name] = Dict(
-        "registers" => matmul_results[1, "registers"],
-        "dynamic_shared_mem" => matmul_results[1, "shared_mem"].dynamic,
-        "static_shared_mem" => matmul_results[1, "shared_mem"].static,
-        "local_mem" => matmul_results[1, "local_mem"].thread
+        "registers" => profile_results[1, "registers"],
+        "dynamic_shared_mem" => profile_results[1, "shared_mem"].dynamic,
+        "static_shared_mem" => profile_results[1, "shared_mem"].static,
+        "local_mem" => profile_results[1, "local_mem"].thread
     )
 
-    times = 1e9 .* (matmul_results[!, "stop"] - matmul_results[!, "start"])
+    times = 1e9 .* (profile_results[!, "stop"] - profile_results[!, "start"])
+    @assert length(times) == NUM_SAMPLES
+
+    @info "\tGemmKernels: $(prettytime(times)) $(prettyflops(times, cf.config.matmul_shape))"
+
+    if !isnothing(cf.baseline)
+        # benchmark baseline
+        baseline_profile_results = CUDA.@profile begin
+            for sample in 1:NUM_SAMPLES
+                run_baseline(cf, a, b, c, d)
+            end
+        end
+
+        baseline_profile_results = baseline_profile_results.device
+        @assert size(baseline_profile_results, 1) % NUM_SAMPLES == 0
+
+        baseline_times = 1e9 .* sum.(Iterators.partition(baseline_profile_results[!, "stop"] - baseline_profile_results[!, "start"], size(baseline_profile_results, 1) ÷ NUM_SAMPLES))
+        @assert length(baseline_times) == NUM_SAMPLES
+
+        baseline_ratio = "$(round(100 * minimum(baseline_times) / minimum(times); sigdigits=3))"
+        @info "\tBaseline:    $(prettytime(baseline_times)) $(prettyflops(baseline_times, cf.config.matmul_shape)) (GemmKernels: $(baseline_ratio)%)"
+
+        baseline_results[cf.name] = Dict("times" => baseline_times)
+    end
 
-    @info "\t$(prettytime(times)) $(prettyflops(times, cf.config.matmul_shape))"
     results[cf.name] = Dict("times" => times)
 end
 
@@ -303,6 +323,20 @@ if previous_results !== nothing
         end
     end
 
+    # Print results compared to baseline.
+    println(io, "# Comparison with baseline")
+
+    println(io, "| test | GemmKernels | Baseline | % |")
+    println(io, "|------|-------------|----------|---|")
+
+    for k in keys(baseline_results)
+        times = results[k]["times"]
+        baseline_times = baseline_results[k]["times"]
+        baseline_ratio = "$(round(100 * minimum(baseline_times) / minimum(times); sigdigits=3))"
+
+        println(io, "| $(markdown_escaped_code(k)) | $(prettytime(times)) | $(prettytime(baseline_times)) | $(baseline_ratio) |")
+    end
+
     body = String(take!(io))
     println(body)
 
diff --git a/configs/configs.jl b/configs/configs.jl
index 56568605..cf83647b 100644
--- a/configs/configs.jl
+++ b/configs/configs.jl
@@ -21,6 +21,7 @@ struct Configuration
     epilogue       # The epilogue to use.
     verify         # Verify function to use.
     kernel         # The kernel function to use.
+    baseline       # Baseline implementation to compare performance against
 end
 
 function get_custom_mul!(element_update)
@@ -82,6 +83,12 @@ function run_gemm(cf::Configuration, a, b, c, d)
                        kernel = cf.kernel)
 end
 
+# Run the baseline.
+function run_baseline(cf::Configuration, a, b, c, d)
+    @assert !isnothing(cf.baseline)
+    cf.baseline(a, b, c, d, cf.alpha, cf.beta, cf.transpose_a, cf.transpose_b)
+end
+
 # Verify results.
 function verify(cf::Configuration, c_h, d)
     cf.verify(c_h, d)
@@ -101,18 +108,28 @@ function verify_dual(c_h, d)
     isapprox(c_dual, d_dual)
 end
 
+function fpu_baseline(a, b, c, d, alpha, beta, transpose_a, transpose_b)
+    CUDA.CUBLAS.cublasSetMathMode(CUBLAS.handle(), CUBLAS.CUBLAS_DEFAULT_MATH)
+    CUDA.CUBLAS.gemmEx!(!transpose_a ? 'N' : 'T', !transpose_b ? 'N' : 'T', alpha, a, b, beta, c)
+end
+
+function wmma_baseline(a, b, c, d, alpha, beta, transpose_a, transpose_b)
+    CUDA.CUBLAS.cublasSetMathMode(CUBLAS.handle(), CUBLAS.CUBLAS_TENSOR_OP_MATH)
+    CUDA.CUBLAS.gemmEx!(!transpose_a ? 'N' : 'T', !transpose_b ? 'N' : 'T', alpha, a, b, beta, c)
+end
+
 function get_configs()
     rv = []
 
     # FPU Op
-    for (A_type, B_type, CD_type) in [
-            (Float16, Float16, Float32),
-            (Float32, Float32, Float32),
-            (Float32, Float32, Float64),
-            (Float64, Float64, Float64),
-            (Int16, Int16, Int16),
-            (Int32, Int32, Int32),
-            (Int64, Int64, Int64)],
+    for (A_type, B_type, CD_type, baseline_func) in [
+            (Float16, Float16, Float32, fpu_baseline),
+            (Float32, Float32, Float32, fpu_baseline),
+            (Float32, Float32, Float64, nothing),
+            (Float64, Float64, Float64, fpu_baseline),
+            (Int16, Int16, Int16, nothing),
+            (Int32, Int32, Int32, nothing),
+            (Int64, Int64, Int64, nothing)],
         transpose_a = [false, true],
         transpose_b = [false, true],
         (OP_M, OP_N, OP_K, OP_MB, OP_NB, OP_KB) in [(8, 16, 2, 4, 8, 1)],
@@ -151,7 +168,8 @@ function get_configs()
                                 mul!,
                                 Epilogue.Default(),
                                 verify_default,
-                                Kernel.matmul_pipelined))
+                                Kernel.matmul_pipelined,
+                                baseline_func))
     end
 
     # FPU Op shapes
@@ -209,7 +227,8 @@ function get_configs()
                                 mul!,
                                 Epilogue.Default(),
                                 verify_default,
-                                Kernel.matmul_pipelined))
+                                Kernel.matmul_pipelined,
+                                fpu_baseline))
     end
 
     # Tropical GEMM
@@ -254,7 +273,8 @@ function get_configs()
                                 get_custom_mul!((a, b, c) -> max(a + b, c)),
                                 Epilogue.Default(),
                                 verify_default,
-                                Kernel.matmul_pipelined))
+                                Kernel.matmul_pipelined,
+                                nothing))
     end
 
     # WMMA GEMM
@@ -298,7 +318,8 @@ function get_configs()
                                 mul!,
                                 Epilogue.Default(),
                                 verify_default,
-                                Kernel.matmul_pipelined))
+                                Kernel.matmul_pipelined,
+                                wmma_baseline))
     end
 
     # WMMA GEMM + bias
@@ -344,7 +365,8 @@ function get_configs()
                                 mul!,
                                 Epilogue.Bias(pointer(bias)),
                                 (c_h, d) -> verify_bias(c_h, d, bias),
-                                Kernel.matmul_pipelined))
+                                Kernel.matmul_pipelined,
+                                nothing))
     end
 
     # WMMA Diagonal GEMM
@@ -394,7 +416,8 @@ function get_configs()
                                 (C, A, B, alpha, beta) -> mul!(C, Diagonal(A[1:M,1]), B, true, true),
                                 Epilogue.Default(),
                                 verify_default,
-                                Kernel.matmul_singlestage))
+                                Kernel.matmul_singlestage,
+                                nothing))
     end
 
     # WMMA Complex GEMM
@@ -453,7 +476,8 @@ function get_configs()
                                 mul!,
                                 Epilogue.Default(),
                                 verify_default,
-                                Kernel.matmul_pipelined))
+                                Kernel.matmul_pipelined,
+                                nothing))
     end
 
     # WMMA Dual GEMM
@@ -511,7 +535,8 @@ function get_configs()
                                 (C, A, B, alpha, beta) -> mul!(dual_conv(C), dual_conv(Complex{Float32}.(A)), dual_conv(Complex{Float32}.(B)), true, true),
                                 Epilogue.Default(),
                                 verify_dual,
-                                Kernel.matmul_pipelined))
+                                Kernel.matmul_pipelined,
+                                nothing))
     end
 
     rv