diff --git a/.gitignore b/.gitignore
index c181d1f8..d9a944ad 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-test/Manifest.toml
+Manifest.toml
+tuning/.CondaPkg/
diff --git a/configs/configs.jl b/configs/configs.jl
index c88b2d8d..d0f7b672 100644
--- a/configs/configs.jl
+++ b/configs/configs.jl
@@ -3,6 +3,7 @@
 using GemmKernels
 using LinearAlgebra
 using ForwardDiff
+using Octavian
 
 struct Configuration
     name           # Human-readable name of the configuration.
@@ -64,7 +65,8 @@ function generate_inputs(cf::Configuration)
     new_b_h = cf.transpose_b ? transpose(b_h) : b_h
 
     (cf.calc_reference)(c_h, new_a_h, new_b_h, cf.alpha, cf.beta)
-    c_h, a, b, c, d
+    c_ref = CuArray(c_h)
+    c_ref, a, b, c, d
 end
 
 # Run the GEMM.
@@ -88,21 +90,21 @@ function run_baseline(cf::Configuration, a, b, c, d)
 end
 
 # Verify results.
-function verify(cf::Configuration, c_h, d)
-    cf.verify(c_h, d)
+function verify(cf::Configuration, c_ref, d)
+    cf.verify(c_ref, d)
 end
 
-function verify_default(c_h, d)
-    isapprox(c_h, Array(d))
+function verify_default(c_ref, d)
+    isapprox(c_ref, d)
 end
 
-function verify_bias(c_h, d, bias)
-    c_h .+ Array(bias) ≈ Array(d)
+function verify_bias(c_ref, d, bias)
+    c_ref .+ bias ≈ d
 end
 
-function verify_dual(c_h, d)
-    c_dual = reinterpret(ForwardDiff.Dual{Float32,Float32,1}, c_h)
-    d_dual = reinterpret(ForwardDiff.Dual{Float32,Float32,1}, Array(d))
+function verify_dual(c_ref, d)
+    c_dual = reinterpret(ForwardDiff.Dual{Float32,Float32,1}, c_ref)
+    d_dual = reinterpret(ForwardDiff.Dual{Float32,Float32,1}, d)
     isapprox(c_dual, d_dual)
 end
 
@@ -238,10 +240,10 @@ macro get_wmma_config()
                       CD_type,
                       transpose_a,
                       transpose_b,
-                      mul!,
+                      Octavian.matmul!,
                       Epilogue.Default(),
                       verify_default,
-                      Kernel.matmul_pipelined,
+                      kernel,
                       wmma_baseline)
     end end)
 end
@@ -520,7 +522,8 @@ function get_configs()
             [2, 2, 1],
             [1, 1, 2],
             [2, 2, 2]], [[2048, 2048, 2048]]),
-        zero_c in [false]
+        zero_c in [false],
+        kernel in [Kernel.matmul_pipelined]
 
         push!(rv, @get_wmma_config)
     end
diff --git a/src/config.jl b/src/config.jl
index 29c26f0c..846f4c62 100644
--- a/src/config.jl
+++ b/src/config.jl
@@ -215,6 +215,12 @@ function get_config(; gemm_shape, operator, global_a_layout, global_c_layout, kw
     prod(mem_b_warp) * warps_per_block ≤ block_shape.K * block_shape.N || throw(ConfigError("mem_b_warp is too big for the selected block shape: need at least one iteration in the memory copy loop!"))
     prod(mem_cd_warp) * warps_per_block ≤ block_shape.M * block_shape.N || throw(ConfigError("mem_cd_warp is too big for the selected block shape: need at least one iteration in the memory copy loop!"))
 
+    # Check sizes of tiles
+    check_tile_smaller(lhs, rhs, msg) = ((lhs.M ≤ rhs.M) && (lhs.N ≤ rhs.N) && (lhs.K ≤ rhs.K)) || throw(ConfigError(msg))
+
+    check_tile_smaller(compute_warp, block_shape, "compute_warp must be smaller than block_shape!")
+    check_tile_smaller(block_shape, gemm_shape, "block_shape must be smaller than gemm_shape!")
+
     return Config(
         #= Params =#
         gemm_shape,
diff --git a/src/matmul.jl b/src/matmul.jl
index 3c4bde0d..f3dfd34f 100644
--- a/src/matmul.jl
+++ b/src/matmul.jl
@@ -34,6 +34,30 @@ function matmul(conf::Config, a, b, c, d;
         conf.block_shape.K ≥ 2 * conf.compute_op_shape.K || throw(ConfigError("Need at least two stages to use a pipelined kernel, i.e. BLOCK_K ≥ 2 * OPERATOR_K"))
     end
 
+    # Check LocalArray size limit of 32 elements.
+    if kernel == Kernel.matmul_singlestage
+        num_fragments_m = conf.compute_warp.M ÷ conf.compute_op_shape.M
+        num_fragments_n = conf.compute_warp.N ÷ conf.compute_op_shape.N
+
+        num_fragments_m * num_fragments_n < 32 || throw(ConfigError("Config exceeds LocalArray size limit of 32 elements!"))
+    end
+
+    if kernel == Kernel.matmul_pipelined
+        num_fragments_m = conf.compute_warp.M ÷ conf.compute_op_shape.M
+        num_fragments_n = conf.compute_warp.N ÷ conf.compute_op_shape.N
+
+        a_frag_i = (conf.block_shape.M * conf.block_shape.K) ÷ (conf.mem_a_warp.M * conf.mem_a_warp.K * conf.warps_per_block)
+        a_frag_j = (conf.mem_a_warp.M * conf.mem_a_warp.K) ÷ (conf.mem_a_thread.M * conf.mem_a_thread.K * 32)
+        b_frag_i = (conf.block_shape.K * conf.block_shape.N) ÷ (conf.mem_b_warp.K * conf.mem_b_warp.N * conf.warps_per_block)
+        b_frag_j = (conf.mem_b_warp.K * conf.mem_b_warp.N) ÷ (conf.mem_b_thread.K * conf.mem_b_thread.N * 32)
+
+        num_fragments_m * num_fragments_n < 32 || throw(ConfigError("Config exceeds LocalArray size limit of 32 elements!"))
+        a_frag_i * a_frag_j < 32 || throw(ConfigError("Config exceeds LocalArray size limit of 32 elements!"))
+        b_frag_i * b_frag_j < 32 || throw(ConfigError("Config exceeds LocalArray size limit of 32 elements!"))
+        2 * num_fragments_m < 32 || throw(ConfigError("Config exceeds LocalArray size limit of 32 elements!"))
+        2 * num_fragments_n < 32 || throw(ConfigError("Config exceeds LocalArray size limit of 32 elements!"))
+    end
+
     hostkernel = @cuda launch=false kernel(args...)
     attributes(hostkernel.fun)[CUDA.FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES] = shmem
 
diff --git a/test/Project.toml b/test/Project.toml
index 8828b9af..8d1be0aa 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -5,5 +5,6 @@ Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 XUnit = "3e3c03f2-1a94-11e9-2981-050a4ca824ab"
diff --git a/tuning/Project.toml b/tuning/Project.toml
new file mode 100644
index 00000000..8fab9e13
--- /dev/null
+++ b/tuning/Project.toml
@@ -0,0 +1,14 @@
+[deps]
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
+Hyperopt = "93e5fe13-2215-51db-baaf-2e9a34fb2712"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+LoggingExtras = "e6f89c97-d47a-5376-807f-9c37f3926c36"
+Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
+PythonPlot = "274fc56d-3b97-40fa-a1cd-1b4a50311bf9"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
diff --git a/tuning/best-configs.bin b/tuning/best-configs.bin
new file mode 100644
index 00000000..63bb0a33
Binary files /dev/null and b/tuning/best-configs.bin differ
diff --git a/tuning/configs.bin b/tuning/configs.bin
new file mode 100644
index 00000000..41e0df5f
Binary files /dev/null and b/tuning/configs.bin differ
diff --git a/tuning/tune-wmma.jl b/tuning/tune-wmma.jl
new file mode 100644
index 00000000..f2a911c6
--- /dev/null
+++ b/tuning/tune-wmma.jl
@@ -0,0 +1,518 @@
+using CUDA, GemmKernels
+using DataFrames
+using DataStructures
+using Dates
+using Logging
+using LoggingExtras
+using Plots
+using ProgressMeter
+using Serialization
+using Statistics
+using StatsBase
+
+pythonplot()
+
+#######
+
+const N_vals = 2 .^ (7:14)
+
+# Stop sampling when normalised 95p CI is smaller than this...
+const BENCH_NORM_CI_THRESHOLD = 0.01
+
+# ... or we have exceeded the time limit...
+const BENCH_MAX_NUM_SECONDS = 5
+
+# ... but have at least 10 samples.
+const BENCH_MIN_NUM_SAMPLES = 10
+
+#####
+
+# Stop gathering samples for plot if the "error bars" are smaller than this...
+const PLOT_RATIO_MAX_UNCERTAINTY = 0.05
+
+# ... or we have exceeded the time limit...
+const PLOT_MAX_NUM_SECONDS = 180
+
+# ... but have at least 10 samples.
+const PLOT_MIN_NUM_SAMPLES = 10
+
+const AB_type = Float16
+const CD_type = Float32
+
+const zero_c = true
+
+const OP_M, OP_N, OP_K = 16, 16, 16
+
+#######
+
+# Reuse inputs across iterations.
+c_ref = nothing
+a = nothing
+b = nothing
+c = nothing
+d = nothing
+input_transpose_a = nothing
+input_transpose_b = nothing
+input_N = nothing
+
+include("../configs/configs.jl")
+
+# Write logging messages to file for persistence.
+timestamp_logger(logger) = TransformerLogger(logger) do log
+    merge(log, (; message = "$(Dates.format(now(), "yyyy-mm-dd HH:MM:SS")) $(log.message)"))
+end
+FileLogger("tuning/tuning.log"; append=true) |> timestamp_logger |> (x -> MinLevelLogger(x, Logging.Info)) |> global_logger
+
+function kernel_string_to_function(str)
+    Dict(
+        "singlestage" => Kernel.matmul_singlestage,
+        "pipelined" => Kernel.matmul_pipelined
+       )[str]
+end
+
+get_label(transpose_a, transpose_b) = "$(transpose_a ? "T" : "N")$(transpose_b ? "T" : "N")"
+
+function generate_configs()
+    all_configs = DataFrame(
+        transpose_a=Bool[],
+        transpose_b=Bool[],
+        N=Int[],
+        BLOCK_M=Int[],
+        BLOCK_N=Int[],
+        BLOCK_K=Int[],
+        WARPS_M=Int[],
+        WARPS_N=Int[],
+        kernel_str=String[],
+        category=String[],
+        times=Vector{Any}[]
+    )
+
+    for transpose_a in [false, true],
+        transpose_b in [false, true],
+        N in N_vals,
+        BLOCK_M in 2 .^ (6:9),
+        BLOCK_N in 2 .^ (6:9),
+        BLOCK_K in 2 .^ (5:7),
+        WARPS_M in 2 .^ (0:3),
+        WARPS_N in 2 .^ (0:3),
+        kernel_str in ["singlestage", "pipelined"]
+
+        push!(all_configs, Dict(
+            :transpose_a => transpose_a,
+            :transpose_b => transpose_b,
+            :N => N,
+            :BLOCK_M => BLOCK_M,
+            :BLOCK_N => BLOCK_N,
+            :BLOCK_K => BLOCK_K,
+            :WARPS_M => WARPS_M,
+            :WARPS_N => WARPS_N,
+            :kernel_str => kernel_str,
+            :category => "unknown",
+            :times => [],
+        ))
+    end
+
+    all_configs
+end
+
+function get_config(row)
+    transpose_a = row["transpose_a"]
+    transpose_b = row["transpose_b"]
+    M = N = K = row["N"]
+    BLOCK_M = row["BLOCK_M"]
+    BLOCK_N = row["BLOCK_N"]
+    BLOCK_K = row["BLOCK_K"]
+    WARPS_M = row["WARPS_M"]
+    WARPS_N = row["WARPS_N"]
+    kernel = kernel_string_to_function(row["kernel_str"])
+
+    @get_wmma_config
+end
+
+function generate_inputs_if_needed(row)
+    global input_transpose_a, input_transpose_b, input_N, c_ref, a, b, c, d
+
+    cf = get_config(row)
+
+    if (input_transpose_a, input_transpose_b, input_N) != (row.transpose_a, row.transpose_b, row.N)
+        c_ref, a, b, c, d = generate_inputs(cf)
+        input_transpose_a, input_transpose_b, input_N = row.transpose_a, row.transpose_b, row.N
+    end
+end
+
+function get_inputs_for_plot(input_dict, row)
+    if row.N ∉ keys(input_dict)
+        cf = get_config(row)
+        _, a, b, c, d = generate_inputs(cf)
+        input_dict[row.N] = (a, b, c, d)
+    end
+
+    return input_dict[row.N]
+end
+
+function measure_config(row)
+    cf = get_config(row)
+
+    generate_inputs_if_needed(row)
+
+    d .= 0
+
+    try
+        run_gemm(cf, a, b, c, d)
+    catch err
+        if isa(err, GemmKernels.ConfigError)
+            @info "Skipping configuration $(NamedTuple(row))" * sprint(Base.showerror, err)
+            return [Inf], "unsupported_config_post_run"
+        end
+
+        if isa(err, CuError)
+            @error "Configuration failed: $(NamedTuple(row))" * sprint(Base.showerror, err)
+            rethrow()
+        end
+
+        @info "Skipping configuration: $(NamedTuple(row))" * sprint(Base.showerror, err)
+        return [Inf], "error"
+    end
+
+    if !verify(cf, c_ref, d)
+        @warn "Configuration produced invalid result: $(NamedTuple(row))"
+
+        expected = c_ref
+        actual = d
+
+        mad, index = findmax(abs.(expected - actual))
+
+        @warn "Maximum absolute deviation is $(mad) at index $(index)."
+
+        return [Inf], "invalid_result"
+    end
+
+    times = Float64[]
+
+    # Use CUDA.@elapsed instead of CUDA.@profile, because the latter is slower.
+    device_synchronize()
+    GC.gc(true)
+
+    start_time = Dates.now()
+
+    while true
+        synchronize(stream())
+        time = CUDA.@elapsed run_gemm(cf, a, b, c, d)
+        push!(times, time)
+
+        if length(times) >= BENCH_MIN_NUM_SAMPLES
+            (Dates.now() - start_time > Second(BENCH_MAX_NUM_SECONDS)) && break
+            (confidence_interval_95(times) / median(times) < BENCH_NORM_CI_THRESHOLD) && break
+        end
+    end
+
+    return times, "success"
+end
+
+confidence_interval_95(times) = 1.58 * iqr(times) / sqrt(length(times))
+
+function prettytime(times)
+    min, q1, med, q3, max = nquantile(times, 4)
+    ci_95 = confidence_interval_95(times)
+
+    # timescale
+    scale, unit = if med < 1e3
+        1, "ns"
+    elseif med < 1e6
+        1e3, "μs"
+    elseif med < 1e9
+        1e6, "ms"
+    else
+        1e9, "s"
+    end
+
+    rnd_min, rnd_q1, rnd_med, rnd_q3, rnd_max, rnd_ci_95 = round.([min, q1, med, q3, max, ci_95] ./ scale; sigdigits=3)
+    rnd_rel_ci_95 = round(100 * ci_95 / med; sigdigits=3)
+
+    return "$rnd_med $unit ± $rnd_ci_95 $unit ($rnd_rel_ci_95%) (length: $(length(times)), 5-num summary: $rnd_min, $rnd_q1, $rnd_med, $rnd_q3, $rnd_max $unit)"
+end
+
+perf_ratio(gemmkernels, baseline) = percentile(baseline, 0) / percentile(gemmkernels, 0)
+perf_ratio_lo(gemmkernels, baseline) = percentile(baseline, 0) / percentile(gemmkernels, 75)
+perf_ratio_hi(gemmkernels, baseline) = percentile(baseline, 75) / percentile(gemmkernels, 0)
+
+function get_uncertainty(gk, bl)
+    lo, mid, hi = (perf_ratio_lo(gk, bl), perf_ratio(gk, bl), perf_ratio_hi(gk, bl))
+
+    hi_uncertainty = abs(hi - mid) / mid
+    lo_uncertainty = abs(lo - mid) / mid
+    uncertainty = max(hi_uncertainty, lo_uncertainty)
+
+    uncertainty, lo_uncertainty, hi_uncertainty
+end
+
+function got_enough_samples(row)
+    gk, bl = row["gemmkernels_times"], row["baseline_times"]
+
+    (length(gk) < PLOT_MIN_NUM_SAMPLES) && return false
+    (length(bl) < PLOT_MIN_NUM_SAMPLES) && return false
+
+    (row["time_spent"] >= PLOT_MAX_NUM_SECONDS) && return true
+
+    uncertainty, _, _ = get_uncertainty(gk, bl)
+
+    uncertainty < PLOT_RATIO_MAX_UNCERTAINTY
+end
+
+function benchmark_best_configs(configs)
+    best_configs = DataFrame(
+        transpose_a=Bool[],
+        transpose_b=Bool[],
+        N=Int[],
+        BLOCK_M=Int[],
+        BLOCK_N=Int[],
+        BLOCK_K=Int[],
+        WARPS_M=Int[],
+        WARPS_N=Int[],
+        kernel_str=String[],
+        category=String[],
+        uncertainty=Float64[],
+        time_spent=Float64[],
+        gemmkernels_times=Vector{Any}[],
+        baseline_times=Vector{Any}[]
+    )
+
+    for transpose_a = [false, true],
+        transpose_b = [false, true],
+        N = N_vals
+
+        relevant_configs = configs[(@. (configs[!, "transpose_a"] == transpose_a) & (configs[!, "transpose_b"] == transpose_b) & (configs[!, "N"] == N)), :]
+        _, best_config_index = findmin(minimum.(relevant_configs[!, "times"], init=Inf))
+        best_config = relevant_configs[best_config_index, :]
+
+        push!(best_configs, Dict(
+            :transpose_a => transpose_a,
+            :transpose_b => transpose_b,
+            :N => N,
+            :BLOCK_M => best_config["BLOCK_M"],
+            :BLOCK_N => best_config["BLOCK_N"],
+            :BLOCK_K => best_config["BLOCK_K"],
+            :WARPS_M => best_config["WARPS_M"],
+            :WARPS_N => best_config["WARPS_N"],
+            :kernel_str => best_config["kernel_str"],
+            :category => "todo",
+            :uncertainty => Inf,
+            :time_spent => 0.0,
+            :gemmkernels_times => [],
+            :baseline_times => [],
+        ))
+    end
+
+    # We will reuse matrix inputs across iterations. This takes about 4 GB of GPU memory for e.g. all matrix sizes for NN.
+    # Group runs of the same transposition together, so we don't have to keep 4 * 4 GB of inputs in memory.
+    for transpose_a in [false, true],
+        transpose_b in [false, true]
+
+        input_dict = Dict()
+
+        p = ProgressUnknown(desc="Benchmarking (highest uncertainty)", dt=1.0)
+
+        # Spread the samples of one configuration over time, to reduce the effect
+        # of time-related noise. Note that this means that the progress bar may
+        # make big jumps.
+        while true
+            (sum(@. (best_configs[!, "category"] == "todo") & (best_configs[!, "transpose_a"] == transpose_a) & (best_configs[!, "transpose_b"] == transpose_b)) == 0) && break
+
+            for config_row in eachrow(best_configs)
+                if (config_row.category, config_row.transpose_a, config_row.transpose_b) != ("todo", transpose_a, transpose_b)
+                    continue
+                end
+
+                a, b, c, d = get_inputs_for_plot(input_dict, config_row)
+                cf = get_config(config_row)
+
+                @info "Profiling configuration $(NamedTuple(config_row))..."
+
+                start_time = Dates.now()
+
+                prof = CUDA.@profile run_gemm(cf, a, b, c, d)
+                push!(config_row["gemmkernels_times"], sum(prof.device[!, "stop"] - prof.device[!, "start"]))
+
+                prof = CUDA.@profile run_baseline(cf, a, b, c, d)
+                push!(config_row["baseline_times"], sum(prof.device[!, "stop"] - prof.device[!, "start"]))
+
+                config_row["time_spent"] += (Dates.now() - start_time) / Second(1)
+                config_row["uncertainty"], _, _ = get_uncertainty(config_row["gemmkernels_times"], config_row["baseline_times"])
+
+                if got_enough_samples(config_row)
+                    config_row["category"] = "done"
+                end
+
+                # Update progress bar.
+                highest_uncertainty = best_configs[(@. (best_configs[!, "transpose_a"] == transpose_a) & (best_configs[!, "transpose_b"] == transpose_b)), :]
+                highest_uncertainty = maximum(highest_uncertainty[!, "uncertainty"])
+                next!(p; showvalues = [
+                    (:transpose_a, transpose_a),
+                    (:transpose_b, transpose_b),
+                    (:N, config_row["N"]),
+                    (:num_samples, length(config_row["gemmkernels_times"])),
+                    (:uncertainty, config_row["uncertainty"]),
+                    (:time_spent_in_config, config_row["time_spent"]),
+                    (:highest_uncertainty, highest_uncertainty),
+                    (:remaining_configurations, sum(best_configs[!, "category"] .== "todo"))
+                ])
+            end
+        end
+    end
+
+    best_configs
+end
+
+function plot_results(best_configs)
+    markershapes = Dict(
+        "NN" => :circle,
+        "NT" => :dtriangle,
+        "TN" => :diamond,
+        "TT" => :cross
+    )
+
+    p = plot()
+    title!("$AB_type x $AB_type = $CD_type ($(CUDA.name(CUDA.device())))")
+    xlabel!("Matrix size [-]")
+    ylabel!("Performance relative to cuBLAS [%]")
+
+    for transpose_a in [false, true],
+        transpose_b in [false, true]
+
+        label = get_label(transpose_a, transpose_b)
+
+        relevant_configs = best_configs[(@. (best_configs[!, "transpose_a"] == transpose_a) & (best_configs[!, "transpose_b"] == transpose_b)), :]
+
+        ratios = @. 100 * perf_ratio(relevant_configs.gemmkernels_times, relevant_configs.baseline_times)
+        ratios_lo = @. 100 * perf_ratio_lo(relevant_configs.gemmkernels_times, relevant_configs.baseline_times)
+        ratios_hi = @. 100 * perf_ratio_hi(relevant_configs.gemmkernels_times, relevant_configs.baseline_times)
+
+        plot!(p, relevant_configs.N, ratios, ribbon=(ratios .- ratios_lo, ratios_hi .- ratios), label=label, markershape=markershapes[label], xscale=:log2)
+    end
+
+    savefig(p, "tuning/plot.pdf")
+end
+
+function main()
+    @info "Starting WMMA tuning script..."
+
+    configs = nothing
+
+    if !isfile("tuning/configs.bin")
+        # (1) Generate configurations.
+        @info "Generating configurations..."
+        configs = generate_configs()
+        @info "Generated $(size(configs, 1)) configurations."
+
+        # (2) Filter configurations where we can determine upfront that they are unsupported.
+        @info "Filtering configurations that we know are unsupported a-priori..."
+
+        for config_row in eachrow(configs)
+            try
+                cf = get_config(config_row)
+            catch err
+                if isa(err, GemmKernels.ConfigError)
+                    config_row["category"] = "unsupported_config_pre_run"
+                else
+                    rethrow()
+                end
+            end
+        end
+
+        @info "Filtered $(counter(configs[!, "category"])["unsupported_config_pre_run"]) configurations."
+
+        open("tuning/configs.bin", "w") do io
+            serialize(io, configs)
+        end
+    end
+
+    @info "Loading configurations from disk..."
+    configs = open("tuning/configs.bin", "r") do io
+        deserialize(io)
+    end
+    @info "Loaded $(size(configs, 1)) configurations."
+
+    # (3) Measure performance of configurations.
+    num_unknown = counter(configs[!, "category"])["unknown"]
+    p = Progress(num_unknown; desc="Parameter sweep", dt=1.0, showspeed=true)
+
+    @info "Need to perform parameter sweep over $(num_unknown) configurations."
+
+    # Generate inputs for the first configuration. This is not strictly
+    # speaking necessary, but doing this outside of the loop means that the
+    # first iteration will not be excessively slow, which improves the "ETA"
+    # estimate.
+    first_unknown_config = findfirst(configs[!, "category"] .== "unknown")
+    !isnothing(first_unknown_config) && generate_inputs_if_needed(configs[first_unknown_config, :])
+
+    for config_row in eachrow(configs)
+        start_time = Dates.now()
+
+        if config_row.category != "unknown"
+            continue
+        end
+
+        config_row.category = "crashed"
+
+        # Save results in case the process crashes.
+        open("tuning/configs.bin", "w") do io
+            serialize(io, configs)
+        end
+
+        @info "Measuring configuration $(NamedTuple(config_row))..."
+
+        times, category = measure_config(config_row)
+
+        @info "Result for $(NamedTuple(config_row)): $(category) -- $(prettytime(times .* 1e9))"
+
+        config_row.category = category
+        config_row.times = times
+
+        counter_dict_abs = Dict(counter(configs[!, "category"]))
+        counter_dict_rel = Dict(k => "$(round(100 * v / sum(values(counter_dict_abs)); sigdigits=3))%" for (k, v) in counter_dict_abs)
+
+        next!(p; showvalues=[
+            (:N, config_row.N),
+            (:transpose, get_label(config_row.transpose_a, config_row.transpose_b)),
+            (:block_shape, (config_row.BLOCK_M, config_row.BLOCK_N, config_row.BLOCK_K)),
+            (:num_warps, (config_row.WARPS_M, config_row.WARPS_N)),
+            (:kernel, config_row.kernel_str),
+            (:counters, counter_dict_abs),
+            (:counters_relative, counter_dict_rel),
+            (:last_result, "$(category) -- $(prettytime(times .* 1e9))"),
+            (:last_iteration_time, Dates.now() - start_time)
+        ])
+    end
+
+    # Save data for final iteration.
+    open("tuning/configs.bin", "w") do io
+        serialize(io, configs)
+    end
+
+    # And load again, for good measure.
+    configs = open("tuning/configs.bin", "r") do io
+        deserialize(io)
+    end
+
+    # (4) Select best configurations, and benchmark.
+    if !isfile("tuning/best-configs.bin")
+        @info "Benchmarking configurations for plot..."
+        best_configs = benchmark_best_configs(configs)
+
+        open("tuning/best-configs.bin", "w") do io
+            serialize(io, best_configs)
+        end
+    end
+
+    @info "Loading best configurations from disk..."
+    best_configs = open("tuning/best-configs.bin", "r") do io
+        deserialize(io)
+    end
+
+    # (5) Plotting results
+    @info "Plotting results..."
+    plot_results(best_configs)
+end
+
+
+isinteractive() || main()
diff --git a/tuning/tune-wmma.sh b/tuning/tune-wmma.sh
new file mode 100755
index 00000000..a97b779c
--- /dev/null
+++ b/tuning/tune-wmma.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+set -Eeuo pipefail
+
+cd "$( dirname "${BASH_SOURCE[0]}" )"
+
+rm -f configs.bson
+rm -f tuning.log
+
+cd ..
+
+until julia --project -e '
+    println("--- :julia: Instantiating project")
+    using Pkg
+    Pkg.instantiate()
+    Pkg.activate("tuning")
+    Pkg.instantiate()
+    push!(LOAD_PATH, @__DIR__)
+
+    println("+++ :julia: Tuning")
+    include("tuning/tune-wmma.jl")'; do
+
+    echo "Tuning script crashed. Resuming after 1 second..." >&2
+    sleep 1
+done