From 4860245c80f77476f5c93d10a5a72af19899798a Mon Sep 17 00:00:00 2001
From: skyleaworlder <870033938@qq.com>
Date: Wed, 19 Jul 2023 06:53:19 +0000
Subject: [PATCH] refact: split nnlib benchmarks

---
 benchmark/benchmark/flux.jl              |   2 +-
 benchmark/benchmark/nnlib.jl             | 261 +----------------------
 benchmark/benchmark/nnlib/activations.jl |  12 ++
 benchmark/benchmark/nnlib/attention.jl   |  26 +++
 benchmark/benchmark/nnlib/conv.jl        |  58 +++++
 benchmark/benchmark/nnlib/dropout.jl     |  20 ++
 benchmark/benchmark/nnlib/gemm.jl        |  26 +++
 benchmark/benchmark/nnlib/pooling.jl     |  38 ++++
 benchmark/benchmark/nnlib/softmax.jl     |  23 ++
 benchmark/benchmark/nnlib/upsample.jl    |  35 +++
 10 files changed, 247 insertions(+), 254 deletions(-)
 create mode 100644 benchmark/benchmark/nnlib/activations.jl
 create mode 100644 benchmark/benchmark/nnlib/attention.jl
 create mode 100644 benchmark/benchmark/nnlib/conv.jl
 create mode 100644 benchmark/benchmark/nnlib/dropout.jl
 create mode 100644 benchmark/benchmark/nnlib/gemm.jl
 create mode 100644 benchmark/benchmark/nnlib/pooling.jl
 create mode 100644 benchmark/benchmark/nnlib/softmax.jl
 create mode 100644 benchmark/benchmark/nnlib/upsample.jl

diff --git a/benchmark/benchmark/flux.jl b/benchmark/benchmark/flux.jl
index 757d941..ab2da5f 100644
--- a/benchmark/benchmark/flux.jl
+++ b/benchmark/benchmark/flux.jl
@@ -2,4 +2,4 @@ using Flux
 
 SUITE["flux"] = BenchmarkGroup()
 
-register_benchmark("FLUXML_BENCHMARK_FLUX_MLP", "benchmark/flux/mlp.jl")
+register_benchmark("FLUXML_BENCHMARK_FLUX_MLP", "flux/mlp.jl")
diff --git a/benchmark/benchmark/nnlib.jl b/benchmark/benchmark/nnlib.jl
index 2fa63b5..f4f2548 100644
--- a/benchmark/benchmark/nnlib.jl
+++ b/benchmark/benchmark/nnlib.jl
@@ -1,258 +1,13 @@
 using NNlib
 using NNlib.ChainRulesCore: rrule
-using Random
 
 SUITE["nnlib"] = BenchmarkGroup()
 
-########## activations ############
-SUITE["nnlib"]["activations"] = BenchmarkGroup()
-for et in (Float64, Float32, Float16,)
-    et_suite = BenchmarkGroup()
-    SUITE["nnlib"]["activations"][string(et)] = et_suite
-    let x = rand(et, 1024, 1024), y = similar(x)
-        for f in NNlib.ACTIVATIONS
-            act = @eval($f)
-            et_suite[string(f)] = @benchmarkable broadcast!($act, $y, $x)
-        end
-    end
-end
-
-
-########## softmax ############
-SUITE["nnlib"]["softmax"] = BenchmarkGroup()
-for (fn!, fn_bw) in [(softmax!, NNlib.∇softmax_data), (logsoftmax!, NNlib.∇logsoftmax_data)]
-    fn_suite = BenchmarkGroup()
-    SUITE["nnlib"]["softmax"][rstrip(string(fn!), '!')] = fn_suite
-    let SIZES = [
-        (12288, 2048, 1), (4096, 4096, 2), (4096, 2048, 2), (2048, 2048, 2),
-        (1024, 2048, 4), (768, 1024, 4), (512, 784, 8), (128, 384, 8),
-    ]
-        for et in (Float32, Float16,)
-            et_suite = BenchmarkGroup("fw" => BenchmarkGroup(), "bw" => BenchmarkGroup())
-            fn_suite[string(et)] = et_suite
-            for sz in SIZES
-                x = randn(et, sz)
-                y = similar(x)
-                dy = zero(x)
-                fn!(y, x)
-                et_suite["fw"][string(sz)] = @benchmarkable $fn!($y, $x)
-                et_suite["bw"][string(sz)] = @benchmarkable $fn_bw($dy, $y)
-            end
-        end
-    end
-end
-
-
-########## conv ############
-SUITE["nnlib"]["conv"] = BenchmarkGroup()
-for rank in (3, 2, 1,), N in (512, 256,), K in (3,),
-    C_in in (1,), C_out in (1,),
-    stride in (1,), dilation in (1,), padding in (2, 0,)
-
-    size_suite = BenchmarkGroup()
-    SUITE["nnlib"]["conv"][
-        "$(rank+2)-N($N)-K($K)-in($C_in)-out($C_out)-stride($stride)-dilation($dilation)-padding($padding)"
-        ] = size_suite
-
-    conv_items = [
-        (NNlib.conv_direct!, NNlib.∇conv_data_direct!, NNlib.∇conv_filter_direct!, DenseConvDims, "direct"),
-        (NNlib.conv_im2col!, NNlib.∇conv_data_im2col!, NNlib.∇conv_filter_im2col!, DenseConvDims, "im2col"),
-        (NNlib.depthwiseconv_direct!, NNlib.∇depthwiseconv_data_direct!, NNlib.∇depthwiseconv_filter_direct!, DepthwiseConvDims, "direct"),
-        (NNlib.depthwiseconv_im2col!, NNlib.∇depthwiseconv_data_im2col!, NNlib.∇depthwiseconv_filter_im2col!, DepthwiseConvDims, "im2col"),
-    ]
-
-    for (conv!, ∇conv_data!, ∇conv_filter!, cdimT, _) in conv_items
-        conv_suite = BenchmarkGroup()
-        SUITE["nnlib"]["conv"][
-            "$(rank+2)-N($N)-K($K)-in($C_in)-out($C_out)-stride($stride)-dilation($dilation)-padding($padding)"
-            ][rstrip(string(conv!), '!')] = conv_suite
-
-        for et in (Float32, Float64)
-            et_suite = BenchmarkGroup()
-            SUITE["nnlib"]["conv"][
-                "$(rank+2)-N($N)-K($K)-in($C_in)-out($C_out)-stride($stride)-dilation($dilation)-padding($padding)"
-                ][rstrip(string(conv!), '!')][string(et)] = et_suite
-
-            x = zeros(et, repeat([N], rank)..., C_in, 1)
-            w = (cdimT == DenseConvDims) ?
-                zeros(et, repeat([K], rank)..., C_in, C_out) :
-                zeros(et, repeat([K], rank)..., C_out, C_in)
-
-            cdims = try
-                cdimT(x, w; stride = stride, dilation = dilation, padding = padding)
-            catch
-                continue
-            end
-
-            y = (cdimT == DenseConvDims) ?
-                zeros(et, NNlib.output_size(cdims)..., C_out, 1) :
-                zeros(et, NNlib.output_size(cdims)..., C_out*C_in, 1)
-
-            dx, dy, dw = similar(x), similar(y), similar(w)
-            SUITE["nnlib"]["conv"][
-                "$(rank+2)-N($N)-K($K)-in($C_in)-out($C_out)-stride($stride)-dilation($dilation)-padding($padding)"
-                ][rstrip(string(conv!), '!')][string(et)]["conv"] = @benchmarkable $(conv!)($y, $x, $w, $cdims)
-            SUITE["nnlib"]["conv"][
-                "$(rank+2)-N($N)-K($K)-in($C_in)-out($C_out)-stride($stride)-dilation($dilation)-padding($padding)"
-                ][rstrip(string(conv!), '!')][string(et)]["data"] = @benchmarkable $(∇conv_data!)($dx, $y, $w, $cdims)
-            SUITE["nnlib"]["conv"][
-                "$(rank+2)-N($N)-K($K)-in($C_in)-out($C_out)-stride($stride)-dilation($dilation)-padding($padding)"
-                ][rstrip(string(conv!), '!')][string(et)]["filter"] = @benchmarkable $(∇conv_filter!)($dw, $x, $y, $cdims)
-        end
-    end
-end
-
-
-########## pooling ############
-SUITE["nnlib"]["pooling"] = BenchmarkGroup()
-for rank in (3, 2, 1,), N in (512, 256,), K in (4, 2,), stride in (4, 2, 1,)
-    size_suite = BenchmarkGroup()
-    SUITE["nnlib"]["pooling"]["$(rank+2)-N($N)-K($K)-stride($stride)"] = size_suite
-
-    x = zeros(Float32, repeat([N], rank)..., 1, 1)
-    pdims = PoolDims(x, K; stride = stride)
-    y = zeros(Float32, NNlib.output_size(pdims)..., 1, 1)
-    dx, dy = similar(x), similar(y)
-
-    pooling_items = [
-        (NNlib.maxpool!, NNlib.∇maxpool!, "maxpool"),
-        (NNlib.meanpool!, NNlib.∇meanpool!, "meanpool"),
-        (NNlib.lpnormpool!, NNlib.∇lpnormpool!, "lpnormpool"),
-    ]
-
-    for (pool, ∇pool, name) in pooling_items
-        pooling_suite = BenchmarkGroup()
-        SUITE["nnlib"]["pooling"][
-            "$(rank+2)-N($N)-K($K)-stride($stride)"
-            ]["$(name)$(rank)d-direct"] = pooling_suite
-        SUITE["nnlib"]["pooling"][
-            "$(rank+2)-N($N)-K($K)-stride($stride)"
-            ]["$(name)$(rank)d-direct"]["pool"] = @benchmarkable $pool(
-                $y, $x, $pdims; p = ($name == "lpnormpool") ? 2 : nothing)
-        SUITE["nnlib"]["pooling"][
-            "$(rank+2)-N($N)-K($K)-stride($stride)"
-            ]["$(name)$(rank)d-direct"]["data"] = @benchmarkable $(∇pool)(
-                $dx, $dy, $y, $x, $pdims; p = ($name == "lpnormpool") ? 2 : nothing)
-    end
-
-    if NNlib.is_nnpack_available() && NNlib.nnpack_supported_operation(pdims)
-        SUITE["nnlib"]["pooling"][
-            "$(rank+2)-N($N)-K($K)-stride($stride)"
-            ]["maxpool$(rank)d-nnpack"]["pool"] = @benchmarkable NNlib.maxpool_nnpack!($y, $x, $pdims)
-    end
-end
-
-
-########## dropout ############
-SUITE["nnlib"]["dropout"] = BenchmarkGroup()
-for rank in (1, 2, 3,), N in (128, 512, 1024,)
-    size_suite = BenchmarkGroup()
-    SUITE["nnlib"]["dropout"]["$(rank+2)-N($N)"] = size_suite
-
-    x = ones(Float32, repeat([N], rank)..., 1, 1)
-    y = zeros(Float32, repeat([N], rank)..., 1, 1)
-    p = 0.2
-
-    dropout_suite = BenchmarkGroup()
-    dropout_suite["with-colon"] = @benchmarkable dropout($x, $p)
-    dropout_suite["with-dim"] = @benchmarkable dropout($x, $p; dims = 1)
-    SUITE["nnlib"]["dropout"]["$(rank+2)-N($N)"]["dropout"] = dropout_suite
-
-    dropout!_suite = BenchmarkGroup()
-    dropout!_suite["with-colon"] = @benchmarkable dropout!($y, $x, $p)
-    dropout!_suite["with-dim"] = @benchmarkable dropout!($y, $x, $p; dims = 1)
-    SUITE["nnlib"]["dropout"]["$(rank+2)-N($N)"]["dropout!"] = dropout!_suite
-end
-
-
-########## upsample ############
-SUITE["nnlib"]["upsample"] = BenchmarkGroup()
-SUITE["nnlib"]["upsample"]["linear"] = BenchmarkGroup()
-for rank in (3, 2, 1,), et in (Float32, Float16,)
-    et_suite = BenchmarkGroup("fw" => BenchmarkGroup(), "bw" => BenchmarkGroup())
-    SUITE["nnlib"]["upsample"]["linear"][string(et)] = et_suite
-
-    inputs_sizes = [
-        (1024, (0.5, 2), false), (256, 8, false),
-        (256, 4, true), (128, (1, 2), false), (128, 2, true),
-    ]
-    for (sz, scale, ac) in inputs_sizes
-        x = ones(et, repeat([sz], rank)..., 1, 1)
-        et_suite["fw"][
-            "$(rank+2)-N($sz)-scale($scale)"
-            ] = @benchmarkable upsample_linear($x, $scale; align_corners = $ac)
-        et_suite["bw"][
-            "$(rank+2)-N($sz)-scale($scale)"
-            ] = @benchmarkable ∇upsample_linear($x;
-                size = (typeof($scale) <: Tuple) ?
-                    floor.(Integer, $sz .* $scale) :
-                    ntuple(_ -> floor(Integer, $sz * $scale), $rank),
-                align_corners = $ac)
-    end
-end
-
-SUITE["nnlib"]["upsample"]["nearest"] = BenchmarkGroup()
-for rank in (3, 2, 1,), N in (1024, 512, 128,)
-    et_suite = BenchmarkGroup()
-    for et in (Float64, Float32, Float16,)
-        x = zeros(Float32, repeat([N], rank)..., 1, 1)
-        et_suite[string(et)] = @benchmarkable upsample_nearest($x; size = (repeat([$N * 10], $rank)..., 1, 1))
-    end
-    SUITE["nnlib"]["upsample"]["nearest"]["$(rank+2)-N($N)"] = et_suite
-end
-
-
-########## gemm ############
-SUITE["nnlib"]["gemm"] = BenchmarkGroup()
-for et in (Float32, Float64)
-    et_suite = BenchmarkGroup(
-        "gemm!" => BenchmarkGroup(),
-        "batched_gemm!" => BenchmarkGroup())
-    SUITE["nnlib"]["gemm"][string(et)] = et_suite
-
-    # transA and transB are not of the main varaints.
-    # gemm! meets some memory problem, not included here.
-    input_items = [
-        (Val(false), Val(false), 'N', 'N', 1024, 1024, 1024, et(0.5), et(0.0)),
-        (Val(false), Val(false), 'N', 'N', 512, 512, 128, et(0.5), et(1.0)),
-        (Val(false), Val(false), 'N', 'N', 80, 40, 100, et(1.0), et(0.0)),
-    ]
-    for (transA, transB, transA_ch, transB_ch, M, N, K, alpha, beta) in input_items
-        bA = ones(et, M, N, 1)
-        bB = ones(et, N, K, 1)
-        bC = zeros(et, M, K, 1)
-        et_suite["batched_gemm!"][
-           "trans($transA_ch,$transB_ch)-M($M)-N($N)-K($K)-alpha($alpha)-beta($beta)"
-        ] = @benchmarkable NNlib.batched_gemm!(
-           $transA_ch, $transB_ch,
-           $alpha, $bA, $bB, $beta, $bC)
-    end
-end
-
-
-########## attention ############
-SUITE["nnlib"]["attention"] = BenchmarkGroup()
-for et in (Float16, Float64)
-    et_suite = BenchmarkGroup(
-        "attention" => BenchmarkGroup(), "score" => BenchmarkGroup())
-    SUITE["nnlib"]["attention"][string(et)] = et_suite
-
-    input_items = [
-        ((16,128,8), (16,512,8), (32,512,8), (512,128), 4),
-        ((64,64,16), (64,64,16), (64,64,16), (64,64), 4),
-        ((8,6,1), (8,10,1), (4,10,1), nothing, 1),
-    ]
-    for (q_sz, k_sz, v_sz, bias_sz, nheads) in input_items
-        q, q_score = rand(et, q_sz...), rand(et, 8, q_sz...)
-        k, k_score = rand(et, k_sz...), rand(et, 8, k_sz...)
-        v = rand(et, v_sz...)
-        bias = isnothing(bias_sz) ? nothing : rand(et, bias_sz...)
-        mask = isnothing(bias_sz) ? nothing : rand(Bool, bias_sz...)
-        et_suite["attention"][
-            "q($q_sz)-k($k_sz)-v($v_sz)-bias($bias_sz)-nheads($nheads)"
-        ] = @benchmarkable dot_product_attention($q, $k, $v, $bias; nheads = $nheads)
-        et_suite["score"][
-            "q(8, $q_sz)-k(8, $k_sz)-bias($bias_sz)-nheads($nheads)"
-        ] = @benchmarkable dot_product_attention_scores($q_score, $k_score, $bias; mask = $mask)
-    end
-end
+register_benchmark("FLUXML_BENCHMARK_NNLIB_ACTIVATIONS", "nnlib/activations.jl")
+register_benchmark("FLUXML_BENCHMARK_NNLIB_SOFTMAX", "nnlib/softmax.jl")
+register_benchmark("FLUXML_BENCHMARK_NNLIB_CONV", "nnlib/conv.jl")
+register_benchmark("FLUXML_BENCHMARK_NNLIB_POOLING", "nnlib/pooling.jl")
+register_benchmark("FLUXML_BENCHMARK_NNLIB_DROPOUT", "nnlib/dropout.jl")
+register_benchmark("FLUXML_BENCHMARK_NNLIB_UPSAMPLE", "nnlib/upsample.jl")
+register_benchmark("FLUXML_BENCHMARK_NNLIB_GEMM", "nnlib/gemm.jl")
+register_benchmark("FLUXML_BENCHMARK_NNLIB_ATTENTION", "nnlib/attention.jl")
diff --git a/benchmark/benchmark/nnlib/activations.jl b/benchmark/benchmark/nnlib/activations.jl
new file mode 100644
index 0000000..5133e86
--- /dev/null
+++ b/benchmark/benchmark/nnlib/activations.jl
@@ -0,0 +1,12 @@
+########## activations ############
+SUITE["nnlib"]["activations"] = BenchmarkGroup()
+for et in (Float64, Float32, Float16,)
+    et_suite = BenchmarkGroup()
+    SUITE["nnlib"]["activations"][string(et)] = et_suite
+    let x = rand(et, 1024, 1024), y = similar(x)
+        for f in NNlib.ACTIVATIONS
+            act = @eval($f)
+            et_suite[string(f)] = @benchmarkable broadcast!($act, $y, $x)
+        end
+    end
+end
diff --git a/benchmark/benchmark/nnlib/attention.jl b/benchmark/benchmark/nnlib/attention.jl
new file mode 100644
index 0000000..50a00e0
--- /dev/null
+++ b/benchmark/benchmark/nnlib/attention.jl
@@ -0,0 +1,26 @@
+########## attention ############
+SUITE["nnlib"]["attention"] = BenchmarkGroup()
+for et in (Float16, Float64)
+    et_suite = BenchmarkGroup(
+        "attention" => BenchmarkGroup(), "score" => BenchmarkGroup())
+    SUITE["nnlib"]["attention"][string(et)] = et_suite
+
+    input_items = [
+        ((16,128,8), (16,512,8), (32,512,8), (512,128), 4),
+        ((64,64,16), (64,64,16), (64,64,16), (64,64), 4),
+        ((8,6,1), (8,10,1), (4,10,1), nothing, 1),
+    ]
+    for (q_sz, k_sz, v_sz, bias_sz, nheads) in input_items
+        q, q_score = rand(et, q_sz...), rand(et, 8, q_sz...)
+        k, k_score = rand(et, k_sz...), rand(et, 8, k_sz...)
+        v = rand(et, v_sz...)
+        bias = isnothing(bias_sz) ? nothing : rand(et, bias_sz...)
+        mask = isnothing(bias_sz) ? nothing : rand(Bool, bias_sz...)
+        et_suite["attention"][
+            "q($q_sz)-k($k_sz)-v($v_sz)-bias($bias_sz)-nheads($nheads)"
+        ] = @benchmarkable dot_product_attention($q, $k, $v, $bias; nheads = $nheads)
+        et_suite["score"][
+            "q(8, $q_sz)-k(8, $k_sz)-bias($bias_sz)-nheads($nheads)"
+        ] = @benchmarkable dot_product_attention_scores($q_score, $k_score, $bias; mask = $mask)
+    end
+end
diff --git a/benchmark/benchmark/nnlib/conv.jl b/benchmark/benchmark/nnlib/conv.jl
new file mode 100644
index 0000000..9b571f4
--- /dev/null
+++ b/benchmark/benchmark/nnlib/conv.jl
@@ -0,0 +1,58 @@
+########## conv ############
+SUITE["nnlib"]["conv"] = BenchmarkGroup()
+for rank in (3, 2, 1,), N in (512, 256,), K in (3,),
+    C_in in (1,), C_out in (1,),
+    stride in (1,), dilation in (1,), padding in (2, 0,)
+
+    size_suite = BenchmarkGroup()
+    SUITE["nnlib"]["conv"][
+        "$(rank+2)-N($N)-K($K)-in($C_in)-out($C_out)-stride($stride)-dilation($dilation)-padding($padding)"
+        ] = size_suite
+
+    conv_items = [
+        (NNlib.conv_direct!, NNlib.∇conv_data_direct!, NNlib.∇conv_filter_direct!, DenseConvDims, "direct"),
+        (NNlib.conv_im2col!, NNlib.∇conv_data_im2col!, NNlib.∇conv_filter_im2col!, DenseConvDims, "im2col"),
+        (NNlib.depthwiseconv_direct!, NNlib.∇depthwiseconv_data_direct!, NNlib.∇depthwiseconv_filter_direct!, DepthwiseConvDims, "direct"),
+        (NNlib.depthwiseconv_im2col!, NNlib.∇depthwiseconv_data_im2col!, NNlib.∇depthwiseconv_filter_im2col!, DepthwiseConvDims, "im2col"),
+    ]
+
+    for (conv!, ∇conv_data!, ∇conv_filter!, cdimT, _) in conv_items
+        conv_suite = BenchmarkGroup()
+        SUITE["nnlib"]["conv"][
+            "$(rank+2)-N($N)-K($K)-in($C_in)-out($C_out)-stride($stride)-dilation($dilation)-padding($padding)"
+            ][rstrip(string(conv!), '!')] = conv_suite
+
+        for et in (Float32, Float64)
+            et_suite = BenchmarkGroup()
+            SUITE["nnlib"]["conv"][
+                "$(rank+2)-N($N)-K($K)-in($C_in)-out($C_out)-stride($stride)-dilation($dilation)-padding($padding)"
+                ][rstrip(string(conv!), '!')][string(et)] = et_suite
+
+            x = zeros(et, repeat([N], rank)..., C_in, 1)
+            w = (cdimT == DenseConvDims) ?
+                zeros(et, repeat([K], rank)..., C_in, C_out) :
+                zeros(et, repeat([K], rank)..., C_out, C_in)
+
+            cdims = try
+                cdimT(x, w; stride = stride, dilation = dilation, padding = padding)
+            catch
+                continue
+            end
+
+            y = (cdimT == DenseConvDims) ?
+                zeros(et, NNlib.output_size(cdims)..., C_out, 1) :
+                zeros(et, NNlib.output_size(cdims)..., C_out*C_in, 1)
+
+            dx, dy, dw = similar(x), similar(y), similar(w)
+            SUITE["nnlib"]["conv"][
+                "$(rank+2)-N($N)-K($K)-in($C_in)-out($C_out)-stride($stride)-dilation($dilation)-padding($padding)"
+                ][rstrip(string(conv!), '!')][string(et)]["conv"] = @benchmarkable $(conv!)($y, $x, $w, $cdims)
+            SUITE["nnlib"]["conv"][
+                "$(rank+2)-N($N)-K($K)-in($C_in)-out($C_out)-stride($stride)-dilation($dilation)-padding($padding)"
+                ][rstrip(string(conv!), '!')][string(et)]["data"] = @benchmarkable $(∇conv_data!)($dx, $y, $w, $cdims)
+            SUITE["nnlib"]["conv"][
+                "$(rank+2)-N($N)-K($K)-in($C_in)-out($C_out)-stride($stride)-dilation($dilation)-padding($padding)"
+                ][rstrip(string(conv!), '!')][string(et)]["filter"] = @benchmarkable $(∇conv_filter!)($dw, $x, $y, $cdims)
+        end
+    end
+end
diff --git a/benchmark/benchmark/nnlib/dropout.jl b/benchmark/benchmark/nnlib/dropout.jl
new file mode 100644
index 0000000..90dd838
--- /dev/null
+++ b/benchmark/benchmark/nnlib/dropout.jl
@@ -0,0 +1,20 @@
+########## dropout ############
+SUITE["nnlib"]["dropout"] = BenchmarkGroup()
+for rank in (1, 2, 3,), N in (128, 512, 1024,)
+    size_suite = BenchmarkGroup()
+    SUITE["nnlib"]["dropout"]["$(rank+2)-N($N)"] = size_suite
+
+    x = ones(Float32, repeat([N], rank)..., 1, 1)
+    y = zeros(Float32, repeat([N], rank)..., 1, 1)
+    p = 0.2
+
+    dropout_suite = BenchmarkGroup()
+    dropout_suite["with-colon"] = @benchmarkable dropout($x, $p)
+    dropout_suite["with-dim"] = @benchmarkable dropout($x, $p; dims = 1)
+    SUITE["nnlib"]["dropout"]["$(rank+2)-N($N)"]["dropout"] = dropout_suite
+
+    dropout!_suite = BenchmarkGroup()
+    dropout!_suite["with-colon"] = @benchmarkable dropout!($y, $x, $p)
+    dropout!_suite["with-dim"] = @benchmarkable dropout!($y, $x, $p; dims = 1)
+    SUITE["nnlib"]["dropout"]["$(rank+2)-N($N)"]["dropout!"] = dropout!_suite
+end
diff --git a/benchmark/benchmark/nnlib/gemm.jl b/benchmark/benchmark/nnlib/gemm.jl
new file mode 100644
index 0000000..9741902
--- /dev/null
+++ b/benchmark/benchmark/nnlib/gemm.jl
@@ -0,0 +1,26 @@
+########## gemm ############
+SUITE["nnlib"]["gemm"] = BenchmarkGroup()
+for et in (Float32, Float64)
+    et_suite = BenchmarkGroup(
+        "gemm!" => BenchmarkGroup(),
+        "batched_gemm!" => BenchmarkGroup())
+    SUITE["nnlib"]["gemm"][string(et)] = et_suite
+
+    # transA and transB are not of the main varaints.
+    # gemm! meets some memory problem, not included here.
+    input_items = [
+        (Val(false), Val(false), 'N', 'N', 1024, 1024, 1024, et(0.5), et(0.0)),
+        (Val(false), Val(false), 'N', 'N', 512, 512, 128, et(0.5), et(1.0)),
+        (Val(false), Val(false), 'N', 'N', 80, 40, 100, et(1.0), et(0.0)),
+    ]
+    for (transA, transB, transA_ch, transB_ch, M, N, K, alpha, beta) in input_items
+        bA = ones(et, M, N, 1)
+        bB = ones(et, N, K, 1)
+        bC = zeros(et, M, K, 1)
+        et_suite["batched_gemm!"][
+           "trans($transA_ch,$transB_ch)-M($M)-N($N)-K($K)-alpha($alpha)-beta($beta)"
+        ] = @benchmarkable NNlib.batched_gemm!(
+           $transA_ch, $transB_ch,
+           $alpha, $bA, $bB, $beta, $bC)
+    end
+end
diff --git a/benchmark/benchmark/nnlib/pooling.jl b/benchmark/benchmark/nnlib/pooling.jl
new file mode 100644
index 0000000..6f4bfd7
--- /dev/null
+++ b/benchmark/benchmark/nnlib/pooling.jl
@@ -0,0 +1,38 @@
+########## pooling ############
+SUITE["nnlib"]["pooling"] = BenchmarkGroup()
+for rank in (3, 2, 1,), N in (512, 256,), K in (4, 2,), stride in (4, 2, 1,)
+    size_suite = BenchmarkGroup()
+    SUITE["nnlib"]["pooling"]["$(rank+2)-N($N)-K($K)-stride($stride)"] = size_suite
+
+    x = zeros(Float32, repeat([N], rank)..., 1, 1)
+    pdims = PoolDims(x, K; stride = stride)
+    y = zeros(Float32, NNlib.output_size(pdims)..., 1, 1)
+    dx, dy = similar(x), similar(y)
+
+    pooling_items = [
+        (NNlib.maxpool!, NNlib.∇maxpool!, "maxpool"),
+        (NNlib.meanpool!, NNlib.∇meanpool!, "meanpool"),
+        (NNlib.lpnormpool!, NNlib.∇lpnormpool!, "lpnormpool"),
+    ]
+
+    for (pool, ∇pool, name) in pooling_items
+        pooling_suite = BenchmarkGroup()
+        SUITE["nnlib"]["pooling"][
+            "$(rank+2)-N($N)-K($K)-stride($stride)"
+            ]["$(name)$(rank)d-direct"] = pooling_suite
+        SUITE["nnlib"]["pooling"][
+            "$(rank+2)-N($N)-K($K)-stride($stride)"
+            ]["$(name)$(rank)d-direct"]["pool"] = @benchmarkable $pool(
+                $y, $x, $pdims; p = ($name == "lpnormpool") ? 2 : nothing)
+        SUITE["nnlib"]["pooling"][
+            "$(rank+2)-N($N)-K($K)-stride($stride)"
+            ]["$(name)$(rank)d-direct"]["data"] = @benchmarkable $(∇pool)(
+                $dx, $dy, $y, $x, $pdims; p = ($name == "lpnormpool") ? 2 : nothing)
+    end
+
+    if NNlib.is_nnpack_available() && NNlib.nnpack_supported_operation(pdims)
+        SUITE["nnlib"]["pooling"][
+            "$(rank+2)-N($N)-K($K)-stride($stride)"
+            ]["maxpool$(rank)d-nnpack"]["pool"] = @benchmarkable NNlib.maxpool_nnpack!($y, $x, $pdims)
+    end
+end
diff --git a/benchmark/benchmark/nnlib/softmax.jl b/benchmark/benchmark/nnlib/softmax.jl
new file mode 100644
index 0000000..a8b5abe
--- /dev/null
+++ b/benchmark/benchmark/nnlib/softmax.jl
@@ -0,0 +1,23 @@
+########## softmax ############
+SUITE["nnlib"]["softmax"] = BenchmarkGroup()
+for (fn!, fn_bw) in [(softmax!, NNlib.∇softmax_data), (logsoftmax!, NNlib.∇logsoftmax_data)]
+    fn_suite = BenchmarkGroup()
+    SUITE["nnlib"]["softmax"][rstrip(string(fn!), '!')] = fn_suite
+    let SIZES = [
+        (12288, 2048, 1), (4096, 4096, 2), (4096, 2048, 2), (2048, 2048, 2),
+        (1024, 2048, 4), (768, 1024, 4), (512, 784, 8), (128, 384, 8),
+    ]
+        for et in (Float32, Float16,)
+            et_suite = BenchmarkGroup("fw" => BenchmarkGroup(), "bw" => BenchmarkGroup())
+            fn_suite[string(et)] = et_suite
+            for sz in SIZES
+                x = randn(et, sz)
+                y = similar(x)
+                dy = zero(x)
+                fn!(y, x)
+                et_suite["fw"][string(sz)] = @benchmarkable $fn!($y, $x)
+                et_suite["bw"][string(sz)] = @benchmarkable $fn_bw($dy, $y)
+            end
+        end
+    end
+end
diff --git a/benchmark/benchmark/nnlib/upsample.jl b/benchmark/benchmark/nnlib/upsample.jl
new file mode 100644
index 0000000..2f54c89
--- /dev/null
+++ b/benchmark/benchmark/nnlib/upsample.jl
@@ -0,0 +1,35 @@
+########## upsample ############
+SUITE["nnlib"]["upsample"] = BenchmarkGroup()
+SUITE["nnlib"]["upsample"]["linear"] = BenchmarkGroup()
+for rank in (3, 2, 1,), et in (Float32, Float16,)
+    et_suite = BenchmarkGroup("fw" => BenchmarkGroup(), "bw" => BenchmarkGroup())
+    SUITE["nnlib"]["upsample"]["linear"][string(et)] = et_suite
+
+    inputs_sizes = [
+        (1024, (0.5, 2), false), (256, 8, false),
+        (256, 4, true), (128, (1, 2), false), (128, 2, true),
+    ]
+    for (sz, scale, ac) in inputs_sizes
+        x = ones(et, repeat([sz], rank)..., 1, 1)
+        et_suite["fw"][
+            "$(rank+2)-N($sz)-scale($scale)"
+            ] = @benchmarkable upsample_linear($x, $scale; align_corners = $ac)
+        et_suite["bw"][
+            "$(rank+2)-N($sz)-scale($scale)"
+            ] = @benchmarkable ∇upsample_linear($x;
+                size = (typeof($scale) <: Tuple) ?
+                    floor.(Integer, $sz .* $scale) :
+                    ntuple(_ -> floor(Integer, $sz * $scale), $rank),
+                align_corners = $ac)
+    end
+end
+
+SUITE["nnlib"]["upsample"]["nearest"] = BenchmarkGroup()
+for rank in (3, 2, 1,), N in (1024, 512, 128,)
+    et_suite = BenchmarkGroup()
+    for et in (Float64, Float32, Float16,)
+        x = zeros(Float32, repeat([N], rank)..., 1, 1)
+        et_suite[string(et)] = @benchmarkable upsample_nearest($x; size = (repeat([$N * 10], $rank)..., 1, 1))
+    end
+    SUITE["nnlib"]["upsample"]["nearest"]["$(rank+2)-N($N)"] = et_suite
+end