Skip to content
This repository has been archived by the owner on Nov 4, 2024. It is now read-only.

Commit

Permalink
fix: fixes for testing
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal committed Sep 5, 2024
1 parent cd34365 commit 1afc1c7
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 9 deletions.
4 changes: 2 additions & 2 deletions src/api/instancenorm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ end

function instancenorm(x::AbstractArray, γ::Optional{<:AbstractVector},
β::Optional{<:AbstractVector}, rμ::Optional{<:AbstractVector},
rσ²::Optional{<:AbstractVector}, training::TrainingType,
σ::F=identity, momentum::Real=0.1f0, epsilon::Real=default_epsilon(x)) where {F}
rσ²::Optional{<:AbstractVector}, training::TrainingType, σ::F=identity,
momentum::Optional{<:Real}=0.1f0, epsilon::Real=default_epsilon(x)) where {F}
assert_valid_instancenorm_arguments(x)

y, rμₙ, rσ²ₙ = instancenorm_impl(
Expand Down
8 changes: 4 additions & 4 deletions src/impl/normalization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,10 @@ CRC.@non_differentiable get_norm_reshape_dims(::Any...)

# Entry Points
## InstanceNorm
function instancenorm(x::AbstractArray{xT, N}, ::Optional{<:AbstractVector},
rσ²::Optional{<:AbstractVector}, γ::Optional{<:AbstractVector},
β::Optional{<:AbstractVector}, training::StaticBool,
momentum, epsilon, act::F) where {xT, N, F}
function instancenorm(x::AbstractArray{xT, N}, γ::Optional{<:AbstractVector},
β::Optional{<:AbstractVector}, ::Optional{<:AbstractVector},
rσ²::Optional{<:AbstractVector}, training::StaticBool,
act::F, momentum, epsilon) where {xT, N, F}
y, rμₙ, rσ²ₙ = normalization(
x, rμ, rσ², γ, β, instancenorm_reduce_dims(x), training, momentum, epsilon, act)
return y, safe_vec(rμₙ), safe_vec(rσ²ₙ)
Expand Down
7 changes: 4 additions & 3 deletions test/normalization/instancenorm_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,10 @@ function run_instancenorm_testing(gen_f, T, sz, training, act, aType, mode, ongp
@jet instancenorm(x, scale, bias, rm, rv, training, act, T(0.1), epsilon)

if anonact !== act && is_training(training)
lfn = (x, sc, b, rm, rv, act, ϵ) -> sum(first(instancenorm(
x, sc, b, rm, rv, Val(true), act, T(0.1), ϵ)))
@test @inferred(Zygote.gradient(lfn, x, scale, bias, rm, rv, act, epsilon)) isa Any
lfn = (x, sc, b, rm, rv, act, m, ϵ) -> sum(first(instancenorm(
x, sc, b, rm, rv, Val(true), act, m, ϵ)))
@test @inferred(Zygote.gradient(
lfn, x, scale, bias, rm, rv, act, T(0.1), epsilon)) isa Any
end

@test y isa aType{T, length(sz)}
Expand Down

1 comment on commit 1afc1c7

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LuxLib Benchmarks

Benchmark suite Current: 1afc1c7 Previous: 9d522c5 Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5479.5 ns 5750 ns 0.95
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6375 ns 6187.5 ns 1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 8000 ns 7979 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6375 ns 6958.5 ns 0.92
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 119198 ns 119461 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 2649209 ns
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 704000 ns 723417 ns 0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 417764 ns 417664 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9812 ns 9834 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9625 ns 9792 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10042 ns 9916 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9541 ns 10166 ns 0.94
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 551456 ns 551816 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 16841216 ns
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 2645125 ns 2364708 ns 1.12
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 659636 ns 695047 ns 0.95
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1395.5 ns 1458 ns 0.96
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1687.5 ns 1687.5 ns 1
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1875 ns 1917 ns 0.98
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 2521 ns 1250 ns 2.02
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 21867 ns 21782 ns 1.00
bias_activation(32, act=relu)(32 x 128)/forward/GPU/oneAPI 1304894 ns
bias_activation(32, act=relu)(32 x 128)/forward/GPU/Metal 212604 ns 189208 ns 1.12
bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU 30820.5 ns 30960 ns 1.00
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4209 ns 3958.5 ns 1.06
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 4312.5 ns 4167 ns 1.03
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 3917 ns 4000 ns 0.98
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 4375 ns 4334 ns 1.01
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 146279 ns 148046.5 ns 0.99
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/oneAPI 8894773.5 ns
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/Metal 1523375 ns 1745084 ns 0.87
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU 148982 ns 148342 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57542 ns 56083 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46584 ns 39917 ns 1.17
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 39875 ns 47000 ns 0.85
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83708 ns 82750 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 36787 ns 37366 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 582007 ns
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 985625 ns 1348187.5 ns 0.73
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 84391 ns 80291 ns 1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2036583 ns 2017708 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2086750 ns 2083959 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2079917 ns 2090792 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1987312.5 ns 1999604 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 227214 ns 232635 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 7854957 ns
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 7818750 ns 7104833 ns 1.10
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 967560 ns 1540007 ns 0.63
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 154083 ns 143708 ns 1.07
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 146958 ns 173750.5 ns 0.85
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 149979.5 ns 165562.5 ns 0.91
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 165187.5 ns 165979 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 166381 ns 166570 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 7795058 ns
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1464583 ns 1701792 ns 0.86
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 207072 ns 205502.5 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1110895.5 ns 1100292 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1103209 ns 1114709 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1118687 ns 1122042 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1109562.5 ns 1119916 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 711437 ns 713685 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 33922938.5 ns
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 6051917 ns 7357125 ns 0.82
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1036360 ns 1039502 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5208 ns 4458 ns 1.17
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4271 ns 4291 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 5375 ns 6208 ns 0.87
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4584 ns 4416 ns 1.04
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 94268 ns 94296 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 5136056 ns
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 711583 ns 782083.5 ns 0.91
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 69481 ns 69431 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8667 ns 8542 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8500 ns 8834 ns 0.96
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8917 ns 9083 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8333 ns 8583 ns 0.97
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 603970 ns 608245 ns 0.99
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 33683319.5 ns
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 5821292 ns 5666604.5 ns 1.03
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 389889 ns 384864 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17729.5 ns 17229 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 20042 ns 17250 ns 1.16
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 20584 ns 22250 ns 0.93
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 20416.5 ns 18312.5 ns 1.11
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 66995 ns 68096 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 2897295 ns
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1301292 ns 1292667 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 73931 ns 74070.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 211625 ns 218583 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 218875 ns 244459 ns 0.90
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 218667 ns 213333 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 224875 ns 220875 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 357740 ns 359693 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 14308445 ns
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 5704396 ns 7278917 ns 0.78
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 473855 ns 475315 ns 1.00
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 625 ns 708 ns 0.88
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 666 ns 584 ns 1.14
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 750 ns 916.5 ns 0.82
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 666 ns 583 ns 1.14
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 20965 ns 20807.5 ns 1.01
bias_activation(2, act=relu)(2 x 128)/forward/GPU/oneAPI 1157358.5 ns
bias_activation(2, act=relu)(2 x 128)/forward/GPU/Metal 283542 ns 297208 ns 0.95
bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU 32571 ns 33001 ns 0.99
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1375 ns 1375 ns 1
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1375 ns 1458 ns 0.94
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1500 ns 1583 ns 0.95
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1334 ns 1417 ns 0.94
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 125947 ns 126203 ns 1.00
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/oneAPI 8433349.5 ns
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/Metal 1594979.5 ns 1457625 ns 1.09
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU 138471 ns 138172 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7334 ns 7333 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6125 ns 5375 ns 1.14
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5333 ns 6083 ns 0.88
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10417 ns 10291 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 23836 ns 24430 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1232101.5 ns
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 583125 ns 351229 ns 1.66
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 46460 ns 47101 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 227708 ns 219208 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 235583 ns 261791 ns 0.90
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 264667 ns 228625 ns 1.16
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 248583 ns 223750 ns 1.11
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 190580 ns 194664 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 29562269.5 ns
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8564854.5 ns 11964250 ns 0.72
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 611281 ns 617187 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4084 ns 4125 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4084 ns 4167 ns 0.98
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4125 ns 4125 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4125 ns 4084 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23789 ns 23689 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/oneAPI 2018577 ns
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/Metal 219791.5 ns 203375 ns 1.08
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU 50370 ns 48541 ns 1.04
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16958 ns 16958 ns 1
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 17083 ns 16583 ns 1.03
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 17083 ns 17250 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16666 ns 16917 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 197449 ns 196884 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/oneAPI 9693737.5 ns
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/Metal 940458 ns 1560667 ns 0.60
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU 176226.5 ns 174782 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 509500 ns 509333 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 405083 ns 332250 ns 1.22
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 332459 ns 404250 ns 0.82
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 865125 ns 865708 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113130 ns 114284.5 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/oneAPI 391060 ns
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/Metal 451416 ns 392875 ns 1.15
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU 248703 ns 248273 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2324333 ns 2318021 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 2025375.5 ns 1745083 ns 1.16
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1752833.5 ns 2021000 ns 0.87
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3200583 ns 3274791.5 ns 0.98
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 244865 ns 244508 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/oneAPI 11656548 ns
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/Metal 1966229 ns 2001875 ns 0.98
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 761317.5 ns 763478 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6250 ns 5833 ns 1.07
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6145.5 ns 7167 ns 0.86
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7729 ns 7271 ns 1.06
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6375 ns 6124.5 ns 1.04
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 93009 ns 92855.5 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 5406797 ns
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 758167 ns 861271 ns 0.88
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 60110 ns 60401 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10646 ns 11375 ns 0.94
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10542 ns 11750 ns 0.90
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11084 ns 12229 ns 0.91
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10375 ns 11125 ns 0.93
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 660576 ns 638820 ns 1.03
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 38819677 ns
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 5487104 ns 6435375 ns 0.85
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 416424 ns 416514.5 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 500 ns 541 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 500 ns 541 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 541 ns 541 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 542 ns 500 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23635 ns 23671 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/oneAPI 2221310 ns
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/Metal 319750 ns 318791 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU 53401 ns 53351 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2083 ns 2167 ns 0.96
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2083 ns 2084 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2084 ns 2166 ns 0.96
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2125 ns 2125 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 232566 ns 222818.5 ns 1.04
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/oneAPI 11381984 ns
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/Metal 1912541.5 ns 1967167 ns 0.97
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU 186466.5 ns 180782 ns 1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 8375 ns 8708 ns 0.96
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 8750 ns 8833 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 10438 ns 9895.5 ns 1.05
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 8958 ns 8709 ns 1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 104173 ns 100619 ns 1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 3244842 ns
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 896708 ns 898521 ns 1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 74231 ns 74410.5 ns 1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 17708 ns 17375 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17750 ns 17167 ns 1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 18187.5 ns 19375 ns 0.94
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 18041.5 ns 18250 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 610296 ns 574738 ns 1.06
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 17126722 ns
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 5229458 ns 5654917 ns 0.92
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 387209 ns 389229 ns 0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 500 ns 625 ns 0.80
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 625 ns 500 ns 1.25
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 541 ns 667 ns 0.81
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 500 ns 500 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 35555 ns 36237 ns 0.98
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 1100087 ns
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 438541 ns 463667 ns 0.95
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 47930 ns 48401 ns 0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9312 ns 8437.5 ns 1.10
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 8125 ns 9312 ns 0.87
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9792 ns 9875 ns 0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9146 ns 9708 ns 0.94
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 256000 ns 254845 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 19311232 ns
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 4774937.5 ns 5087792 ns 0.94
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 378844 ns 375784 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397000 ns 395833.5 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288125 ns 215750 ns 1.34
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 215667 ns 288166 ns 0.75
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 756875 ns 756000 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 111981 ns 112957 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/oneAPI 320003 ns
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/Metal 365500 ns 299833 ns 1.22
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU 78230 ns 76681 ns 1.02
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1460875 ns 1455646 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 1135291.5 ns 862000 ns 1.32
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 862687.5 ns 1130021 ns 0.76
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2357291 ns 2442563 ns 0.97
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 209166.5 ns 210541 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/oneAPI 9267436 ns
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/Metal 1516312.5 ns 1636104.5 ns 0.93
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU 323643 ns 325573.5 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6667 ns 7000 ns 0.95
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6959 ns 7084 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8958.5 ns 8125 ns 1.10
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7334 ns 7041 ns 1.04
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 144567 ns 136948 ns 1.06
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 5867002 ns
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 707270.5 ns 760125 ns 0.93
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 70660 ns 68820 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 15395.5 ns 14625 ns 1.05
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 12417 ns 15042 ns 0.83
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14250 ns 14958.5 ns 0.95
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 13312 ns 15625 ns 0.85
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 958993.5 ns 931253.5 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 40369162 ns
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 5752729.5 ns 6306249.5 ns 0.91
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 433804 ns 436305 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 24416 ns 25542 ns 0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 26417 ns 27334 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 28687 ns 28354 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 26874.5 ns 31542 ns 0.85
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 201880.5 ns 200462.5 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 8100056 ns
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 896833 ns 1129500 ns 0.79
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 114876.5 ns 112942 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 148834 ns 149250 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 104708 ns 131583.5 ns 0.80
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 153500 ns 106479 ns 1.44
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 116979 ns 153208 ns 0.76
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1086710 ns 1062590 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 41151661 ns
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 5843229.5 ns 5978292 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 594985 ns 590197 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 73958 ns 76250 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 76791.5 ns 74291.5 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 80166 ns 77333 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 75417 ns 76792 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 207189 ns 209030.5 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7362606 ns
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 519687.5 ns 638458 ns 0.81
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 126391.5 ns 130572 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 297334 ns 216500 ns 1.37
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 221667 ns 297395.5 ns 0.75
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 288917 ns 212146 ns 1.36
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 221041.5 ns 306208 ns 0.72
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1119401 ns 1140320 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 41008184.5 ns
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6497687.5 ns 7480542 ns 0.87
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 694627 ns 697363 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 16417 ns 15833 ns 1.04
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 16583 ns 17291.5 ns 0.96
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 17792 ns 17875 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 16708 ns 16687.5 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 147421 ns 150183 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 5759467 ns
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 427292 ns 779979 ns 0.55
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 237703 ns 237943 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 24833.5 ns 26458.5 ns 0.94
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 27042 ns 25708 ns 1.05
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 27166.5 ns 27625 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 27125 ns 27750 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 984196 ns 987976 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 40719457 ns
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 5828333 ns 7131041.5 ns 0.82
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 714022 ns 701547 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11562.5 ns 10396 ns 1.11
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 10375 ns 11563 ns 0.90
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 12083 ns 12833 ns 0.94
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 11083 ns 10875.5 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 124895.5 ns 125970.5 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 3575871 ns
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 912833 ns 910812.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 242943 ns 241512 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 21125 ns 21083 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 21917 ns 21604.5 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 22000 ns 23041.5 ns 0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 21416 ns 21541.5 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 706086.5 ns 709336 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 21428227.5 ns
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 5387146 ns 5733333 ns 0.94
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 673547 ns 676248 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 64000.5 ns 62667 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 63500 ns 63771 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 66166 ns 65667 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 62584 ns 67667 ns 0.92
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 105629.5 ns 107292 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3434086.5 ns
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1323250 ns 1352583.5 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 237572 ns 240373 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 448750 ns 444083 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 437958 ns 448875 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 446666 ns 440458 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 449583 ns 445833.5 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 517219 ns 521267 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 21208755 ns
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 5978042 ns 8808750 ns 0.68
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 730458 ns 728812.5 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6958.5 ns 6958.5 ns 1
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6833 ns 7291 ns 0.94
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8041 ns 8771 ns 0.92
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7771 ns 7104 ns 1.09
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 145909.5 ns 147758.5 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 5602766 ns
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 628395.5 ns 763583 ns 0.82
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 58991 ns 60941 ns 0.97
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14042 ns 15125 ns 0.93
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 15750 ns 14417 ns 1.09
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 13917 ns 15334 ns 0.91
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 13479 ns 15958 ns 0.84
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 954313 ns 958359.5 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 38432249.5 ns
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 5549500 ns 6378396 ns 0.87
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 404584 ns 409474 ns 0.99
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 6160416 ns 6155291 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 6378167 ns 3225687.5 ns 1.98
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 3224791.5 ns 6379541 ns 0.51
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 11924000 ns 11906125 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA 301800.5 ns 351844 ns 0.86
batchedmm(512, Bsize=4)/forward/GPU/AMDGPU 294983 ns 301554 ns 0.98
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 19104958 ns 19041833.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 19957229 ns 11118520.5 ns 1.79
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 11123708.5 ns 19989395.5 ns 0.56
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 36532604 ns 36469125 ns 1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1023618 ns 1015731 ns 1.01
batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU 1158122 ns 1151512 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 917 ns 959 ns 0.96
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 958 ns 958 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 958 ns 959 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 959 ns 958 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 23554 ns 23791 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/oneAPI 2143802 ns
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/Metal 316188 ns 317417 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU 215672 ns 215032 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3625 ns 3667 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 3667 ns 3667 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 3666 ns 3750 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 3666 ns 3708 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 283503 ns 283833 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/oneAPI 11257238 ns
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/Metal 2086333.5 ns 2116208 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 637297 ns 634877 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8000 ns 7167 ns 1.12
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 7958 ns 7833.5 ns 1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9042 ns 9291 ns 0.97
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7854 ns 7500 ns 1.05
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 120818.5 ns 122503 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 3517154 ns
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 776959 ns 866646 ns 0.90
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 67641 ns 66931 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 11729.5 ns 11709 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 12250 ns 11834 ns 1.04
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 12334 ns 13291 ns 0.93
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 12458.5 ns 11875 ns 1.05
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 643501 ns 651319 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 21447178 ns
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 5189125.5 ns 5038083 ns 1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 365334 ns 365314 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 250 ns 292 ns 0.86
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 291 ns 291 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 291 ns 250 ns 1.16
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 22596 ns 22923 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/oneAPI 1951713 ns
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/Metal 225750 ns 208979.5 ns 1.08
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU 52251 ns 50651 ns 1.03
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 3041 ns 3000 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 3208 ns 2959 ns 1.08
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 3375 ns 3250 ns 1.04
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 3042 ns 2959 ns 1.03
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 204741 ns 206218 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/oneAPI 9227567 ns
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/Metal 1619250 ns 1699541.5 ns 0.95
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU 172842 ns 158851.5 ns 1.09
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 11250 ns 10375 ns 1.08
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 11334 ns 11854.5 ns 0.96
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 13125 ns 12417 ns 1.06
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 11458 ns 12333 ns 0.93
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 121547.5 ns 123182.5 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 3353104 ns
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 869041 ns 877125 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 243193 ns 241463 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 22000 ns 22062 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 20583 ns 21625 ns 0.95
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 21167 ns 21708 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 20791 ns 20084 ns 1.04
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 598450 ns 605852.5 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 19931223.5 ns
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 4695229 ns 5025000 ns 0.93
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 652706.5 ns 667502 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4375 ns 4417 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4417 ns 4584 ns 0.96
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4416 ns 4417 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4416 ns 4375 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 24359 ns 24334 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/oneAPI 2166080 ns
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/Metal 223833 ns 208417 ns 1.07
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU 52541 ns 54130 ns 0.97
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16667 ns 16375 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16500 ns 16375 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16375 ns 16667 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16333 ns 16875 ns 0.97
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 331128 ns 333246 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/oneAPI 12599810 ns
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/Metal 1647875.5 ns 1768771 ns 0.93
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU 212037.5 ns 214042.5 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 1959 ns 2084 ns 0.94
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 2083 ns 2000 ns 1.04
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 1958 ns 2166 ns 0.90
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 1958 ns 2041 ns 0.96
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 35684 ns 36196 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 1146851 ns
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 441458.5 ns 473000 ns 0.93
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 206802 ns 205752 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 16645.5 ns 17667 ns 0.94
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 16750 ns 18937.5 ns 0.88
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 16562.5 ns 17625 ns 0.94
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 17208.5 ns 16896 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 294264.5 ns 297235 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 20813859 ns
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 5292083 ns 5572167 ns 0.95
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 703797.5 ns 694748 ns 1.01
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 59583.5 ns 55979.5 ns 1.06
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 63625 ns 60709 ns 1.05
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 62625 ns 65812.5 ns 0.95
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 51292 ns 51583 ns 0.99
batchedmm(16, Bsize=512)/forward/GPU/CUDA 66405 ns 66558 ns 1.00
batchedmm(16, Bsize=512)/forward/GPU/AMDGPU 103511 ns 120591.5 ns 0.86
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 199395.5 ns 185895.5 ns 1.07
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 157250 ns 146354 ns 1.07
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 133937.5 ns 136208 ns 0.98
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 317729 ns 297104 ns 1.07
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 216342 ns 218976.5 ns 0.99
batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU 579316 ns 584106 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 82458.5 ns 112833.5 ns 0.73
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 85271 ns 86417 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 90209 ns 89416 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 140417 ns 81000 ns 1.73
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 192334 ns 191966 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5533381 ns
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1893708 ns 1945000 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 170101.5 ns 209467.5 ns 0.81
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1851687.5 ns 1912250 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1882334 ns 1923916 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1926500 ns 1917917 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1891958.5 ns 1922250 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 532324 ns 536309 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 25979046 ns
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9683125 ns 11093750 ns 0.87
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1080090 ns 935284.5 ns 1.15
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 291 ns 291 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 291 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 291 ns 292 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 250 ns 1.17
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 21761 ns 21820 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/oneAPI 2115738 ns
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/Metal 346875 ns 327833.5 ns 1.06
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU 45220 ns 46181 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1792 ns 1792 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1750 ns 1791 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1792 ns 1833 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1792 ns 1792 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 253104 ns 254627 ns 0.99
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/oneAPI 9490240.5 ns
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/Metal 1088979 ns 1640833 ns 0.66
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU 187502 ns 187212 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 8084 ns 8209 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 8438 ns 9083 ns 0.93
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 10875 ns 9896 ns 1.10
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 8209 ns 8417 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 119061 ns 120586.5 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 3459549.5 ns
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 880209 ns 873250 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 237872 ns 236722 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10167 ns 10292 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9208 ns 8958 ns 1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9500 ns 9917 ns 0.96
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9167 ns 8666 ns 1.06
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 527070 ns 532717.5 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 18222497.5 ns
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 4417458 ns 4452292 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 634411 ns 646767 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58417 ns 56750 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46333 ns 39708 ns 1.17
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 39500 ns 47166 ns 0.84
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 84083 ns 83125 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 39770 ns 40431 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1341281.5 ns
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1100583.5 ns 1093666 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 75935.5 ns 77971 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1901542 ns 1903833 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1921833.5 ns 1979312 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1955833 ns 1983896 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1881792 ns 1849208 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 221320 ns 224788 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 33766076 ns
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11588792 ns 14363791.5 ns 0.81
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1036440 ns 1042991 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 415958 ns 415042 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 420042 ns 418584 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 419875 ns 420291 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 418708 ns 420459 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 210156.5 ns 212100.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7606443 ns
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 522750 ns 1065709 ns 0.49
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 287858 ns 286133 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 764709 ns 742875 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 781812 ns 758958 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 753417 ns 691062.5 ns 1.09
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 678791.5 ns 742624.5 ns 0.91
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1059447 ns 1063422.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 43854665.5 ns
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6323063 ns 7312146 ns 0.86
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 916300 ns 924920 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 3425978.5 ns 3442959 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 3451792 ns 3441833 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 3458979.5 ns 3417500 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 3412708 ns 3453000 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 170950 ns 174858 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8189493 ns
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1396875 ns 1420583 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 435150 ns 452865 ns 0.96
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 6194166.5 ns 6180375 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 6230791.5 ns 6232875 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 6222854 ns 6229979 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 6218875 ns 6252666 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1001834 ns 1007257 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 49254606 ns
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 8528604 ns 9641124.5 ns 0.88
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1556125 ns 1560736 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 472667 ns 471375 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 339875 ns 253334 ns 1.34
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 253208 ns 341708 ns 0.74
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 902000 ns 902583 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 46534 ns 46913 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/oneAPI 886552 ns
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/Metal 478875 ns 338020.5 ns 1.42
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU 249963 ns 250492 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2333750 ns 2320416 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 2036625 ns 1761167 ns 1.16
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1763167 ns 2033167 ns 0.87
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3203312 ns 3279375 ns 0.98
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 258879 ns 260626 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/oneAPI 13032420 ns
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/Metal 2178375 ns 2319917 ns 0.94
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 787818 ns 785678 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57542 ns 56166 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 45875 ns 39417 ns 1.16
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 39458 ns 46584 ns 0.85
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83791 ns 82917 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 28376 ns 28863 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1391893 ns
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1124083 ns 1130625 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 77840.5 ns 79170.5 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2032250 ns 2020083 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2093187.5 ns 2062917 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2091917 ns 2078437.5 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1972229.5 ns 2004145.5 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 235913 ns 238429 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 35452366 ns
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11558395.5 ns 15264270.5 ns 0.76
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1056250.5 ns 1057241 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57708 ns 56292 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46625 ns 39833 ns 1.17
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 39875 ns 47416 ns 0.84
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83916.5 ns 82875 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 49455 ns 50090 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 809068 ns
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1084875 ns 1054834 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 72105.5 ns 74900 ns 0.96
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1921083 ns 1924167 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1945916.5 ns 1968250 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1974729.5 ns 1980792 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1864791 ns 1891208 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 238800.5 ns 243592 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 17238198 ns
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10023791.5 ns 12800042 ns 0.78
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 934629 ns 1070466 ns 0.87
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 291 ns 292 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 292 ns 333 ns 0.88
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 333 ns 375 ns 0.89
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 333 ns 292 ns 1.14
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 34886 ns 35236 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 1200155 ns
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 279833 ns 461750 ns 0.61
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 48281 ns 50011 ns 0.97
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6792 ns 6709 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6208.5 ns 6520.5 ns 0.95
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7000 ns 7625 ns 0.92
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6667 ns 6541 ns 1.02
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 212384.5 ns 216284 ns 0.98
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 19751565 ns
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 5078916.5 ns 5088292 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 379104 ns 373774 ns 1.01
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 250 ns 292 ns 0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 291 ns 291 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 32763 ns 32446 ns 1.01
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/oneAPI 1167700 ns
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/Metal 253542 ns 248500 ns 1.02
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU 41150 ns 40510 ns 1.02
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 3833 ns 2917 ns 1.31
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 3041 ns 3250 ns 0.94
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 3375 ns 3083 ns 1.09
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 3125 ns 3458 ns 0.90
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 190584.5 ns 191592.5 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/oneAPI 7912209 ns
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/Metal 1265542 ns 1031291.5 ns 1.23
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU 153656.5 ns 153502 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 454937 ns 423917 ns 1.07
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 454750 ns 473500 ns 0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 458229 ns 427833 ns 1.07
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 427188 ns 424125 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 138010.5 ns 138519 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5819207 ns
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 2011000 ns 2048875 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 325693 ns 380684 ns 0.86
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3801708.5 ns 3799062.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3811125 ns 3822458 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3821292 ns 3802667 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3815375 ns 3823563 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 710674 ns 717031.5 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 32043185 ns
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10832625.5 ns 12950229 ns 0.84
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1491590 ns 1325953 ns 1.12
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 49856479 ns 49840813 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 35516042 ns 25988833 ns 1.37
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 26022291 ns 35525750 ns 0.73
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 97102959 ns 96904729.5 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1594251.5 ns 1593190 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/AMDGPU 1009650 ns 1014101 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 154623520.5 ns 153775938 ns 1.01
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 112350625 ns 89008896 ns 1.26
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 89065125 ns 112384750 ns 0.79
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 296081125 ns 296752479 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6489845.5 ns 6476290 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU 5556104 ns 5534451 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 17312.5 ns 15062.5 ns 1.15
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 16834 ns 15625 ns 1.08
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 14291.5 ns 16875 ns 0.85
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 15167 ns 15333 ns 0.99
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 21687 ns 21010 ns 1.03
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/oneAPI 1157478.5 ns
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/Metal 218167 ns 204959 ns 1.06
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU 27541 ns 27230 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 11042 ns 11083 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 9000.5 ns 7583 ns 1.19
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 7875 ns 9209 ns 0.86
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 17416.5 ns 17188 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 261161 ns 264057 ns 0.99
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/oneAPI 9552185 ns
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/Metal 1560042 ns 1736125.5 ns 0.90
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU 155181 ns 152581.5 ns 1.02
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8125 ns 7417 ns 1.10
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8084 ns 8833 ns 0.92
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 10083.5 ns 10041.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8542 ns 8292 ns 1.03
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 116504 ns 117259.5 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 3349407.5 ns
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 798667 ns 887417 ns 0.90
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 238952.5 ns 236902.5 ns 1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9854 ns 9708.5 ns 1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10229.5 ns 9292 ns 1.10
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10083 ns 10791.5 ns 0.93
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9958 ns 9584 ns 1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 623888 ns 631614 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 22194230 ns
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 4515667 ns 5189583 ns 0.87
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 656976 ns 668942 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 9520.5 ns 8812.5 ns 1.08
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 9125 ns 9583 ns 0.95
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 11625 ns 11042 ns 1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9479.5 ns 9250 ns 1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 120769 ns 122641 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 3531092 ns
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 888291 ns 876791.5 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 79170 ns 74481 ns 1.06
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 14208 ns 13708 ns 1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13208.5 ns 14979 ns 0.88
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 16333 ns 14416 ns 1.13
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 17000 ns 13625.5 ns 1.25
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 594781 ns 601521.5 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 19851682 ns
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 4474458 ns 4885250 ns 0.92
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 357348.5 ns 353174 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 500 ns 458 ns 1.09
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 459 ns 500 ns 0.92
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 458 ns 584 ns 0.78
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 500 ns 500 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 34855 ns 35180 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 1184802 ns
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 423042 ns 441166 ns 0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 209842 ns 206562 ns 1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7709 ns 7042 ns 1.09
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7084 ns 10458 ns 0.68
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7708 ns 8042 ns 0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8042 ns 7125 ns 1.13
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 231568.5 ns 233713.5 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 22217593.5 ns
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 5660167 ns 5300958.5 ns 1.07
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 679867 ns 658707 ns 1.03
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 16042 ns 12666 ns 1.27
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 15333 ns 13833 ns 1.11
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 13854 ns 15667 ns 0.88
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 10375 ns 10270.5 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 22215 ns 22010 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/oneAPI 1158702.5 ns
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/Metal 205521 ns 186625 ns 1.10
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU 194012 ns 191282 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 31958 ns 32042 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 32145.5 ns 32020.5 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 32250 ns 32458 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 32250 ns 31854.5 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 276502.5 ns 278049 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/oneAPI 11085623 ns
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/Metal 1721729 ns 1885500 ns 0.91
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 605276.5 ns 606396.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 474834 ns 438291 ns 1.08
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 445167 ns 484125 ns 0.92
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 486875 ns 446062.5 ns 1.09
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 474916 ns 477208 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 194410 ns 194398.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5748288 ns
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 2751937.5 ns 1968250 ns 1.40
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 326354 ns 375174 ns 0.87
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3823792 ns 3825292 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3824042 ns 3837396 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3849500 ns 3828687.5 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3847584 ns 3836875 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 546410 ns 549907 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 27926309 ns
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10140750 ns 12010500 ns 0.84
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1388348.5 ns 1226382.5 ns 1.13
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 782652917 ns 836787979.5 ns 0.94
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 542161792 ns 426008000 ns 1.27
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 420966458.5 ns 542930250 ns 0.78
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 1553203729.5 ns 1533058916 ns 1.01
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22558411.5 ns 22531506 ns 1.00
batchedmm(512, Bsize=512)/forward/GPU/AMDGPU 14062784.5 ns 14059203 ns 1.00
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 2518008250 ns 3617643875 ns 0.70
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 1785714792 ns 1519606625 ns 1.18
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 1525039667 ns 1791220042 ns 0.85
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 4874366334 ns 4771769708 ns 1.02
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 367235490 ns 370760684 ns 0.99
batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU 88231178 ns 89879564 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 77646 ns 75354.5 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 75959 ns 77417 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 82625 ns 80167 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 77291 ns 76625 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 208602.5 ns 210924.5 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 8336540 ns
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 525229 ns 1045583.5 ns 0.50
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 109211 ns 110131.5 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 199042 ns 231500 ns 0.86
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 262396 ns 195167 ns 1.34
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 276625 ns 244583 ns 1.13
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 287458 ns 234875 ns 1.22
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1056833 ns 1060035 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 40754174 ns
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6090583 ns 6603312.5 ns 0.92
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 646691 ns 643791.5 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 199913000 ns 199256958.5 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 139280375 ns 103813958.5 ns 1.34
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 104140916 ns 139098125 ns 0.75
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 389020708 ns 388864875 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5827400 ns 5820038 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/AMDGPU 3419864.5 ns 3424485 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 620313062.5 ns 615907583.5 ns 1.01
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 440225000 ns 354224562 ns 1.24
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 352767458 ns 440166291.5 ns 0.80
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 1182963541 ns 1188432875 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 26862507 ns 26804213.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU 21755438 ns 21815881 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7292 ns 7333 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6083 ns 5416 ns 1.12
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5291 ns 6291 ns 0.84
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10041 ns 10458 ns 0.96
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 28028 ns 28403 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1272660 ns
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 627458 ns 361437.5 ns 1.74
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 48010 ns 48715.5 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 220750 ns 213333.5 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220521 ns 221708 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221875 ns 220916 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 209208.5 ns 205750 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 222206 ns 226122 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 29719216 ns
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 9434666.5 ns 11493583.5 ns 0.82
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 527475 ns 541195.5 ns 0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 8458.5 ns 7291 ns 1.16
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 9209 ns 8417 ns 1.09
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 10375 ns 10770.5 ns 0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 8083 ns 8583 ns 0.94
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 119377.5 ns 119656 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 3449983 ns
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 855000 ns 855542 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 72520 ns 72200 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8958.5 ns 7667 ns 1.17
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7500 ns 9395.5 ns 0.80
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10084 ns 8375 ns 1.20
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10187.5 ns 7542 ns 1.35
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 521950 ns 526844.5 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 18008002 ns
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 4315292 ns 4384667 ns 0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 321943 ns 322463 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 625 ns 459 ns 1.36
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 458 ns 458 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 625 ns 500 ns 1.25
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 625 ns 416 ns 1.50
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 26701 ns 27306 ns 0.98
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 1195571.5 ns
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 459104 ns 483625 ns 0.95
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 48701 ns 48601 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 10375 ns 9917 ns 1.05
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 8479 ns 10167 ns 0.83
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 11375 ns 9542 ns 1.19
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9375 ns 8667 ns 1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 252977 ns 256488 ns 0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 24052360 ns
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 5702709 ns 5936416 ns 0.96
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 397983.5 ns 396784 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 106500 ns 108542 ns 0.98
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 98125 ns 85333 ns 1.15
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 87479.5 ns 100208 ns 0.87
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 147229 ns 146625 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 24863 ns 25074 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/oneAPI 1228355 ns
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/Metal 263458.5 ns 244333 ns 1.08
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU 190212 ns 190632 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 478667 ns 479625 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 509250 ns 518583.5 ns 0.98
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 518562.5 ns 481000 ns 1.08
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 520417 ns 478125 ns 1.09
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 234381 ns 235150 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/oneAPI 11772054 ns
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/Metal 2148312.5 ns 2164333 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 621156 ns 622586 ns 1.00
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 5375 ns 5500 ns 0.98
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 5167 ns 5750 ns 0.90
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 7500 ns 6666.5 ns 1.13
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 4833.5 ns 4125 ns 1.17
batchedmm(16, Bsize=32)/forward/GPU/CUDA 16136 ns 16723 ns 0.96
batchedmm(16, Bsize=32)/forward/GPU/AMDGPU 79061 ns 78130 ns 1.01
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 14083 ns 11812 ns 1.19
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 10208.5 ns 11916 ns 0.86
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 10292 ns 11000 ns 0.94
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 16708 ns 16500 ns 1.01
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 213958 ns 216336 ns 0.99
batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU 374963 ns 370958.5 ns 1.01
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 40000 ns 35917 ns 1.11
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 50584 ns 50500 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 52458.5 ns 52709 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 13895.5 ns 13541 ns 1.03
batchedmm(16, Bsize=128)/forward/GPU/CUDA 19866 ns 20359 ns 0.98
batchedmm(16, Bsize=128)/forward/GPU/AMDGPU 87035.5 ns 79931 ns 1.09
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 38625 ns 36625 ns 1.05
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 30646 ns 29625 ns 1.03
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 30791.5 ns 31458 ns 0.98
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 57666 ns 57209 ns 1.01
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 192524 ns 195413 ns 0.99
batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU 416745 ns 409364 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 1604.5 ns 1959 ns 0.82
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 1791 ns 1792 ns 1.00
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 2042 ns 2125 ns 0.96
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 1708 ns 1792 ns 0.95
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 21123 ns 21014.5 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/oneAPI 1140764 ns
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/Metal 294500 ns 324459 ns 0.91
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU 30391 ns 33550 ns 0.91
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 2042 ns 2209 ns 0.92
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 2125 ns 2125 ns 1
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 2292 ns 2417 ns 0.95
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 2208 ns 2291 ns 0.96
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 205122.5 ns 207244.5 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/oneAPI 8519681 ns
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/Metal 1638500 ns 1670895.5 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU 139726.5 ns 137121 ns 1.02
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5709 ns 4583 ns 1.25
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5104 ns 4750 ns 1.07
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 5750 ns 6333 ns 0.91
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4271 ns 4917 ns 0.87
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 146388.5 ns 147827 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 5488369.5 ns
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 465291 ns 771709 ns 0.60
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 72161 ns 71711 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8479.5 ns 8270.5 ns 1.03
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8209 ns 8666 ns 0.95
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8750 ns 8792 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9042 ns 8125 ns 1.11
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 884256.5 ns 888135.5 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 38177021 ns
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 5496125 ns 6483625 ns 0.85
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 394569 ns 391164 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 56791 ns 56875 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 57625 ns 56875 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 56875 ns 57750 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 58166 ns 58292 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 37427.5 ns 37890 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1210467.5 ns
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 468667 ns 379312.5 ns 1.24
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 208482 ns 205582 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 487354.5 ns 448479 ns 1.09
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 501250 ns 465229 ns 1.08
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 492208.5 ns 464687.5 ns 1.06
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 437438 ns 433500 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 267413 ns 270782 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 26782051.5 ns
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8248375 ns 10306000 ns 0.80
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 839679 ns 801818 ns 1.05
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 3311333.5 ns 3291000 ns 1.01
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 2340166.5 ns 1770084 ns 1.32
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 1769958 ns 2335292 ns 0.76
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 6319645.5 ns 6297083.5 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/CUDA 205610 ns 206316 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/AMDGPU 202712 ns 203322 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 11497979 ns 11333854.5 ns 1.01
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 8319667 ns 6594562.5 ns 1.26
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 6588125 ns 8324937.5 ns 0.79
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 21221896 ns 21089229 ns 1.01
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 736463 ns 735605 ns 1.00
batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU 1065445 ns 1072271 ns 0.99
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5562.5 ns 5625 ns 0.99
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4666.5 ns 5667 ns 0.82
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6437.5 ns 7500 ns 0.86
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6104 ns 6750 ns 0.90
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 139569.5 ns 139700 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 5734965.5 ns
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 826042 ns 867541.5 ns 0.95
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 59531 ns 56260 ns 1.06
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9333.5 ns 7500 ns 1.24
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7000 ns 14625 ns 0.48
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 11875 ns 7375 ns 1.61
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8708 ns 7000 ns 1.24
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 764194 ns 766028 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 34028843.5 ns
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 5176312.5 ns 5998084 ns 0.86
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 378403 ns 380414 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 99625 ns 117604 ns 0.85
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 136708 ns 125375 ns 1.09
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 101312.5 ns 102396 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 129709 ns 98145.5 ns 1.32
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 151420 ns 152876 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 6034399 ns
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1982667 ns 2030624.5 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 206692 ns 185692 ns 1.11
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2031041 ns 2021875 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2037417 ns 2037125 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2036291 ns 2013542 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2038584 ns 2033354 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 708221 ns 716061.5 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 31488037 ns
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11251291 ns 13591542 ns 0.83
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1126246 ns 1265732.5 ns 0.89
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 33459 ns 29833 ns 1.12
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 36750 ns 34167 ns 1.08
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 33833 ns 35542 ns 0.95
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 667 ns 625 ns 1.07
batchedmm(2, Bsize=4)/forward/GPU/CUDA 15506 ns 15704 ns 0.99
batchedmm(2, Bsize=4)/forward/GPU/AMDGPU 86920 ns 71560.5 ns 1.21
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 4792 ns 2583 ns 1.86
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 2709 ns 4583 ns 0.59
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 3167 ns 3000 ns 1.06
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 2291.5 ns 2209 ns 1.04
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 140769.5 ns 143464 ns 0.98
batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU 351474 ns 351354 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7250 ns 7208 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6000 ns 5334 ns 1.12
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5375 ns 6166 ns 0.87
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10000 ns 10000 ns 1
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 36795 ns 37164 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1247042.5 ns
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 351333 ns 334396 ns 1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 49030 ns 49180 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 213334 ns 212895.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220166.5 ns 222000 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 228125 ns 221041.5 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 206875 ns 205979 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 244945 ns 249374 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 24969632 ns
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7965166.5 ns 9656333 ns 0.82
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 578090.5 ns 581561 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3916 ns 3959 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3959 ns 4000 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3958 ns 3958 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 21762 ns 21939 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/oneAPI 2067928.5 ns
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/Metal 245104 ns 227375 ns 1.08
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU 45631 ns 45671 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14875 ns 14916 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14916 ns 14708 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14667 ns 15000 ns 0.98
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14667 ns 14875 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 310256.5 ns 314728.5 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/oneAPI 11269459 ns
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/Metal 1000292 ns 1635750 ns 0.61
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU 193502 ns 192832 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 102917 ns 109166 ns 0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 103667 ns 132541 ns 0.78
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 108625 ns 109875 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 131875 ns 102125 ns 1.29
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 137366.5 ns 138355.5 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5955500.5 ns
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1988958 ns 2016354 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 200842 ns 188667 ns 1.06
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1926354.5 ns 1918396 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1913500 ns 1939229 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1917792 ns 1913584 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1936729 ns 1937625 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 692519 ns 700104 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 33116808.5 ns
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11144584 ns 13264020.5 ns 0.84
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1078360.5 ns 1233652.5 ns 0.87
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17708 ns 17667 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 22291.5 ns 18458 ns 1.21
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 21250 ns 22270.5 ns 0.95
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 19146 ns 18250 ns 1.05
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 109241 ns 110588.5 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3392625.5 ns
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1271125 ns 1374104.5 ns 0.93
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 81331 ns 81891 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 221229.5 ns 216417 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 216791 ns 249771 ns 0.87
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 230083.5 ns 216541.5 ns 1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 216083.5 ns 217312.5 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 522920 ns 527304 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 19545470 ns
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6165645.5 ns 8411584 ns 0.73
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 476780 ns 488925 ns 0.98
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 26250 ns 24063 ns 1.09
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 31250 ns 28500 ns 1.10
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 27875 ns 29459 ns 0.95
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 1292 ns 1334 ns 0.97
batchedmm(16, Bsize=4)/forward/GPU/CUDA 16312 ns 16479 ns 0.99
batchedmm(16, Bsize=4)/forward/GPU/AMDGPU 87751 ns 82590 ns 1.06
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 6625 ns 4708.5 ns 1.41
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 4645.5 ns 4708 ns 0.99
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 4917 ns 5208 ns 0.94
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 4792 ns 4875 ns 0.98
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 207882.5 ns 210198 ns 0.99
batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU 402074 ns 398304 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 305938 ns 304792 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 305917 ns 305542 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 307521 ns 311083 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 305375 ns 306375 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 230214 ns 232191.5 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7500239 ns
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 643000 ns 1156396 ns 0.56
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 280903 ns 279563 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 538541 ns 530625 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 549750 ns 542459 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 542666 ns 542000.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 529708 ns 535875 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1085631 ns 1096065 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 44253871 ns
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6154687.5 ns 6678000 ns 0.92
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 872599 ns 873778.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 19021 ns 20083 ns 0.95
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 19833.5 ns 20187.5 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 22542 ns 23187 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 21917 ns 20959 ns 1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 114174 ns 115290.5 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3531348.5 ns
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1449271 ns 1265792 ns 1.14
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 81471 ns 80731 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 218834 ns 212042 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 227542 ns 224625 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 219708 ns 214333 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 212708 ns 213708.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 761865.5 ns 758025 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 24050167 ns
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7412916.5 ns 10158583 ns 0.73
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 543136 ns 542975 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 7125.5 ns 6458 ns 1.10
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6479 ns 6917 ns 0.94
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 8458 ns 8542 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6084 ns 6417 ns 0.95
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 141785 ns 143078 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 5370056 ns
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 777458 ns 869500 ns 0.89
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 69581 ns 69771 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12958 ns 10709 ns 1.21
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9583.5 ns 9771 ns 0.98
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10687.5 ns 10729.5 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9625 ns 10291 ns 0.94
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 832452.5 ns 834187 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 38810557 ns
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 5231375 ns 6274750 ns 0.83
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 395184 ns 396084 ns 1.00
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5145.5 ns 5333 ns 0.96
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4812.5 ns 4958 ns 0.97
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6958 ns 7125 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6833 ns 5958 ns 1.15
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 144967.5 ns 146313.5 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 5514807.5 ns
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 829125 ns 875000 ns 0.95
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 70250 ns 67660 ns 1.04
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7770.5 ns 7667 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7333 ns 7500 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7667 ns 7625 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7208 ns 7459 ns 0.97
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 790491 ns 797995 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 37869840 ns
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 5670687 ns 6580999.5 ns 0.86
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 398424.5 ns 400804 ns 0.99
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 14518959 ns 14350958 ns 1.01
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 10120000 ns 7722625 ns 1.31
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 7708791.5 ns 10132750 ns 0.76
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 27832250 ns 27757125 ns 1.00
batchedmm(128, Bsize=512)/forward/GPU/CUDA 532409 ns 532327 ns 1.00
batchedmm(128, Bsize=512)/forward/GPU/AMDGPU 399949.5 ns 403538.5 ns 0.99
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 46375083.5 ns 45806208 ns 1.01
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 33404583.5 ns 26766750.5 ns 1.25
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 26627416.5 ns 33520000 ns 0.79
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 85835750 ns 85306916 ns 1.01
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2644453 ns 2661047 ns 0.99
batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU 3278895 ns 3296413 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 66042 ns 66000 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 66125 ns 67333 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 70520.5 ns 69854 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 67875 ns 67375 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 119873.5 ns 120529 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3330724 ns
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1410021 ns 1329083.5 ns 1.06
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 229907.5 ns 228112 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 453292 ns 444083 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 441208 ns 444083 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 450208 ns 441292 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 445541 ns 442521.5 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 732886.5 ns 736542.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 26274297 ns
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7781500 ns 10732062.5 ns 0.73
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 794638 ns 809398 ns 0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 667 ns 542 ns 1.23
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 625 ns 542 ns 1.15
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 542 ns 667 ns 0.81
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 500 ns 542 ns 0.92
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 32132 ns 32886 ns 0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 1164338 ns
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 431645.5 ns 466834 ns 0.92
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 49160 ns 49230 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 8292 ns 9375 ns 0.88
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 8708 ns 9250 ns 0.94
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9250 ns 9500 ns 0.97
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 8959 ns 8125 ns 1.10
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 286401.5 ns 290314.5 ns 0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 21940598 ns
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 5096125 ns 5519708 ns 0.92
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 388934 ns 387394 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 9792 ns 9875 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 9875 ns 9833 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 9833 ns 9833 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 9875 ns 9791 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 23178 ns 23928 ns 0.97
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/oneAPI 1908743.5 ns
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/Metal 222541 ns 204979.5 ns 1.09
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU 217383 ns 214872 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 45875 ns 46000 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 45917 ns 45667 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 46167 ns 46666 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 45875 ns 46250 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 293089 ns 293307 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/oneAPI 10988297.5 ns
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/Metal 982875 ns 1595562.5 ns 0.62
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 621107 ns 621217 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 56250 ns 56333 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 57125 ns 56792 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 56334 ns 57083 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 57792 ns 57834 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 28527 ns 29516 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1186883 ns
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 578645.5 ns 704333.5 ns 0.82
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 204943 ns 205082 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 448333 ns 455021 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 494125 ns 465375 ns 1.06
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 507583 ns 473000 ns 1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 439437 ns 434208.5 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 247232 ns 252003 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 33216066 ns
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 9499166 ns 12166125 ns 0.78
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 891519.5 ns 893508.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 652937.5 ns 624416 ns 1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 647333 ns 662083 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 662854 ns 619083 ns 1.07
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 668500 ns 633895.5 ns 1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 207996 ns 212333 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8125052.5 ns
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1384354.5 ns 1471333 ns 0.94
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 233282 ns 236152 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2235042 ns 2220834 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2238979 ns 2250000 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2248959 ns 2213792 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2260792 ns 2240750 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 984096 ns 990521.5 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 45382984 ns
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 8132833.5 ns 9717333 ns 0.84
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1370494 ns 1376089 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 20958 ns 19000 ns 1.10
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 20000 ns 19979 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 22667 ns 22333.5 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 22083 ns 22250 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 113160 ns 114382.5 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3278898 ns
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1472792 ns 1244584 ns 1.18
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 81561 ns 81450 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 222313 ns 222479 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 257542 ns 224959 ns 1.14
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 232250 ns 221208 ns 1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 228000.5 ns 218917 ns 1.04
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 734156.5 ns 738666.5 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 27357269 ns
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7692750 ns 10456396 ns 0.74
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 559476 ns 562856 ns 0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 542 ns 584 ns 0.93
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 583 ns 500 ns 1.17
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 541 ns 667 ns 0.81
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 500 ns 542 ns 0.92
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 23248 ns 23746 ns 0.98
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 1222462 ns
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 466625 ns 488062.5 ns 0.96
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 51870 ns 49670 ns 1.04
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9167 ns 9541.5 ns 0.96
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9208 ns 9792 ns 0.94
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9292 ns 9833 ns 0.94
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9312.5 ns 9291.5 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 268568 ns 272510 ns 0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 24289416 ns
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 6049709 ns 6224583.5 ns 0.97
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 410500 ns 407824 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 10333 ns 7708 ns 1.34
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 8458 ns 8687.5 ns 0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 10354 ns 11166.5 ns 0.93
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 8333 ns 9666 ns 0.86
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 120393.5 ns 121220 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 3445203 ns
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 832874.5 ns 860208 ns 0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 72921 ns 72661 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7583 ns 7708 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8208 ns 7250 ns 1.13
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7417 ns 8125 ns 0.91
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7770.5 ns 7334 ns 1.06
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 511772 ns 516336 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 16339001 ns
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 3959271 ns 4339813 ns 0.91
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 328364 ns 328244 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1458 ns 1458 ns 1
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1542 ns 1375 ns 1.12
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1833 ns 2041.5 ns 0.90
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1541 ns 1583 ns 0.97
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 21725 ns 21646 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/oneAPI 1136020 ns
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/Metal 296000 ns 305020.5 ns 0.97
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU 194712 ns 191511.5 ns 1.02
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3250 ns 3334 ns 0.97
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 3250 ns 3375 ns 0.96
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 3500 ns 3459 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 3209 ns 3458 ns 0.93
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 220221.5 ns 224911 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/oneAPI 9698879 ns
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/Metal 1612667 ns 1768041 ns 0.91
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 596166 ns 595216 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 148167 ns 145708.5 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 127709 ns 106562.5 ns 1.20
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 107958.5 ns 129292 ns 0.83
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 225958 ns 225125 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 24338 ns 24473.5 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/oneAPI 1138772 ns
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/Metal 270854.5 ns 252375 ns 1.07
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU 40151 ns 38390 ns 1.05
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 156125 ns 143771 ns 1.09
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 127209 ns 88167 ns 1.44
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 100750 ns 110771 ns 0.91
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 256666.5 ns 250875 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 218905 ns 220914.5 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/oneAPI 10030041 ns
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/Metal 2003417 ns 2045709 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU 240417.5 ns 237933 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7292 ns 7250 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6083 ns 5333 ns 1.14
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5375 ns 5916 ns 0.91
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10375 ns 10208 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 32865 ns 33448 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1134920.5 ns
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 562875 ns 335833 ns 1.68
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 52191 ns 50340 ns 1.04
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 230854.5 ns 224250 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 270500 ns 228375 ns 1.18
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 264875 ns 236083.5 ns 1.12
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 213771 ns 212562.5 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 263381.5 ns 267943.5 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 28212764 ns
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8517000 ns 9170083 ns 0.93
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 607266 ns 609306 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 14958 ns 14458 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 15500 ns 14812.5 ns 1.05
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 16500 ns 16791.5 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 15625 ns 15334 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 140749.5 ns 141134 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 5465169 ns
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 787125 ns 873104 ns 0.90
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 238512 ns 238182 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 22583 ns 24083.5 ns 0.94
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 23500 ns 23875 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 24084 ns 24167 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 23167 ns 23625 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 875101 ns 878285 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 37582744 ns
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 5600270.5 ns 6385188 ns 0.88
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 692048 ns 692226 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 9125 ns 8916 ns 1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 9250.5 ns 9687.5 ns 0.95
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 10521 ns 12125 ns 0.87
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 9209 ns 10416 ns 0.88
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 124561 ns 124959.5 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 3393331 ns
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 802083 ns 918334 ns 0.87
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 79030 ns 75531 ns 1.05
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 13750 ns 14000 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14125 ns 13729 ns 1.03
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14125 ns 14708 ns 0.96
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 13917 ns 13834 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 670894 ns 676549 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 20295661 ns
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 5274042 ns 5573041 ns 0.95
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 375405 ns 373189 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 9208.5 ns 8062 ns 1.14
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 9167 ns 9750 ns 0.94
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10438 ns 11916.5 ns 0.88
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9584 ns 10187.5 ns 0.94
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 122339.5 ns 124116 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 3319433.5 ns
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 882875 ns 883646 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 75581 ns 69690 ns 1.08
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12333.5 ns 12625 ns 0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12645.5 ns 12750 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 12708 ns 13542 ns 0.94
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12708 ns 12312 ns 1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 557225 ns 561116 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 18661226 ns
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 4435167 ns 4630937 ns 0.96
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 345844 ns 345083.5 ns 1.00
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 30292 ns 27208.5 ns 1.11
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 34021.5 ns 32333.5 ns 1.05
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 30854.5 ns 31958 ns 0.97
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 1791 ns 2041 ns 0.88
batchedmm(2, Bsize=128)/forward/GPU/CUDA 16303 ns 16556 ns 0.98
batchedmm(2, Bsize=128)/forward/GPU/AMDGPU 82211 ns 82091 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 5270.5 ns 5229 ns 1.01
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 5354 ns 4687.5 ns 1.14
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 5375 ns 5334 ns 1.01
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 6625 ns 6458 ns 1.03
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 140733 ns 142634 ns 0.99
batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU 394064.5 ns 367964 ns 1.07
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 250 ns 334 ns 0.75
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 292 ns 250 ns 1.17
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 250 ns 375 ns 0.67
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 292 ns 250 ns 1.17
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 26135 ns 26682 ns 0.98
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 1123770.5 ns
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 474625 ns 482271 ns 0.98
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 50311 ns 47990 ns 1.05
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6375 ns 6500 ns 0.98
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6145.5 ns 6562.5 ns 0.94
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6416 ns 6709 ns 0.96
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6416 ns 6188 ns 1.04
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 187828 ns 190767.5 ns 0.98
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 23626156 ns
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 5544437.5 ns 5874834 ns 0.94
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 395104 ns 394363.5 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 2042 ns 2042 ns 1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 2000 ns 1917 ns 1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 1959 ns 2125 ns 0.92
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 2000 ns 2000 ns 1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 26544 ns 27167 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 1165809 ns
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 461708.5 ns 492292 ns 0.94
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 209972 ns 210002 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 15792 ns 16833.5 ns 0.94
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 16375 ns 16417 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 17000 ns 17354.5 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 16084 ns 16458.5 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 275962 ns 278278 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 24890960.5 ns
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 5972833 ns 6125604 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 713667.5 ns 714427 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 178250 ns 146500 ns 1.22
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 184187.5 ns 171396 ns 1.07
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 153417 ns 155584 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 147459 ns 154167 ns 0.96
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 204372 ns 204804 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 7857309.5 ns
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1392667 ns 1553583 ns 0.90
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 196752 ns 231362.5 ns 0.85
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1326895.5 ns 1324312.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1320625 ns 1348021 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1330833 ns 1319083.5 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1334750 ns 1326542 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 917280 ns 925557 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 46023181 ns
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 6714958.5 ns 8602229.5 ns 0.78
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1108992 ns 1014380 ns 1.09
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 25229.5 ns 23792 ns 1.06
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 26583 ns 25354 ns 1.05
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 26833 ns 28250 ns 0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 25917 ns 24604.5 ns 1.05
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 239791.5 ns 238411 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7972748 ns
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 980542 ns 1139000 ns 0.86
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 116941 ns 120312 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 179917 ns 117854 ns 1.53
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 141604.5 ns 124667 ns 1.14
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 127354.5 ns 174458.5 ns 0.73
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 118604 ns 118354 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1092585 ns 1098934 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 43816902.5 ns
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6033333 ns 7919042 ns 0.76
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 606086 ns 614406 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 291 ns 375 ns 0.78
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 334 ns 250 ns 1.34
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 292 ns 375 ns 0.78
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 292 ns 292 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 22970 ns 23522 ns 0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 1175116 ns
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 456125 ns 491791.5 ns 0.93
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 48591 ns 50790 ns 0.96
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6625 ns 6583 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6750 ns 6375 ns 1.06
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6542 ns 6833 ns 0.96
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6459 ns 6167 ns 1.05
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 204628 ns 207746.5 ns 0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 23603781 ns
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 6092458 ns 5956667 ns 1.02
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 397554 ns 395954 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6125 ns 5958 ns 1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6334 ns 6041.5 ns 1.05
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6709 ns 7604.5 ns 0.88
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5937.5 ns 6500 ns 0.91
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 147027 ns 147981.5 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 5559804 ns
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 583167 ns 774875 ns 0.75
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 237472 ns 239202 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9666.5 ns 10000 ns 0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10041 ns 10083 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10041 ns 10667 ns 0.94
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9854 ns 9791.5 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 910526.5 ns 916090 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 39406121 ns
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 5909375 ns 7392292 ns 0.80
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 686288 ns 688747.5 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 666 ns 708 ns 0.94
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 667 ns 666 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 667 ns 666 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 667 ns 625 ns 1.07
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 22655 ns 23031 ns 0.98
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/oneAPI 2037996 ns
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/Metal 222583 ns 209625 ns 1.06
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU 215862 ns 215712 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4584 ns 4833 ns 0.95
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4584 ns 4584 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4625 ns 4833 ns 0.96
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4625 ns 4625 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 232442.5 ns 230125.5 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/oneAPI 9881227 ns
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/Metal 1690521 ns 1700146 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 600181 ns 599396 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8562.5 ns 8396 ns 1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 7937.5 ns 8000 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9771 ns 10125 ns 0.97
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8520.5 ns 9062.5 ns 0.94
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 122197 ns 123106.5 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 3361719 ns
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 761542 ns 907333 ns 0.84
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 76241 ns 76081 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8792 ns 8792 ns 1
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8459 ns 8459 ns 1
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8875 ns 9041 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8750 ns 8270.5 ns 1.06
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 595652 ns 600302.5 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 20278296 ns
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 4718125 ns 4960583.5 ns 0.95
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 354274 ns 353604 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 125917 ns 122750 ns 1.03
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 128958 ns 95625 ns 1.35
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 96959 ns 130334 ns 0.74
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 181416 ns 183125 ns 0.99
batchedmm(128, Bsize=4)/forward/GPU/CUDA 46106 ns 46375 ns 0.99
batchedmm(128, Bsize=4)/forward/GPU/AMDGPU 96666 ns 98981 ns 0.98
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 317875 ns 303292 ns 1.05
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 346375 ns 182750 ns 1.90
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 178979 ns 345917 ns 0.52
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 569062.5 ns 608729 ns 0.93
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 191966 ns 195364.5 ns 0.98
batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU 487875 ns 494734 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397125 ns 396125 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288292 ns 215375 ns 1.34
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 215791 ns 287708 ns 0.75
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 757959 ns 756000 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 43243.5 ns 43820 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/oneAPI 1345812 ns
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/Metal 404062.5 ns 358000 ns 1.13
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU 83381 ns 83390 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1459854 ns 1446958.5 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 1136645.5 ns 863667 ns 1.32
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 865270.5 ns 1133375 ns 0.76
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2359813 ns 2443417 ns 0.97
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 259216 ns 252085 ns 1.03
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/oneAPI 11177773 ns
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/Metal 1833666 ns 1851958 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU 349653.5 ns 350863.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 642333 ns 626459 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 649875 ns 682479 ns 0.95
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 660416.5 ns 615000 ns 1.07
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 623542 ns 641167 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 202604 ns 203045 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 7957177 ns
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1348791.5 ns 1359542 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 265108 ns 254223 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2448583 ns 2435250 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2452104 ns 2470979.5 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2473833 ns 2445042 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2455791 ns 2415792 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1005284.5 ns 1014910 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 50767854.5 ns
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10026166 ns 11589916 ns 0.87
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1511186 ns 1478675 ns 1.02
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 32375 ns 29458.5 ns 1.10
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 35749.5 ns 33812.5 ns 1.06
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 34312.5 ns 34541 ns 0.99
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 916 ns 1042 ns 0.88
batchedmm(2, Bsize=32)/forward/GPU/CUDA 15700 ns 15442 ns 1.02
batchedmm(2, Bsize=32)/forward/GPU/AMDGPU 81140 ns 85531 ns 0.95
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 3166 ns 3250 ns 0.97
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 3083 ns 3042 ns 1.01
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 3125 ns 3416 ns 0.91
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 3000 ns 3166 ns 0.95
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 139352.5 ns 142240.5 ns 0.98
batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU 344664 ns 360413 ns 0.96
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 405583 ns 404291 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 408750 ns 403708 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 403083 ns 409042 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 422042 ns 421875 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 43343.5 ns 44262 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1354478 ns
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1109583 ns 1119041 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 240442 ns 242882 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3869125 ns 3855208 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3994396 ns 3997771 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3999708 ns 3998125 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3774354.5 ns 3773938 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 244251 ns 248524 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 35978667 ns
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11608750 ns 14976771 ns 0.78
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1245273.5 ns 1453704 ns 0.86
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3917 ns 3959 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3958 ns 3875 ns 1.02
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 34866 ns 34278.5 ns 1.02
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/oneAPI 1227111 ns
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/Metal 175291 ns 161167 ns 1.09
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU 42710 ns 40280 ns 1.06
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15750 ns 15875 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15667 ns 15583 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15500 ns 16041 ns 0.97
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15542 ns 15791 ns 0.98
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 256386 ns 257529.5 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/oneAPI 8908913 ns
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/Metal 872958 ns 864083.5 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU 174412 ns 168256.5 ns 1.04
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 404166 ns 403417 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 295666 ns 221375 ns 1.34
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 221625 ns 295666 ns 0.75
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 760500 ns 760500 ns 1
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 113218 ns 113952 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/oneAPI 1016425 ns
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/Metal 393437 ns 335792 ns 1.17
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU 90851 ns 88615.5 ns 1.03
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1473333 ns 1471958 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 1161666 ns 887791.5 ns 1.31
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 888166.5 ns 1157167 ns 0.77
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2383791 ns 2467666 ns 0.97
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 241468.5 ns 255583.5 ns 0.94
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/oneAPI 11846004 ns
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/Metal 1877938 ns 1946854 ns 0.96
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU 360704 ns 360243.5 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 500 ns 542 ns 0.92
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 583 ns 500 ns 1.17
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 459 ns 584 ns 0.79
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 542 ns 500 ns 1.08
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 25943 ns 26902 ns 0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 1192515 ns
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 470937.5 ns 486187.5 ns 0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 208143 ns 208227.5 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7458 ns 7667 ns 0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 7583 ns 7666 ns 0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 7458 ns 7916.5 ns 0.94
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 7709 ns 7250 ns 1.06
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 214477.5 ns 219818 ns 0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 25777295.5 ns
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 5998979.5 ns 6151042 ns 0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 700287 ns 686716.5 ns 1.02
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 831271 ns 825562.5 ns 1.01
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 617041 ns 468833 ns 1.32
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 470000 ns 620188 ns 0.76
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 1545709 ns 1547479 ns 1.00
batchedmm(128, Bsize=32)/forward/GPU/CUDA 129860.5 ns 131055 ns 0.99
batchedmm(128, Bsize=32)/forward/GPU/AMDGPU 169171.5 ns 231953 ns 0.73
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 2689145.5 ns 2669042 ns 1.01
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 2013250 ns 1538125.5 ns 1.31
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 1538125 ns 2006270.5 ns 0.77
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 4941375 ns 4938583 ns 1.00
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 241461 ns 242713 ns 0.99
batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU 867019 ns 860168 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 291 ns 375 ns 0.78
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 291 ns 1.29
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 333 ns 375 ns 0.89
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 333 ns 0.88
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 31985 ns 32634 ns 0.98
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 1142400.5 ns
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 453291.5 ns 452000 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 48580 ns 48761 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6250 ns 6437.5 ns 0.97
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6375 ns 6541.5 ns 0.97
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6416 ns 6750 ns 0.95
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6166 ns 6000 ns 1.03
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 224593 ns 228896 ns 0.98
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 21127237.5 ns
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 5053916 ns 5302916 ns 0.95
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 372504 ns 369843 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2423917 ns 2391250 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2397291.5 ns 2400000 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2403792 ns 2405958 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2371125 ns 2372125 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 203214 ns 204395 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8123069 ns
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1393562 ns 1597249.5 ns 0.87
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 332763.5 ns 377704 ns 0.88
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4645250 ns 4646708.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4645125 ns 4648958 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4654250 ns 4659021 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4658042 ns 4685792 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 910071 ns 915367 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 48057492 ns
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 6619584 ns 7426833 ns 0.89
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1416215 ns 1261857 ns 1.12
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 7438 ns 7479 ns 0.99
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 7083 ns 7125 ns 0.99
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 6958 ns 7959 ns 0.87
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 6979 ns 7250 ns 0.96
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 23722 ns 23573 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/GPU/oneAPI 1176238 ns
bias_activation(512, act=relu)(512 x 128)/forward/GPU/Metal 263000 ns 243500 ns 1.08
bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU 34150 ns 39571 ns 0.86
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 68020.5 ns 70291.5 ns 0.97
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 50312 ns 45542 ns 1.10
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 53292 ns 63500 ns 0.84
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 32583 ns 33104 ns 0.98
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 218170 ns 217821 ns 1.00
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/oneAPI 10824043 ns
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/Metal 2030958 ns 2084458 ns 0.97
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU 244333 ns 226612 ns 1.08
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 21437 ns 20396 ns 1.05
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 25333 ns 24479.5 ns 1.03
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 23479.5 ns 24854.5 ns 0.94
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 6083 ns 5500 ns 1.11
batchedmm(2, Bsize=512)/forward/GPU/CUDA 16786.5 ns 16892 ns 0.99
batchedmm(2, Bsize=512)/forward/GPU/AMDGPU 91501 ns 85151 ns 1.07
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 12208.5 ns 11958 ns 1.02
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 10083 ns 9000 ns 1.12
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 9458.5 ns 10958.5 ns 0.86
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 17854.5 ns 18167 ns 0.98
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 228126 ns 227664.5 ns 1.00
batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU 376824 ns 389024 ns 0.97
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 406500 ns 404791 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 297312.5 ns 223500 ns 1.33
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 223791 ns 296709 ns 0.75
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 762958 ns 762750 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 46683 ns 46360 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/oneAPI 1412498.5 ns
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/Metal 476666.5 ns 340000 ns 1.40
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU 89121 ns 88940 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1499875 ns 1485750.5 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 1167833.5 ns 895812 ns 1.30
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 894271 ns 1165791.5 ns 0.77
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2389834 ns 2472333 ns 0.97
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 292932.5 ns 290272 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/oneAPI 13048501 ns
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/Metal 2098166 ns 2106583 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU 380285 ns 377424 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 433875 ns 432770.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 436334 ns 430583 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 430709 ns 436958 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 448020.5 ns 448209 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 54564 ns 54092 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1024914 ns
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1099208.5 ns 1074083.5 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 236522.5 ns 235772 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3897208 ns 3888958 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4021833 ns 4016791.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4027708 ns 4025938 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3812146 ns 3793958.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 264154 ns 263523 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 31494055 ns
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10517749.5 ns 11929333 ns 0.88
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1245028 ns 1247352 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 8750 ns 8750 ns 1
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 7666 ns 6875 ns 1.12
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 6834 ns 7667 ns 0.89
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 12459 ns 12417 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 24707 ns 24084 ns 1.03
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/oneAPI 2085760.5 ns
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/Metal 225250 ns 211583 ns 1.06
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU 215337.5 ns 216562 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 45042 ns 45125 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 45125 ns 44750 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 45083 ns 45375 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 45187.5 ns 45187.5 ns 1
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 350283.5 ns 347338.5 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/oneAPI 11134325 ns
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/Metal 1805125 ns 1883625.5 ns 0.96
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 662902 ns 671931.5 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 93959 ns 104146.5 ns 0.90
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 129416 ns 86437 ns 1.50
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 87916.5 ns 92875 ns 0.95
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 125062.5 ns 126625 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 189645 ns 189767 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5972246.5 ns
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1906021.5 ns 1966250 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 201947 ns 183982 ns 1.10
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2011375 ns 2011000 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2017791 ns 2025000 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2029459 ns 2009458 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2017916.5 ns 2016917 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 537811 ns 535873.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 27667805 ns
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9734479.5 ns 11961958.5 ns 0.81
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1103102 ns 982380 ns 1.12

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.