Skip to content

Commit

Permalink
fix: update default rng for reactant (#1152)
Browse files Browse the repository at this point in the history
* fix: update default rng for reactant

* feat: handle RNGs in layers correctly
  • Loading branch information
avik-pal authored Jan 1, 2025
1 parent 1951c86 commit 367680b
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 10 deletions.
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ LinearAlgebra = "1.10"
LossFunctions = "0.11.1, 1"
LuxCore = "1.2"
LuxLib = "1.3.7"
MLDataDevices = "1.6"
MLDataDevices = "1.6.6"
MLUtils = "0.4.4"
MPI = "0.20.19"
MacroTools = "0.5.13"
Expand All @@ -110,7 +110,7 @@ NNlib = "0.9.26"
Optimisers = "0.4.1"
Preferences = "1.4.3"
Random = "1.10"
Reactant = "0.2.12"
Reactant = "0.2.13"
Reexport = "1.2.2"
ReverseDiff = "1.15"
SIMDTypes = "0.1"
Expand Down
4 changes: 2 additions & 2 deletions lib/MLDataDevices/Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "MLDataDevices"
uuid = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
authors = ["Avik Pal <avikpal@mit.edu> and contributors"]
version = "1.6.5"
version = "1.6.6"

[deps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
Expand Down Expand Up @@ -66,7 +66,7 @@ Metal = "1"
OneHotArrays = "0.2.5"
Preferences = "1.4"
Random = "1.10"
Reactant = "0.2.6"
Reactant = "0.2.13"
RecursiveArrayTools = "3.8"
ReverseDiff = "1.15"
SparseArrays = "1.10"
Expand Down
11 changes: 6 additions & 5 deletions lib/MLDataDevices/ext/MLDataDevicesReactantExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,21 @@ module MLDataDevicesReactantExt

using Adapt: Adapt
using MLDataDevices: MLDataDevices, Internal, ReactantDevice, CPUDevice, get_device_type
using Random: Random
using Reactant: Reactant, XLA, ConcreteRArray, ConcreteRNumber, TracedRArray,
TracedRNumber

MLDataDevices.loaded(::Union{ReactantDevice, Type{<:ReactantDevice}}) = true
MLDataDevices.functional(::Union{ReactantDevice, Type{<:ReactantDevice}}) = true

# Default RNG: Forward to CPU, we will compile it
# Default RNG
function MLDataDevices.default_device_rng(::ReactantDevice)
return MLDataDevices.default_device_rng(CPUDevice())
return Reactant.TracedRandom.default_rng()
end

# Query Device from Array
function Internal.get_device(x::Union{ConcreteRNumber, ConcreteRArray})
client = XLA.client(x.data)
device = XLA.device(x.data)
return ReactantDevice(client, device)
return ReactantDevice(XLA.client(x.data), XLA.device(x.data))
end

function Internal.get_device(::Union{TracedRArray, TracedRNumber})
Expand Down Expand Up @@ -54,4 +53,6 @@ function Adapt.adapt_storage(dev::ReactantDevice, x::ConcreteRArray)
return Adapt.adapt(dev, Adapt.adapt(CPUDevice(), x))
end

Adapt.adapt_storage(::CPUDevice, ::Reactant.ConcreteRNG) = Random.default_rng()

end
3 changes: 2 additions & 1 deletion lib/MLDataDevices/test/xla_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ using FillArrays, Zygote # Extensions

device = reactant_device()
aType = MLDataDevices.functional(ReactantDevice) ? Reactant.ConcreteRArray : Array
rngType = Random.AbstractRNG
rngType = MLDataDevices.functional(ReactantDevice) ? Reactant.ConcreteRNG :
Random.AbstractRNG

ps_xpu = ps |> device
@test get_device(ps_xpu) isa ReactantDevice
Expand Down
34 changes: 34 additions & 0 deletions test/reactant/layer_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,37 @@ end
end
end
end

@testitem "Dropout Layers" tags=[:reactant] setup=[SharedTestSetup] skip=:(Sys.iswindows()) begin
using Reactant, Lux, Random

@testset "$(mode)" for (mode, atype, dev, ongpu) in MODES
if mode == "amdgpu"
@warn "Skipping AMDGPU tests for Reactant"
continue
end

dev = reactant_device(; force=true)

if ongpu
Reactant.set_default_backend("gpu")
else
Reactant.set_default_backend("cpu")
end

@testset for layer in (AlphaDropout, Dropout, VariationalHiddenDropout)
model = layer(0.5f0)
ps, st = Lux.setup(Random.default_rng(), model) |> dev
x = randn(Float32, 10, 10) |> dev

@test st.rng isa Reactant.ConcreteRNG

hlo = @code_hlo model(x, ps, st)
@test contains(repr(hlo), "stablehlo.rng_bit_generator")

y, st2 = @jit model(x, ps, st)
@test st2.rng isa Reactant.ConcreteRNG
@test st.rng.seed != st2.rng.seed
end
end
end

5 comments on commit 367680b

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Error while trying to register: Version 1.4.3 already exists

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register subdir=lib/MLDataDevices

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/122251

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a MLDataDevices-v1.6.6 -m "<description of version>" 367680bdee0368ed4c2e9f98eb914acf8117cc50
git push origin MLDataDevices-v1.6.6

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lux Benchmarks

Benchmark suite Current: 367680b Previous: 63d3434 Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4042 ns 4083.5 ns 0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4125 ns 4042 ns 1.02
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4833.5 ns 4917 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3958 ns 3833 ns 1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 60780 ns 59941 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10500 ns 11250 ns 0.93
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10333 ns 10500 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10625 ns 11541 ns 0.92
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10833 ns 10958 ns 0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 423470 ns 421187 ns 1.01
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1084 ns 1167 ns 0.93
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1125 ns 1250 ns 0.90
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1416 ns 1417 ns 1.00
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1208 ns 1167 ns 1.04
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 18313 ns 17939 ns 1.02
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4042 ns 4125 ns 0.98
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 4083 ns 3958 ns 1.03
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4208 ns 4292 ns 0.98
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 3625 ns 4062.5 ns 0.89
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 110716 ns 108432 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57375 ns 57333 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46292 ns 46250 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46500 ns 47041 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82709 ns 82125 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 37768 ns 36736 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2006604.5 ns 1991000.5 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2082209 ns 2094313 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2011667 ns 2094167 ns 0.96
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2018937.5 ns 1997041.5 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 196514.5 ns 194384.5 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 141709 ns 143854.5 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 144000 ns 143125 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 145187 ns 147041 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 144208 ns 144750 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 165424.5 ns 165602 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1001541.5 ns 1114896 ns 0.90
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1118791.5 ns 1128937.5 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1097124.5 ns 1128792 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1141417 ns 1114542 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 532439 ns 526049 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 3667 ns 3458 ns 1.06
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 3542 ns 3416 ns 1.04
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 3917 ns 4145.5 ns 0.94
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3541.5 ns 3584 ns 0.99
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 71776.5 ns 70040 ns 1.02
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9042 ns 8917 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9584 ns 9042 ns 1.06
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8500 ns 9459 ns 0.90
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9042 ns 8917 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 486557 ns 447136 ns 1.09
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 15125 ns 15041 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17792 ns 17541.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 16916.5 ns 17625 ns 0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 15250 ns 15917 ns 0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 56432 ns 54471 ns 1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 214500 ns 217417 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 214625 ns 213417 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 215333.5 ns 214979.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 216041 ns 225771 ns 0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 280343 ns 270355 ns 1.04
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 667 ns 791 ns 0.84
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 584 ns 625 ns 0.93
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 708 ns 708 ns 1
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 667 ns 667 ns 1
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 17273.5 ns 17190 ns 1.00
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1583 ns 1500 ns 1.06
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1667 ns 1500 ns 1.11
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1667 ns 1666 ns 1.00
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1541 ns 1500 ns 1.03
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 103457 ns 101385 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7000 ns 7208 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5937.5 ns 5916 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5709 ns 5917 ns 0.96
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9833 ns 9875 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 24396 ns 23163 ns 1.05
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 222750 ns 223083 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 229041 ns 228500 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 230041 ns 230208 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 213500 ns 217000 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 171992 ns 166961 ns 1.03
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 3916 ns 3958 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4000 ns 3958 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23948 ns 23600 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16750 ns 16792 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16583 ns 16750 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 17041 ns 17041 ns 1
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16916 ns 17000 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 165565.5 ns 161078 ns 1.03
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 572458 ns 577750 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 576208 ns 572709 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 581250 ns 574833 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 575042 ns 575625 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113609 ns 112893 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1419604 ns 1420292 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1420333 ns 1425209 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1421834 ns 1426583 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 1421062.5 ns 1429020.5 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 216706.5 ns 211317.5 ns 1.03
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s) 1089896 ns 1077500 ns 1.01
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s) 966312 ns 960792 ns 1.01
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s) 1351792 ns 1350854.5 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s) 1307959 ns 1298750 ns 1.01
lenet(28, 28, 1, 64)/forward/GPU/CUDA 276909 ns 273506 ns 1.01
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s) 5979271 ns 6004937.5 ns 1.00
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s) 4608000 ns 4547292 ns 1.01
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s) 4925667 ns 4929708.5 ns 1.00
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s) 5767000 ns 5555333 ns 1.04
lenet(28, 28, 1, 64)/zygote/GPU/CUDA 1097403.5 ns 1074648 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 500 ns 542 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 541 ns 500 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 542 ns 583 ns 0.93
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 500 ns 500 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23800 ns 23430 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2125 ns 2167 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2084 ns 2084 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2167 ns 2167 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2084 ns 2084 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 174099 ns 173597 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 4209 ns 4292 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 4042 ns 3750 ns 1.08
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5020.5 ns 4917 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 3667 ns 3958 ns 0.93
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 66593 ns 65160 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10958 ns 11209 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11167 ns 11250 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 12083 ns 12208 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 11167 ns 11125 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 455844 ns 447745.5 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6583 ns 6166 ns 1.07
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6417 ns 6375 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7562.5 ns 8125 ns 0.93
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6333 ns 6583 ns 0.96
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 53149 ns 52163 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 17375 ns 16750 ns 1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17250 ns 18209 ns 0.95
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 18250 ns 18500 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 16458 ns 17000 ns 0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 301789.5 ns 298259.5 ns 1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 542 ns 583 ns 0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 542 ns 583 ns 0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 584 ns 542 ns 1.08
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 33109.5 ns 32532 ns 1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 8542 ns 8208 ns 1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 8500 ns 8667 ns 0.98
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9375 ns 9333 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 8416.5 ns 8083 ns 1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 161412.5 ns 158900.5 ns 1.02
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 64666 ns 64500 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 64583 ns 64500 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 64459 ns 64458 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 64208 ns 64375 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 112066 ns 111633.5 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 275959 ns 274542 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 279333 ns 287042 ns 0.97
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 280167 ns 274708 ns 1.02
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 284791 ns 280292 ns 1.02
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 190816.5 ns 186083 ns 1.03
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s) 3359666.5 ns 3329333 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s) 3020708 ns 3017229 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s) 3019708 ns 3024687.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s) 4044937.5 ns 3956250 ns 1.02
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA 582824 ns 577429 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s) 7633375 ns 7623958 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s) 7444749.5 ns 7210334 ns 1.03
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s) 7451687.5 ns 7453270.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s) 8276916.5 ns 8209375 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA 1416070 ns 1359043.5 ns 1.04
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s) 17541687.5 ns 17513124.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s) 17532229.5 ns 17530146 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s) 17547042 ns 17518395.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s) 14143625 ns 14128813 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23437021 ns 23645979.5 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 33669000 ns 33821104.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 36847792 ns 37080041 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 35241729 ns 34888834 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1852807 ns 1866294 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 188072458 ns 189046208 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 164284791 ns 164619624.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 152400917 ns 152711479 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 434137916 ns 436948083 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 13886569 ns 13894254.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 288796896 ns 289373791 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 251588375 ns 251042625 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 296639417 ns 296809167 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 474281875 ns 474994229.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 22000 ns 22250 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 22625 ns 24542 ns 0.92
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 24250 ns 23188 ns 1.05
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 21812.5 ns 22417 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 98991 ns 96027 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 104791 ns 116584 ns 0.90
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 103292 ns 113125 ns 0.91
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 104708 ns 117833 ns 0.89
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 103625 ns 103854 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 514494 ns 510213 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5917 ns 5833 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5834 ns 5917 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 6459 ns 6812.5 ns 0.95
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6167 ns 6292 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 69465 ns 68158.5 ns 1.02
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14417 ns 14875 ns 0.97
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 15250 ns 14812.5 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15459 ns 14875 ns 1.04
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14666 ns 15042 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 483934.5 ns 478636.5 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 2986042 ns 3009146 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2014792 ns 2061334 ns 0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2274354.5 ns 2279208 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4589125 ns 4871541.5 ns 0.94
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA 584502 ns 589315.5 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 23505916.5 ns 23547375 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18035749.5 ns 17982875.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 16922042 ns 16893209 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 34856104.5 ns 34849958 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2763874 ns 2772744 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 33341541.5 ns 33314834 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 27602208 ns 27464208 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 27326333 ns 27410208 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41263417 ns 41078500 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 72791.5 ns 72375 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 73208 ns 74375 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 83958 ns 75166 ns 1.12
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 83208 ns 75167 ns 1.11
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 103702 ns 102682 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 286979.5 ns 286145.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 206625.5 ns 210021.5 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 322750 ns 315000 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 322333 ns 218458 ns 1.48
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 559306 ns 553543 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11458.5 ns 11875 ns 0.96
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 11666.5 ns 11708 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 12333 ns 13334 ns 0.92
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 11958 ns 13125 ns 0.91
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 73645.5 ns 71259 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 26208.5 ns 26833.5 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 27000 ns 26375 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 27416 ns 27417 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26645.5 ns 25854.5 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 483328.5 ns 477064.5 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11917 ns 12041.5 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 14750 ns 12229.5 ns 1.21
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13708 ns 13958 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 12708 ns 12584 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 54699.5 ns 53895.5 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 25375 ns 25875 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 25500 ns 25834 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 26333 ns 26125 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 27875 ns 25667 ns 1.09
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 308185.5 ns 305285 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 182041.5 ns 179417 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 181583 ns 179417 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 183167 ns 181041 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 182167 ns 180042 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 58753 ns 58113 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 592604 ns 590084 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 583041 ns 585083 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 594209 ns 591062.5 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 586791 ns 584333 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 294181 ns 289662.5 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6083 ns 6083 ns 1
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5958.5 ns 5500 ns 1.08
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 6833 ns 7542 ns 0.91
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6250 ns 6604.5 ns 0.95
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 72095.5 ns 70599 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14375 ns 14291 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 13083 ns 14209 ns 0.92
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14791 ns 14917 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14292 ns 13062.5 ns 1.09
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 473402.5 ns 466681.5 ns 1.01
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 1210604.5 ns 1223541.5 ns 0.99
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 1239854 ns 1236625 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 1297479 ns 1285666.5 ns 1.01
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 1024875 ns 1007959 ns 1.02
batchedmm(512, Bsize=4)/forward/GPU/CUDA 300941 ns 301986 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 4097875.5 ns 4226959 ns 0.97
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 4434062.5 ns 4384249.5 ns 1.01
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 4563541 ns 4572312.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 3722313 ns 3695104.5 ns 1.01
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1037751.5 ns 1047036 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1791 ns 1833 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1792 ns 1792 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1834 ns 1833 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1833 ns 1875 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 23494 ns 24200 ns 0.97
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4834 ns 4875 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4834 ns 4833 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4917 ns 4875 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4875 ns 4875 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 188396 ns 192268.5 ns 0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5625 ns 5458 ns 1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5459 ns 5542 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6500 ns 6791.5 ns 0.96
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5562.5 ns 5792 ns 0.96
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 54865 ns 56595.5 ns 0.97
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10583 ns 10500 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10500 ns 10416 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 11125 ns 11375 ns 0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10666 ns 10875 ns 0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 324083 ns 335979.5 ns 0.96
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 292 ns 334 ns 0.87
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 333 ns 333 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 375 ns 333 ns 1.13
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 292 ns 334 ns 0.87
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 22774 ns 23172 ns 0.98
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2708 ns 2833 ns 0.96
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2750 ns 2709 ns 1.02
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2959 ns 3042 ns 0.97
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2708 ns 2791 ns 0.97
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 158123.5 ns 162255.5 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 11375 ns 11084 ns 1.03
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 11083 ns 11000 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 12125 ns 13563 ns 0.89
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 11542 ns 11458 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 56425.5 ns 58685.5 ns 0.96
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 24583 ns 24542 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24667 ns 24542 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 24833.5 ns 25167 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 25250 ns 25000 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 289503 ns 298266 ns 0.97
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4167 ns 4208 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4208 ns 4208 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4250 ns 4250 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4250 ns 4250 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 24426.5 ns 25307 ns 0.97
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16417 ns 16166 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16167 ns 16292 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16334 ns 16334 ns 1
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16125 ns 16084 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 194624 ns 199542 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5709 ns 5709 ns 1
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5708 ns 5917 ns 0.96
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5834 ns 5792 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5875 ns 5834 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 33182 ns 33833 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 20792 ns 20292 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 20645.5 ns 20375 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 20792 ns 20875 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 20417 ns 20250 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 174846 ns 178083 ns 0.98
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 423688 ns 420500 ns 1.01
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 381917 ns 372625 ns 1.02
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 480521 ns 482833 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 104125 ns 103292 ns 1.01
batchedmm(16, Bsize=512)/forward/GPU/CUDA 66873.5 ns 67723.5 ns 0.99
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 934375 ns 922417 ns 1.01
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 984083 ns 955208.5 ns 1.03
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 1186625 ns 1180875 ns 1.00
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 471042 ns 379083 ns 1.24
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 189890.5 ns 192988 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 81458.5 ns 136917 ns 0.59
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 80125 ns 79854.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 81104.5 ns 82750 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 136333 ns 81167 ns 1.68
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 192847 ns 194081 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1918292 ns 1915042 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1908625 ns 1919750 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1922750 ns 1926125 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1953687.5 ns 1915750 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 394765 ns 401908.5 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 333 ns 292 ns 1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 333 ns 292 ns 1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 333 ns 333 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 21680 ns 22364 ns 0.97
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1792 ns 1833 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1833 ns 1792 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1834 ns 1834 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1833 ns 1834 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 167307.5 ns 174295 ns 0.96
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6625 ns 6042 ns 1.10
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6333 ns 6500 ns 0.97
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7375 ns 7812.5 ns 0.94
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6667 ns 6541 ns 1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 59094.5 ns 61489.5 ns 0.96
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8958 ns 9000 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8959 ns 8792 ns 1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9417 ns 9375 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9416 ns 9459 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 303401 ns 308375 ns 0.98
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 120415166.5 ns 118419979.5 ns 1.02
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 173861833 ns 173770000 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 147873916 ns 148397083 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 104464750 ns 104919541 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5466659 ns 5493586 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 607892187.5 ns 611739750.5 ns 0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 555380583 ns 553521958 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 449180562.5 ns 449841709 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 624687437 ns 631089333.5 ns 0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 34960099 ns 38209825 ns 0.91
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 655676042 ns 652096250 ns 1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 664719854.5 ns 661126562.5 ns 1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 586317000.5 ns 580970687.5 ns 1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 854444125 ns 848782167 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57541 ns 58667 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47500 ns 47500 ns 1
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46625 ns 48250 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 85500 ns 83625 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 37532 ns 37628 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1919792 ns 1919312.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1980000 ns 1980333.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1978083.5 ns 1982541.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1915584 ns 1895625 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 173336.5 ns 176341 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 266563 ns 266208 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 285125 ns 265334 ns 1.07
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 286313 ns 288604 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 267916 ns 268167 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 130327.5 ns 130454.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 588541 ns 664646 ns 0.89
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 688375 ns 671062.5 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 691667 ns 665875 ns 1.04
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 713875 ns 597542 ns 1.19
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 704236.5 ns 690208 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2209792 ns 2192312.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2211250 ns 2179542 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2214666 ns 2181333.5 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2251125 ns 2207146 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 133526 ns 134808 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5473459 ns 5469791 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5495771 ns 5472958.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5506084 ns 5499916 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5555625 ns 5442583.5 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 758118 ns 720984 ns 1.05
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 641209 ns 644667 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 638417 ns 644084 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 648750 ns 642042 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 647250 ns 644167 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 46678 ns 47636.5 ns 0.98
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1823542 ns 1819917 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1728500 ns 1720500 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1721125 ns 1721792 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 2101541 ns 2100000 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 220988 ns 224071 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58375 ns 57667 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47291 ns 46666 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46667 ns 46583 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 84417 ns 83750 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 28560 ns 28795 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2021604 ns 2029583 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2078542 ns 2087375 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2089792 ns 2087791.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2018458 ns 1991416.5 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 188289 ns 190320 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 13165083 ns 13371041.5 ns 0.98
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 12437062.5 ns 12439187.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 12496625 ns 12491875 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 15241708 ns 15195833.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 511138.5 ns 516777 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 47044896 ns 47119104.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 41734229 ns 41727062.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 41006041 ns 41051417 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 58474250 ns 58599458 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2887641 ns 2892052.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 74158583 ns 74212666 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 68293166 ns 67877750 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 90787478.5 ns 90536499.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 76120020.5 ns 98549792 ns 0.77
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58708 ns 58375 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47417 ns 46459 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47333 ns 47708 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 81500 ns 83958 ns 0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 48467.5 ns 47165 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1906541 ns 1919583.5 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1966979 ns 1980791 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1972250 ns 1979229.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1919083.5 ns 1886958 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 194955.5 ns 193816.5 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 292 ns 333 ns 0.88
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 292 ns 333 ns 0.88
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 417 ns 375 ns 1.11
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 333 ns 333 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 31682 ns 32624 ns 0.97
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 5979.5 ns 5833 ns 1.03
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 5959 ns 6083 ns 0.98
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6417 ns 6416.5 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6250 ns 5833 ns 1.07
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 173280.5 ns 171378.5 ns 1.01
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 250 ns 292 ns 0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 250 ns 291 ns 0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 333 ns 0.88
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 250 ns 292 ns 0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 31661 ns 32204 ns 0.98
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 2583 ns 2583 ns 1
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 2625 ns 2625 ns 1
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 2834 ns 2875 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 2584 ns 2625 ns 0.98
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 162166.5 ns 159764 ns 1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 285912791.5 ns 286393770.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 341793875 ns 340253500 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 314064437.5 ns 313806270.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 269291750 ns 268566520.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 7104649.5 ns 7103110 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1013628833 ns 1012043792 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 955735416 ns 955581708 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 855387437.5 ns 855297583 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1263250834 ns 1259239875 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 33975753 ns 33847341 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1379120562.5 ns 1418325958.5 ns 0.97
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 1314342812 ns 1338395020.5 ns 0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1634956500 ns 1636087292 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1372311479 ns 1775858125 ns 0.77
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1410229 ns 1409833 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1415750 ns 1414458.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1412896 ns 1465562.5 ns 0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1460375 ns 1413458.5 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 127578 ns 127951 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5011584 ns 5027250 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5015500 ns 5036354 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5020521 ns 5030437.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5052375 ns 5027250.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 577903.5 ns 479205.5 ns 1.21
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s) 171180458 ns 170869291 ns 1.00
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s) 128541250 ns 128735708 ns 1.00
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s) 109850250 ns 105431542 ns 1.04
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s) 169107792 ns 167706958 ns 1.01
vgg16(32, 32, 3, 32)/forward/GPU/CUDA 4873683 ns 4877746.5 ns 1.00
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s) 624949333 ns 511068334 ns 1.22
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s) 491287250 ns 490911792 ns 1.00
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s) 454790833 ns 385742875 ns 1.18
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s) 648542167 ns 650161000 ns 1.00
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA 16059874 ns 16340937 ns 0.98
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 8910395.5 ns 9003042 ns 0.99
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 8995792 ns 8983042 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 7901000 ns 7909375 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 9817770.5 ns 9604229.5 ns 1.02
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1593491 ns 1611438.5 ns 0.99
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 35975583 ns 36334167 ns 0.99
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 37440812.5 ns 37265291.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 33423291.5 ns 33553354 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 38560271 ns 37555333 ns 1.03
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6452757.5 ns 6454550 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 47625 ns 47333 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 47583 ns 47500 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 47625 ns 47625 ns 1
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 47375 ns 47417 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 18605 ns 18252 ns 1.02
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 50250 ns 50417 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 50417 ns 50666 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 50625 ns 50625 ns 1
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 50459 ns 50250 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 218596.5 ns 164880 ns 1.33
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6416 ns 6417 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6625 ns 6792 ns 0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7209 ns 7583.5 ns 0.95
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7000 ns 6792 ns 1.03
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 120537.5 ns 76692.5 ns 1.57
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9667 ns 10125 ns 0.95
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9583 ns 9750 ns 0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10625 ns 10250 ns 1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10209 ns 9875 ns 1.03
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 676959 ns 448214.5 ns 1.51
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5584 ns 5666 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6167 ns 5791 ns 1.06
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7146 ns 7583 ns 0.94
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5562.5 ns 6042 ns 0.92
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 144983 ns 81735 ns 1.77
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12875 ns 13208 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13084 ns 12709 ns 1.03
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13875 ns 13375 ns 1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12959 ns 13417 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 555671 ns 399198.5 ns 1.39
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 959 ns 959 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 959 ns 1000 ns 0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 1083 ns 1042 ns 1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 1083 ns 1083 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 32054 ns 32447 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7500 ns 7666 ns 0.98
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7875 ns 7708 ns 1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8167 ns 7958 ns 1.03
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7958.5 ns 8166 ns 0.97
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 215727.5 ns 187787.5 ns 1.15
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 23166.5 ns 23167 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 23292 ns 23209 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 23458 ns 23250 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 23334 ns 23292 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 18589.5 ns 18320.5 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 52625 ns 52917 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 52500 ns 52167 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 52958 ns 52917 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 52333 ns 52875 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 299146 ns 214503.5 ns 1.39
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1401500 ns 1398125 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1396145.5 ns 1402146 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1398562.5 ns 1406437.5 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1435792 ns 1448937.5 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 195172 ns 196187.5 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5009646 ns 5003458 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4800875 ns 5029708 ns 0.95
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5005896 ns 5015042 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5025041.5 ns 5005729.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 612010.5 ns 509817 ns 1.20
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3032250 ns 3051834 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2072292 ns 2076520.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2300667 ns 2302500 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4921042 ns 4658291.5 ns 1.06
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 580134 ns 581685 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 24343228.5 ns 24315708 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18906020.5 ns 18877250 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 17758521.5 ns 17822166 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 35734042 ns 35790999.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2830179 ns 2842698 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 33956916.5 ns 33982916.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 28347958 ns 28228208.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 28079666 ns 27940958 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 42065000 ns 41757334 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 144437916 ns 143078500 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 147635291 ns 146668125 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 125109916 ns 127355624.5 ns 0.98
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 173674875 ns 171841729.5 ns 1.01
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22545545 ns 22550146 ns 1.00
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 908256562.5 ns 1234730083.5 ns 0.74
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 1584608041.5 ns 1060723417 ns 1.49
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 749118208 ns 1027004875 ns 0.73
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 669868292 ns 674561583 ns 0.99
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 118395391 ns 117659213 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 81333 ns 74125 ns 1.10
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 75042 ns 73146 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 77166 ns 76000 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 73625 ns 85834 ns 0.86
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 243285.5 ns 175925 ns 1.38
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 287145.5 ns 215750 ns 1.33
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 285833 ns 192541.5 ns 1.48
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 283104.5 ns 284542 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 279041 ns 285708 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1239705 ns 952026.5 ns 1.30
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 35487666 ns 35486000 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 36325875 ns 36428646.5 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 32416604 ns 32475229 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 40654875 ns 40408041.5 ns 1.01
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5840513 ns 5831517 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 146753459 ns 146000771 ns 1.01
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 153140083.5 ns 154808750 ns 0.99
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 135055542 ns 137043083.5 ns 0.99
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 286267791 ns 285556542 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 34875869 ns 34852076.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 120929708.5 ns 121592083 ns 0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 174008000 ns 174639125 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 147856792 ns 148027541 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 102357166.5 ns 105917833 ns 0.97
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5458379 ns 5344344 ns 1.02
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 472290792 ns 468650958 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 468203875 ns 466713000 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 437903521 ns 437158458 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 743156542 ns 744371959 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 32279044 ns 35992005 ns 0.90
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 709215666.5 ns 712765167 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 641585354.5 ns 641204167 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 623424125.5 ns 624084979.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 853935458 ns 856208084 ns 1.00
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s) 1289084 ns 1270583 ns 1.01
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s) 912625 ns 995709 ns 0.92
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s) 959625 ns 995875 ns 0.96
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s) 2066167 ns 2037625 ns 1.01
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA 576350.5 ns 569478 ns 1.01
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s) 2954792 ns 2961229.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s) 2624645.5 ns 2647792 ns 0.99
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s) 2616708 ns 2621500 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s) 3750458 ns 3709750 ns 1.01
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA 1708662 ns 1587708.5 ns 1.08
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s) 5780625 ns 5785812.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s) 5802646 ns 5824083 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s) 5793708 ns 5785375 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s) 2916792 ns 2904896 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7292 ns 7250 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6125 ns 6125 ns 1
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6167 ns 6042 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9917 ns 10042 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 24959.5 ns 24479.5 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 212666.5 ns 223812.5 ns 0.95
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 219979.5 ns 222667 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 220458 ns 220792 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 244353.5 ns 240666 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 249958 ns 212315.5 ns 1.18
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s) 296320791 ns 296229125 ns 1.00
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s) 216911667 ns 216728584 ns 1.00
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s) 196230687 ns 190254604.5 ns 1.03
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s) 303909375 ns 304954521 ns 1.00
vgg16(32, 32, 3, 64)/forward/GPU/CUDA 7672082.5 ns 7671461.5 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s) 1231911312.5 ns 1229817167 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s) 900530270.5 ns 902846291.5 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s) 828047958 ns 824304209 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s) 1151206292 ns 1157856750.5 ns 0.99
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA 26738113 ns 26996841 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4833 ns 5292 ns 0.91
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5500 ns 5291.5 ns 1.04
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6167 ns 6375 ns 0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5000 ns 5250 ns 0.95
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 149363.5 ns 112898 ns 1.32
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7041 ns 6875 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7333 ns 6958 ns 1.05
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7541 ns 7583 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6917 ns 7125 ns 0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 600699 ns 535221.5 ns 1.12
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 500 ns 500 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 500 ns 584 ns 0.86
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 584 ns 584 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 500 ns 541 ns 0.92
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 23466 ns 23660 ns 0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 8667 ns 8625 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 8417 ns 9084 ns 0.93
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9667 ns 9417 ns 1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9125 ns 8708 ns 1.05
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 211340 ns 195936.5 ns 1.08
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 368458 ns 352958.5 ns 1.04
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 351459 ns 352792 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 352500 ns 351479 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 352146 ns 356708.5 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 21302 ns 20962 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 826271 ns 775625 ns 1.07
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 824958.5 ns 825833 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 792000 ns 812229.5 ns 0.98
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 830250.5 ns 834959 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 269586 ns 234827 ns 1.15
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 340937.5 ns 341562.5 ns 1.00
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 343062.5 ns 341958 ns 1.00
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 454770.5 ns 455917 ns 1.00
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 14084 ns 11083 ns 1.27
batchedmm(16, Bsize=32)/forward/GPU/CUDA 17990 ns 17699 ns 1.02
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 710583 ns 712500 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 728458 ns 739896 ns 0.98
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 1004208 ns 1007854 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 27417 ns 26459 ns 1.04
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 239886 ns 214680.5 ns 1.12
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 383166.5 ns 381042 ns 1.01
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 350542 ns 346750 ns 1.01
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 443208 ns 449187.5 ns 0.99
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 31250 ns 39042 ns 0.80
batchedmm(16, Bsize=128)/forward/GPU/CUDA 22514 ns 22537 ns 1.00
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 718250 ns 733792 ns 0.98
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 782083 ns 788958 ns 0.99
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 1028417 ns 1032500 ns 1.00
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 105334 ns 105583 ns 1.00
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 217107 ns 200835.5 ns 1.08
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 3333 ns 3791 ns 0.88
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 3708 ns 3541 ns 1.05
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 3625 ns 3708 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 3417 ns 3708 ns 0.92
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 17516 ns 17542 ns 1.00
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 4104.5 ns 4250 ns 0.97
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 4208 ns 4167 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 4291 ns 4250 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 4166 ns 4250 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 232485 ns 204574.5 ns 1.14
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 3333 ns 3834 ns 0.87
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 3667 ns 3667 ns 1
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4084 ns 4250 ns 0.96
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4250 ns 3625 ns 1.17
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 176024.5 ns 160115.5 ns 1.10
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8291 ns 8292 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8250 ns 8166 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8250 ns 8458 ns 0.98
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8542 ns 8333 ns 1.03
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 1051146 ns 989699 ns 1.06
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 204709 ns 203375 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 210709 ns 212791 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 210583 ns 210666 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 199833.5 ns 200834 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 34425 ns 34428 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 647229 ns 652624.5 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 649666.5 ns 622667 ns 1.04
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 626208 ns 631604.5 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 640479.5 ns 632750 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 293508 ns 280400.5 ns 1.05
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 993750 ns 994229.5 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 1020395.5 ns 1040292 ns 0.98
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 958396 ns 956020.5 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 887291 ns 853917 ns 1.04
batchedmm(128, Bsize=128)/forward/GPU/CUDA 206487.5 ns 208023.5 ns 0.99
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 4504792 ns 4502437.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 4702583.5 ns 4668229.5 ns 1.01
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 4449000 ns 4455084 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 4321500 ns 4280937 ns 1.01
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 979904 ns 935555 ns 1.05
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3167 ns 3292 ns 0.96
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3541 ns 3458 ns 1.02
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4166 ns 4042 ns 1.03
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3333.5 ns 3209 ns 1.04
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 174711 ns 159049 ns 1.10
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7042 ns 7291 ns 0.97
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7042 ns 7333 ns 0.96
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7375 ns 7334 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7083 ns 6833 ns 1.04
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 911927 ns 850635.5 ns 1.07
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1650250 ns 1640041 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1195333 ns 1196604.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1375625 ns 1383250 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2471000 ns 2417500 ns 1.02
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA 213276 ns 215018 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12340062 ns 12333396 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9568500 ns 9592791.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9298896 ns 9267625 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18088041 ns 18011459 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1943838 ns 1959459 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17384833.5 ns 17332937.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14357854 ns 14386792 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14387313 ns 14369396.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21175104 ns 21112291.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 100083 ns 87708 ns 1.14
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 87750 ns 88542 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 93416.5 ns 92833 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 89625 ns 116000 ns 0.77
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 125990 ns 126352.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2026687.5 ns 2022959 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2031083.5 ns 2049666 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2031250 ns 2035562.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2050458.5 ns 2025938 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 951363 ns 878938 ns 1.08
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 2979 ns 2750 ns 1.08
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 2875 ns 3209 ns 0.90
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 3520.5 ns 3417 ns 1.03
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 2521 ns 2792 ns 0.90
batchedmm(2, Bsize=4)/forward/GPU/CUDA 16207 ns 16283 ns 1.00
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 2666.5 ns 2542 ns 1.05
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 2500 ns 2708 ns 0.92
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 2875 ns 2875 ns 1
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 2959 ns 2834 ns 1.04
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 179422.5 ns 176848 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7250 ns 7083 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5958 ns 6000 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6000 ns 6041 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10083 ns 10042 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 33838 ns 34134 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 225292 ns 221583 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 219750 ns 220000 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 220542 ns 220417 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 244708 ns 215333 ns 1.14
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 293649.5 ns 285763.5 ns 1.03
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3708 ns 3750 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3709 ns 3750 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3750 ns 3750 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3709 ns 3709 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 22219 ns 22875 ns 0.97
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14417 ns 14500 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14375 ns 14375 ns 1
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14625 ns 14458 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14583 ns 14500 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 436265 ns 410580 ns 1.06
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 140000 ns 92125 ns 1.52
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 92458 ns 92916 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 96792 ns 96979 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 96792 ns 138000 ns 0.70
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 125211.5 ns 125660 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1921583.5 ns 1923792 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1923937.5 ns 1935291 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1928188 ns 1932916.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1942771 ns 1920500 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 855373 ns 861874.5 ns 0.99
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s) 874041 ns 873916 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s) 820458 ns 826583 ns 0.99
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s) 1223417 ns 1222000 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s) 972500 ns 963750 ns 1.01
lenet(28, 28, 1, 32)/forward/GPU/CUDA 272168 ns 276546 ns 0.98
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s) 2804167 ns 2791083 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s) 2520875 ns 2445687.5 ns 1.03
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s) 3337667 ns 3347916 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s) 3424895.5 ns 3371375 ns 1.02
lenet(28, 28, 1, 32)/zygote/GPU/CUDA 1501496.5 ns 1487194.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 16791.5 ns 17250 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 14854.5 ns 17959 ns 0.83
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 18375 ns 17875 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 15229 ns 17417 ns 0.87
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 131230 ns 130892 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 227959 ns 218625 ns 1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 250729 ns 260667 ns 0.96
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 216125 ns 227792 ns 0.95
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 262791 ns 256083 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 582129.5 ns 584591.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 222062.5 ns 222000 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 219125 ns 222667 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 222041.5 ns 222312.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 221584 ns 220833 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 244344.5 ns 243596.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 508270.5 ns 501417 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 521083 ns 496084 ns 1.05
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 498833 ns 508541.5 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 565541.5 ns 561833 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1195773 ns 1202534 ns 0.99
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 4479.5 ns 3895.5 ns 1.15
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 3583.5 ns 4270.5 ns 0.84
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 4750 ns 5708 ns 0.83
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 4625 ns 4458.5 ns 1.04
batchedmm(16, Bsize=4)/forward/GPU/CUDA 16818 ns 16584 ns 1.01
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 7208 ns 7208.5 ns 1.00
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 7250 ns 7000 ns 1.04
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 7333 ns 7625 ns 0.96
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 7458.5 ns 7500 ns 0.99
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 180977.5 ns 179332 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18583 ns 17687 ns 1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17583.5 ns 17917 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 19958.5 ns 18625 ns 1.07
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17333 ns 18729 ns 0.93
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 132074.5 ns 135434 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 212166 ns 211041 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 212146 ns 220417 ns 0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 212917 ns 212542 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 218959 ns 212271 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 814362 ns 847267 ns 0.96
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 4042 ns 3959 ns 1.02
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 4208 ns 4209 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5000 ns 4875 ns 1.03
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 4000 ns 4291 ns 0.93
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 175168.5 ns 187480.5 ns 0.93
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10250 ns 10459 ns 0.98
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9687.5 ns 10541.5 ns 0.92
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11083 ns 10042 ns 1.10
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10125 ns 10125 ns 1
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 961404 ns 955985 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3041.5 ns 3145.5 ns 0.97
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3291 ns 2937.5 ns 1.12
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4375 ns 4000 ns 1.09
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3416.5 ns 3167 ns 1.08
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 193655 ns 188520.5 ns 1.03
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7208.5 ns 7375 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7209 ns 7209 ns 1
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7542 ns 7625 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7458 ns 7333 ns 1.02
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 972220 ns 987324 ns 0.98
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23356708 ns 23406938 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 34480833.5 ns 35765125 ns 0.96
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 37583875 ns 37705500 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 35001895.5 ns 34946604 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1828165 ns 1830206.5 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 184126958 ns 183995333 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 166867125 ns 165575375 ns 1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 146311896 ns 146468292 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 275288375 ns 274483625 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 16524063 ns 16521685 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 276685520.5 ns 276817937 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 252606729 ns 246377395.5 ns 1.03
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 231173396 ns 231576042 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 324261749.5 ns 325032833.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 184542 ns 182896.5 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 182833 ns 184292 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 185583 ns 184958 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 184895.5 ns 183167 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 166499.5 ns 200810.5 ns 0.83
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 634000 ns 635333 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 585209 ns 633354.5 ns 0.92
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 592708.5 ns 600291 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 630958 ns 597271 ns 1.06
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 926373.5 ns 958799 ns 0.97
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 3858042 ns 3842750 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 3914708 ns 3997500 ns 0.98
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 3549917 ns 3542792 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 4595104.5 ns 4556625 ns 1.01
batchedmm(128, Bsize=512)/forward/GPU/CUDA 532803 ns 532425 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 17337937.5 ns 17396104 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 17877583 ns 18078958 ns 0.99
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 16422125 ns 16589917 ns 0.99
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 20130416.5 ns 19981167 ns 1.01
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2619405 ns 2633170 ns 0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 542 ns 542 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 542 ns 542 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 542 ns 583 ns 0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 32935 ns 32094 ns 1.03
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 8958 ns 8917 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 8875 ns 8750 ns 1.01
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9458 ns 9041 ns 1.05
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9209 ns 9042 ns 1.02
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 248903 ns 249030 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s) 649671041.5 ns 652464437.5 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s) 390100166.5 ns 394034604 ns 0.99
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s) 355146542 ns 326393417 ns 1.09
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s) 750210500 ns 748745833 ns 1.00
vgg16(32, 32, 3, 128)/forward/GPU/CUDA 12471745.5 ns 12466975 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s) 1883695042 ns 1885107791.5 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s) 1646365041 ns 1638827875 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s) 1513696187.5 ns 1512914354 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s) 2208789146 ns 2208603583.5 ns 1.00
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA 49495223 ns 49231175.5 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1642208 ns 1616792 ns 1.02
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1192812.5 ns 1200917 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1386104 ns 1389625 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2519667 ns 2477916.5 ns 1.02
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 215937.5 ns 215338 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12672750 ns 12691834 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9911875 ns 9979354.5 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9658417 ns 9689896 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18448708.5 ns 18371271 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1992558.5 ns 1985308 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17681874.5 ns 17676916 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14694333 ns 14722000 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14589750 ns 14613667 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21582250 ns 21413395.5 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 26291 ns 26292 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 26667 ns 26250 ns 1.02
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 26291 ns 26291 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 26250 ns 26250 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 23957 ns 23721 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66959 ns 67333 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 67750 ns 67333 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 67250 ns 67209 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 67459 ns 67333 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 371563.5 ns 367128.5 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 203875 ns 203542 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 209500 ns 208625 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 209125 ns 209584 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 200459 ns 199792 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 26219 ns 25494 ns 1.03
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 647500 ns 604625 ns 1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 669416.5 ns 670666.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 685542 ns 632166.5 ns 1.08
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 632166.5 ns 630000 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 324278 ns 321975.5 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 675000 ns 639021 ns 1.06
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 541042 ns 643458 ns 0.84
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 637375 ns 658750 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 666542 ns 632750 ns 1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 132249.5 ns 131332 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2232250 ns 2244229 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2239333.5 ns 2277708.5 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2241084 ns 2240167 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2299271.5 ns 2235458.5 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1091764 ns 1075922 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17833 ns 17167 ns 1.04
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17917 ns 17916 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 20584 ns 18167 ns 1.13
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18709 ns 18208 ns 1.03
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 133803 ns 130720.5 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 260333 ns 258584 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 255395.5 ns 227459 ns 1.12
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 253687.5 ns 232750 ns 1.09
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 230479 ns 230791 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 901721 ns 887768.5 ns 1.02
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 541 ns 625 ns 0.87
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 667 ns 625 ns 1.07
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 666 ns 666 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 542 ns 542 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 23720 ns 23104 ns 1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 8333.5 ns 9750 ns 0.85
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9666 ns 9250 ns 1.04
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10208 ns 9208 ns 1.11
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9750 ns 9417 ns 1.04
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 244421 ns 242418 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5125 ns 5208 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5750 ns 5125 ns 1.12
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6584 ns 6375 ns 1.03
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5125 ns 5375 ns 0.95
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 195651 ns 193804 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7083 ns 7167 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7375 ns 7250 ns 1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7750 ns 7375 ns 1.05
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7875 ns 7042 ns 1.12
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 711373.5 ns 706410 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 2041 ns 2125 ns 0.96
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 2250 ns 2250 ns 1
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2250 ns 2209 ns 1.02
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 2208 ns 2208 ns 1
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 18128 ns 17672 ns 1.03
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 6542 ns 6458 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 6542 ns 6291 ns 1.04
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6625 ns 6709 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 6417 ns 6500 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 296966 ns 300575 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 751937.5 ns 749459 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 746542 ns 748959 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 750125 ns 750854 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 751833.5 ns 749167 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 21365 ns 20805 ns 1.03
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 811458 ns 775208 ns 1.05
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 810958 ns 795916.5 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 790958 ns 792791 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 813167 ns 792792 ns 1.03
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 271261 ns 274546.5 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7334 ns 7208 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5958 ns 5917 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5917 ns 5959 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10250 ns 10250 ns 1
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 33874 ns 33244 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 258396 ns 219625 ns 1.18
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 269104 ns 240291 ns 1.12
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 253416 ns 237583 ns 1.07
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 245208 ns 260042 ns 0.94
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 333723 ns 337443 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10250 ns 10084 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 10334 ns 9583 ns 1.08
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10625 ns 10750 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 10250 ns 10167 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 213790.5 ns 223296.5 ns 0.96
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 24583 ns 25125 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24500 ns 24312.5 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 24792 ns 24917 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 24916 ns 24667 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 1032950.5 ns 1047460.5 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 107140583 ns 106018062.5 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 117792062 ns 118144520.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 120863042 ns 120409292 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 117603375 ns 117468833 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 2946778 ns 2652084 ns 1.11
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 393794791.5 ns 373672500 ns 1.05
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 359678396 ns 359102771.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 357838334 ns 356068521.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 545418083.5 ns 543525042 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 15489580 ns 15230726 ns 1.02
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 607837250 ns 605345333 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 579716416 ns 584604208 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 747642396 ns 744606604.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 607166334 ns 793208583.5 ns 0.77
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7292 ns 6500 ns 1.12
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6958 ns 6375 ns 1.09
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7625 ns 8062 ns 0.95
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6834 ns 7146 ns 0.96
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 206235.5 ns 216878 ns 0.95
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 13709 ns 13625 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14167 ns 13625 ns 1.04
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14500 ns 14125 ns 1.03
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14292 ns 14084 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 968613 ns 1010131 ns 0.96
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5625 ns 5625 ns 1
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6250 ns 6000 ns 1.04
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 6875 ns 7895.5 ns 0.87
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5750 ns 5958 ns 0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 204166 ns 211472.5 ns 0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12625 ns 12583 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12583 ns 12333 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13000 ns 12708 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12292 ns 12709 ns 0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 694587 ns 725788 ns 0.96
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 5917 ns 5583 ns 1.06
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 5458 ns 5875 ns 0.93
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 5875 ns 6583.5 ns 0.89
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 5958 ns 6167 ns 0.97
batchedmm(2, Bsize=128)/forward/GPU/CUDA 16951 ns 17002 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 15583 ns 15916 ns 0.98
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 15375 ns 15250 ns 1.01
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 15625 ns 16125 ns 0.97
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 15708 ns 15834 ns 0.99
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 185517 ns 187784.5 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 292 ns 292 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 333 ns 375 ns 0.89
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 333 ns 334 ns 1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 22862.5 ns 23531 ns 0.97
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6209 ns 6167 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6208 ns 6292 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6542 ns 6459 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6375 ns 6084 ns 1.05
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 223995 ns 228744 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5834 ns 5834 ns 1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5917 ns 5916 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 6000 ns 5959 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 5833 ns 5959 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 23989 ns 24273 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 20833 ns 20833 ns 1
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 20583 ns 20750 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 21625 ns 21292 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 21375 ns 21041 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 246983.5 ns 251207.5 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 169125 ns 185375 ns 0.91
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 144292 ns 144625 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 148291.5 ns 147917 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 189062.5 ns 144417 ns 1.31
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 166865 ns 166909.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1326271 ns 1321833 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1323042 ns 1350479 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1320500 ns 1337166 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1341500 ns 1323625 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1189366 ns 1251196 ns 0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 23000 ns 24833 ns 0.93
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 23479 ns 25041 ns 0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 24875 ns 23958 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 24750 ns 24271 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 254630.5 ns 315591 ns 0.81
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 130167 ns 131292 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 128375 ns 118396 ns 1.08
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 123229 ns 176916 ns 0.70
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 131062.5 ns 129458 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1279498 ns 1353120 ns 0.95
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 292 ns 333 ns 0.88
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 417 ns 0.90
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 292 ns 292 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 23209 ns 23127 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6042 ns 6125 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6416 ns 6459 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6792 ns 6333 ns 1.07
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6458 ns 6125 ns 1.05
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 238830 ns 245064.5 ns 0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4333 ns 4208 ns 1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4542 ns 4875 ns 0.93
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4708 ns 5125 ns 0.92
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4791 ns 4667 ns 1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 217579.5 ns 228957.5 ns 0.95
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9666 ns 9875 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10042 ns 9875 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10125 ns 10334 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10208 ns 10208 ns 1
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 1231902.5 ns 1285818.5 ns 0.96
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1584 ns 1584 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1625 ns 1625 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1625 ns 1625 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1584 ns 1625 ns 0.97
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 22989 ns 23344 ns 0.98
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 5667 ns 5750 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 5625 ns 5709 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6041 ns 6000 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 5750 ns 5666 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 258706.5 ns 264086.5 ns 0.98
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 6877625 ns 6807541.5 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 6431167 ns 6433375 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 6497166 ns 6489875 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 7600437.5 ns 7649521 ns 0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 213793 ns 214938 ns 0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 24074875 ns 24073959 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 21241875 ns 21296000 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 21023583.5 ns 21044062.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 29822125.5 ns 29805771 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2088714.5 ns 2104181 ns 0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 37413209 ns 37247625 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 34256250 ns 34089791 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 45704562.5 ns 45725979.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 38148271 ns 49397750 ns 0.77
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5416 ns 5500 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6104.5 ns 5708 ns 1.07
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6667 ns 6541 ns 1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6167 ns 5708 ns 1.08
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 206549 ns 208256 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7917 ns 8084 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8229.5 ns 8125 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8584 ns 8375 ns 1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8542 ns 8375 ns 1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 962776 ns 991485 ns 0.97
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s) 1560583 ns 1509000 ns 1.03
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s) 1259145.5 ns 1282542 ns 0.98
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s) 1626291.5 ns 1634916.5 ns 0.99
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s) 2161625 ns 2162000.5 ns 1.00
lenet(28, 28, 1, 128)/forward/GPU/CUDA 280818.5 ns 271116.5 ns 1.04
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s) 7902229 ns 7902209 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s) 6567125 ns 6449312.5 ns 1.02
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s) 7147750 ns 7195708 ns 0.99
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s) 10485771 ns 10462229 ns 1.00
lenet(28, 28, 1, 128)/zygote/GPU/CUDA 1771472.5 ns 1752716.5 ns 1.01
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 373687.5 ns 371187.5 ns 1.01
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 370583 ns 374208 ns 0.99
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 462021 ns 461250 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 23584 ns 22208 ns 1.06
batchedmm(128, Bsize=4)/forward/GPU/CUDA 45539 ns 42428.5 ns 1.07
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 728750 ns 745437.5 ns 0.98
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 804208.5 ns 815833 ns 0.99
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 1065312.5 ns 1062958 ns 1.00
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 96666.5 ns 117396 ns 0.82
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 226465 ns 283256.5 ns 0.80
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397333 ns 397208 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288042 ns 288667 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 288417 ns 287875 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 751375 ns 750917 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 44356 ns 43636 ns 1.02
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 672167 ns 667000 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 531292 ns 531375 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 528292 ns 531417 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 975666 ns 974083 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 193617.5 ns 188745 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 669291 ns 644833 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 642666 ns 648750 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 644708.5 ns 644479 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 687208 ns 652458.5 ns 1.05
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 132960 ns 131347.5 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2454209 ns 2445334 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2456687 ns 2500021 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2455291 ns 2463250 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2470521 ns 2463375 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1122477 ns 1238313 ns 0.91
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 3541 ns 3417 ns 1.04
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 3208 ns 3625 ns 0.88
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 4458 ns 4250 ns 1.05
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 2958 ns 3437.5 ns 0.86
batchedmm(2, Bsize=32)/forward/GPU/CUDA 16816 ns 16066 ns 1.05
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 5292 ns 5375 ns 0.98
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 5333 ns 5292 ns 1.01
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 5625 ns 5750 ns 0.98
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 5750 ns 5583 ns 1.03
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 187435 ns 182995 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1458000 ns 1458042 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1498250 ns 1499750 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1497083 ns 1503250 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1439583 ns 1437708 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 40900 ns 40191 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5127041 ns 5113291 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5298083.5 ns 5287958 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5287583 ns 5307041.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5015875 ns 4985125 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 198989 ns 196599 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3708 ns 3709 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3708 ns 3708 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3709 ns 3709 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3709 ns 3709 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 34297 ns 33557 ns 1.02
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15125 ns 15125 ns 1
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15083.5 ns 15167 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15375 ns 15416 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15166 ns 15208 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 348507 ns 349206 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 71250 ns 71125 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 71333 ns 71542 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 70959 ns 71209 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 71209 ns 71041 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 113569.5 ns 113114 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 317792 ns 317667 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 319125 ns 324125 ns 0.98
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 319500 ns 318292 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 319875 ns 317625 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 197937.5 ns 193277 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 959 ns 958 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 1000 ns 1041 ns 0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 1084 ns 1083 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 1000 ns 1125 ns 0.89
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 23702 ns 23048 ns 1.03
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7500 ns 7750 ns 0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 7750 ns 8270.5 ns 0.94
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8334 ns 8250 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 7958 ns 8041 ns 0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 249887 ns 245757.5 ns 1.02
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 504875 ns 502770.5 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 484208 ns 484500 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 564708 ns 561750 ns 1.01
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 236458 ns 219917 ns 1.08
batchedmm(128, Bsize=32)/forward/GPU/CUDA 130159 ns 129178 ns 1.01
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 1379479.5 ns 1387645.5 ns 0.99
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 1446458.5 ns 1473958 ns 0.98
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 1730646 ns 1779041.5 ns 0.97
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 884667 ns 862917 ns 1.03
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 273315.5 ns 273950 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 333 ns 333 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 333 ns 334 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 416 ns 416 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 334 ns 333 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 32089 ns 31657.5 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6083 ns 6125 ns 0.99
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6000 ns 6208 ns 0.97
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6500 ns 6541 ns 0.99
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6083 ns 6042 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 250296.5 ns 251419 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1723562.5 ns 1733792 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1725958.5 ns 1721208 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1731208 ns 1724250 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1767667 ns 1773541 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 168954.5 ns 168671 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4352187.5 ns 4114542 ns 1.06
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4302209 ns 4392834 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4360250 ns 4368208.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4366750 ns 4369208.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1065222 ns 1291475.5 ns 0.82
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 6916 ns 6834 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 6750 ns 6667 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 6875 ns 7999.5 ns 0.86
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 6958 ns 7041 ns 0.99
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 20747 ns 20138.5 ns 1.03
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 67792 ns 51250 ns 1.32
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 48292 ns 32625 ns 1.48
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 32958 ns 73833 ns 0.45
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 51583 ns 51084 ns 1.01
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 198224 ns 340107 ns 0.58
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 18375 ns 17833 ns 1.03
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 17625 ns 18083 ns 0.97
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 18542 ns 18875 ns 0.98
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 18291 ns 18208 ns 1.00
batchedmm(2, Bsize=512)/forward/GPU/CUDA 18190 ns 18400 ns 0.99
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 53292 ns 53250 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 53541 ns 53041 ns 1.01
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 53500 ns 53375 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 53500 ns 53542 ns 1.00
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 306993 ns 319083.5 ns 0.96
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 75375 ns 75166 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 75208 ns 75625 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 75000 ns 75291.5 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 75458 ns 75083 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 46432 ns 47469 ns 0.98
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 323792 ns 324958 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 324916 ns 342000 ns 0.95
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 325000 ns 325000 ns 1
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 327375 ns 324542 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 209114 ns 211595 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1485167 ns 1484959 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1524792 ns 1526854.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1525000 ns 1527250 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1466042 ns 1462542 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 51777 ns 51799 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5115209 ns 5111083.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5290000 ns 5312417 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5261979.5 ns 5299333.5 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5012167 ns 4982354 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 202581 ns 204934 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 28208 ns 28208 ns 1
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 28208 ns 28250 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 28208 ns 28187.5 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 28208 ns 28250 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 24112 ns 24742 ns 0.97
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66333 ns 66500 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 66375 ns 66709 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 66667 ns 66500 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 67041 ns 66541 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 467729 ns 484630.5 ns 0.97
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s) 1491583.5 ns 1480583.5 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s) 1128834 ns 1136563 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s) 1128084 ns 1136750 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s) 2260833.5 ns 2265937.5 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA 577757.5 ns 579622.5 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s) 3056208 ns 3074562.5 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s) 2732395.5 ns 2788145.5 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s) 2734709 ns 2743021 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s) 3843875 ns 3819500.5 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA 1892225.5 ns 1931643 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s) 7896000 ns 7902458 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s) 7928041.5 ns 7834062.5 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s) 7897562.5 ns 7920375 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s) 4840958 ns 4826312.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 81709 ns 77625 ns 1.05
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 81062.5 ns 81167 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 85084 ns 84041.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 90541 ns 111396 ns 0.81
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 194858.5 ns 193746 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2012792 ns 2012875 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2022916.5 ns 2046292 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2012625 ns 2031354 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2042500 ns 2015417 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 690147 ns 746361.5 ns 0.92

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.