Skip to content

Commit

Permalink
fix: use generic broadcasting for complex numbers (#1106)
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal authored Nov 25, 2024
1 parent 6f9f8d6 commit 161b64c
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 3 deletions.
2 changes: 1 addition & 1 deletion lib/LuxLib/Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LuxLib"
uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
authors = ["Avik Pal <[email protected]> and contributors"]
version = "1.3.9"
version = "1.3.10"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
Expand Down
5 changes: 4 additions & 1 deletion lib/LuxLib/src/impl/bias_activation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -80,14 +80,17 @@ function CRC.rrule(cfg::RuleConfig{>:HasReverseMode}, ::typeof(bias_activation),
return y, ∇bias_activation_rrule
end

y, ∇broadcast = CRC.rrule_via_ad(cfg, broadcast, σ +, x, reshape_bias(x, bias))
y, ∇broadcast = CRC.rrule_via_ad(
cfg, broadcast_bias_activation_generic, σ, x, reshape_bias(x, bias))
∇bias_activation_rrule = @closure Δ -> begin
_, _, ∂x, ∂bias = ∇broadcast(Δ)
return ∂∅, ∂∅, ∂∅, 𝒫x(∂x), 𝒫bias(vec(∂bias))
end
return y, ∇bias_activation_rrule
end

@inline broadcast_bias_activation_generic::F, x, b) where {F} = σ.(x .+ b)

bias_activation!!(::typeof(identity), x::AbstractVector, ::Nothing) = x
for bType in (Nothing, AbstractVector)
@eval function bias_activation!!::F, x::AbstractVector, bias::$(bType)) where {F}
Expand Down
7 changes: 6 additions & 1 deletion lib/LuxLib/src/traits.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ is_mutable_array(::Nothing) = True()

ChainRulesCore.@non_differentiable is_mutable_array(::Any...)

for op in (:has_dual, :has_float16, :is_tracked)
for op in (:has_dual, :has_float16, :is_tracked, :has_complex)
@eval $op(::Nothing) = False()
@eval $op(x::Numeric) = $op(eltype(x))
end
Expand All @@ -38,6 +38,9 @@ has_dual(::Type{<:ForwardDiff.Dual}) = True()
has_float16(_) = False()
has_float16(::Type{<:Float16}) = True()

has_complex(_) = False()
has_complex(::Type{<:Complex}) = True()

is_tracked(_) = False()

has_autodiff_value(x) = is_tracked(x) | has_dual(x)
Expand All @@ -51,6 +54,7 @@ function use_generic_broadcasting(xs::Tuple)
xs_unwrapped = unrolled_map(unwrap_array, xs)
return unrolled_any(has_autodiff_value, xs_unwrapped) |
unrolled_any(has_float16, xs_unwrapped) |
unrolled_any(has_complex, xs_unwrapped) |
unrolled_any(static_isa(StaticArray), xs_unwrapped)
end

Expand Down Expand Up @@ -198,6 +202,7 @@ Currently supported modes are:
+ ReverseDiff Arrays
+ Tracker Arrays
+ ForwardDiff.Dual Arrays
+ Complex Arrays
- `GPUBroadcastOp{dev}`: GPU Arrays where `dev` is obtained from `get_device_type(xs)`.
This option dispatches should preferably use `KernelAbstractions` or specialized vendor
Expand Down
27 changes: 27 additions & 0 deletions test/issue_tests.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
@testitem "complex differentiation: issue #977" tags=[:misc] begin
using Lux, Zygote, Random

rng = Random.default_rng()
Random.seed!(rng, 666)

rbf(x) = exp.(-(x .^ 2))

U = Lux.Chain(
Lux.Dense(1, 10, rbf),
Lux.Dense(10, 3, rbf)
)

θ, st = Lux.setup(rng, U)

function complex_step_differentiation(f::Function, x::Float64, ϵ::Float64)
return imag(f(x + ϵ * im)) / ϵ
end

loss(t) = sum(complex_step_differentiation-> U([τ], θ, st)[begin], t, 1e-5))

if pkgversion(LuxLib) v"1.3.10"
@test only(Zygote.gradient(loss, 1.0)) isa Float64
else
@test_broken only(Zygote.gradient(loss, 1.0)) isa Float64
end
end

5 comments on commit 161b64c

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register subdir=lib/LuxLib

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/120168

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a LuxLib-v1.3.10 -m "<description of version>" 161b64c0d0edd0ee475fc9e6ca59fdb71681e8d9
git push origin LuxLib-v1.3.10

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/120169

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v1.4.0 -m "<description of version>" 161b64c0d0edd0ee475fc9e6ca59fdb71681e8d9
git push origin v1.4.0

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lux Benchmarks

Benchmark suite Current: 161b64c Previous: 6f9f8d6 Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4042 ns 3937.5 ns 1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4042 ns 4333 ns 0.93
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 5000 ns 4917 ns 1.02
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3917 ns 4042 ns 0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 60335 ns 61383 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10292 ns 10583 ns 0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9958 ns 10250 ns 0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10917 ns 10125 ns 1.08
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9917 ns 10083 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 425045 ns 431239 ns 0.99
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1250 ns 1042 ns 1.20
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1125 ns 1334 ns 0.84
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1417 ns 1333 ns 1.06
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1083 ns 1125 ns 0.96
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 17905 ns 18191 ns 0.98
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4083 ns 4333 ns 0.94
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 4000 ns 4250 ns 0.94
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4375 ns 4291 ns 1.02
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 3916 ns 3834 ns 1.02
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 109347 ns 110865.5 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 56292 ns 57709 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46833 ns 46667 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46229.5 ns 46208 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 81458 ns 80291 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 36705 ns 37897 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2055229.5 ns 2036958 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2092146 ns 2083333.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2088791.5 ns 1856125 ns 1.13
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2005459 ns 1994375 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 195507 ns 198201 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 175854 ns 157792 ns 1.11
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 144666 ns 145875 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 145708 ns 145583.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 141167 ns 143729 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 165651 ns 166222 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1150750 ns 1114145.5 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1127354.5 ns 1128875 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1114250 ns 1024292 ns 1.09
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1116458.5 ns 1115833.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 529529 ns 534915.5 ns 0.99
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 3208 ns 3584 ns 0.90
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 3417 ns 4208 ns 0.81
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4208 ns 4000 ns 1.05
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3334 ns 3583 ns 0.93
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 70388 ns 67978 ns 1.04
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8875 ns 9750 ns 0.91
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9500 ns 10459 ns 0.91
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9750 ns 8625 ns 1.13
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9250 ns 9125 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 494790 ns 495677 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 15209 ns 15000 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 15000 ns 18500 ns 0.81
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 17209 ns 16000 ns 1.08
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 14688 ns 14583 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 54580 ns 55105 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 216291.5 ns 213834 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 214167 ns 215958 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 213416 ns 213333 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 225708.5 ns 214375 ns 1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 274273 ns 276152.5 ns 0.99
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 709 ns 541 ns 1.31
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 625 ns 792 ns 0.79
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 834 ns 792 ns 1.05
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 625 ns 541 ns 1.16
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 17190 ns 17241 ns 1.00
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1709 ns 1541 ns 1.11
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1375 ns 1708 ns 0.81
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1833 ns 1791 ns 1.02
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1584 ns 1542 ns 1.03
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 102235 ns 102070.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7125 ns 7208 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5958 ns 5958 ns 1
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5916 ns 5916 ns 1
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9958 ns 10042 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 23722 ns 23944 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 222833 ns 221312.5 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 227958 ns 229500 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 229500 ns 228667 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 213417 ns 218021 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 169452 ns 170367 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4000 ns 3958 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 3916 ns 3917 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 3916 ns 3875 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23542 ns 23420 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16834 ns 17000 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16709 ns 17084 ns 0.98
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16959 ns 16916 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16666 ns 16708 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 162915 ns 162884.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 571709 ns 573416.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 574917 ns 580333 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 573708 ns 568042 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 568500 ns 569542 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113185.5 ns 113416 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1427354.5 ns 1418250 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1431625 ns 1429042 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1423541 ns 1420375 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 1422542 ns 1437458 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 211963 ns 212927.5 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s) 1046896 ns 1086895.5 ns 0.96
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s) 967000 ns 962854 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s) 1344687.5 ns 1344792 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s) 1304958 ns 1286083 ns 1.01
lenet(28, 28, 1, 64)/forward/GPU/CUDA 275060 ns 281106 ns 0.98
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s) 5993167 ns 5908292 ns 1.01
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s) 4544458 ns 4600625 ns 0.99
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s) 4946959 ns 4927041.5 ns 1.00
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s) 5568042 ns 5714562.5 ns 0.97
lenet(28, 28, 1, 64)/zygote/GPU/CUDA 1091420 ns 1101975 ns 0.99
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 583 ns 542 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 500 ns 542 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 542 ns 542 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 541 ns 500 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23913 ns 23476 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2209 ns 2209 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2125 ns 2167 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2208 ns 2167 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2084 ns 2166 ns 0.96
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 169337.5 ns 169515.5 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 4417 ns 4333 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 3833 ns 4500 ns 0.85
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 4625 ns 4791.5 ns 0.97
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 3958.5 ns 3791 ns 1.04
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 65443 ns 65149 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 11833 ns 11584 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11250 ns 11333 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11958 ns 11667 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 11125 ns 11333 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 450871 ns 446339 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7208 ns 6916 ns 1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6958 ns 6917 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7417 ns 7479.5 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6333 ns 6042 ns 1.05
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 51992 ns 51979.5 ns 1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 18459 ns 19125 ns 0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17500 ns 17458 ns 1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 17833 ns 17792 ns 1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 17459 ns 17333 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 300918 ns 300938.5 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 667 ns 666 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 542 ns 584 ns 0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 542 ns 1.15
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 625 ns 584 ns 1.07
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 32212 ns 32053.5 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9084 ns 9250 ns 0.98
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9437 ns 9042 ns 1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9333 ns 9375 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 8958 ns 8917 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 158990.5 ns 158152 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 64208 ns 64333 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 64833 ns 64625 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 64542 ns 64458 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 64542 ns 64375 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 111823 ns 111585 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 282708 ns 287209 ns 0.98
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 279000 ns 277834 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 273166 ns 280583 ns 0.97
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 281437.5 ns 281125 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 186218.5 ns 183928 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s) 3136750 ns 3298562.5 ns 0.95
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s) 3023208 ns 3083000 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s) 3030188 ns 3028771 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s) 3954583.5 ns 4061625 ns 0.97
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA 576992 ns 577723.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s) 7597041.5 ns 7606291 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s) 7419792 ns 7495375.5 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s) 7452395.5 ns 7404541 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s) 8186583.5 ns 8192541.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA 1367306 ns 1371476 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s) 17658333 ns 17505792 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s) 17553062.5 ns 17567291 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s) 17551250 ns 17475667 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s) 14310208 ns 14122958.5 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23729167 ns 23660020.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 33388291 ns 34147791.5 ns 0.98
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 37228104.5 ns 37059937.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 34843354 ns 34985187.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1868338 ns 1854503 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 192271333 ns 187449375 ns 1.03
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 232983250 ns 233703458.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 191886562.5 ns 195671083 ns 0.98
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 435397084 ns 433586291 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 13905970 ns 13830860.5 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 291433625 ns 288446709 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 336814583 ns 337867791 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 297436208 ns 296978708 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 408923438 ns 400413062.5 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 22583 ns 22084 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 24708 ns 24979.5 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 23209 ns 23875 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 21625 ns 21666 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 99141.5 ns 98077 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 103334 ns 102958 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 103750 ns 104792 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 105083 ns 103812 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 103062.5 ns 110292 ns 0.93
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 520213.5 ns 512479 ns 1.02
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6000 ns 6125 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5958 ns 6500 ns 0.92
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 6958 ns 7062.5 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 5708 ns 6083 ns 0.94
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 69364 ns 69253 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 15042 ns 15166 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 15209 ns 16145.5 ns 0.94
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 16250 ns 16208 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 15083 ns 15083 ns 1
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 484888 ns 482969 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3057208.5 ns 3041271 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2066208 ns 2067458.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2260437.5 ns 2297479.5 ns 0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4508458 ns 4457375 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA 589772 ns 592674 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 23926959 ns 23527562.5 ns 1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18026875 ns 18050396 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 18022708 ns 17902834 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 35506041.5 ns 35496125 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2765084 ns 2768935.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 33917958 ns 33385459 ns 1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 27599646 ns 27540666 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 28534208 ns 28658250 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41643583.5 ns 41547354.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 74541.5 ns 74875 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 74313 ns 74396 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 74500 ns 75500 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 72291 ns 74333 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 104269 ns 102653 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 317750 ns 291291.5 ns 1.09
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 208562.5 ns 318417 ns 0.65
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 322375 ns 208187.5 ns 1.55
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 291583.5 ns 290437.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 562266.5 ns 545207.5 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11875 ns 11958 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 11625 ns 12145.5 ns 0.96
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13250 ns 14209 ns 0.93
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 12125 ns 11500 ns 1.05
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 72944 ns 70994 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 27208 ns 27042 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 26791.5 ns 26917 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 27833.5 ns 27625 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26750 ns 26958 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 485353 ns 469447.5 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 13458.5 ns 12708 ns 1.06
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 12375 ns 12708 ns 0.97
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13250 ns 14125 ns 0.94
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 12291 ns 11958 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 54559.5 ns 52810 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 26417 ns 25958 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 25959 ns 26209 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 26209 ns 25958 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26000 ns 26875 ns 0.97
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 311166.5 ns 301312 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 181458 ns 179875 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 179708 ns 181083 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 183437.5 ns 182500 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 181354 ns 179666 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 58673.5 ns 56497 ns 1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 597521 ns 582959 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 584083 ns 588917 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 583958.5 ns 585083 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 582625 ns 590500 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 295518 ns 286103 ns 1.03
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6125 ns 6417 ns 0.95
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5958 ns 7125 ns 0.84
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7333 ns 8083 ns 0.91
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6166.5 ns 5750 ns 1.07
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 71636.5 ns 70488.5 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 15312.5 ns 14875 ns 1.03
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14333 ns 14458 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15708 ns 15417 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 13958 ns 14291 ns 0.98
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 473061 ns 457568 ns 1.03
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 1205708 ns 1207438 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 1241125 ns 1241417 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 1286479 ns 1284208 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 1000208 ns 997354.5 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA 301351 ns 301394.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 4319770.5 ns 4107041.5 ns 1.05
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 4471334 ns 4414458 ns 1.01
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 4578416 ns 4959854.5 ns 0.92
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 3698417 ns 3696125 ns 1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1037486.5 ns 1040815 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1916 ns 1834 ns 1.04
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1792 ns 1834 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1875 ns 1875 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1917 ns 1833 ns 1.05
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 24166 ns 23635 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4917 ns 4959 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4834 ns 5041 ns 0.96
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 5083 ns 4958 ns 1.03
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4875 ns 4958 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 194650 ns 185922 ns 1.05
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6583 ns 6250 ns 1.05
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6208 ns 6625 ns 0.94
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7125 ns 6334 ns 1.12
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5750 ns 5666 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 56615.5 ns 55102 ns 1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 12209 ns 11084 ns 1.10
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10895.5 ns 11834 ns 0.92
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 11667 ns 10791 ns 1.08
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 11000 ns 10875 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 336343 ns 330730 ns 1.02
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 375 ns 334 ns 1.12
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 250 ns 333 ns 0.75
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 292 ns 333 ns 0.88
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 375 ns 292 ns 1.28
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 23536 ns 22810 ns 1.03
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 3042 ns 3000 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2791 ns 3000 ns 0.93
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 3042 ns 2959 ns 1.03
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2750 ns 2750 ns 1
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 163558.5 ns 156803.5 ns 1.04
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 12083 ns 11750 ns 1.03
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 11375 ns 11958 ns 0.95
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 14667 ns 13292 ns 1.10
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 11500 ns 11292 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 58066.5 ns 57953 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 25541 ns 25291.5 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24250 ns 24917 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25125 ns 25125 ns 1
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 24458 ns 24542 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 299332 ns 293802.5 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4208 ns 4167 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4167 ns 4209 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4209 ns 4208 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4167 ns 4208 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 25749 ns 24619 ns 1.05
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16125 ns 16375 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16166 ns 16292 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16291 ns 16333 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16250 ns 16250 ns 1
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 200089.5 ns 195053 ns 1.03
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5916 ns 5833 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5750 ns 5834 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5959 ns 5834 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5833 ns 5875 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 34238 ns 33320 ns 1.03
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 21125 ns 21209 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 20459 ns 21125 ns 0.97
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 21167 ns 21625 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 20812.5 ns 20667 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 179917 ns 173685 ns 1.04
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 397270.5 ns 426708 ns 0.93
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 384187.5 ns 384958 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 478583.5 ns 482062.5 ns 0.99
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 103333 ns 102708.5 ns 1.01
batchedmm(16, Bsize=512)/forward/GPU/CUDA 67557 ns 66966 ns 1.01
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 891750 ns 909146 ns 0.98
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 972959 ns 972729 ns 1.00
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 1184041.5 ns 1175729 ns 1.01
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 330499.5 ns 439917 ns 0.75
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 194177 ns 190337.5 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 79812.5 ns 80625 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 81209 ns 81500 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 84042 ns 81708 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 79916.5 ns 80187.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 194547.5 ns 193436 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1931812.5 ns 1902458 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1636646 ns 1931125 ns 0.85
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1918646 ns 1927562.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1926062 ns 1906917 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 403673 ns 397725 ns 1.01
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 333 ns 292 ns 1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 333 ns 0.88
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 22738 ns 22050 ns 1.03
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1833 ns 1834 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1792 ns 1834 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1834 ns 1834 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1792 ns 1833 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 173787 ns 169534 ns 1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 7041 ns 6875 ns 1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6666 ns 7146 ns 0.93
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7666 ns 7667 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6666 ns 6500 ns 1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 61338.5 ns 62608.5 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9459 ns 9542 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9208 ns 9333 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9333 ns 9083 ns 1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9500 ns 9458 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 310208.5 ns 313766.5 ns 0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 155906937.5 ns 118190208 ns 1.32
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 174332958 ns 174175750 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 147872625 ns 147818500 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 105277000 ns 107522750 ns 0.98
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5483548 ns 5476530 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 669282000 ns 612187917 ns 1.09
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 555382333 ns 556303083 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 453291791.5 ns 452274750 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 761771979 ns 757288396 ns 1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 35124637 ns 38234410 ns 0.92
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 699486584 ns 649126292 ns 1.08
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 668241854.5 ns 667267229 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 612942458.5 ns 589618437.5 ns 1.04
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 744149959 ns 741758417 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 56292 ns 57583 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47709 ns 47375 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47584 ns 47833 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83167 ns 82250 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 37949 ns 37784 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1925646.5 ns 1917978.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1981729 ns 1995291.5 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1969458.5 ns 1985646 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1898709 ns 1843354 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 177394.5 ns 172983 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 269708.5 ns 266084 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 268625 ns 268750 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 287000 ns 268209 ns 1.07
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 267041 ns 267562 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 125253 ns 132212 ns 0.95
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 681916.5 ns 650416.5 ns 1.05
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 693791 ns 674667 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 682500 ns 589437.5 ns 1.16
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 685958 ns 688771 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 675851.5 ns 730804 ns 0.92
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2214458 ns 2181417 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2234583 ns 2196416.5 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2206291 ns 2101104 ns 1.05
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2191792 ns 2231125 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 134149 ns 133510.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5560291 ns 5502917 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5498375 ns 5510333 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5509646 ns 5498521 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5459417 ns 5441417 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 719852 ns 776428 ns 0.93
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 658625 ns 640333 ns 1.03
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 643125 ns 646083 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 639042 ns 646875 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 637666 ns 635334 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 47328 ns 47144 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1793125 ns 1818833 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1725000 ns 1727958 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1723687.5 ns 1724083 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 2098895.5 ns 2099750 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 225375 ns 220116 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 56875 ns 58500 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47416 ns 47083 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47125 ns 46458 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83625 ns 81500 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 29103 ns 28922 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2036833 ns 2025729 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2094667 ns 2106791.5 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2075625 ns 2095000 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2003333 ns 1998375 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 192100 ns 189080 ns 1.02
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 13402000 ns 13351000 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 12431542 ns 12437395.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 12506125 ns 12498666.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 14837542 ns 14894375 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 516101 ns 519065 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 47711000 ns 47200625 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 42011395.5 ns 41881708 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 40917708 ns 40754334 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 58129729.5 ns 58105083 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2890593.5 ns 2883161 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 97106625 ns 96219125 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 68523125 ns 91954062.5 ns 0.75
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 90562125 ns 90758584 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 76819625 ns 98984500 ns 0.78
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57208 ns 58708 ns 0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47750 ns 47000 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47250 ns 47291 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82041 ns 82000 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 47330 ns 47821 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1935667 ns 1902250 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1983791 ns 1986000 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1973041.5 ns 1978125 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1878417 ns 1883042 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 195219 ns 194258.5 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 417 ns 417 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 292 ns 375 ns 0.78
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 333 ns 1.13
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 334 ns 333 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 32542 ns 32804.5 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6750 ns 6792 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6125 ns 6625 ns 0.92
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6625 ns 6708 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6375 ns 6166 ns 1.03
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 170591.5 ns 179592 ns 0.95
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 334 ns 292 ns 1.14
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 250 ns 292 ns 0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 32528 ns 32076 ns 1.01
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 2958 ns 2917 ns 1.01
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 2667 ns 2875 ns 0.93
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 2917 ns 2875 ns 1.01
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 2625 ns 2625 ns 1
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 158771.5 ns 166744 ns 0.95
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 321043354.5 ns 284237292 ns 1.13
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 340532834 ns 339653916.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 314151312.5 ns 313913791.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 270601541 ns 272402875 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 7107105.5 ns 7047786.5 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1046677708.5 ns 993594875 ns 1.05
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 945289167 ns 945283292 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 840954313 ns 835507124.5 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1155312792 ns 1160037292 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 34104665 ns 34045459 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1718615541 ns 1668906166 ns 1.03
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 1335253333.5 ns 1694695167 ns 0.79
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1620256500 ns 1627000917 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1333409458.5 ns 1703328625 ns 0.78
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1460479.5 ns 1418646 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1422584 ns 1413958 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1418083.5 ns 1414875 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1412208.5 ns 1411416 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 127814.5 ns 128242 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5051916 ns 5029312.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5033458.5 ns 5037875 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5025999.5 ns 5028146 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5025125 ns 5024417 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 500081 ns 552451.5 ns 0.91
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s) 162840083 ns 170453833 ns 0.96
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s) 128019708.5 ns 127944542 ns 1.00
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s) 130269666 ns 129428958 ns 1.01
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s) 152687687.5 ns 164372666.5 ns 0.93
vgg16(32, 32, 3, 32)/forward/GPU/CUDA 4884899 ns 4859943 ns 1.01
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s) 844540708 ns 620949625 ns 1.36
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s) 537349833 ns 515114583 ns 1.04
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s) 560583292 ns 463124083 ns 1.21
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s) 649437458 ns 648066667 ns 1.00
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA 17863022 ns 16797902 ns 1.06
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 9095833.5 ns 8927250 ns 1.02
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 8979250 ns 8950584 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 7868500 ns 7917333 ns 0.99
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 9713958 ns 9753125 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1593097 ns 1591258 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 37599479 ns 35919479 ns 1.05
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 37114520.5 ns 37210542 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 33537625 ns 33517916.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 37598895.5 ns 37573417 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6454775 ns 6470424 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 47417 ns 47417 ns 1
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 47500 ns 47583 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 47666 ns 47708 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 47375 ns 47417 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 18487 ns 18601 ns 0.99
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 50417 ns 50542 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 50416 ns 50458 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 50500 ns 50542 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 50333.5 ns 52916.5 ns 0.95
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 161534 ns 206886.5 ns 0.78
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 7854.5 ns 7000 ns 1.12
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6770.5 ns 7291 ns 0.93
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7729.5 ns 7625 ns 1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7083 ns 7125 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 73765 ns 89400.5 ns 0.83
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10209 ns 10625 ns 0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10375 ns 10333.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10042 ns 10417 ns 0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10000 ns 10479.5 ns 0.95
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 437389 ns 543240.5 ns 0.81
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6875 ns 6000 ns 1.15
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6458 ns 6166 ns 1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 8250 ns 7083 ns 1.16
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5583.5 ns 5666.5 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 81756 ns 121379.5 ns 0.67
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13791 ns 13333 ns 1.03
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13500 ns 13000 ns 1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13541 ns 13333 ns 1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12895.5 ns 12687.5 ns 1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 408231 ns 510219 ns 0.80
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 1125 ns 1125 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 1000 ns 1084 ns 0.92
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 1083 ns 1084 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 1083 ns 1083 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 32689 ns 32431.5 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8500 ns 8375 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7667 ns 8542 ns 0.90
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8000 ns 8292 ns 0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8042 ns 8042 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 192936.5 ns 204899 ns 0.94
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 23459 ns 23208 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 23208 ns 23417 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 23375 ns 23666 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 23542 ns 23334 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 18259 ns 18285 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 52750 ns 52625 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 52625 ns 52709 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 52791.5 ns 53083 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 52292 ns 52562.5 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 228166 ns 286000 ns 0.80
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1407187.5 ns 1398458 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1444833 ns 1450667 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1405083 ns 1398999.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1396895.5 ns 1395750.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 196465 ns 196905 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5040708 ns 5011896 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5018541 ns 5032187.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5002417 ns 5012250 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5013625 ns 5002687.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 546168 ns 598226 ns 0.91
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3079083 ns 3070875 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2047000 ns 2072042 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2294458.5 ns 2289104.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4540917 ns 4773854 ns 0.95
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 582581 ns 584355 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 24731020.5 ns 24311583 ns 1.02
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18912562.5 ns 18870583.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 19038249.5 ns 19070166.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 36828979.5 ns 36514562.5 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2836262.5 ns 2861612.5 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 34546958.5 ns 34008958 ns 1.02
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 28342834 ns 28397792 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 28021500.5 ns 27946625 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41446459 ns 41793708.5 ns 0.99
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 144151542 ns 144075292 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 148019541 ns 147842750 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 125949729 ns 126624187.5 ns 0.99
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 173005021 ns 172290146 ns 1.00
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22565027 ns 22560426 ns 1.00
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 948587416.5 ns 1298569062.5 ns 0.73
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 1316893208.5 ns 886633209 ns 1.49
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 846166625 ns 1199135125 ns 0.71
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 681952500 ns 689233333 ns 0.99
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 118678990 ns 117701235 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 76499.5 ns 73000 ns 1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 80646 ns 73292 ns 1.10
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 75541.5 ns 85645.5 ns 0.88
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 72583 ns 72583 ns 1
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 219501.5 ns 223969 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 295875 ns 276062.5 ns 1.07
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 203584 ns 287625 ns 0.71
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 292875 ns 282625 ns 1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 288125 ns 190583 ns 1.51
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1030687 ns 1155754 ns 0.89
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 36242145.5 ns 35424583 ns 1.02
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 36566979.5 ns 36355854 ns 1.01
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 32367458.5 ns 32516083.5 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 40164416.5 ns 40329917 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5846818 ns 5847057 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 152632458 ns 144746000 ns 1.05
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 152676896 ns 153804708.5 ns 0.99
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 139286062.5 ns 140298187 ns 0.99
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 283773000 ns 283107125 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 34916870 ns 34865240 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 156722375.5 ns 121095354 ns 1.29
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 173916792 ns 174763417 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 148066500 ns 148056208 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 102175416 ns 105211667 ns 0.97
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5486669 ns 5466322 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 519305021 ns 468110062.5 ns 1.11
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 467283583 ns 466487917 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 441689083 ns 437682625 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 742430042 ns 737562458 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 32276395 ns 35152775 ns 0.92
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 688401084 ns 706128833.5 ns 0.97
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 657912104.5 ns 656179312 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 573100917 ns 571296688 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 731550292 ns 731578125 ns 1.00
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s) 1195458.5 ns 1324833 ns 0.90
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s) 988250 ns 963417 ns 1.03
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s) 987583 ns 979125 ns 1.01
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s) 2066875 ns 2064125 ns 1.00
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA 585359 ns 573443.5 ns 1.02
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s) 2919770.5 ns 2963875 ns 0.99
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s) 2614875 ns 2641084 ns 0.99
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s) 2611792 ns 2621249.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s) 3691417 ns 3522250 ns 1.05
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA 1640515 ns 1659147 ns 0.99
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s) 5907500 ns 5792625 ns 1.02
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s) 5785541 ns 5824583.5 ns 0.99
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s) 5799666 ns 5815083.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s) 2887792 ns 2879416 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7167 ns 7292 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6125 ns 6333 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6167 ns 6250 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10000 ns 9917 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 25666.5 ns 25248 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 213834 ns 212708 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 221083 ns 220666 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 220958 ns 221208 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 209500 ns 206375 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 224627 ns 250623 ns 0.90
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s) 310292438 ns 307616584 ns 1.01
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s) 228430666 ns 221441583 ns 1.03
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s) 199615625 ns 198752396 ns 1.00
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s) 310121208 ns 309471333 ns 1.00
vgg16(32, 32, 3, 64)/forward/GPU/CUDA 7680035.5 ns 7903869 ns 0.97
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s) 1101205687.5 ns 1075422250 ns 1.02
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s) 904614354 ns 906727646 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s) 806439375 ns 801892167 ns 1.01
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s) 1160007708.5 ns 1153514499.5 ns 1.01
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA 26999631 ns 26746953 ns 1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6458 ns 5791.5 ns 1.12
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6041 ns 5917 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6083 ns 6375 ns 0.95
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4895.5 ns 4875 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 119636.5 ns 155781 ns 0.77
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7833 ns 7625 ns 1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7292 ns 7334 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7500 ns 7562.5 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7166 ns 7083 ns 1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 510149.5 ns 649264 ns 0.79
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 625 ns 625 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 666 ns 542 ns 1.23
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 708 ns 666 ns 1.06
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 583 ns 542 ns 1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 24235 ns 23898 ns 1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9625 ns 9333.5 ns 1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9583 ns 9542 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9291 ns 9833 ns 0.94
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9041 ns 8792 ns 1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 191615 ns 220286 ns 0.87
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 352542 ns 351479.5 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 351541.5 ns 352042 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 354208 ns 353812.5 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 352041 ns 354687.5 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 21082 ns 21024 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 827625 ns 811959 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 774417 ns 778625 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 830187 ns 774625 ns 1.07
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 822209 ns 821708 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 224458.5 ns 304830.5 ns 0.74
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 315667 ns 339000 ns 0.93
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 337708 ns 343083 ns 0.98
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 448542 ns 451041.5 ns 0.99
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 11375 ns 10583 ns 1.07
batchedmm(16, Bsize=32)/forward/GPU/CUDA 18423 ns 18316 ns 1.01
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 705604.5 ns 714000 ns 0.99
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 738958.5 ns 742750.5 ns 0.99
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 999000 ns 1003583 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 26459 ns 26375 ns 1.00
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 211965.5 ns 291054.5 ns 0.73
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 360167 ns 384958.5 ns 0.94
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 346666 ns 348083 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 437417 ns 444917 ns 0.98
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 30125 ns 30125 ns 1
batchedmm(16, Bsize=128)/forward/GPU/CUDA 22977 ns 23128 ns 0.99
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 727167 ns 738542 ns 0.98
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 782250 ns 791896 ns 0.99
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 1026667 ns 1018521 ns 1.01
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 90000 ns 105270.5 ns 0.85
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 196309.5 ns 225989 ns 0.87
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 3625 ns 3708 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 3541 ns 3708 ns 0.95
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 3625 ns 3792 ns 0.96
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 3458 ns 3542 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 18016 ns 17710.5 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 4541 ns 4500 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 4375 ns 4250 ns 1.03
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 4292 ns 4500 ns 0.95
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 4250 ns 4417 ns 0.96
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 210663.5 ns 279830 ns 0.75
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 3750 ns 3666 ns 1.02
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 3625 ns 4125 ns 0.88
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4500 ns 4500 ns 1
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3708 ns 3708 ns 1
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 158953 ns 199332 ns 0.80
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8750 ns 8875 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8167 ns 8500 ns 0.96
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8875 ns 8458 ns 1.05
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8375 ns 8500 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 976072 ns 1228522.5 ns 0.79
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 203542 ns 203645.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 211375 ns 210208 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 212125 ns 210792 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 200042 ns 201125 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 35273 ns 34981 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 649750 ns 611520.5 ns 1.06
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 622083 ns 624479.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 673000 ns 624000.5 ns 1.08
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 628584 ns 630624.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 286304.5 ns 361212 ns 0.79
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 1006541.5 ns 995583 ns 1.01
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 1012562.5 ns 1022646 ns 0.99
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 950084 ns 952562 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 867374.5 ns 869209 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/CUDA 208692 ns 207395.5 ns 1.01
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 4662333 ns 4529458 ns 1.03
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 4724042 ns 4744750 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 4460291 ns 4448625 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 5133479.5 ns 5089542 ns 1.01
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 931046 ns 933469 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3750 ns 3666 ns 1.02
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3416 ns 3375 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 3875 ns 4167 ns 0.93
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3000 ns 3209 ns 0.93
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 160179 ns 242210.5 ns 0.66
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7708 ns 7792 ns 0.99
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7000 ns 7167 ns 0.98
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7500 ns 7417 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6917 ns 7208 ns 0.96
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 834512 ns 1046390.5 ns 0.80
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1638021 ns 1637500.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1178750.5 ns 1186917 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1368583 ns 1336062.5 ns 1.02
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2435458 ns 2468375 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA 212757 ns 213930 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12417125 ns 12339333 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9573771 ns 9615979.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9272896 ns 9254104 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18032250 ns 17996208 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1947684.5 ns 1954541 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17407875.5 ns 17361479 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14413792 ns 14427833 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14355521 ns 14271583.5 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21131291.5 ns 21144917 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 89791 ns 88834 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 90333 ns 91500 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 91667 ns 90834 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 88604 ns 87625 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 125843 ns 125982 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2042500 ns 2019500 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2024209 ns 2042708 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2017334 ns 2028042 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2030458 ns 2025999.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 851622 ns 1063927.5 ns 0.80
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 1500 ns 3541.5 ns 0.42
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 2250 ns 2333 ns 0.96
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 3833 ns 3584 ns 1.07
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 2250 ns 1500 ns 1.50
batchedmm(2, Bsize=4)/forward/GPU/CUDA 15376 ns 15780.5 ns 0.97
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 2916 ns 3000 ns 0.97
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 2459 ns 2958 ns 0.83
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 2791 ns 2709 ns 1.03
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 2917 ns 2875 ns 1.01
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 153882 ns 195545.5 ns 0.79
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7250 ns 7208 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6000 ns 6042 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6000 ns 6000 ns 1
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10000 ns 10000 ns 1
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 33856.5 ns 33801 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 221791 ns 221667 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220646 ns 228625 ns 0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 220479.5 ns 221000 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 241958.5 ns 206709 ns 1.17
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 266253.5 ns 347206.5 ns 0.77
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3708 ns 3708 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3750 ns 3709 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3708 ns 3708 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3709 ns 3750 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 22475 ns 22295 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14209 ns 14500 ns 0.98
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14333 ns 14459 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14459 ns 14458 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14417 ns 14417 ns 1
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 372905 ns 485845.5 ns 0.77
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 96208 ns 92250 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 95604 ns 94209 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 96583.5 ns 95250 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 91812.5 ns 91750 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 125359 ns 125421 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1942458 ns 1929000 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1923146 ns 1929917 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1909167 ns 1922333 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1932625 ns 1923646 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 780596.5 ns 960449 ns 0.81
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s) 859584 ns 875291 ns 0.98
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s) 815917 ns 817687.5 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s) 1209375 ns 1220791.5 ns 0.99
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s) 960270.5 ns 956708 ns 1.00
lenet(28, 28, 1, 32)/forward/GPU/CUDA 271785 ns 270219.5 ns 1.01
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s) 2844229 ns 2786125 ns 1.02
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s) 2490542 ns 2476771 ns 1.01
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s) 3348000.5 ns 3326500 ns 1.01
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s) 3404749.5 ns 3277354 ns 1.04
lenet(28, 28, 1, 32)/zygote/GPU/CUDA 1487247 ns 1614761 ns 0.92
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17416.5 ns 16229 ns 1.07
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17416 ns 18000 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 18333 ns 17084 ns 1.07
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17604.5 ns 14895.5 ns 1.18
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 140524.5 ns 142802.5 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 261583 ns 222458 ns 1.18
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 215667 ns 216625 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 257416.5 ns 216270.5 ns 1.19
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 215792 ns 225667 ns 0.96
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 572039 ns 642849 ns 0.89
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 222625 ns 220666.5 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 222062.5 ns 223437.5 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 222146 ns 221291.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 220833 ns 220208 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 232890 ns 270694.5 ns 0.86
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 507750 ns 498104.5 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 501667 ns 505958 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 556500 ns 498020.5 ns 1.12
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 507875 ns 500229 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1207816 ns 1376499 ns 0.88
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 4292 ns 4000 ns 1.07
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 4042 ns 3667 ns 1.10
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 4541.5 ns 5875 ns 0.77
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 3875 ns 3666 ns 1.06
batchedmm(16, Bsize=4)/forward/GPU/CUDA 16753 ns 16958 ns 0.99
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 7625 ns 7333 ns 1.04
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 7125 ns 7333 ns 0.97
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 7167 ns 7125 ns 1.01
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 7270.5 ns 7542 ns 0.96
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 176857.5 ns 195319 ns 0.91
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18542 ns 17333 ns 1.07
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 16958 ns 20291.5 ns 0.84
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 19792 ns 19354.5 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 16562.5 ns 16708.5 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 145193.5 ns 146982.5 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 224708 ns 214625 ns 1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 211854 ns 212500 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 238145.5 ns 212792 ns 1.12
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 212042 ns 221417 ns 0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 888620 ns 1020818 ns 0.87
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 4917 ns 4125 ns 1.19
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 4208 ns 4500 ns 0.94
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5042 ns 5291 ns 0.95
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 3667 ns 3542 ns 1.04
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 184696.5 ns 241162.5 ns 0.77
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10875 ns 11000 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10584 ns 10459 ns 1.01
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11042 ns 10916 ns 1.01
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10458 ns 10125 ns 1.03
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 966049 ns 1056501 ns 0.91
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3645.5 ns 3333 ns 1.09
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3209 ns 3875 ns 0.83
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 3792 ns 4250 ns 0.89
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 2833 ns 2938 ns 0.96
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 188943.5 ns 237567.5 ns 0.80
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8000 ns 7875 ns 1.02
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7125 ns 7709 ns 0.92
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7958 ns 7250 ns 1.10
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7208 ns 7458.5 ns 0.97
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 1007792.5 ns 1070019 ns 0.94
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 24183291.5 ns 23347771 ns 1.04
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 34946479 ns 35406500 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 37338083 ns 37669583 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 34888125 ns 34858666 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1782868.5 ns 1830001 ns 0.97
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 186454375 ns 183823166 ns 1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 159896583 ns 159867750 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 145990104.5 ns 146428479.5 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 411376042 ns 410553708 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 16457564 ns 16506890.5 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 432652834 ns 424862333.5 ns 1.02
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 247809833 ns 253527416.5 ns 0.98
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 279749334 ns 295623854.5 ns 0.95
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 479974375 ns 480544667 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 183958.5 ns 182875 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 182375 ns 185563 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 185500 ns 184500 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 184062.5 ns 182250 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 172057.5 ns 218471 ns 0.79
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 637709 ns 633375 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 586041.5 ns 596208 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 639084 ns 587250 ns 1.09
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 596416 ns 590520.5 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1002959 ns 1067870 ns 0.94
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 4026750 ns 3926937.5 ns 1.03
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 3920250 ns 3941459 ns 0.99
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 3579209 ns 3667000 ns 0.98
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 4570291.5 ns 4544333.5 ns 1.01
batchedmm(128, Bsize=512)/forward/GPU/CUDA 532647 ns 531767 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 17895041.5 ns 17389166 ns 1.03
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 17836083 ns 17947521 ns 0.99
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 16489292 ns 16390812 ns 1.01
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 20147270.5 ns 19902458.5 ns 1.01
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2607011.5 ns 2636393 ns 0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 583 ns 584 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 542 ns 542 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 32522 ns 32468 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9729 ns 9479.5 ns 1.03
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9291 ns 9500 ns 0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9625 ns 9584 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9209 ns 8875 ns 1.04
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 258262.5 ns 263813 ns 0.98
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s) 503041917 ns 498564458 ns 1.01
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s) 424847437.5 ns 426956020.5 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s) 425274250 ns 423367333 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s) 682175395.5 ns 596263958 ns 1.14
vgg16(32, 32, 3, 128)/forward/GPU/CUDA 12478951 ns 12482792 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s) 1889075833 ns 1875323562.5 ns 1.01
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s) 1625727875 ns 1628477375 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s) 1494457604.5 ns 1492393083.5 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s) 2214128083.5 ns 2205444916.5 ns 1.00
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA 49385566.5 ns 49302271 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1647625 ns 1639125 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1201312.5 ns 1202125 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1376271 ns 1357187.5 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2354042 ns 2457312 ns 0.96
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 214603 ns 213583.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12810437.5 ns 12714125 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9968417 ns 9952750 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9702395.5 ns 9614459 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18320249.5 ns 18361979 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2015837.5 ns 2064490 ns 0.98
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17772083 ns 17715625 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14741771 ns 14737021 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14583292 ns 14521854 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21392208 ns 21413792 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 26292 ns 26292 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 26250 ns 26250 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 26208 ns 26666 ns 0.98
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 26167 ns 26250 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 23824 ns 24074 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 67125 ns 67542 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 67500 ns 67625 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 67958 ns 68125 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 67000 ns 67042 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 377030.5 ns 400556.5 ns 0.94
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 204125 ns 203812.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 209792 ns 210750 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 210750 ns 209833 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 200292 ns 199375 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 26462 ns 27041 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 650250 ns 627333 ns 1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 625708.5 ns 626584 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 669874.5 ns 622042 ns 1.08
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 629250 ns 580541 ns 1.08
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 303651 ns 355125.5 ns 0.86
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 627292 ns 640833 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 671583 ns 653000 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 598312 ns 599854 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 639791 ns 599062.5 ns 1.07
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 132031 ns 132599.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2336625 ns 2247625 ns 1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2255375 ns 2173250 ns 1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2235562.5 ns 2242375 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2236583 ns 2238250 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1129126 ns 1242951 ns 0.91
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18437 ns 17459 ns 1.06
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 18354.5 ns 19917 ns 0.92
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 20062.5 ns 18958 ns 1.06
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17104.5 ns 17500 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 144037 ns 146290 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 265000 ns 227125 ns 1.17
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 230729 ns 229687.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 231875 ns 219959 ns 1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 258875 ns 218709 ns 1.18
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 929000 ns 1041939 ns 0.89
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 708 ns 625 ns 1.13
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 666 ns 666 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 625 ns 542 ns 1.15
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 23483 ns 24055 ns 0.98
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 10125 ns 10042 ns 1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9541 ns 9875 ns 0.97
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10208 ns 10292 ns 0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9291 ns 9333 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 253535 ns 260885.5 ns 0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5958 ns 6208 ns 0.96
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5417 ns 6042 ns 0.90
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6166 ns 6416 ns 0.96
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4959 ns 5062.5 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 177041 ns 223280.5 ns 0.79
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7770.5 ns 7625 ns 1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7250 ns 7416 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7875 ns 7459 ns 1.06
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6917 ns 6875 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 724899.5 ns 794061 ns 0.91
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 2334 ns 2083 ns 1.12
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 2042 ns 2333 ns 0.88
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2208 ns 2542 ns 0.87
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 2292 ns 2166 ns 1.06
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 17786 ns 17949 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 6667 ns 6584 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 6958 ns 6584 ns 1.06
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6625 ns 6792 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 6500 ns 6520.5 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 316064.5 ns 335163 ns 0.94
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 752459 ns 746958.5 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 746750 ns 746875 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 750791 ns 749792 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 746917 ns 751729 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 21186 ns 21434 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 794041.5 ns 819625 ns 0.97
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 787583 ns 791708 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 810166.5 ns 773145.5 ns 1.05
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 777749.5 ns 790854 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 292715.5 ns 298785 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7125 ns 7375 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6000 ns 6000 ns 1
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6042 ns 5958 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10125 ns 10166 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 33031.5 ns 33922 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 260583 ns 220854.5 ns 1.18
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 266771 ns 236854.5 ns 1.13
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 240125 ns 228083.5 ns 1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 213791 ns 212875 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 347920 ns 365652.5 ns 0.95
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10417 ns 10209 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 10083 ns 10417 ns 0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10666 ns 10708 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9770.5 ns 9541.5 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 236152.5 ns 251155 ns 0.94
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 24958 ns 24333 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24125 ns 24584 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25625 ns 24583 ns 1.04
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 24625 ns 24979 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 1075060 ns 1135827 ns 0.95
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 106687708 ns 106024583.5 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 118577083.5 ns 117903521 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 120497312.5 ns 120396396 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 118064771 ns 117544479 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 2612121 ns 2631384.5 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 394040917 ns 385240458 ns 1.02
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 367160584 ns 368294084 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 357048666 ns 356727875 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 483172291 ns 482802291 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 15226002.5 ns 15255065.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 944093812.5 ns 936146916.5 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 581088583 ns 762770042 ns 0.76
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 744439291.5 ns 746849979.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 770449312.5 ns 945639875 ns 0.81
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7833.5 ns 7541 ns 1.04
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6584 ns 7250 ns 0.91
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7667 ns 7583.5 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6584 ns 6479 ns 1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 231298 ns 243530.5 ns 0.95
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14833.5 ns 14458 ns 1.03
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 13833 ns 13959 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14333 ns 14750 ns 0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 13667 ns 13833 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 1030746 ns 1088103 ns 0.95
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6812.5 ns 6417 ns 1.06
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6250 ns 6292 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 8625 ns 7166.5 ns 1.20
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5458 ns 5750 ns 0.95
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 228035.5 ns 238108.5 ns 0.96
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13541 ns 13084 ns 1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12250 ns 12916 ns 0.95
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13417 ns 12583 ns 1.07
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12375 ns 12166 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 749909.5 ns 798707 ns 0.94
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 5562.5 ns 5584 ns 1.00
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 5917 ns 5770.5 ns 1.03
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 5959 ns 6500 ns 0.92
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 5625 ns 7166.5 ns 0.78
batchedmm(2, Bsize=128)/forward/GPU/CUDA 17374 ns 17513 ns 0.99
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 15979.5 ns 15667 ns 1.02
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 15291 ns 15625 ns 0.98
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 15666.5 ns 15541 ns 1.01
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 15791 ns 15583 ns 1.01
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 198865.5 ns 202130 ns 0.98
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 416 ns 416 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 417 ns 333 ns 1.25
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 417 ns 0.90
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 334 ns 292 ns 1.14
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 23594 ns 23880 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6770.5 ns 6584 ns 1.03
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6416 ns 6458 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6875 ns 6583 ns 1.04
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6416 ns 6187.5 ns 1.04
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 238325.5 ns 241952.5 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5958 ns 5917 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5917 ns 5917 ns 1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 5958 ns 5959 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 5875 ns 5834 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 24848 ns 25052 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 22291.5 ns 21667 ns 1.03
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 21375 ns 21750 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 21750 ns 21875 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 21833 ns 21167 ns 1.03
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 262151 ns 267258 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 145041 ns 144125 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 179792 ns 144791 ns 1.24
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 147000 ns 146500 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 145833 ns 143125 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 167939 ns 168261.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1367292 ns 1324313 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1334375 ns 1331958 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1330499.5 ns 1325708 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1319209 ns 1319666.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1299116 ns 1358754 ns 0.96
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 23000 ns 24416.5 ns 0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 24062.5 ns 24834 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 24875 ns 23375 ns 1.06
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 21542 ns 21292 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 285873 ns 357948 ns 0.80
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 181458 ns 132395.5 ns 1.37
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 142020.5 ns 127354.5 ns 1.12
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 130312 ns 118459 ns 1.10
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 166291 ns 117395.5 ns 1.42
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1432985 ns 1501059 ns 0.95
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 333 ns 292 ns 1.14
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 24013 ns 23530 ns 1.02
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6667 ns 6875 ns 0.97
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6292 ns 6666 ns 0.94
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6708 ns 7125 ns 0.94
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6208 ns 6417 ns 0.97
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 257668.5 ns 259579 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4875 ns 4625 ns 1.05
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4541 ns 4708 ns 0.96
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4917 ns 5042 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4334 ns 4084 ns 1.06
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 248170.5 ns 258332.5 ns 0.96
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10541 ns 10375 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9500 ns 10209 ns 0.93
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10583 ns 10209 ns 1.04
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10000 ns 10166.5 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 1315251.5 ns 1363356 ns 0.96
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1667 ns 1625 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1584 ns 1625 ns 0.97
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1625 ns 1625 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1625 ns 1583 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 23770.5 ns 23506 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 6083 ns 5958 ns 1.02
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 5625 ns 6042 ns 0.93
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6000 ns 6125 ns 0.98
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 5625 ns 5667 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 277301 ns 277914 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 6853687 ns 6791458.5 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 6416292 ns 6360916.5 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 6504750 ns 6541917 ns 0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 7620312.5 ns 7577625 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 214867.5 ns 214916.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 24153125 ns 24027042 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 21320542 ns 21266917 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 21047708.5 ns 21002500 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 29760542 ns 29759417 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2095640.5 ns 2132435.5 ns 0.98
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 48863062.5 ns 48562041 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 34327709 ns 45901125 ns 0.75
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 45697437.5 ns 45588125 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 38239917 ns 49263125 ns 0.78
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6708 ns 6000 ns 1.12
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5666 ns 6167 ns 0.92
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6459 ns 6625 ns 0.97
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5770.5 ns 5334 ns 1.08
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 232386 ns 236967.5 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9062.5 ns 9250 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8375 ns 8458 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8375 ns 8958 ns 0.93
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8291 ns 8250 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 1027676.5 ns 1058397 ns 0.97
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s) 1539229.5 ns 1553000 ns 0.99
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s) 1264500 ns 1271333 ns 0.99
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s) 1616916 ns 1611667 ns 1.00
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s) 2152625 ns 2139521 ns 1.01
lenet(28, 28, 1, 128)/forward/GPU/CUDA 281859 ns 272139 ns 1.04
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s) 7990000 ns 7938708.5 ns 1.01
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s) 6612375 ns 6600938 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s) 7167458 ns 7126750 ns 1.01
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s) 10472916.5 ns 10443521 ns 1.00
lenet(28, 28, 1, 128)/zygote/GPU/CUDA 1870517 ns 1846977 ns 1.01
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 359666 ns 374500.5 ns 0.96
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 372896 ns 372770.5 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 456458 ns 456750 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 22396 ns 23000 ns 0.97
batchedmm(128, Bsize=4)/forward/GPU/CUDA 47625 ns 46393 ns 1.03
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 739666 ns 736917 ns 1.00
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 822937.5 ns 808083 ns 1.02
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 1053333 ns 1057958 ns 1.00
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 109291 ns 78020.5 ns 1.40
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 240230 ns 308525 ns 0.78
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 396792 ns 397417 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288042 ns 287917 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 287917 ns 288000 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 755250 ns 753542 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 45350 ns 43767 ns 1.04
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 639083 ns 673583 ns 0.95
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 531000 ns 536166 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 531625 ns 531917 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 973083 ns 973208 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 194303 ns 188160 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 636645.5 ns 633500 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 636021 ns 647250 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 652063 ns 599709 ns 1.09
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 654042 ns 615666 ns 1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 133147 ns 131655.5 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2499458 ns 2457916 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2456708 ns 2396750 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2459542 ns 2458187.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2452854 ns 2452625 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1214588 ns 1345493 ns 0.90
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 2209 ns 3083 ns 0.72
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 3041 ns 2833 ns 1.07
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 4667 ns 4500 ns 1.04
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 2792 ns 2583 ns 1.08
batchedmm(2, Bsize=32)/forward/GPU/CUDA 16731 ns 16191 ns 1.03
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 5625 ns 5750 ns 0.98
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 5333 ns 5584 ns 0.96
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 5625 ns 5459 ns 1.03
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 5584 ns 5625 ns 0.99
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 199833.5 ns 198160.5 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1461916.5 ns 1458292 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1505708 ns 1499833 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1503458 ns 1499083 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1437083 ns 1437417 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 41276 ns 40922 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5154479 ns 5128625 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5307146 ns 5308187.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5288209 ns 5301146 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5001917 ns 4993250 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 200453 ns 195601.5 ns 1.02
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3750 ns 3709 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3708 ns 3708 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3708 ns 3708 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3667 ns 3708 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 34571 ns 33852 ns 1.02
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15250 ns 15500 ns 0.98
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15250 ns 15334 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15375 ns 15334 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15125 ns 15166 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 372573 ns 381247 ns 0.98
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 71375 ns 71292 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 71583 ns 71416 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 71208 ns 71167 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 71250 ns 70916 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 114012 ns 113823.5 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 325917 ns 317500 ns 1.03
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 325167 ns 321000 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 318375 ns 319083 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 317750 ns 318500 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 199225 ns 197369 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 1083 ns 1125 ns 0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 1000 ns 1084 ns 0.92
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 1083 ns 1084 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 1000 ns 959 ns 1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 24050 ns 24373 ns 0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8584 ns 8333.5 ns 1.03
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8084 ns 8229.5 ns 0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8375 ns 8250 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8000 ns 7833 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 262017.5 ns 265338.5 ns 0.99
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 497584 ns 511459 ns 0.97
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 490208.5 ns 488042 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 559959 ns 567084 ns 0.99
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 148209 ns 220750 ns 0.67
batchedmm(128, Bsize=32)/forward/GPU/CUDA 129838.5 ns 129208 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 1405375 ns 1389625 ns 1.01
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 1471875 ns 1480250 ns 0.99
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 1758791.5 ns 1756312.5 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 869583 ns 865000 ns 1.01
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 274551 ns 277406 ns 0.99
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 417 ns 416 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 334 ns 375 ns 0.89
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 333 ns 292 ns 1.14
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 32490 ns 32170 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6875 ns 6875 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6208 ns 6625 ns 0.94
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6541 ns 6458 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6333 ns 6020.5 ns 1.05
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 265808 ns 266374 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1723271 ns 1718417 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1751146 ns 1721417 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1734270.5 ns 1726125 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1724083 ns 1719500 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 169537.5 ns 169010.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4419270.5 ns 4367625 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4365292 ns 4399270.5 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4351792 ns 4374042 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4357792 ns 4359438 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1171701 ns 1258694 ns 0.93
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 6833.5 ns 6500 ns 1.05
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 7062.5 ns 6625 ns 1.07
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 7833 ns 7208.5 ns 1.09
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 6833 ns 6542 ns 1.04
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 20938 ns 20518 ns 1.02
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 72249.5 ns 32542 ns 2.22
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 51291.5 ns 52479.5 ns 0.98
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 52875 ns 52000 ns 1.02
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 51333 ns 32625 ns 1.57
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 211685.5 ns 210236 ns 1.01
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 17709 ns 17542 ns 1.01
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 18333 ns 17917 ns 1.02
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 18312.5 ns 18708 ns 0.98
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 17625 ns 17375 ns 1.01
batchedmm(2, Bsize=512)/forward/GPU/CUDA 18852 ns 18845.5 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 53583 ns 53750 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 52958 ns 53208 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 53625 ns 53250 ns 1.01
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 53417 ns 53500 ns 1.00
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 337333.5 ns 344404.5 ns 0.98
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 75417 ns 75292 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 75375 ns 75500 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 75334 ns 74833 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 75333 ns 74959 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 47609 ns 47057 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 339875 ns 323708 ns 1.05
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 332958 ns 338541 ns 0.98
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 325791 ns 326000 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 324042 ns 325417 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 215842 ns 211393 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1486125 ns 1486000 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1530958 ns 1527542 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1527584 ns 1526208 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1463416 ns 1463000 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 52815 ns 52398 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5149209 ns 5120500 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5312291.5 ns 5242958 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5298250 ns 5297166.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4995000 ns 4985916.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 207728 ns 204362 ns 1.02
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 28250 ns 28250 ns 1
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 28208 ns 28250 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 28291 ns 28167 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 28292 ns 28167 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 24971.5 ns 25076 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66292 ns 66417 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 66375 ns 66417 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 66209 ns 66792 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66500 ns 66458 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 510271 ns 540264.5 ns 0.94
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s) 1349333.5 ns 1467604.5 ns 0.92
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s) 1135833 ns 1148208 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s) 1132458 ns 1073125 ns 1.06
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s) 2196062.5 ns 2179542 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA 589889 ns 575331.5 ns 1.03
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s) 3042333 ns 3075042 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s) 2731792 ns 2748167 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s) 2726167 ns 2727604 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s) 3811625 ns 3816646 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA 2004374 ns 2066149 ns 0.97
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s) 8038292 ns 7917125 ns 1.02
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s) 7942499.5 ns 7956750 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s) 7931979.5 ns 7912958 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s) 4817250 ns 4824417 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 80499.5 ns 81334 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 82250 ns 82000 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 82500 ns 81895.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 80479.5 ns 80250 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 194209 ns 193566.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2050042 ns 2017375 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2034333.5 ns 2065916.5 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2017875 ns 2015625 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2018854 ns 2021542 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 768336 ns 803967 ns 0.96

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.