Skip to content

Commit

Permalink
refactor: cleanup some old pre-1.0 hacks (#1102)
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal authored Nov 24, 2024
1 parent d755929 commit cd96335
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 13 deletions.
15 changes: 7 additions & 8 deletions src/extended_ops.jl
Original file line number Diff line number Diff line change
Expand Up @@ -224,19 +224,18 @@ CRC.@non_differentiable istraining(::Any)

end

using .LuxOps: LuxOps, multigate
using .LuxOps: LuxOps, multigate, xlogx, xlogy, foldl_init

const safe_getproperty = LuxOps.getproperty
const safe_eachslice = LuxOps.eachslice

# TODO: directly import them from LuxOps from 1.0
const private_xlogx = LuxOps.xlogx
const private_xlogy = LuxOps.xlogy
const private_foldl_init = LuxOps.foldl_init

# These are defined here to avoid a circular dependency among modules
for (op, field) in (:bias => :use_bias, :affine => :affine,
:track_stats => :track_stats, :train_state => :train_state)
for (op, field) in (
:bias => :use_bias,
:affine => :affine,
:track_stats => :track_stats,
:train_state => :train_state
)
@eval function $(Symbol(:has_, op))(l::AbstractLuxLayer)
res = known(safe_getproperty(l, Val($(Meta.quot(field)))))
return ifelse(res === nothing, false, res)
Expand Down
7 changes: 3 additions & 4 deletions src/helpers/losses.jl
Original file line number Diff line number Diff line change
Expand Up @@ -261,8 +261,7 @@ end

for logits in (true, false)
return_expr = logits ? :(return loss.agg((1 .- ỹ) .*.- logsigmoid.(ŷ))) :
:(return loss.agg(-private_xlogy.(ỹ, ŷ .+ ϵ) .-
private_xlogy.(1 .- ỹ, 1 .-.+ ϵ)))
:(return loss.agg(-xlogy.(ỹ, ŷ .+ ϵ) .- xlogy.(1 .- ỹ, 1 .-.+ ϵ)))

@eval function unsafe_apply_loss(loss::BinaryCrossEntropyLoss{$(logits)}, ŷ, y)
T = promote_type(eltype(ŷ), eltype(y))
Expand Down Expand Up @@ -387,7 +386,7 @@ for logits in (true, false)
:(return LossFunctionImpl.fused_agg(
loss.agg, -, sum(ỹ .* logsoftmax(ŷ; loss.dims); loss.dims))) :
:(return LossFunctionImpl.fused_agg(
loss.agg, -, sum(private_xlogy.(ỹ, ŷ .+ ϵ); loss.dims)))
loss.agg, -, sum(xlogy.(ỹ, ŷ .+ ϵ); loss.dims)))

@eval function unsafe_apply_loss(loss::CrossEntropyLoss{$(logits)}, ŷ, y)
T = promote_type(eltype(ŷ), eltype(y))
Expand Down Expand Up @@ -603,7 +602,7 @@ end
function unsafe_apply_loss(loss::KLDivergenceLoss, ŷ, y)
cross_entropy = unsafe_apply_loss(loss.celoss, ŷ, y)
# Intentional broadcasting for Zygote type stability
entropy = loss.agg(sum(private_xlogx.(y); loss.dims))
entropy = loss.agg(sum(xlogx.(y); loss.dims))
return entropy + cross_entropy
end

Expand Down
2 changes: 1 addition & 1 deletion src/layers/recurrent.jl
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ function (r::Recurrence{True})(x::Union{AbstractVector, NTuple}, ps, st::NamedTu
(out, carry), state = apply(r.cell, (input, carry), ps, state)
return vcat(outputs, [out]), carry, state
end
results = private_foldl_init(recur_op, x)
results = foldl_init(recur_op, x)
return first(results), last(results)
end

Expand Down

1 comment on commit cd96335

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lux Benchmarks

Benchmark suite Current: cd96335 Previous: d755929 Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4375 ns 4083 ns 1.07
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4208 ns 4458 ns 0.94
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 5042 ns 4583 ns 1.10
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3833 ns 4458 ns 0.86
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 59750 ns 61537 ns 0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10229.5 ns 9958 ns 1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10458 ns 11083 ns 0.94
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 11208 ns 10125 ns 1.11
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10083.5 ns 10292 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 421969 ns 428120 ns 0.99
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1042 ns 1208 ns 0.86
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1333 ns 1333 ns 1
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1334 ns 1333 ns 1.00
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1167 ns 1042 ns 1.12
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 18218 ns 17813 ns 1.02
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 3791 ns 3959 ns 0.96
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 4125 ns 4042 ns 1.02
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4375 ns 4375 ns 1
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 4083 ns 4000 ns 1.02
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 110020 ns 110308 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 55625 ns 57500 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46833 ns 38333 ns 1.22
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46208 ns 46625 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 81750 ns 82166 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 36958.5 ns 36705 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2050166 ns 2027541 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2100334 ns 2090041.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2073937.5 ns 2097083 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1993041 ns 1999875 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 195385 ns 195283 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 143208 ns 143625 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 143958.5 ns 143417 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 146000 ns 145584 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 182375 ns 147187.5 ns 1.24
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 165528 ns 166525 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1157292 ns 1109542 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1158062.5 ns 1126812.5 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1107125 ns 1122083 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1113937.5 ns 1020645.5 ns 1.09
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 525805 ns 533338 ns 0.99
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 3542 ns 3375 ns 1.05
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 3959 ns 3416 ns 1.16
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4458 ns 4541 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3458 ns 3604.5 ns 0.96
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 70267.5 ns 68868.5 ns 1.02
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8792 ns 9292 ns 0.95
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8667 ns 9542 ns 0.91
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9500 ns 9792 ns 0.97
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9333 ns 8833 ns 1.06
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 486148 ns 494765.5 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 15916 ns 15583 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 15208 ns 16458 ns 0.92
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 17458 ns 16500 ns 1.06
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 15750 ns 15083 ns 1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 55035.5 ns 54721 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 214687.5 ns 212833 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 213875 ns 215167 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 214499.5 ns 214416 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 214020.5 ns 212417 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 271923 ns 274119.5 ns 0.99
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 584 ns 583 ns 1.00
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 542 ns 792 ns 0.68
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 750 ns 750 ns 1
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 542 ns 583 ns 0.93
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 17550 ns 17270 ns 1.02
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1667 ns 1667 ns 1
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1625 ns 1667 ns 0.97
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1875 ns 1458 ns 1.29
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1583 ns 1708 ns 0.93
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 102829 ns 103124 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7083 ns 7291 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6041 ns 5292 ns 1.14
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5917 ns 5958 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10000 ns 9917 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 23605 ns 23563 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 221000 ns 220708 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 229416.5 ns 236874.5 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 230875 ns 228875 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 252791.5 ns 220166 ns 1.15
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 168416.5 ns 169828.5 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 3958 ns 3875 ns 1.02
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 3875 ns 3917 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 3917 ns 3916 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 3916 ns 3875 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23282 ns 23299 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16625 ns 16708 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16875 ns 16833 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16958 ns 16834 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16667 ns 16667 ns 1
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 160471 ns 162920 ns 0.98
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 574250 ns 574791 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 576167 ns 578334 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 579895.5 ns 574000 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 573917 ns 574333 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113142 ns 113504 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1424041.5 ns 1420083 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1417292 ns 1415750 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1420500 ns 1420208 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 1425500 ns 1425187.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 209769 ns 212199 ns 0.99
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s) 1054333.5 ns 1067895.5 ns 0.99
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s) 959917 ns 940416 ns 1.02
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s) 1343583.5 ns 1346520.5 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s) 1300896 ns 1295333 ns 1.00
lenet(28, 28, 1, 64)/forward/GPU/CUDA 279273.5 ns 276087 ns 1.01
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s) 5749437.5 ns 6005792 ns 0.96
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s) 4599687.5 ns 4619125 ns 1.00
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s) 4952395.5 ns 4921458.5 ns 1.01
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s) 5610084 ns 5705500 ns 0.98
lenet(28, 28, 1, 64)/zygote/GPU/CUDA 1087158.5 ns 1093586 ns 0.99
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 542 ns 542 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 541 ns 542 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 542 ns 542 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 500 ns 542 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23646 ns 23336 ns 1.01
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2125 ns 2125 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2084 ns 2167 ns 0.96
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2208 ns 2166 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2083 ns 2209 ns 0.94
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 173162 ns 170662.5 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 4542 ns 4083 ns 1.11
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 4208 ns 4250 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5000 ns 5250 ns 0.95
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 4000 ns 4250 ns 0.94
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 64791.5 ns 66890.5 ns 0.97
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 11208 ns 11166 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11291.5 ns 11750 ns 0.96
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 12209 ns 11792 ns 1.04
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 11292 ns 11145.5 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 449166 ns 455730.5 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6792 ns 6708 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6833 ns 6917 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8312.5 ns 8000 ns 1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6250 ns 6833 ns 0.91
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 51887 ns 53251 ns 0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 16792 ns 17646 ns 0.95
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 16667 ns 17687.5 ns 0.94
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 17500 ns 17583 ns 1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 17875 ns 18520.5 ns 0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 301591 ns 303857.5 ns 0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 583 ns 583 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 583 ns 625 ns 0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 667 ns 0.94
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 583 ns 625 ns 0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 32520 ns 32349 ns 1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 8625 ns 8500 ns 1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 8416.5 ns 9458 ns 0.89
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9417 ns 9291 ns 1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 8250 ns 9375 ns 0.88
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 159487 ns 158134 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 64959 ns 64375 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 64667 ns 64500 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 64416 ns 64542 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 64541 ns 64417 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 110435.5 ns 111051 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 291250 ns 280917 ns 1.04
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 285125 ns 285417 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 274833.5 ns 280750 ns 0.98
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 279770.5 ns 279291.5 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 183913 ns 185526.5 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s) 3222417 ns 3281750 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s) 3060583 ns 2797500 ns 1.09
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s) 3017291.5 ns 3018917 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s) 4070708 ns 4088625 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA 571448 ns 571296 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s) 7560916.5 ns 7642500 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s) 7434917 ns 7291354 ns 1.02
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s) 7464958 ns 7449292 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s) 8157583.5 ns 8096333 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA 1323265 ns 1326986 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s) 17698375 ns 17512333 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s) 17382541 ns 17557479.5 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s) 17917041 ns 17568792 ns 1.02
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s) 14113978.5 ns 14165000 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 24259667 ns 23618750 ns 1.03
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 33537791.5 ns 43411666 ns 0.77
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 37485625 ns 37050562 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 34876854.5 ns 34914229.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1864963 ns 1853387 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 191699417 ns 187623875 ns 1.02
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 233048750 ns 247457083 ns 0.94
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 194089542 ns 194208333 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 434858250 ns 434785500 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 13855629 ns 13912861.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 292275916 ns 289468416 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 336958667 ns 350360437.5 ns 0.96
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 297206917 ns 297011958 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 408837354 ns 409128187.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 22333 ns 24042 ns 0.93
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 24521 ns 23958 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 23666 ns 23916 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 22417 ns 22020.5 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 95962.5 ns 96407 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 104625 ns 103208.5 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 103334 ns 104791 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 104875 ns 104667 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 103250 ns 103417 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 503280 ns 511501 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6417 ns 6145.5 ns 1.04
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6250 ns 5625 ns 1.11
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7250 ns 6979.5 ns 1.04
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6041 ns 5709 ns 1.06
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 67524 ns 69596.5 ns 0.97
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 15250 ns 14520.5 ns 1.05
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 15500 ns 15542 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 16125 ns 16125 ns 1
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 12875 ns 14625 ns 0.88
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 474310.5 ns 479202.5 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 2994417 ns 3041750 ns 0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2072458 ns 2066041.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2264416 ns 2266312 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4512000 ns 4490041.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA 589406.5 ns 590463 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 23917916 ns 23486917 ns 1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18038749.5 ns 18259854 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 17983750 ns 17822021 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 35261125 ns 35704478.5 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2768485.5 ns 2768088 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 33831646.5 ns 33321020.5 ns 1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 27630729 ns 28000312.5 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 28545541 ns 28560333.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41340292 ns 41618958 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 72833 ns 72209 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 73521 ns 81645.5 ns 0.90
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 74958 ns 74917 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 83500 ns 72396 ns 1.15
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 102113 ns 105122.5 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 208042 ns 278083 ns 0.75
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 291208 ns 314375 ns 0.93
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 219875 ns 208562.5 ns 1.05
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 217417 ns 241750.5 ns 0.90
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 550239 ns 565906 ns 0.97
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11916 ns 11417 ns 1.04
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 12042 ns 11833.5 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13000 ns 12250 ns 1.06
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 11583 ns 12166 ns 0.95
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 70941.5 ns 73969 ns 0.96
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 26541 ns 26125 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 26875 ns 27542 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 27833 ns 26708 ns 1.04
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26708 ns 26708 ns 1
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 472589 ns 488459.5 ns 0.97
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 12083 ns 12208 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 12917 ns 13084 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13771 ns 13833 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 12334 ns 12687.5 ns 0.97
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 52605 ns 55593 ns 0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 25917 ns 25333 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 25625 ns 26458 ns 0.97
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 25958 ns 26250 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26542 ns 26458 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 304518.5 ns 314229 ns 0.97
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 179750 ns 181625 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 181375 ns 180250 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 182875 ns 183667 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 182083 ns 179417 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 57612 ns 58869 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 585375 ns 587667 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 582375 ns 585625 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 584291.5 ns 583584 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 585895.5 ns 584708 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 287910.5 ns 294563.5 ns 0.98
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6396 ns 5395.5 ns 1.19
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6084 ns 6042 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7667 ns 8416.5 ns 0.91
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 5542 ns 8791 ns 0.63
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 70404 ns 73281.5 ns 0.96
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14458 ns 13834 ns 1.05
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14917 ns 15125 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 16000 ns 14333 ns 1.12
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14416 ns 14250 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 461584 ns 478456 ns 0.96
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 1193604.5 ns 1191541 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 1246000 ns 1236750 ns 1.01
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 1273583.5 ns 1285583.5 ns 0.99
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 1016875 ns 1003417 ns 1.01
batchedmm(512, Bsize=4)/forward/GPU/CUDA 301246 ns 302585 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 4298583 ns 4114354 ns 1.04
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 4454937.5 ns 4527875 ns 0.98
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 4559833 ns 4560333.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 3718125 ns 3695000 ns 1.01
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1052722 ns 1056192.5 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1875 ns 1833 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1834 ns 1875 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1833 ns 1833 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1834 ns 1875 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 24315 ns 23824 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 5000 ns 4875 ns 1.03
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4916 ns 4917 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4917 ns 4917 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4916 ns 4959 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 193381 ns 193428 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6270.5 ns 6084 ns 1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6292 ns 6166 ns 1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7042 ns 6917 ns 1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5666 ns 6209 ns 0.91
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 56858.5 ns 57953 ns 0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 11000 ns 10333 ns 1.06
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10584 ns 11709 ns 0.90
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 11292 ns 11333 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10625 ns 11583 ns 0.92
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 341133.5 ns 343622 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 416 ns 334 ns 1.25
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 334 ns 333 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 292 ns 333 ns 0.88
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 23459 ns 23294 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2792 ns 2792 ns 1
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2708 ns 3042 ns 0.89
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 3000 ns 3000 ns 1
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2750 ns 2750 ns 1
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 163121 ns 163978 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 12209 ns 11375 ns 1.07
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 12250 ns 11666 ns 1.05
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 15083 ns 12500 ns 1.21
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 11375 ns 11542 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 59412 ns 59566.5 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 24792 ns 24459 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24833 ns 25042 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25042 ns 25083.5 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 25167 ns 25083 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 302787.5 ns 305262.5 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4250 ns 4208 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4167 ns 4167 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4208 ns 4209 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4167 ns 4208 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 25427.5 ns 25152 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16041 ns 16083 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16250 ns 16042 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16125 ns 16375 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16084 ns 16167 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 203537 ns 203575 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5875 ns 5750 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5834 ns 5833 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5875 ns 5875 ns 1
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5875 ns 5875 ns 1
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 34639 ns 34167 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 21375 ns 20291.5 ns 1.05
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 21125 ns 21041 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 22125 ns 21500 ns 1.03
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 23000 ns 21167 ns 1.09
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 181357 ns 180386.5 ns 1.01
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 404042 ns 420667 ns 0.96
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 390084 ns 363520.5 ns 1.07
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 483167 ns 482000 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 103834 ns 125291.5 ns 0.83
batchedmm(16, Bsize=512)/forward/GPU/CUDA 67491 ns 67480 ns 1.00
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 913854 ns 897041 ns 1.02
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 961459 ns 967000.5 ns 0.99
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 1201334 ns 1167958 ns 1.03
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 448417 ns 396500 ns 1.13
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 192152 ns 197078.5 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 80542 ns 80125 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 81500 ns 81020.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 79854.5 ns 82625 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 78813 ns 83458 ns 0.94
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 193447.5 ns 194831 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1946833 ns 1694000 ns 1.15
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1932479 ns 1917291.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1920708 ns 1931459 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1904937.5 ns 1896062.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 402534 ns 416256.5 ns 0.97
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 333 ns 292 ns 1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 333 ns 292 ns 1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 22000 ns 22312 ns 0.99
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1833 ns 1792 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1792 ns 1833 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1834 ns 1834 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1833 ns 1875 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 169877.5 ns 176862.5 ns 0.96
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 7521 ns 6208 ns 1.21
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 7167 ns 6875 ns 1.04
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7792 ns 7750 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6500 ns 7000 ns 0.93
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 61779 ns 62506 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9250 ns 8833 ns 1.05
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9500 ns 9250 ns 1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9042 ns 9417 ns 0.96
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9292 ns 9333 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 314965 ns 325531 ns 0.97
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 158324292 ns 121103854.5 ns 1.31
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 174385041 ns 181392229 ns 0.96
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 148149145.5 ns 147959958.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 104978917 ns 103681750 ns 1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5475583 ns 5500074 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 673914521 ns 613086875 ns 1.10
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 556536500 ns 578493750 ns 0.96
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 454282229 ns 454857041.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 754352104 ns 752941812.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 35161544.5 ns 35077599 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 703002500 ns 649102417 ns 1.08
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 668300021 ns 685608520.5 ns 0.97
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 587968625 ns 589011249.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 742489083 ns 739858625 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57833 ns 59500 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 48000 ns 38708 ns 1.24
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47959 ns 48000 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82333 ns 82708 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 38135 ns 38528 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1945042 ns 1741292 ns 1.12
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1994937.5 ns 1966416 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1978208 ns 1984416 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1862834 ns 1859270.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 174772.5 ns 177396 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 267333 ns 271125 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 267521 ns 274250 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 268709 ns 268416 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 266959 ns 267791.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 138445.5 ns 137600.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 605250 ns 587833 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 597333.5 ns 666917 ns 0.90
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 696500 ns 587208 ns 1.19
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 676042 ns 665917 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 740206.5 ns 757074 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2204042 ns 2224291.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2205084 ns 2235083 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2220750 ns 2099770.5 ns 1.06
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2219958 ns 2218208 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 135150.5 ns 135238 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5598583 ns 5494167 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5526083 ns 5547875 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5502958 ns 5497792 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5487708.5 ns 5395666.5 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 792599 ns 797087 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 660166 ns 643250 ns 1.03
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 643583 ns 646958 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 659417 ns 642375 ns 1.03
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 644542 ns 640208 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 47532 ns 47636 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1795875 ns 1820958 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1722291 ns 1668166 ns 1.03
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1727709 ns 1721291 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 2095458 ns 2100708 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 227325 ns 227359.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 56375 ns 58583 ns 0.96
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46291 ns 38208.5 ns 1.21
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46625 ns 47292 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82500 ns 82750 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 29417 ns 29299.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2030542 ns 2023770.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2111062.5 ns 2018000 ns 1.05
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2091895.5 ns 2096292 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1996833 ns 1983895.5 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 193004 ns 191243 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 13382833 ns 13392479 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 12443000 ns 12447084 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 12480979 ns 12573562.5 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 15173917 ns 15225667 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 517073 ns 515936 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 47607083 ns 47214583.5 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 41883313 ns 42007792 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 40854417 ns 40831167 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 58509979 ns 58287250 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2896765.5 ns 2893597 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 97269708 ns 73879562 ns 1.32
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 68581771 ns 91062583 ns 0.75
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 90434166 ns 90595250 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 98826583 ns 98708500 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 56833 ns 59041 ns 0.96
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47417 ns 38458 ns 1.23
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47291 ns 47500 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 80833 ns 83041 ns 0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 46888 ns 46889 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1939104 ns 1914042 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2010459 ns 1980250 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1977312.5 ns 1983041.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1892292 ns 1895208.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 192004 ns 191685.5 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 292 ns 333 ns 0.88
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 334 ns 375 ns 0.89
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 417 ns 0.90
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 333 ns 416 ns 0.80
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 31834 ns 31909.5 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6084 ns 5958 ns 1.02
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6083 ns 6666 ns 0.91
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6709 ns 6500 ns 1.03
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6167 ns 6667 ns 0.93
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 176223.5 ns 174339.5 ns 1.01
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 333 ns 0.88
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 31304 ns 31434 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 2625 ns 2625 ns 1
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 2584 ns 2959 ns 0.87
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 2917 ns 2792 ns 1.04
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 2667 ns 2833 ns 0.94
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 164663.5 ns 161698.5 ns 1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 324499000.5 ns 284655874.5 ns 1.14
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 340579375 ns 346665396 ns 0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 313389416.5 ns 314185249.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 273909208 ns 271410834 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 7105361 ns 7071052.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1052816166 ns 986652459 ns 1.07
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 943649000 ns 960769500 ns 0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 840615666.5 ns 837320313 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1152028667 ns 1160509417 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 34095663 ns 34004605 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1721214458 ns 1311324917 ns 1.31
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 1359927020.5 ns 1697266750 ns 0.80
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1606248000 ns 1638971166 ns 0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1668736833 ns 1734387958.5 ns 0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1425375 ns 1414375 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1415542 ns 1459333 ns 0.97
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1416520.5 ns 1417583 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1410375 ns 1464750 ns 0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 127634 ns 127631 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5060999.5 ns 4707666.5 ns 1.08
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5059104 ns 5056666.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5025375 ns 5045625 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5018125 ns 5028167 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 596333 ns 589690 ns 1.01
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s) 163798854 ns 174231250 ns 0.94
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s) 128369875 ns 167491167 ns 0.77
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s) 130888792 ns 128702541 ns 1.02
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s) 168698771 ns 154878708 ns 1.09
vgg16(32, 32, 3, 32)/forward/GPU/CUDA 5432122 ns 4890073 ns 1.11
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s) 630866750 ns 622332667 ns 1.01
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s) 635134916 ns 581984000 ns 1.09
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s) 554211625 ns 496978166 ns 1.12
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s) 648292583 ns 643892875 ns 1.01
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA 16519965 ns 16065970 ns 1.03
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 9165854 ns 8934042 ns 1.03
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 8986459 ns 9020375 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 7922833 ns 7917083 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 9756167 ns 9692542 ns 1.01
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1610067 ns 1603050 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 37032625 ns 36495271 ns 1.01
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 37212042 ns 38137292 ns 0.98
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 33438583 ns 33438520.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 37841958 ns 37760500 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6473180 ns 6473707 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 47479.5 ns 47375 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 47437.5 ns 47417 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 47667 ns 47500 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 47500 ns 47542 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 18175 ns 18555 ns 0.98
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 50250 ns 50250 ns 1
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 50625 ns 50375 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 50542 ns 50667 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 50416.5 ns 50375 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 243634.5 ns 207795 ns 1.17
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 7229.5 ns 6375 ns 1.13
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6917 ns 7041 ns 0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7834 ns 7958 ns 0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7292 ns 7208.5 ns 1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 134228.5 ns 101178.5 ns 1.33
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10375 ns 10125 ns 1.02
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9458 ns 10625 ns 0.89
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10334 ns 10625 ns 0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10250 ns 10417 ns 0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 725024.5 ns 593102.5 ns 1.22
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6917 ns 5875 ns 1.18
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6292 ns 6208.5 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7417 ns 6750 ns 1.10
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5937.5 ns 6084 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 158899 ns 121281 ns 1.31
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13334 ns 12708 ns 1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13083 ns 13541 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13958 ns 13250 ns 1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12875 ns 13208 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 654550.5 ns 511694 ns 1.28
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 1083 ns 1000 ns 1.08
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 1083 ns 1042 ns 1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 1042 ns 1083 ns 0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 1083 ns 1083 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 32302 ns 32282 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8000 ns 7750 ns 1.03
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7958.5 ns 8042 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8000 ns 8000 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8250 ns 8041 ns 1.03
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 248668.5 ns 210142.5 ns 1.18
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 23334 ns 23166 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 23625 ns 23209 ns 1.02
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 23604.5 ns 23250 ns 1.02
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 23334 ns 23104.5 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 18197 ns 18312 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 52375 ns 52416 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 52583 ns 52542 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 52709 ns 52709 ns 1
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 52291 ns 52625 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 365195 ns 291833.5 ns 1.25
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1409312.5 ns 1400833 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1395312.5 ns 1445959 ns 0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1395667 ns 1396833 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1399187.5 ns 1398917 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 196466 ns 197117.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5048625 ns 5008208 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5082916.5 ns 5030250 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5010208 ns 5026354 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5015083 ns 4996437.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 697077 ns 600264 ns 1.16
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3082583 ns 3038708 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2075667 ns 2105979 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2279000 ns 2274062.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4910958 ns 4858083 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 586799 ns 586328 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 24742792 ns 24399625 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18899334 ns 19072583.5 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 18912125 ns 18904750 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 36606271 ns 36638687.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2884394 ns 2819518 ns 1.02
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 34600271 ns 33955417 ns 1.02
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 28275125 ns 28785062.5 ns 0.98
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 27978625 ns 28141333 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41693583 ns 41707708.5 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 146263625 ns 142540583 ns 1.03
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 148262792 ns 146733875 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 125521666 ns 125527687.5 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 173208104.5 ns 174248667 ns 0.99
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22564372 ns 22566115 ns 1.00
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 948935833 ns 968276062.5 ns 0.98
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 1199705645.5 ns 860326354.5 ns 1.39
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 727524542 ns 858659167 ns 0.85
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 936153853.5 ns 683117959 ns 1.37
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 115985315 ns 118099274 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 74250 ns 72375 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 76209 ns 74000 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 76042 ns 76250 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 72167 ns 73208 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 331111.5 ns 235570 ns 1.41
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 282500 ns 203292 ns 1.39
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 191083.5 ns 282896 ns 0.68
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 280584 ns 203583 ns 1.38
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 291917 ns 207583 ns 1.41
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1500994.5 ns 1260670 ns 1.19
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 36314916.5 ns 35143208 ns 1.03
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 36531396 ns 36705709 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 32439729.5 ns 32591958.5 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 40435354 ns 40607646 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5837859 ns 5841170.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 151857209 ns 148155791.5 ns 1.02
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 153888604 ns 158417083.5 ns 0.97
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 135530208.5 ns 137765333 ns 0.98
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 283241209 ns 283770667 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 34859945 ns 34905958 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 159567375 ns 120795375 ns 1.32
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 174506458 ns 181579562.5 ns 0.96
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 147925667 ns 148004834 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 104572437 ns 108061458.5 ns 0.97
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5480695 ns 5466179.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 524085270.5 ns 468909791.5 ns 1.12
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 467380250 ns 485490958.5 ns 0.96
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 437823166 ns 438520417 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 737646542 ns 742778708 ns 0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 32284174.5 ns 32266057 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 696105375 ns 707166333 ns 0.98
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 658106854.5 ns 671742104.5 ns 0.98
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 575346979 ns 577648896 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 729353375 ns 734518917 ns 0.99
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s) 1155874.5 ns 1349520.5 ns 0.86
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s) 998792 ns 780417 ns 1.28
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s) 991542 ns 909417 ns 1.09
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s) 2092625 ns 2087500 ns 1.00
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA 579446 ns 566986 ns 1.02
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s) 2931916.5 ns 2979167 ns 0.98
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s) 2619083.5 ns 2496208 ns 1.05
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s) 2626604.5 ns 2619166 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s) 3482417 ns 3728333 ns 0.93
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA 1969877.5 ns 1738136 ns 1.13
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s) 5947625 ns 5799875 ns 1.03
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s) 5782625 ns 5883292 ns 0.98
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s) 5801958.5 ns 5800167 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s) 2880584 ns 2892541.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7208 ns 7375 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6083 ns 5292 ns 1.15
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5959 ns 6208 ns 0.96
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9959 ns 10042 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 26024 ns 25118 ns 1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 212562.5 ns 212333 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 221083.5 ns 221583 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221333 ns 220562.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 209292 ns 215896 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 302079.5 ns 262400.5 ns 1.15
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s) 311414437.5 ns 307233708 ns 1.01
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s) 232931208 ns 279732584 ns 0.83
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s) 202032375 ns 198830375 ns 1.02
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s) 308462875 ns 309726917 ns 1.00
vgg16(32, 32, 3, 64)/forward/GPU/CUDA 7680461 ns 7656813 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s) 1101691479.5 ns 1090685500 ns 1.01
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s) 909424125 ns 1068219000 ns 0.85
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s) 804661000 ns 818375167 ns 0.98
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s) 1153673416.5 ns 1160424021 ns 0.99
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA 26512167 ns 26548125.5 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5833.5 ns 5812.5 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5833 ns 5708 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6270.5 ns 6959 ns 0.90
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5146 ns 5458 ns 0.94
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 196235 ns 154820 ns 1.27
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7542 ns 7125 ns 1.06
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7125 ns 7708 ns 0.92
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7417 ns 7625 ns 0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7125 ns 7542 ns 0.94
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 702510 ns 618164 ns 1.14
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 625 ns 625 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 584 ns 625 ns 0.93
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 583 ns 584 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 24654 ns 23615 ns 1.04
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9333 ns 9250 ns 1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 8750 ns 9458 ns 0.93
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9875 ns 9625 ns 1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9292 ns 9750 ns 0.95
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 239874 ns 207782.5 ns 1.15
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 352958 ns 356333 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 351875 ns 352417 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 351583.5 ns 356083 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 352208 ns 357500.5 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 21408 ns 21053.5 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 779709 ns 780146 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 775541 ns 776312.5 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 776062.5 ns 809375 ns 0.96
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 817375 ns 826750 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 316328 ns 303323.5 ns 1.04
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 317708 ns 338396 ns 0.94
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 341667 ns 325208 ns 1.05
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 453354 ns 453375 ns 1.00
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 10875 ns 10542 ns 1.03
batchedmm(16, Bsize=32)/forward/GPU/CUDA 18691 ns 17732 ns 1.05
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 712145.5 ns 718917 ns 0.99
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 734917 ns 732645.5 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 1006834 ns 1009833 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 27250 ns 26583 ns 1.03
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 293795 ns 257155 ns 1.14
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 359083.5 ns 374000 ns 0.96
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 350250 ns 331500 ns 1.06
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 442875 ns 441875 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 30583 ns 30917 ns 0.99
batchedmm(16, Bsize=128)/forward/GPU/CUDA 22877.5 ns 22404 ns 1.02
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 736584 ns 739437.5 ns 1.00
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 783750 ns 779666.5 ns 1.01
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 1041500 ns 1041375.5 ns 1.00
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 105875 ns 104312.5 ns 1.01
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 265090.5 ns 235395 ns 1.13
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 3666 ns 3625 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 3667 ns 3625 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 3667 ns 3625 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 3750 ns 3459 ns 1.08
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 17832 ns 17702 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 4208 ns 4250 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 4292 ns 4250 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 4333 ns 4334 ns 1.00
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 4333 ns 4375 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 285935 ns 245299 ns 1.17
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4209 ns 3479.5 ns 1.21
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 3875 ns 3792 ns 1.02
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4291 ns 4334 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3250 ns 3709 ns 0.88
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 226147.5 ns 185222 ns 1.22
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8541 ns 8125 ns 1.05
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8208.5 ns 8687.5 ns 0.94
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8833 ns 8666 ns 1.02
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8792 ns 8375 ns 1.05
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 1241590 ns 1127148 ns 1.10
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 203041 ns 206541 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 210833 ns 212000 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 213042 ns 211000 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 200208 ns 202291 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 35096 ns 34888 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 600458 ns 648750 ns 0.93
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 664687.5 ns 634312.5 ns 1.05
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 621125 ns 632771 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 587666 ns 596417 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 364021 ns 322649.5 ns 1.13
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 1006145.5 ns 998333 ns 1.01
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 1034750 ns 1039375 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 960375 ns 952083 ns 1.01
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 870666.5 ns 904292 ns 0.96
batchedmm(128, Bsize=128)/forward/GPU/CUDA 207603 ns 208498.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 4675520.5 ns 4540000 ns 1.03
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 4661500 ns 4817791.5 ns 0.97
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 4484166.5 ns 4468750 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 5182375 ns 5130375 ns 1.01
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 945582 ns 959939 ns 0.99
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4167 ns 3875 ns 1.08
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3458 ns 3334 ns 1.04
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4416.5 ns 4125 ns 1.07
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3250 ns 3750 ns 0.87
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 242881.5 ns 197248.5 ns 1.23
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7625 ns 7645.5 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7125 ns 7333 ns 0.97
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7791 ns 7292 ns 1.07
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7166 ns 7458 ns 0.96
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 1049374 ns 1027567 ns 1.02
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1641104.5 ns 1650375 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1162041.5 ns 1182479.5 ns 0.98
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1361146 ns 1370292 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2337792 ns 2441916.5 ns 0.96
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA 215237 ns 215671.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12428417 ns 12370500 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9554417 ns 9601667 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9282166 ns 9328687.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18043958 ns 18097145.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1957521 ns 1953457 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17446729 ns 17380125 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14307562.5 ns 14471146 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14338292 ns 14397875 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21055500 ns 21055583 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 90250 ns 91125 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 89750 ns 90875 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 92271 ns 94958 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 92625 ns 88000 ns 1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 126161 ns 126032 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2045083 ns 2023583.5 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2029000 ns 2028542 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2032875 ns 2033312 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2022667 ns 2043416.5 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1071170.5 ns 1084734 ns 0.99
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 1645.5 ns 3458.5 ns 0.48
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 2375 ns 1625 ns 1.46
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 3708 ns 3500 ns 1.06
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 2875 ns 1750 ns 1.64
batchedmm(2, Bsize=4)/forward/GPU/CUDA 16073 ns 15936 ns 1.01
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 2792 ns 2584 ns 1.08
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 2708 ns 2791 ns 0.97
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 2916 ns 2917 ns 1.00
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 2792 ns 2833 ns 0.99
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 196950.5 ns 195099.5 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7166 ns 7250 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6083 ns 5292 ns 1.15
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5958 ns 6083 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10042 ns 10125 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 33844 ns 33830 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 215354.5 ns 224916 ns 0.96
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220833.5 ns 234875 ns 0.94
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 220875 ns 231083 ns 0.96
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 209625.5 ns 218917 ns 0.96
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 351596 ns 348229.5 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3750 ns 3708 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3708 ns 3708 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3708 ns 3709 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3709 ns 3708 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 22384 ns 21982 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14250 ns 14459 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14459 ns 14208 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14334 ns 14417 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14375 ns 14584 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 522120.5 ns 489892.5 ns 1.07
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 140291 ns 94917 ns 1.48
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 91729.5 ns 93416.5 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 96250 ns 99875 ns 0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 94458 ns 92625 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 125465 ns 125549 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1947916 ns 1921625 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1932104.5 ns 1933333.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1925000 ns 1928500 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1650916 ns 1950604.5 ns 0.85
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1016603 ns 964756 ns 1.05
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s) 859167 ns 873521 ns 0.98
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s) 818395.5 ns 804167 ns 1.02
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s) 1219500 ns 1218520.5 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s) 962834 ns 954959 ns 1.01
lenet(28, 28, 1, 32)/forward/GPU/CUDA 269546 ns 285492.5 ns 0.94
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s) 2844645.5 ns 2830854 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s) 2436375 ns 2531000 ns 0.96
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s) 3336375 ns 3356083 ns 0.99
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s) 3413042 ns 3412042 ns 1.00
lenet(28, 28, 1, 32)/zygote/GPU/CUDA 1630539.5 ns 1671062 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 15708.5 ns 16271 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 16000 ns 16500 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 17166.5 ns 18666.5 ns 0.92
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 14958.5 ns 18916 ns 0.79
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 143350.5 ns 144500.5 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 217083.5 ns 260708 ns 0.83
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 215417 ns 254749.5 ns 0.85
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 216916 ns 227979 ns 0.95
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 227125 ns 226584 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 653459 ns 650846.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 221959 ns 222167 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 221645.5 ns 222041.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 221395.5 ns 222166 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 221083 ns 220000 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 274733.5 ns 277439 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 511042 ns 561333.5 ns 0.91
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 495750 ns 549000 ns 0.90
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 497042 ns 558813 ns 0.89
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 508875 ns 557729.5 ns 0.91
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1471458 ns 1450310.5 ns 1.01
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 4666.5 ns 4000 ns 1.17
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 4083 ns 4166 ns 0.98
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 5708 ns 5750 ns 0.99
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 3917 ns 4042 ns 0.97
batchedmm(16, Bsize=4)/forward/GPU/CUDA 16967 ns 17089 ns 0.99
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 7125 ns 7000 ns 1.02
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 7417 ns 7208 ns 1.03
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 7520.5 ns 7166 ns 1.05
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 7250 ns 7542 ns 0.96
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 198610.5 ns 196929 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18084 ns 18083 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 18166.5 ns 18959 ns 0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 18666.5 ns 19250 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18395.5 ns 18124.5 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 148303 ns 165663 ns 0.90
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 214125 ns 222875 ns 0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 212791.5 ns 213896 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 213042 ns 225792 ns 0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 219417 ns 222042 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1024505 ns 1029397 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 4625 ns 4500 ns 1.03
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 4333 ns 3958 ns 1.09
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5708 ns 5125 ns 1.11
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 3708.5 ns 4333 ns 0.86
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 244514 ns 204180 ns 1.20
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10875 ns 10917 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10062.5 ns 10583 ns 0.95
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11375 ns 10500 ns 1.08
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10583 ns 10750 ns 0.98
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 1099794 ns 1058573 ns 1.04
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4000 ns 3291.5 ns 1.22
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3792 ns 3542 ns 1.07
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4375 ns 4417 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 2750 ns 3458 ns 0.80
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 250198 ns 245634 ns 1.02
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7270.5 ns 7458 ns 0.97
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7459 ns 7583 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7916 ns 7625 ns 1.04
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7500 ns 7541 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 1106505 ns 1074772.5 ns 1.03
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 24086812.5 ns 23471041.5 ns 1.03
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 34704750 ns 43849166 ns 0.79
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 37376896 ns 37957792 ns 0.98
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 34935000 ns 34964125 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1853477 ns 1792082 ns 1.03
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 186942250 ns 184426958 ns 1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 159685500 ns 173017604 ns 0.92
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 146457125 ns 147161645.5 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 411532208 ns 411405916 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 16500596 ns 16521696 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 434054666 ns 426004833.5 ns 1.02
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 253740479 ns 259123250 ns 0.98
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 299567770.5 ns 296958750 ns 1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 479705417 ns 480245750 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 184375 ns 183042 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 183584 ns 185188 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 184333 ns 186041.5 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 185292 ns 184333.5 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 229399 ns 226412 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 594750 ns 597750 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 586209 ns 598229 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 586729.5 ns 632895.5 ns 0.93
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 599250.5 ns 586958 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1121066.5 ns 1097502 ns 1.02
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 3936375 ns 3838542 ns 1.03
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 4081937 ns 4115979 ns 0.99
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 3587479 ns 3571292 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 4565729.5 ns 4600166.5 ns 0.99
batchedmm(128, Bsize=512)/forward/GPU/CUDA 538820 ns 534974 ns 1.01
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 18136458.5 ns 17343875 ns 1.05
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 17936750 ns 18514250 ns 0.97
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 16532771 ns 16537292 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 20226167 ns 20367667 ns 0.99
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2633099 ns 2795688 ns 0.94
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 583 ns 583 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 583 ns 666 ns 0.88
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 31971 ns 32682 ns 0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9375 ns 9042 ns 1.04
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9000 ns 9709 ns 0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9500 ns 9500 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9333 ns 9666 ns 0.97
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 265140 ns 266437.5 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s) 503989417 ns 499772583 ns 1.01
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s) 431858541.5 ns 504959958 ns 0.86
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s) 427434834 ns 422832542 ns 1.01
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s) 592092708 ns 673427063 ns 0.88
vgg16(32, 32, 3, 128)/forward/GPU/CUDA 11928812 ns 11842270.5 ns 1.01
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s) 1891189687.5 ns 1875482271 ns 1.01
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s) 1632073542 ns 1653498000 ns 0.99
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s) 1496948750 ns 1486024395.5 ns 1.01
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s) 2217192312.5 ns 2210913770.5 ns 1.00
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA 49332313 ns 49084588.5 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1638750 ns 1649062.5 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1179458 ns 1182584 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1387875 ns 1392250 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2352479.5 ns 2377145.5 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 214938 ns 218920 ns 0.98
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12852583.5 ns 12688458.5 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9964500 ns 10001583.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9669416.5 ns 9698792 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18345667 ns 18502292 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2032751.5 ns 2042988 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17791875 ns 17689291 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14679354.5 ns 14793041.5 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14576209 ns 14622084 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21490021 ns 21477583.5 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 26333 ns 26292 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 26250 ns 26250 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 26250 ns 26250 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 26250 ns 26291 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 23891 ns 24105 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66750 ns 67000 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 67791 ns 67042 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 67042 ns 67875 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 67042 ns 67208 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 403092 ns 396461.5 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 203542 ns 204959 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 209834 ns 209958 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 210500 ns 209875 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 199500 ns 199833 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 26253 ns 26682 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 602666.5 ns 646208 ns 0.93
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 670479 ns 670000 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 621791 ns 644166 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 633916 ns 630416 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 351051 ns 354787 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 678375 ns 598417 ns 1.13
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 654937.5 ns 657292 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 646500 ns 664187.5 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 669916 ns 659708 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 131843 ns 132717 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2326042 ns 2235958 ns 1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2262000 ns 2279125 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2145978.5 ns 2249833 ns 0.95
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2234542 ns 2316042 ns 0.96
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1242552 ns 1193695.5 ns 1.04
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17645.5 ns 18500 ns 0.95
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17062.5 ns 19250 ns 0.89
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 19125 ns 19292 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17875 ns 17500 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 146421.5 ns 146082 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 220959 ns 259917 ns 0.85
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 219500 ns 259625 ns 0.85
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 220291 ns 230208.5 ns 0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 235729 ns 256708 ns 0.92
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1091456.5 ns 1005431.5 ns 1.09
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 625 ns 584 ns 1.07
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 666 ns 625 ns 1.07
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 708 ns 625 ns 1.13
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 583 ns 625 ns 0.93
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 23721 ns 23900 ns 0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 10084 ns 9750 ns 1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9791.5 ns 10333 ns 0.95
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10041 ns 10000 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9875 ns 10000 ns 0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 261550.5 ns 259163 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6042 ns 5833 ns 1.04
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5625 ns 5916 ns 0.95
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6584 ns 6459 ns 1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5125 ns 5833 ns 0.88
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 234716 ns 228223.5 ns 1.03
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7417 ns 7500 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7208 ns 7666 ns 0.94
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8041 ns 7666 ns 1.05
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7334 ns 7333 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 806215 ns 770644 ns 1.05
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 2229.5 ns 2333 ns 0.96
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 2458 ns 2187.5 ns 1.12
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2375 ns 2292 ns 1.04
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 2292 ns 2250 ns 1.02
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 17855 ns 17986 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 6542 ns 6500 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 6667 ns 6666 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6834 ns 6666 ns 1.03
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 6500 ns 6625 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 333301.5 ns 321059 ns 1.04
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 755083 ns 749208.5 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 746333 ns 748958 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 749250 ns 750125 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 750187.5 ns 748834 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 21362 ns 21410 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 788958.5 ns 798125 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 772209 ns 791208 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 787687.5 ns 837729.5 ns 0.94
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 791333 ns 775270.5 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 298265 ns 301663.5 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7125 ns 7417 ns 0.96
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6041 ns 5291 ns 1.14
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5959 ns 6042 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10209 ns 10292 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 33317.5 ns 33301 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 226500 ns 232896 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 236063 ns 268833.5 ns 0.88
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 228041 ns 267354.5 ns 0.85
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 255812.5 ns 215500 ns 1.19
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 363202 ns 361937 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10542 ns 10000 ns 1.05
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 10334 ns 9833.5 ns 1.05
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 11208 ns 11042 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9833 ns 10333 ns 0.95
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 246668.5 ns 250034.5 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 24729.5 ns 24334 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24666 ns 25250 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25542 ns 24542 ns 1.04
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 24625 ns 24334 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 1134784.5 ns 1111417.5 ns 1.02
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 106546667 ns 106812374.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 118425312.5 ns 126726167 ns 0.93
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 120189792 ns 121727417 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 117420708 ns 118228479 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 2655736 ns 2616848 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 394570417 ns 391804291 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 368931959 ns 379056792 ns 0.97
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 424438979 ns 355535666 ns 1.19
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 482063875 ns 486452916 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 15246102 ns 15186296 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 945190750 ns 756685666.5 ns 1.25
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 580209500 ns 774854291 ns 0.75
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 744122999.5 ns 746786813 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 945148083 ns 947077458 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7708 ns 8416 ns 0.92
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7250 ns 7125 ns 1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8750 ns 8125 ns 1.08
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6958.5 ns 9604 ns 0.72
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 238753.5 ns 240976 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14500 ns 14250 ns 1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 13875 ns 14291 ns 0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14000 ns 14167 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14333 ns 14166 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 1093778.5 ns 1095523 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6708 ns 5917 ns 1.13
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6125 ns 6125 ns 1
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 8208 ns 6687.5 ns 1.23
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5417 ns 6292 ns 0.86
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 238599 ns 239291 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12833 ns 12583 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12750 ns 13125 ns 0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13166 ns 13291.5 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12666 ns 12417 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 799288.5 ns 797358.5 ns 1.00
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 5667 ns 5459 ns 1.04
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 6250 ns 5833 ns 1.07
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 6459 ns 7000 ns 0.92
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 5500 ns 5542 ns 0.99
batchedmm(2, Bsize=128)/forward/GPU/CUDA 17328 ns 16938 ns 1.02
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 15583 ns 15500 ns 1.01
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 15417 ns 15458 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 15625 ns 15666 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 15791 ns 15875 ns 0.99
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 202450 ns 200590 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 375 ns 417 ns 0.90
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 416 ns 0.90
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 333 ns 416 ns 0.80
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 23671 ns 23824 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6541 ns 6583 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6459 ns 6500 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6667 ns 6666.5 ns 1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6312.5 ns 6583 ns 0.96
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 241480.5 ns 239979.5 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5875 ns 5875 ns 1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5834 ns 6000 ns 0.97
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 5917 ns 5917 ns 1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 5917 ns 5958 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 25115 ns 24627 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 21604.5 ns 20916.5 ns 1.03
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 21166 ns 21209 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 21750 ns 21833 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 21708.5 ns 21292 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 267689.5 ns 265615.5 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 186417 ns 192687.5 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 144250 ns 146521 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 148916.5 ns 149374.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 187729 ns 142250 ns 1.32
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 167935.5 ns 168462.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1375083.5 ns 1318667 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1321917 ns 1326875 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1326146 ns 1328208 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1322375 ns 1311167 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1358092 ns 1370856 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 23000 ns 22125 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 25770.5 ns 22083 ns 1.17
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 24167 ns 24209 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 23834 ns 24417 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 354989 ns 357178 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 130916 ns 130958 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 188500 ns 180395.5 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 127375 ns 130875 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 176959 ns 178917 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1479622.5 ns 1498842 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 375 ns 334 ns 1.12
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 292 ns 375 ns 0.78
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 23532 ns 23528 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6458 ns 6417 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6333 ns 6791 ns 0.93
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6833 ns 6834 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6458 ns 6792 ns 0.95
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 258733.5 ns 258073.5 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4833 ns 4500 ns 1.07
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4917 ns 5250 ns 0.94
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 5709 ns 5125 ns 1.11
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4167 ns 4667 ns 0.89
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 256891 ns 256140 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9916 ns 10000 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9958 ns 10416 ns 0.96
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10584 ns 10292 ns 1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10167 ns 10208 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 1360812 ns 1357774 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1667 ns 1625 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1625 ns 1625 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1584 ns 1625 ns 0.97
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1583 ns 1625 ns 0.97
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 23180 ns 23069 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 5708 ns 5750 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 5667 ns 6084 ns 0.93
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6000 ns 5917 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 5708 ns 5667 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 276437 ns 275859 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 6818791 ns 6814167 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 6367083 ns 6368854.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 6546291.5 ns 6497917 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 7662166 ns 7560667 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 215904 ns 215030 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 24172500 ns 24038396 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 21282334 ns 21318250 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 21008479 ns 21055625 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 29757292 ns 29800458 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2111780 ns 2117334 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 48853770.5 ns 37406895.5 ns 1.31
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 34383187.5 ns 45481041 ns 0.76
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 45683833.5 ns 45606750 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 49363417 ns 49407375 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6479.5 ns 6375 ns 1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6125 ns 6208 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6833 ns 7292 ns 0.94
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5750 ns 5916 ns 0.97
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 238562.5 ns 237163.5 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8625 ns 8375 ns 1.03
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8084 ns 8666 ns 0.93
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8208 ns 8416 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 7917 ns 8375 ns 0.95
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 1069949 ns 1062411 ns 1.01
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s) 1541500 ns 1544167 ns 1.00
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s) 1273500 ns 1249833.5 ns 1.02
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s) 1639187 ns 1625709 ns 1.01
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s) 2161000 ns 2004375 ns 1.08
lenet(28, 28, 1, 128)/forward/GPU/CUDA 276949 ns 275720 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s) 7986167 ns 7903083 ns 1.01
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s) 6543375.5 ns 6659625 ns 0.98
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s) 7167709 ns 7184500 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s) 10462145.5 ns 10128083 ns 1.03
lenet(28, 28, 1, 128)/zygote/GPU/CUDA 1888924 ns 1884846.5 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 343084 ns 369396 ns 0.93
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 369208 ns 353625.5 ns 1.04
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 456437.5 ns 456542 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 26417 ns 24041.5 ns 1.10
batchedmm(128, Bsize=4)/forward/GPU/CUDA 42517 ns 46544 ns 0.91
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 749479 ns 743500 ns 1.01
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 814979 ns 796417 ns 1.02
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 1061458 ns 1071583 ns 0.99
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 119729.5 ns 125958 ns 0.95
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 307361.5 ns 312111.5 ns 0.98
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 395625 ns 397375 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288375 ns 212250 ns 1.36
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 288167 ns 288125 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 749875 ns 753500 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 44492 ns 44394 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 646208 ns 673292 ns 0.96
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 533666 ns 472125 ns 1.13
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 529000 ns 531791 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 974208 ns 974625 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 191704 ns 191967.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 670000 ns 657167 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 636958 ns 669958.5 ns 0.95
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 641042 ns 661104 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 677625 ns 662708 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 132879 ns 132971.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2560042 ns 2458250 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2486124.5 ns 2498250 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2459583 ns 2467687 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2464667 ns 2501875 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1294427.5 ns 1568577 ns 0.83
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 2459 ns 4333 ns 0.57
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 3208 ns 2583 ns 1.24
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 4500 ns 4417 ns 1.02
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 3354 ns 2750 ns 1.22
batchedmm(2, Bsize=32)/forward/GPU/CUDA 16581 ns 16411 ns 1.01
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 5541 ns 5375 ns 1.03
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 5500 ns 5458 ns 1.01
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 5583 ns 5625 ns 0.99
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 5541 ns 5625 ns 0.99
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 200795 ns 199892.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1458667 ns 1463541 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1501750 ns 1497208 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1499417 ns 1503375 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1438916 ns 1442834 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 40877 ns 41596 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5154042 ns 5109479 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5302542 ns 5289042 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5280125 ns 5301333.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4986917 ns 4680604 ns 1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 198039.5 ns 198982.5 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3750 ns 3709 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3667 ns 3667 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3708 ns 3750 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3667 ns 3709 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 33533 ns 33311 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14875 ns 15125 ns 0.98
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15125 ns 15084 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15417 ns 15250 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15125 ns 15250 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 380809 ns 376159 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 71583 ns 71208 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 71458 ns 71209 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 71166 ns 71250 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 70000 ns 71500 ns 0.98
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 112938 ns 112893 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 327333 ns 317750 ns 1.03
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 333917 ns 323708 ns 1.03
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 320375 ns 334166 ns 0.96
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 318167 ns 320500 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 194303.5 ns 195635 ns 0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 1000 ns 1041 ns 0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 1000 ns 1083 ns 0.92
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 1042 ns 1125 ns 0.93
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 959 ns 1083 ns 0.89
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 24404 ns 23896 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8166 ns 8000 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8083 ns 8625 ns 0.94
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8250 ns 8333 ns 0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8000 ns 8167 ns 0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 265429.5 ns 263562 ns 1.01
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 503042 ns 509521 ns 0.99
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 488125 ns 479125 ns 1.02
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 565250 ns 564625 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 215521 ns 232458.5 ns 0.93
batchedmm(128, Bsize=32)/forward/GPU/CUDA 129735 ns 129625 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 1418125 ns 1393208 ns 1.02
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 1470041 ns 1479000 ns 0.99
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 1769041.5 ns 1765792 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 863062.5 ns 868125 ns 0.99
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 275150 ns 276144 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 416 ns 375 ns 1.11
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 375 ns 0.78
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 32275 ns 31637 ns 1.02
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6375 ns 6334 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6375 ns 6625 ns 0.96
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6645.5 ns 6625 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6291.5 ns 6667 ns 0.94
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 266163 ns 263537 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1767250 ns 1722958.5 ns 1.03
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1723000 ns 1735250 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1726625 ns 1733292 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1769375 ns 1763312 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 169706 ns 169598.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4423667 ns 4353521 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4340375 ns 4379875 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4364395.5 ns 4349063 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4356604.5 ns 4390959 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1259542.5 ns 1422688.5 ns 0.89
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 6875 ns 6938 ns 0.99
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 6708 ns 6875 ns 0.98
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 9167 ns 7166 ns 1.28
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 9667 ns 6583 ns 1.47
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 21299 ns 20547 ns 1.04
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 52542 ns 50541 ns 1.04
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 48458 ns 50312.5 ns 0.96
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 32834 ns 51250 ns 0.64
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 51708.5 ns 58249.5 ns 0.89
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 295364.5 ns 308428 ns 0.96
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 17959 ns 17750 ns 1.01
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 18333 ns 17875 ns 1.03
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 18667 ns 19125 ns 0.98
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 17833.5 ns 17500 ns 1.02
batchedmm(2, Bsize=512)/forward/GPU/CUDA 18767 ns 18339 ns 1.02
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 53250 ns 53375 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 53583 ns 53166 ns 1.01
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 53292 ns 53250 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 53500 ns 53458 ns 1.00
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 337341 ns 344770 ns 0.98
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 75666 ns 75459 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 75250 ns 75375 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 75208 ns 75395.5 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 74750 ns 75458 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 46984 ns 47276 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 342791 ns 336417 ns 1.02
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 339042 ns 341125 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 324833 ns 339250 ns 0.96
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 325083 ns 336541 ns 0.97
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 212927.5 ns 213552 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1484000 ns 1489000 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1528916 ns 1522292 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1527041 ns 1529458 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1464042 ns 1468458 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 52506 ns 52575 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5172417 ns 5115542 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5313667 ns 5292541 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5251417 ns 5289458.5 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4985750 ns 4978625 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 206884 ns 206120 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 28375 ns 28125 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 28250 ns 28250 ns 1
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 28209 ns 28250 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 28208 ns 28208 ns 1
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 24921 ns 24358 ns 1.02
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66458 ns 66334 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 66625 ns 66167 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 66291 ns 66209 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66250 ns 66750 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 525792 ns 526089 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s) 1326354 ns 1498042 ns 0.89
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s) 1132104 ns 911000 ns 1.24
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s) 1139166 ns 1149625 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s) 2248604 ns 2098500 ns 1.07
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA 583822.5 ns 582137 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s) 3055395.5 ns 3080771 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s) 2729979 ns 2593125 ns 1.05
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s) 2738333 ns 2751125 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s) 3816042 ns 3818125 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA 2120607 ns 2100592 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s) 8049792 ns 7913063 ns 1.02
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s) 8097167 ns 8011208 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s) 7911292 ns 7901167 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s) 4824937 ns 4863125 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 82042 ns 81500 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 81875 ns 82000 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 82625 ns 84125 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82125 ns 83083 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 194553 ns 194175 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2055125 ns 2020500 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2001916.5 ns 2036292 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2021458 ns 2018708 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2014750 ns 2021916 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 811167.5 ns 810603 ns 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.