-
Notifications
You must be signed in to change notification settings - Fork 62
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: use generic broadcasting for complex numbers (#1106)
- Loading branch information
Showing
4 changed files
with
38 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
name = "LuxLib" | ||
uuid = "82251201-b29d-42c6-8e01-566dec8acb11" | ||
authors = ["Avik Pal <[email protected]> and contributors"] | ||
version = "1.3.9" | ||
version = "1.3.10" | ||
|
||
[deps] | ||
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
@testitem "complex differentiation: issue #977" tags=[:misc] begin | ||
using Lux, Zygote, Random | ||
|
||
rng = Random.default_rng() | ||
Random.seed!(rng, 666) | ||
|
||
rbf(x) = exp.(-(x .^ 2)) | ||
|
||
U = Lux.Chain( | ||
Lux.Dense(1, 10, rbf), | ||
Lux.Dense(10, 3, rbf) | ||
) | ||
|
||
θ, st = Lux.setup(rng, U) | ||
|
||
function complex_step_differentiation(f::Function, x::Float64, ϵ::Float64) | ||
return imag(f(x + ϵ * im)) / ϵ | ||
end | ||
|
||
loss(t) = sum(complex_step_differentiation(τ -> U([τ], θ, st)[begin], t, 1e-5)) | ||
|
||
if pkgversion(LuxLib) ≥ v"1.3.10" | ||
@test only(Zygote.gradient(loss, 1.0)) isa Float64 | ||
else | ||
@test_broken only(Zygote.gradient(loss, 1.0)) isa Float64 | ||
end | ||
end |
161b64c
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@JuliaRegistrator register subdir=lib/LuxLib
161b64c
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@JuliaRegistrator register
161b64c
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Registration pull request created: JuliaRegistries/General/120168
Tip: Release Notes
Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.
To add them here just re-invoke and the PR will be updated.
Tagging
After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.
This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:
161b64c
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Registration pull request created: JuliaRegistries/General/120169
Tip: Release Notes
Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.
To add them here just re-invoke and the PR will be updated.
Tagging
After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.
This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:
161b64c
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lux Benchmarks
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
4042
ns3937.5
ns1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
4042
ns4333
ns0.93
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
5000
ns4917
ns1.02
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
3917
ns4042
ns0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
60335
ns61383
ns0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
10292
ns10583
ns0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
9958
ns10250
ns0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
10917
ns10125
ns1.08
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
9917
ns10083
ns0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
425045
ns431239
ns0.99
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s)
1250
ns1042
ns1.20
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s)
1125
ns1334
ns0.84
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s)
1417
ns1333
ns1.06
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s)
1083
ns1125
ns0.96
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA
17905
ns18191
ns0.98
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
4083
ns4333
ns0.94
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
4000
ns4250
ns0.94
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
4375
ns4291
ns1.02
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
3916
ns3834
ns1.02
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA
109347
ns110865.5
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
56292
ns57709
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
46833
ns46667
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
46229.5
ns46208
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
81458
ns80291
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
36705
ns37897
ns0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2055229.5
ns2036958
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2092146
ns2083333.5
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2088791.5
ns1856125
ns1.13
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2005459
ns1994375
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
195507
ns198201
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
175854
ns157792
ns1.11
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
144666
ns145875
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
145708
ns145583.5
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
141167
ns143729
ns0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
165651
ns166222
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1150750
ns1114145.5
ns1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1127354.5
ns1128875
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1114250
ns1024292
ns1.09
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1116458.5
ns1115833.5
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
529529
ns534915.5
ns0.99
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
3208
ns3584
ns0.90
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
3417
ns4208
ns0.81
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
4208
ns4000
ns1.05
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
3334
ns3583
ns0.93
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
70388
ns67978
ns1.04
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8875
ns9750
ns0.91
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
9500
ns10459
ns0.91
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
9750
ns8625
ns1.13
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
9250
ns9125
ns1.01
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
494790
ns495677
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
15209
ns15000
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
15000
ns18500
ns0.81
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
17209
ns16000
ns1.08
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
14688
ns14583
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
54580
ns55105
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
216291.5
ns213834
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
214167
ns215958
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
213416
ns213333
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
225708.5
ns214375
ns1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
274273
ns276152.5
ns0.99
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s)
709
ns541
ns1.31
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s)
625
ns792
ns0.79
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s)
834
ns792
ns1.05
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s)
625
ns541
ns1.16
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA
17190
ns17241
ns1.00
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
1709
ns1541
ns1.11
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
1375
ns1708
ns0.81
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
1833
ns1791
ns1.02
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
1584
ns1542
ns1.03
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA
102235
ns102070.5
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7125
ns7208
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
5958
ns5958
ns1
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
5916
ns5916
ns1
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
9958
ns10042
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
23722
ns23944
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
222833
ns221312.5
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
227958
ns229500
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
229500
ns228667
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
213417
ns218021
ns0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
169452
ns170367
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s)
4000
ns3958
ns1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s)
3916
ns3917
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s)
3917
ns3917
ns1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s)
3916
ns3875
ns1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA
23542
ns23420
ns1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
16834
ns17000
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
16709
ns17084
ns0.98
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
16959
ns16916
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
16666
ns16708
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA
162915
ns162884.5
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
571709
ns573416.5
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
574917
ns580333
ns0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
573708
ns568042
ns1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
568500
ns569542
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA
113185.5
ns113416
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
1427354.5
ns1418250
ns1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
1431625
ns1429042
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
1423541
ns1420375
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
1422542
ns1437458
ns0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA
211963
ns212927.5
ns1.00
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s)
1046896
ns1086895.5
ns0.96
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s)
967000
ns962854
ns1.00
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s)
1344687.5
ns1344792
ns1.00
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s)
1304958
ns1286083
ns1.01
lenet(28, 28, 1, 64)/forward/GPU/CUDA
275060
ns281106
ns0.98
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s)
5993167
ns5908292
ns1.01
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s)
4544458
ns4600625
ns0.99
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s)
4946959
ns4927041.5
ns1.00
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s)
5568042
ns5714562.5
ns0.97
lenet(28, 28, 1, 64)/zygote/GPU/CUDA
1091420
ns1101975
ns0.99
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s)
583
ns542
ns1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s)
500
ns542
ns0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s)
542
ns542
ns1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s)
541
ns500
ns1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA
23913
ns23476
ns1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
2209
ns2209
ns1
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
2125
ns2167
ns0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
2208
ns2167
ns1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
2084
ns2166
ns0.96
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA
169337.5
ns169515.5
ns1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
4417
ns4333
ns1.02
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
3833
ns4500
ns0.85
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
4625
ns4791.5
ns0.97
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
3958.5
ns3791
ns1.04
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
65443
ns65149
ns1.00
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
11833
ns11584
ns1.02
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
11250
ns11333
ns0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
11958
ns11667
ns1.02
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
11125
ns11333
ns0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
450871
ns446339
ns1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
7208
ns6916
ns1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
6958
ns6917
ns1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
7417
ns7479.5
ns0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6333
ns6042
ns1.05
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
51992
ns51979.5
ns1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
18459
ns19125
ns0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
17500
ns17458
ns1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
17833
ns17792
ns1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
17459
ns17333
ns1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
300918
ns300938.5
ns1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
667
ns666
ns1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
542
ns584
ns0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
625
ns542
ns1.15
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
625
ns584
ns1.07
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
32212
ns32053.5
ns1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
9084
ns9250
ns0.98
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
9437
ns9042
ns1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
9333
ns9375
ns1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
8958
ns8917
ns1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
158990.5
ns158152
ns1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s)
64208
ns64333
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s)
64833
ns64625
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s)
64542
ns64458
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s)
64542
ns64375
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA
111823
ns111585
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
282708
ns287209
ns0.98
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
279000
ns277834
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
273166
ns280583
ns0.97
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
281437.5
ns281125
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA
186218.5
ns183928
ns1.01
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s)
3136750
ns3298562.5
ns0.95
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s)
3023208
ns3083000
ns0.98
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s)
3030188
ns3028771
ns1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s)
3954583.5
ns4061625
ns0.97
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA
576992
ns577723.5
ns1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s)
7597041.5
ns7606291
ns1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s)
7419792
ns7495375.5
ns0.99
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s)
7452395.5
ns7404541
ns1.01
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s)
8186583.5
ns8192541.5
ns1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA
1367306
ns1371476
ns1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s)
17658333
ns17505792
ns1.01
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s)
17553062.5
ns17567291
ns1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s)
17551250
ns17475667
ns1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s)
14310208
ns14122958.5
ns1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
23729167
ns23660020.5
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
33388291
ns34147791.5
ns0.98
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
37228104.5
ns37059937.5
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
34843354
ns34985187.5
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA
1868338
ns1854503
ns1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
192271333
ns187449375
ns1.03
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
232983250
ns233703458.5
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
191886562.5
ns195671083
ns0.98
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
435397084
ns433586291
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
13905970
ns13830860.5
ns1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
291433625
ns288446709
ns1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
336814583
ns337867791
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
297436208
ns296978708
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
408923438
ns400413062.5
ns1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
22583
ns22084
ns1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
24708
ns24979.5
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
23209
ns23875
ns0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
21625
ns21666
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
99141.5
ns98077
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
103334
ns102958
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
103750
ns104792
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
105083
ns103812
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
103062.5
ns110292
ns0.93
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
520213.5
ns512479
ns1.02
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
6000
ns6125
ns0.98
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
5958
ns6500
ns0.92
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
6958
ns7062.5
ns0.99
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
5708
ns6083
ns0.94
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
69364
ns69253
ns1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
15042
ns15166
ns0.99
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
15209
ns16145.5
ns0.94
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
16250
ns16208
ns1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
15083
ns15083
ns1
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
484888
ns482969
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
3057208.5
ns3041271
ns1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2066208
ns2067458.5
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2260437.5
ns2297479.5
ns0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
4508458
ns4457375
ns1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA
589772
ns592674
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
23926959
ns23527562.5
ns1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
18026875
ns18050396
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
18022708
ns17902834
ns1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
35506041.5
ns35496125
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
2765084
ns2768935.5
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
33917958
ns33385459
ns1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
27599646
ns27540666
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
28534208
ns28658250
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
41643583.5
ns41547354.5
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
74541.5
ns74875
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
74313
ns74396
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
74500
ns75500
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
72291
ns74333
ns0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
104269
ns102653
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
317750
ns291291.5
ns1.09
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
208562.5
ns318417
ns0.65
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
322375
ns208187.5
ns1.55
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
291583.5
ns290437.5
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
562266.5
ns545207.5
ns1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
11875
ns11958
ns0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
11625
ns12145.5
ns0.96
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
13250
ns14209
ns0.93
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
12125
ns11500
ns1.05
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
72944
ns70994
ns1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
27208
ns27042
ns1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
26791.5
ns26917
ns1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
27833.5
ns27625
ns1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
26750
ns26958
ns0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
485353
ns469447.5
ns1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
13458.5
ns12708
ns1.06
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
12375
ns12708
ns0.97
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
13250
ns14125
ns0.94
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
12291
ns11958
ns1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
54559.5
ns52810
ns1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
26417
ns25958
ns1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
25959
ns26209
ns0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
26209
ns25958
ns1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
26000
ns26875
ns0.97
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
311166.5
ns301312
ns1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
181458
ns179875
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
179708
ns181083
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
183437.5
ns182500
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
181354
ns179666
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
58673.5
ns56497
ns1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
597521
ns582959
ns1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
584083
ns588917
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
583958.5
ns585083
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
582625
ns590500
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
295518
ns286103
ns1.03
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
6125
ns6417
ns0.95
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
5958
ns7125
ns0.84
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
7333
ns8083
ns0.91
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6166.5
ns5750
ns1.07
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
71636.5
ns70488.5
ns1.02
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
15312.5
ns14875
ns1.03
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
14333
ns14458
ns0.99
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
15708
ns15417
ns1.02
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
13958
ns14291
ns0.98
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
473061
ns457568
ns1.03
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s)
1205708
ns1207438
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s)
1241125
ns1241417
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s)
1286479
ns1284208
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s)
1000208
ns997354.5
ns1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA
301351
ns301394.5
ns1.00
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s)
4319770.5
ns4107041.5
ns1.05
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s)
4471334
ns4414458
ns1.01
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s)
4578416
ns4959854.5
ns0.92
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s)
3698417
ns3696125
ns1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA
1037486.5
ns1040815
ns1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
1916
ns1834
ns1.04
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1792
ns1834
ns0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
1875
ns1875
ns1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
1917
ns1833
ns1.05
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA
24166
ns23635
ns1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
4917
ns4959
ns0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
4834
ns5041
ns0.96
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
5083
ns4958
ns1.03
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
4875
ns4958
ns0.98
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA
194650
ns185922
ns1.05
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
6583
ns6250
ns1.05
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
6208
ns6625
ns0.94
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
7125
ns6334
ns1.12
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
5750
ns5666
ns1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
56615.5
ns55102
ns1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
12209
ns11084
ns1.10
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10895.5
ns11834
ns0.92
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
11667
ns10791
ns1.08
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
11000
ns10875
ns1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
336343
ns330730
ns1.02
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s)
375
ns334
ns1.12
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s)
250
ns333
ns0.75
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s)
292
ns333
ns0.88
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s)
375
ns292
ns1.28
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA
23536
ns22810
ns1.03
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
3042
ns3000
ns1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
2791
ns3000
ns0.93
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
3042
ns2959
ns1.03
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
2750
ns2750
ns1
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA
163558.5
ns156803.5
ns1.04
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
12083
ns11750
ns1.03
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
11375
ns11958
ns0.95
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
14667
ns13292
ns1.10
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
11500
ns11292
ns1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
58066.5
ns57953
ns1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
25541
ns25291.5
ns1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
24250
ns24917
ns0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
25125
ns25125
ns1
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
24458
ns24542
ns1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
299332
ns293802.5
ns1.02
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s)
4208
ns4167
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s)
4167
ns4209
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s)
4209
ns4208
ns1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s)
4167
ns4208
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA
25749
ns24619
ns1.05
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
16125
ns16375
ns0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
16166
ns16292
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
16291
ns16333
ns1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
16250
ns16250
ns1
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA
200089.5
ns195053
ns1.03
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
5916
ns5833
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
5750
ns5834
ns0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
5959
ns5834
ns1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
5833
ns5875
ns0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
34238
ns33320
ns1.03
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
21125
ns21209
ns1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
20459
ns21125
ns0.97
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
21167
ns21625
ns0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
20812.5
ns20667
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
179917
ns173685
ns1.04
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s)
397270.5
ns426708
ns0.93
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s)
384187.5
ns384958
ns1.00
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s)
478583.5
ns482062.5
ns0.99
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s)
103333
ns102708.5
ns1.01
batchedmm(16, Bsize=512)/forward/GPU/CUDA
67557
ns66966
ns1.01
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s)
891750
ns909146
ns0.98
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s)
972959
ns972729
ns1.00
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s)
1184041.5
ns1175729
ns1.01
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s)
330499.5
ns439917
ns0.75
batchedmm(16, Bsize=512)/zygote/GPU/CUDA
194177
ns190337.5
ns1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
79812.5
ns80625
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
81209
ns81500
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
84042
ns81708
ns1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
79916.5
ns80187.5
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
194547.5
ns193436
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1931812.5
ns1902458
ns1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1636646
ns1931125
ns0.85
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1918646
ns1927562.5
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1926062
ns1906917
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
403673
ns397725
ns1.01
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s)
333
ns292
ns1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s)
292
ns333
ns0.88
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s)
292
ns292
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s)
292
ns292
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA
22738
ns22050
ns1.03
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
1833
ns1834
ns1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
1792
ns1834
ns0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
1834
ns1834
ns1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
1792
ns1833
ns0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA
173787
ns169534
ns1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
7041
ns6875
ns1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
6666
ns7146
ns0.93
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
7666
ns7667
ns1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
6666
ns6500
ns1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
61338.5
ns62608.5
ns0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
9459
ns9542
ns0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
9208
ns9333
ns0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
9333
ns9083
ns1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
9500
ns9458
ns1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
310208.5
ns313766.5
ns0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
155906937.5
ns118190208
ns1.32
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
174332958
ns174175750
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
147872625
ns147818500
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
105277000
ns107522750
ns0.98
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA
5483548
ns5476530
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
669282000
ns612187917
ns1.09
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
555382333
ns556303083
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
453291791.5
ns452274750
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
761771979
ns757288396
ns1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
35124637
ns38234410
ns0.92
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
699486584
ns649126292
ns1.08
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
668241854.5
ns667267229
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
612942458.5
ns589618437.5
ns1.04
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
744149959
ns741758417
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
56292
ns57583
ns0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
47709
ns47375
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
47584
ns47833
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
83167
ns82250
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
37949
ns37784
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1925646.5
ns1917978.5
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1981729
ns1995291.5
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1969458.5
ns1985646
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1898709
ns1843354
ns1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
177394.5
ns172983
ns1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
269708.5
ns266084
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
268625
ns268750
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
287000
ns268209
ns1.07
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
267041
ns267562
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
125253
ns132212
ns0.95
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
681916.5
ns650416.5
ns1.05
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
693791
ns674667
ns1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
682500
ns589437.5
ns1.16
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
685958
ns688771
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
675851.5
ns730804
ns0.92
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
2214458
ns2181417
ns1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
2234583
ns2196416.5
ns1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
2206291
ns2101104
ns1.05
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
2191792
ns2231125
ns0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
134149
ns133510.5
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5560291
ns5502917
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5498375
ns5510333
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5509646
ns5498521
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5459417
ns5441417
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
719852
ns776428
ns0.93
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
658625
ns640333
ns1.03
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
643125
ns646083
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
639042
ns646875
ns0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
637666
ns635334
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA
47328
ns47144
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
1793125
ns1818833
ns0.99
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
1725000
ns1727958
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
1723687.5
ns1724083
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
2098895.5
ns2099750
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA
225375
ns220116
ns1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
56875
ns58500
ns0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
47416
ns47083
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
47125
ns46458
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
83625
ns81500
ns1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
29103
ns28922
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2036833
ns2025729
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2094667
ns2106791.5
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2075625
ns2095000
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2003333
ns1998375
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
192100
ns189080
ns1.02
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
13402000
ns13351000
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
12431542
ns12437395.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
12506125
ns12498666.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
14837542
ns14894375
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA
516101
ns519065
ns0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
47711000
ns47200625
ns1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
42011395.5
ns41881708
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
40917708
ns40754334
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
58129729.5
ns58105083
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
2890593.5
ns2883161
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
97106625
ns96219125
ns1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
68523125
ns91954062.5
ns0.75
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
90562125
ns90758584
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
76819625
ns98984500
ns0.78
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
57208
ns58708
ns0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
47750
ns47000
ns1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
47250
ns47291
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
82041
ns82000
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
47330
ns47821
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1935667
ns1902250
ns1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1983791
ns1986000
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1973041.5
ns1978125
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1878417
ns1883042
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
195219
ns194258.5
ns1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
417
ns417
ns1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
292
ns375
ns0.78
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
375
ns333
ns1.13
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
334
ns333
ns1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
32542
ns32804.5
ns0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
6750
ns6792
ns0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
6125
ns6625
ns0.92
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
6625
ns6708
ns0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
6375
ns6166
ns1.03
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
170591.5
ns179592
ns0.95
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s)
334
ns292
ns1.14
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s)
250
ns292
ns0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s)
292
ns292
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s)
292
ns292
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA
32528
ns32076
ns1.01
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
2958
ns2917
ns1.01
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
2667
ns2875
ns0.93
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
2917
ns2875
ns1.01
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
2625
ns2625
ns1
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA
158771.5
ns166744
ns0.95
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
321043354.5
ns284237292
ns1.13
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
340532834
ns339653916.5
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
314151312.5
ns313913791.5
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
270601541
ns272402875
ns0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA
7107105.5
ns7047786.5
ns1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
1046677708.5
ns993594875
ns1.05
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
945289167
ns945283292
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
840954313
ns835507124.5
ns1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
1155312792
ns1160037292
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
34104665
ns34045459
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
1718615541
ns1668906166
ns1.03
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
1335253333.5
ns1694695167
ns0.79
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
1620256500
ns1627000917
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
1333409458.5
ns1703328625
ns0.78
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1460479.5
ns1418646
ns1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1422584
ns1413958
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1418083.5
ns1414875
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1412208.5
ns1411416
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
127814.5
ns128242
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5051916
ns5029312.5
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5033458.5
ns5037875
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5025999.5
ns5028146
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5025125
ns5024417
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
500081
ns552451.5
ns0.91
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s)
162840083
ns170453833
ns0.96
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s)
128019708.5
ns127944542
ns1.00
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s)
130269666
ns129428958
ns1.01
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s)
152687687.5
ns164372666.5
ns0.93
vgg16(32, 32, 3, 32)/forward/GPU/CUDA
4884899
ns4859943
ns1.01
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s)
844540708
ns620949625
ns1.36
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s)
537349833
ns515114583
ns1.04
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s)
560583292
ns463124083
ns1.21
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s)
649437458
ns648066667
ns1.00
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA
17863022
ns16797902
ns1.06
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s)
9095833.5
ns8927250
ns1.02
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s)
8979250
ns8950584
ns1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s)
7868500
ns7917333
ns0.99
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s)
9713958
ns9753125
ns1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA
1593097
ns1591258
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s)
37599479
ns35919479
ns1.05
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s)
37114520.5
ns37210542
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s)
33537625
ns33517916.5
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s)
37598895.5
ns37573417
ns1.00
batchedmm(512, Bsize=32)/zygote/GPU/CUDA
6454775
ns6470424
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s)
47417
ns47417
ns1
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s)
47500
ns47583
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s)
47666
ns47708
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s)
47375
ns47417
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA
18487
ns18601
ns0.99
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s)
50417
ns50542
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s)
50416
ns50458
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s)
50500
ns50542
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s)
50333.5
ns52916.5
ns0.95
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA
161534
ns206886.5
ns0.78
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
7854.5
ns7000
ns1.12
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
6770.5
ns7291
ns0.93
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
7729.5
ns7625
ns1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
7083
ns7125
ns0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
73765
ns89400.5
ns0.83
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
10209
ns10625
ns0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10375
ns10333.5
ns1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
10042
ns10417
ns0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10000
ns10479.5
ns0.95
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
437389
ns543240.5
ns0.81
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
6875
ns6000
ns1.15
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
6458
ns6166
ns1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
8250
ns7083
ns1.16
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
5583.5
ns5666.5
ns0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
81756
ns121379.5
ns0.67
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
13791
ns13333
ns1.03
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
13500
ns13000
ns1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
13541
ns13333
ns1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
12895.5
ns12687.5
ns1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
408231
ns510219
ns0.80
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
1125
ns1125
ns1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
1000
ns1084
ns0.92
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
1083
ns1084
ns1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
1083
ns1083
ns1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
32689
ns32431.5
ns1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
8500
ns8375
ns1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7667
ns8542
ns0.90
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
8000
ns8292
ns0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
8042
ns8042
ns1
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
192936.5
ns204899
ns0.94
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
23459
ns23208
ns1.01
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
23208
ns23417
ns0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
23375
ns23666
ns0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
23542
ns23334
ns1.01
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA
18259
ns18285
ns1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
52750
ns52625
ns1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
52625
ns52709
ns1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
52791.5
ns53083
ns0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
52292
ns52562.5
ns0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA
228166
ns286000
ns0.80
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1407187.5
ns1398458
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1444833
ns1450667
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1405083
ns1398999.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1396895.5
ns1395750.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
196465
ns196905
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5040708
ns5011896
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5018541
ns5032187.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5002417
ns5012250
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5013625
ns5002687.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
546168
ns598226
ns0.91
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
3079083
ns3070875
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2047000
ns2072042
ns0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2294458.5
ns2289104.5
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
4540917
ns4773854
ns0.95
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA
582581
ns584355
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
24731020.5
ns24311583
ns1.02
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
18912562.5
ns18870583.5
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
19038249.5
ns19070166.5
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
36828979.5
ns36514562.5
ns1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
2836262.5
ns2861612.5
ns0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
34546958.5
ns34008958
ns1.02
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
28342834
ns28397792
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
28021500.5
ns27946625
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
41446459
ns41793708.5
ns0.99
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s)
144151542
ns144075292
ns1.00
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s)
148019541
ns147842750
ns1.00
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s)
125949729
ns126624187.5
ns0.99
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s)
173005021
ns172290146
ns1.00
batchedmm(512, Bsize=512)/forward/GPU/CUDA
22565027
ns22560426
ns1.00
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s)
948587416.5
ns1298569062.5
ns0.73
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s)
1316893208.5
ns886633209
ns1.49
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s)
846166625
ns1199135125
ns0.71
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s)
681952500
ns689233333
ns0.99
batchedmm(512, Bsize=512)/zygote/GPU/CUDA
118678990
ns117701235
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
76499.5
ns73000
ns1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
80646
ns73292
ns1.10
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
75541.5
ns85645.5
ns0.88
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
72583
ns72583
ns1
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
219501.5
ns223969
ns0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
295875
ns276062.5
ns1.07
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
203584
ns287625
ns0.71
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
292875
ns282625
ns1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
288125
ns190583
ns1.51
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1030687
ns1155754
ns0.89
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s)
36242145.5
ns35424583
ns1.02
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s)
36566979.5
ns36355854
ns1.01
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s)
32367458.5
ns32516083.5
ns1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s)
40164416.5
ns40329917
ns1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA
5846818
ns5847057
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s)
152632458
ns144746000
ns1.05
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s)
152676896
ns153804708.5
ns0.99
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s)
139286062.5
ns140298187
ns0.99
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s)
283773000
ns283107125
ns1.00
batchedmm(512, Bsize=128)/zygote/GPU/CUDA
34916870
ns34865240
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
156722375.5
ns121095354
ns1.29
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
173916792
ns174763417
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
148066500
ns148056208
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
102175416
ns105211667
ns0.97
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA
5486669
ns5466322
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
519305021
ns468110062.5
ns1.11
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
467283583
ns466487917
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
441689083
ns437682625
ns1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
742430042
ns737562458
ns1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
32276395
ns35152775
ns0.92
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
688401084
ns706128833.5
ns0.97
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
657912104.5
ns656179312
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
573100917
ns571296688
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
731550292
ns731578125
ns1.00
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s)
1195458.5
ns1324833
ns0.90
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s)
988250
ns963417
ns1.03
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s)
987583
ns979125
ns1.01
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s)
2066875
ns2064125
ns1.00
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA
585359
ns573443.5
ns1.02
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s)
2919770.5
ns2963875
ns0.99
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s)
2614875
ns2641084
ns0.99
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s)
2611792
ns2621249.5
ns1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s)
3691417
ns3522250
ns1.05
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA
1640515
ns1659147
ns0.99
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s)
5907500
ns5792625
ns1.02
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s)
5785541
ns5824583.5
ns0.99
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s)
5799666
ns5815083.5
ns1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s)
2887792
ns2879416
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7167
ns7292
ns0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6125
ns6333
ns0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6167
ns6250
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10000
ns9917
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
25666.5
ns25248
ns1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
213834
ns212708
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
221083
ns220666
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
220958
ns221208
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
209500
ns206375
ns1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
224627
ns250623
ns0.90
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s)
310292438
ns307616584
ns1.01
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s)
228430666
ns221441583
ns1.03
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s)
199615625
ns198752396
ns1.00
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s)
310121208
ns309471333
ns1.00
vgg16(32, 32, 3, 64)/forward/GPU/CUDA
7680035.5
ns7903869
ns0.97
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s)
1101205687.5
ns1075422250
ns1.02
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s)
904614354
ns906727646
ns1.00
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s)
806439375
ns801892167
ns1.01
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s)
1160007708.5
ns1153514499.5
ns1.01
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA
26999631
ns26746953
ns1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
6458
ns5791.5
ns1.12
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
6041
ns5917
ns1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
6083
ns6375
ns0.95
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
4895.5
ns4875
ns1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
119636.5
ns155781
ns0.77
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7833
ns7625
ns1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7292
ns7334
ns0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7500
ns7562.5
ns0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7166
ns7083
ns1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
510149.5
ns649264
ns0.79
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
625
ns625
ns1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
666
ns542
ns1.23
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
708
ns666
ns1.06
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
583
ns542
ns1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
24235
ns23898
ns1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
9625
ns9333.5
ns1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
9583
ns9542
ns1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
9291
ns9833
ns0.94
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
9041
ns8792
ns1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
191615
ns220286
ns0.87
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
352542
ns351479.5
ns1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
351541.5
ns352042
ns1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
354208
ns353812.5
ns1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
352041
ns354687.5
ns0.99
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA
21082
ns21024
ns1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
827625
ns811959
ns1.02
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
774417
ns778625
ns0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
830187
ns774625
ns1.07
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
822209
ns821708
ns1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA
224458.5
ns304830.5
ns0.74
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s)
315667
ns339000
ns0.93
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s)
337708
ns343083
ns0.98
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s)
448542
ns451041.5
ns0.99
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s)
11375
ns10583
ns1.07
batchedmm(16, Bsize=32)/forward/GPU/CUDA
18423
ns18316
ns1.01
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s)
705604.5
ns714000
ns0.99
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s)
738958.5
ns742750.5
ns0.99
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s)
999000
ns1003583
ns1.00
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s)
26459
ns26375
ns1.00
batchedmm(16, Bsize=32)/zygote/GPU/CUDA
211965.5
ns291054.5
ns0.73
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s)
360167
ns384958.5
ns0.94
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s)
346666
ns348083
ns1.00
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s)
437417
ns444917
ns0.98
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s)
30125
ns30125
ns1
batchedmm(16, Bsize=128)/forward/GPU/CUDA
22977
ns23128
ns0.99
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s)
727167
ns738542
ns0.98
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s)
782250
ns791896
ns0.99
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s)
1026667
ns1018521
ns1.01
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s)
90000
ns105270.5
ns0.85
batchedmm(16, Bsize=128)/zygote/GPU/CUDA
196309.5
ns225989
ns0.87
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s)
3625
ns3708
ns0.98
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s)
3541
ns3708
ns0.95
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s)
3625
ns3792
ns0.96
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s)
3458
ns3542
ns0.98
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA
18016
ns17710.5
ns1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s)
4541
ns4500
ns1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s)
4375
ns4250
ns1.03
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s)
4292
ns4500
ns0.95
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s)
4250
ns4417
ns0.96
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA
210663.5
ns279830
ns0.75
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
3750
ns3666
ns1.02
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
3625
ns4125
ns0.88
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
4500
ns4500
ns1
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
3708
ns3708
ns1
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
158953
ns199332
ns0.80
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8750
ns8875
ns0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8167
ns8500
ns0.96
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8875
ns8458
ns1.05
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8375
ns8500
ns0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
976072
ns1228522.5
ns0.79
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
203542
ns203645.5
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
211375
ns210208
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
212125
ns210792
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
200042
ns201125
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
35273
ns34981
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
649750
ns611520.5
ns1.06
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
622083
ns624479.5
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
673000
ns624000.5
ns1.08
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
628584
ns630624.5
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
286304.5
ns361212
ns0.79
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s)
1006541.5
ns995583
ns1.01
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s)
1012562.5
ns1022646
ns0.99
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s)
950084
ns952562
ns1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s)
867374.5
ns869209
ns1.00
batchedmm(128, Bsize=128)/forward/GPU/CUDA
208692
ns207395.5
ns1.01
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s)
4662333
ns4529458
ns1.03
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s)
4724042
ns4744750
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s)
4460291
ns4448625
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s)
5133479.5
ns5089542
ns1.01
batchedmm(128, Bsize=128)/zygote/GPU/CUDA
931046
ns933469
ns1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
3750
ns3666
ns1.02
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
3416
ns3375
ns1.01
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
3875
ns4167
ns0.93
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
3000
ns3209
ns0.93
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
160179
ns242210.5
ns0.66
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7708
ns7792
ns0.99
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7000
ns7167
ns0.98
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7500
ns7417
ns1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
6917
ns7208
ns0.96
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
834512
ns1046390.5
ns0.80
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
1638021
ns1637500.5
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1178750.5
ns1186917
ns0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1368583
ns1336062.5
ns1.02
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
2435458
ns2468375
ns0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA
212757
ns213930
ns0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
12417125
ns12339333
ns1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
9573771
ns9615979.5
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
9272896
ns9254104
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
18032250
ns17996208
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
1947684.5
ns1954541
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
17407875.5
ns17361479
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
14413792
ns14427833
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
14355521
ns14271583.5
ns1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
21131291.5
ns21144917
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
89791
ns88834
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
90333
ns91500
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
91667
ns90834
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
88604
ns87625
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
125843
ns125982
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2042500
ns2019500
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2024209
ns2042708
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2017334
ns2028042
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2030458
ns2025999.5
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
851622
ns1063927.5
ns0.80
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s)
1500
ns3541.5
ns0.42
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s)
2250
ns2333
ns0.96
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s)
3833
ns3584
ns1.07
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s)
2250
ns1500
ns1.50
batchedmm(2, Bsize=4)/forward/GPU/CUDA
15376
ns15780.5
ns0.97
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s)
2916
ns3000
ns0.97
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s)
2459
ns2958
ns0.83
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s)
2791
ns2709
ns1.03
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s)
2917
ns2875
ns1.01
batchedmm(2, Bsize=4)/zygote/GPU/CUDA
153882
ns195545.5
ns0.79
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7250
ns7208
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6000
ns6042
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6000
ns6000
ns1
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10000
ns10000
ns1
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
33856.5
ns33801
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
221791
ns221667
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
220646
ns228625
ns0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
220479.5
ns221000
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
241958.5
ns206709
ns1.17
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
266253.5
ns347206.5
ns0.77
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3708
ns3708
ns1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3750
ns3709
ns1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3708
ns3708
ns1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3709
ns3750
ns0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA
22475
ns22295
ns1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
14209
ns14500
ns0.98
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
14333
ns14459
ns0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
14459
ns14458
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
14417
ns14417
ns1
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA
372905
ns485845.5
ns0.77
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
96208
ns92250
ns1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
95604
ns94209
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
96583.5
ns95250
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
91812.5
ns91750
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
125359
ns125421
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1942458
ns1929000
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1923146
ns1929917
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1909167
ns1922333
ns0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1932625
ns1923646
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
780596.5
ns960449
ns0.81
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s)
859584
ns875291
ns0.98
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s)
815917
ns817687.5
ns1.00
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s)
1209375
ns1220791.5
ns0.99
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s)
960270.5
ns956708
ns1.00
lenet(28, 28, 1, 32)/forward/GPU/CUDA
271785
ns270219.5
ns1.01
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s)
2844229
ns2786125
ns1.02
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s)
2490542
ns2476771
ns1.01
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s)
3348000.5
ns3326500
ns1.01
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s)
3404749.5
ns3277354
ns1.04
lenet(28, 28, 1, 32)/zygote/GPU/CUDA
1487247
ns1614761
ns0.92
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
17416.5
ns16229
ns1.07
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
17416
ns18000
ns0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
18333
ns17084
ns1.07
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
17604.5
ns14895.5
ns1.18
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
140524.5
ns142802.5
ns0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
261583
ns222458
ns1.18
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
215667
ns216625
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
257416.5
ns216270.5
ns1.19
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
215792
ns225667
ns0.96
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
572039
ns642849
ns0.89
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
222625
ns220666.5
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
222062.5
ns223437.5
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
222146
ns221291.5
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
220833
ns220208
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
232890
ns270694.5
ns0.86
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
507750
ns498104.5
ns1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
501667
ns505958
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
556500
ns498020.5
ns1.12
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
507875
ns500229
ns1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1207816
ns1376499
ns0.88
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s)
4292
ns4000
ns1.07
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s)
4042
ns3667
ns1.10
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s)
4541.5
ns5875
ns0.77
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s)
3875
ns3666
ns1.06
batchedmm(16, Bsize=4)/forward/GPU/CUDA
16753
ns16958
ns0.99
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s)
7625
ns7333
ns1.04
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s)
7125
ns7333
ns0.97
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s)
7167
ns7125
ns1.01
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s)
7270.5
ns7542
ns0.96
batchedmm(16, Bsize=4)/zygote/GPU/CUDA
176857.5
ns195319
ns0.91
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
18542
ns17333
ns1.07
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
16958
ns20291.5
ns0.84
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
19792
ns19354.5
ns1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
16562.5
ns16708.5
ns0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
145193.5
ns146982.5
ns0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
224708
ns214625
ns1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
211854
ns212500
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
238145.5
ns212792
ns1.12
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
212042
ns221417
ns0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
888620
ns1020818
ns0.87
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
4917
ns4125
ns1.19
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
4208
ns4500
ns0.94
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
5042
ns5291
ns0.95
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
3667
ns3542
ns1.04
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
184696.5
ns241162.5
ns0.77
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
10875
ns11000
ns0.99
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
10584
ns10459
ns1.01
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
11042
ns10916
ns1.01
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
10458
ns10125
ns1.03
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
966049
ns1056501
ns0.91
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
3645.5
ns3333
ns1.09
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
3209
ns3875
ns0.83
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
3792
ns4250
ns0.89
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
2833
ns2938
ns0.96
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
188943.5
ns237567.5
ns0.80
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
8000
ns7875
ns1.02
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7125
ns7709
ns0.92
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7958
ns7250
ns1.10
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7208
ns7458.5
ns0.97
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
1007792.5
ns1070019
ns0.94
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
24183291.5
ns23347771
ns1.04
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
34946479
ns35406500
ns0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
37338083
ns37669583
ns0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
34888125
ns34858666
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA
1782868.5
ns1830001
ns0.97
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
186454375
ns183823166
ns1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
159896583
ns159867750
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
145990104.5
ns146428479.5
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
411376042
ns410553708
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
16457564
ns16506890.5
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
432652834
ns424862333.5
ns1.02
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
247809833
ns253527416.5
ns0.98
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
279749334
ns295623854.5
ns0.95
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
479974375
ns480544667
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
183958.5
ns182875
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
182375
ns185563
ns0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
185500
ns184500
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
184062.5
ns182250
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
172057.5
ns218471
ns0.79
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
637709
ns633375
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
586041.5
ns596208
ns0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
639084
ns587250
ns1.09
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
596416
ns590520.5
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1002959
ns1067870
ns0.94
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s)
4026750
ns3926937.5
ns1.03
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s)
3920250
ns3941459
ns0.99
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s)
3579209
ns3667000
ns0.98
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s)
4570291.5
ns4544333.5
ns1.01
batchedmm(128, Bsize=512)/forward/GPU/CUDA
532647
ns531767
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s)
17895041.5
ns17389166
ns1.03
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s)
17836083
ns17947521
ns0.99
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s)
16489292
ns16390812
ns1.01
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s)
20147270.5
ns19902458.5
ns1.01
batchedmm(128, Bsize=512)/zygote/GPU/CUDA
2607011.5
ns2636393
ns0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
625
ns625
ns1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
583
ns584
ns1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
625
ns625
ns1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
542
ns542
ns1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
32522
ns32468
ns1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
9729
ns9479.5
ns1.03
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
9291
ns9500
ns0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
9625
ns9584
ns1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
9209
ns8875
ns1.04
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
258262.5
ns263813
ns0.98
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s)
503041917
ns498564458
ns1.01
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s)
424847437.5
ns426956020.5
ns1.00
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s)
425274250
ns423367333
ns1.00
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s)
682175395.5
ns596263958
ns1.14
vgg16(32, 32, 3, 128)/forward/GPU/CUDA
12478951
ns12482792
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s)
1889075833
ns1875323562.5
ns1.01
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s)
1625727875
ns1628477375
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s)
1494457604.5
ns1492393083.5
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s)
2214128083.5
ns2205444916.5
ns1.00
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA
49385566.5
ns49302271
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
1647625
ns1639125
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1201312.5
ns1202125
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1376271
ns1357187.5
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
2354042
ns2457312
ns0.96
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA
214603
ns213583.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
12810437.5
ns12714125
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
9968417
ns9952750
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
9702395.5
ns9614459
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
18320249.5
ns18361979
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
2015837.5
ns2064490
ns0.98
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
17772083
ns17715625
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
14741771
ns14737021
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
14583292
ns14521854
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
21392208
ns21413792
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
26292
ns26292
ns1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
26250
ns26250
ns1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
26208
ns26666
ns0.98
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
26167
ns26250
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA
23824
ns24074
ns0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
67125
ns67542
ns0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
67500
ns67625
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
67958
ns68125
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
67000
ns67042
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA
377030.5
ns400556.5
ns0.94
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
204125
ns203812.5
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
209792
ns210750
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
210750
ns209833
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
200292
ns199375
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
26462
ns27041
ns0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
650250
ns627333
ns1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
625708.5
ns626584
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
669874.5
ns622042
ns1.08
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
629250
ns580541
ns1.08
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
303651
ns355125.5
ns0.86
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
627292
ns640833
ns0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
671583
ns653000
ns1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
598312
ns599854
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
639791
ns599062.5
ns1.07
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
132031
ns132599.5
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2336625
ns2247625
ns1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2255375
ns2173250
ns1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2235562.5
ns2242375
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2236583
ns2238250
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1129126
ns1242951
ns0.91
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
18437
ns17459
ns1.06
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
18354.5
ns19917
ns0.92
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
20062.5
ns18958
ns1.06
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
17104.5
ns17500
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
144037
ns146290
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
265000
ns227125
ns1.17
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
230729
ns229687.5
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
231875
ns219959
ns1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
258875
ns218709
ns1.18
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
929000
ns1041939
ns0.89
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
625
ns625
ns1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
708
ns625
ns1.13
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
666
ns666
ns1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
625
ns542
ns1.15
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
23483
ns24055
ns0.98
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
10125
ns10042
ns1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
9541
ns9875
ns0.97
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
10208
ns10292
ns0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
9291
ns9333
ns1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
253535
ns260885.5
ns0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
5958
ns6208
ns0.96
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
5417
ns6042
ns0.90
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
6166
ns6416
ns0.96
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
4959
ns5062.5
ns0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
177041
ns223280.5
ns0.79
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7770.5
ns7625
ns1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7250
ns7416
ns0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7875
ns7459
ns1.06
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
6917
ns6875
ns1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
724899.5
ns794061
ns0.91
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
2334
ns2083
ns1.12
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
2042
ns2333
ns0.88
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
2208
ns2542
ns0.87
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
2292
ns2166
ns1.06
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA
17786
ns17949
ns0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
6667
ns6584
ns1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
6958
ns6584
ns1.06
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
6625
ns6792
ns0.98
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
6500
ns6520.5
ns1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA
316064.5
ns335163
ns0.94
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s)
752459
ns746958.5
ns1.01
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s)
746750
ns746875
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s)
750791
ns749792
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s)
746917
ns751729
ns0.99
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA
21186
ns21434
ns0.99
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s)
794041.5
ns819625
ns0.97
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s)
787583
ns791708
ns0.99
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s)
810166.5
ns773145.5
ns1.05
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s)
777749.5
ns790854
ns0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA
292715.5
ns298785
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7125
ns7375
ns0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6000
ns6000
ns1
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6042
ns5958
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10125
ns10166
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
33031.5
ns33922
ns0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
260583
ns220854.5
ns1.18
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
266771
ns236854.5
ns1.13
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
240125
ns228083.5
ns1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
213791
ns212875
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
347920
ns365652.5
ns0.95
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
10417
ns10209
ns1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
10083
ns10417
ns0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
10666
ns10708
ns1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
9770.5
ns9541.5
ns1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
236152.5
ns251155
ns0.94
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
24958
ns24333
ns1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
24125
ns24584
ns0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
25625
ns24583
ns1.04
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
24625
ns24979
ns0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
1075060
ns1135827
ns0.95
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
106687708
ns106024583.5
ns1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
118577083.5
ns117903521
ns1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
120497312.5
ns120396396
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
118064771
ns117544479
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA
2612121
ns2631384.5
ns0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
394040917
ns385240458
ns1.02
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
367160584
ns368294084
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
357048666
ns356727875
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
483172291
ns482802291
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
15226002.5
ns15255065.5
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
944093812.5
ns936146916.5
ns1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
581088583
ns762770042
ns0.76
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
744439291.5
ns746849979.5
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
770449312.5
ns945639875
ns0.81
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
7833.5
ns7541
ns1.04
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
6584
ns7250
ns0.91
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
7667
ns7583.5
ns1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6584
ns6479
ns1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
231298
ns243530.5
ns0.95
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
14833.5
ns14458
ns1.03
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
13833
ns13959
ns0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
14333
ns14750
ns0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
13667
ns13833
ns0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
1030746
ns1088103
ns0.95
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
6812.5
ns6417
ns1.06
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
6250
ns6292
ns0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
8625
ns7166.5
ns1.20
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
5458
ns5750
ns0.95
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
228035.5
ns238108.5
ns0.96
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
13541
ns13084
ns1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
12250
ns12916
ns0.95
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
13417
ns12583
ns1.07
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
12375
ns12166
ns1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
749909.5
ns798707
ns0.94
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s)
5562.5
ns5584
ns1.00
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s)
5917
ns5770.5
ns1.03
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s)
5959
ns6500
ns0.92
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s)
5625
ns7166.5
ns0.78
batchedmm(2, Bsize=128)/forward/GPU/CUDA
17374
ns17513
ns0.99
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s)
15979.5
ns15667
ns1.02
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s)
15291
ns15625
ns0.98
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s)
15666.5
ns15541
ns1.01
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s)
15791
ns15583
ns1.01
batchedmm(2, Bsize=128)/zygote/GPU/CUDA
198865.5
ns202130
ns0.98
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
416
ns416
ns1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
417
ns333
ns1.25
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
375
ns417
ns0.90
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
334
ns292
ns1.14
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
23594
ns23880
ns0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
6770.5
ns6584
ns1.03
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
6416
ns6458
ns0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
6875
ns6583
ns1.04
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
6416
ns6187.5
ns1.04
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
238325.5
ns241952.5
ns0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
5958
ns5917
ns1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
5917
ns5917
ns1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
5958
ns5959
ns1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
5875
ns5834
ns1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
24848
ns25052
ns0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
22291.5
ns21667
ns1.03
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
21375
ns21750
ns0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
21750
ns21875
ns0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
21833
ns21167
ns1.03
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
262151
ns267258
ns0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
145041
ns144125
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
179792
ns144791
ns1.24
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
147000
ns146500
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
145833
ns143125
ns1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
167939
ns168261.5
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1367292
ns1324313
ns1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1334375
ns1331958
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1330499.5
ns1325708
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1319209
ns1319666.5
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1299116
ns1358754
ns0.96
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
23000
ns24416.5
ns0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
24062.5
ns24834
ns0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
24875
ns23375
ns1.06
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
21542
ns21292
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
285873
ns357948
ns0.80
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
181458
ns132395.5
ns1.37
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
142020.5
ns127354.5
ns1.12
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
130312
ns118459
ns1.10
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
166291
ns117395.5
ns1.42
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1432985
ns1501059
ns0.95
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
333
ns292
ns1.14
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
24013
ns23530
ns1.02
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
6667
ns6875
ns0.97
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
6292
ns6666
ns0.94
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
6708
ns7125
ns0.94
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
6208
ns6417
ns0.97
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
257668.5
ns259579
ns0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
4875
ns4625
ns1.05
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
4541
ns4708
ns0.96
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
4917
ns5042
ns0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
4334
ns4084
ns1.06
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
248170.5
ns258332.5
ns0.96
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
10541
ns10375
ns1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
9500
ns10209
ns0.93
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
10583
ns10209
ns1.04
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10000
ns10166.5
ns0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
1315251.5
ns1363356
ns0.96
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
1667
ns1625
ns1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1584
ns1625
ns0.97
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
1625
ns1625
ns1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
1625
ns1583
ns1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA
23770.5
ns23506
ns1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
6083
ns5958
ns1.02
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
5625
ns6042
ns0.93
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
6000
ns6125
ns0.98
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
5625
ns5667
ns0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA
277301
ns277914
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
6853687
ns6791458.5
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
6416292
ns6360916.5
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
6504750
ns6541917
ns0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
7620312.5
ns7577625
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA
214867.5
ns214916.5
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
24153125
ns24027042
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
21320542
ns21266917
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
21047708.5
ns21002500
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
29760542
ns29759417
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
2095640.5
ns2132435.5
ns0.98
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
48863062.5
ns48562041
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
34327709
ns45901125
ns0.75
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
45697437.5
ns45588125
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
38239917
ns49263125
ns0.78
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
6708
ns6000
ns1.12
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
5666
ns6167
ns0.92
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
6459
ns6625
ns0.97
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
5770.5
ns5334
ns1.08
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
232386
ns236967.5
ns0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
9062.5
ns9250
ns0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8375
ns8458
ns0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8375
ns8958
ns0.93
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8291
ns8250
ns1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
1027676.5
ns1058397
ns0.97
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s)
1539229.5
ns1553000
ns0.99
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s)
1264500
ns1271333
ns0.99
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s)
1616916
ns1611667
ns1.00
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s)
2152625
ns2139521
ns1.01
lenet(28, 28, 1, 128)/forward/GPU/CUDA
281859
ns272139
ns1.04
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s)
7990000
ns7938708.5
ns1.01
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s)
6612375
ns6600938
ns1.00
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s)
7167458
ns7126750
ns1.01
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s)
10472916.5
ns10443521
ns1.00
lenet(28, 28, 1, 128)/zygote/GPU/CUDA
1870517
ns1846977
ns1.01
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s)
359666
ns374500.5
ns0.96
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s)
372896
ns372770.5
ns1.00
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s)
456458
ns456750
ns1.00
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s)
22396
ns23000
ns0.97
batchedmm(128, Bsize=4)/forward/GPU/CUDA
47625
ns46393
ns1.03
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s)
739666
ns736917
ns1.00
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s)
822937.5
ns808083
ns1.02
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s)
1053333
ns1057958
ns1.00
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s)
109291
ns78020.5
ns1.40
batchedmm(128, Bsize=4)/zygote/GPU/CUDA
240230
ns308525
ns0.78
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s)
396792
ns397417
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s)
288042
ns287917
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s)
287917
ns288000
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s)
755250
ns753542
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA
45350
ns43767
ns1.04
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
639083
ns673583
ns0.95
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
531000
ns536166
ns0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
531625
ns531917
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
973083
ns973208
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA
194303
ns188160
ns1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
636645.5
ns633500
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
636021
ns647250
ns0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
652063
ns599709
ns1.09
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
654042
ns615666
ns1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
133147
ns131655.5
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2499458
ns2457916
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2456708
ns2396750
ns1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2459542
ns2458187.5
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2452854
ns2452625
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1214588
ns1345493
ns0.90
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s)
2209
ns3083
ns0.72
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s)
3041
ns2833
ns1.07
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s)
4667
ns4500
ns1.04
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s)
2792
ns2583
ns1.08
batchedmm(2, Bsize=32)/forward/GPU/CUDA
16731
ns16191
ns1.03
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s)
5625
ns5750
ns0.98
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s)
5333
ns5584
ns0.96
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s)
5625
ns5459
ns1.03
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s)
5584
ns5625
ns0.99
batchedmm(2, Bsize=32)/zygote/GPU/CUDA
199833.5
ns198160.5
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1461916.5
ns1458292
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1505708
ns1499833
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1503458
ns1499083
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1437083
ns1437417
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
41276
ns40922
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5154479
ns5128625
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5307146
ns5308187.5
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5288209
ns5301146
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5001917
ns4993250
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
200453
ns195601.5
ns1.02
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3750
ns3709
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3708
ns3708
ns1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3708
ns3708
ns1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3667
ns3708
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA
34571
ns33852
ns1.02
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
15250
ns15500
ns0.98
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
15250
ns15334
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
15375
ns15334
ns1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
15125
ns15166
ns1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA
372573
ns381247
ns0.98
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s)
71375
ns71292
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s)
71583
ns71416
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s)
71208
ns71167
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s)
71250
ns70916
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA
114012
ns113823.5
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
325917
ns317500
ns1.03
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
325167
ns321000
ns1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
318375
ns319083
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
317750
ns318500
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA
199225
ns197369
ns1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
1083
ns1125
ns0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
1000
ns1084
ns0.92
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
1083
ns1084
ns1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
1000
ns959
ns1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
24050
ns24373
ns0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8584
ns8333.5
ns1.03
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8084
ns8229.5
ns0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8375
ns8250
ns1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8000
ns7833
ns1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
262017.5
ns265338.5
ns0.99
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s)
497584
ns511459
ns0.97
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s)
490208.5
ns488042
ns1.00
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s)
559959
ns567084
ns0.99
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s)
148209
ns220750
ns0.67
batchedmm(128, Bsize=32)/forward/GPU/CUDA
129838.5
ns129208
ns1.00
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s)
1405375
ns1389625
ns1.01
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s)
1471875
ns1480250
ns0.99
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s)
1758791.5
ns1756312.5
ns1.00
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s)
869583
ns865000
ns1.01
batchedmm(128, Bsize=32)/zygote/GPU/CUDA
274551
ns277406
ns0.99
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
417
ns416
ns1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
334
ns375
ns0.89
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
333
ns292
ns1.14
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
32490
ns32170
ns1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
6875
ns6875
ns1
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
6208
ns6625
ns0.94
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
6541
ns6458
ns1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
6333
ns6020.5
ns1.05
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
265808
ns266374
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1723271
ns1718417
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1751146
ns1721417
ns1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1734270.5
ns1726125
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1724083
ns1719500
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
169537.5
ns169010.5
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
4419270.5
ns4367625
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
4365292
ns4399270.5
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
4351792
ns4374042
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4357792
ns4359438
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1171701
ns1258694
ns0.93
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s)
6833.5
ns6500
ns1.05
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s)
7062.5
ns6625
ns1.07
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s)
7833
ns7208.5
ns1.09
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s)
6833
ns6542
ns1.04
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA
20938
ns20518
ns1.02
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
72249.5
ns32542
ns2.22
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
51291.5
ns52479.5
ns0.98
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
52875
ns52000
ns1.02
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
51333
ns32625
ns1.57
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA
211685.5
ns210236
ns1.01
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s)
17709
ns17542
ns1.01
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s)
18333
ns17917
ns1.02
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s)
18312.5
ns18708
ns0.98
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s)
17625
ns17375
ns1.01
batchedmm(2, Bsize=512)/forward/GPU/CUDA
18852
ns18845.5
ns1.00
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s)
53583
ns53750
ns1.00
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s)
52958
ns53208
ns1.00
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s)
53625
ns53250
ns1.01
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s)
53417
ns53500
ns1.00
batchedmm(2, Bsize=512)/zygote/GPU/CUDA
337333.5
ns344404.5
ns0.98
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s)
75417
ns75292
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s)
75375
ns75500
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s)
75334
ns74833
ns1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s)
75333
ns74959
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA
47609
ns47057
ns1.01
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
339875
ns323708
ns1.05
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
332958
ns338541
ns0.98
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
325791
ns326000
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
324042
ns325417
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA
215842
ns211393
ns1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1486125
ns1486000
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1530958
ns1527542
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1527584
ns1526208
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1463416
ns1463000
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
52815
ns52398
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5149209
ns5120500
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5312291.5
ns5242958
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5298250
ns5297166.5
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4995000
ns4985916.5
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
207728
ns204362
ns1.02
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
28250
ns28250
ns1
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
28208
ns28250
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
28291
ns28167
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
28292
ns28167
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA
24971.5
ns25076
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
66292
ns66417
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
66375
ns66417
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
66209
ns66792
ns0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
66500
ns66458
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA
510271
ns540264.5
ns0.94
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s)
1349333.5
ns1467604.5
ns0.92
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s)
1135833
ns1148208
ns0.99
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s)
1132458
ns1073125
ns1.06
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s)
2196062.5
ns2179542
ns1.01
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA
589889
ns575331.5
ns1.03
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s)
3042333
ns3075042
ns0.99
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s)
2731792
ns2748167
ns0.99
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s)
2726167
ns2727604
ns1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s)
3811625
ns3816646
ns1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA
2004374
ns2066149
ns0.97
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s)
8038292
ns7917125
ns1.02
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s)
7942499.5
ns7956750
ns1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s)
7931979.5
ns7912958
ns1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s)
4817250
ns4824417
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
80499.5
ns81334
ns0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
82250
ns82000
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
82500
ns81895.5
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
80479.5
ns80250
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
194209
ns193566.5
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2050042
ns2017375
ns1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2034333.5
ns2065916.5
ns0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2017875
ns2015625
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2018854
ns2021542
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
768336
ns803967
ns0.96
This comment was automatically generated by workflow using github-action-benchmark.