Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: don't declare implicitly exported functions public (#1147)
* don't export deprecated functions `@deprecate` by default exports the passed functions, which I assume was not intended here. This actually causes precompilation errors on 1.12 since these functions are also declared public * remove public declaration instead * Update src/helpers/recursive_ops.jl
- Loading branch information
ac2879b
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lux Benchmarks
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
3625
ns3833
ns0.95
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
4541
ns4250
ns1.07
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
5125
ns4666
ns1.10
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
3791
ns4041.5
ns0.94
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
61743
nslayernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
10125
ns10459
ns0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
10875
ns10417
ns1.04
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
10334
ns10083
ns1.02
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
10417
ns10625
ns0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
430910
nsbias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s)
1209
ns1125
ns1.07
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s)
1209
ns1375
ns0.88
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s)
1500
ns1375
ns1.09
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s)
1042
ns1208
ns0.86
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA
18223.5
nsbias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
4000
ns3958
ns1.01
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
4042
ns4125
ns0.98
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
4334
ns4208
ns1.03
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
3875
ns3958
ns0.98
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA
110886
nsbatchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
56709
ns57917
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
38334
ns46459
ns0.83
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
46917
ns46750
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
81750
ns82708
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
37932
nsbatchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2043708.5
ns2047958
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2096520.5
ns2090000
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2096437.5
ns2093917
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1991167
ns1976812.5
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
197294.5
nslayernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
144625
ns146708
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
145667
ns182667
ns0.80
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
144916
ns145833
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
144854.5
ns143583
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
166157.5
nslayernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1116791
ns1151625.5
ns0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1150459
ns1117646
ns1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1128083
ns1124084
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1121458
ns1165146
ns0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
535998
nslayernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
3417
ns3500
ns0.98
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
4042
ns4083
ns0.99
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
4459
ns4042
ns1.10
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
3187.5
ns3916
ns0.81
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
72464.5
nslayernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
9417
ns9083
ns1.04
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
9458
ns9166
ns1.03
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
9750
ns9125
ns1.07
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8708
ns8854.5
ns0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
469472
nsgroupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
14375
ns17334
ns0.83
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
16208
ns18542
ns0.87
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
18750
ns17834
ns1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
16875
ns16333
ns1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
54038
nsgroupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
213375
ns214916.5
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
220000
ns214541
ns1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
217250
ns213500
ns1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
213916
ns220667
ns0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
270771
nsbias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s)
541
ns542
ns1.00
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s)
542
ns625
ns0.87
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s)
708
ns583
ns1.21
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s)
667
ns625
ns1.07
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA
17308
nsbias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
1417
ns1458
ns0.97
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
1375
ns1750
ns0.79
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
1541
ns1458
ns1.06
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
1417
ns1625
ns0.87
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA
101606.5
nsbatchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7083
ns6208
ns1.14
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
5250
ns5958
ns0.88
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
5958
ns6000
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10084
ns10208
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
23383
nsbatchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
221709
ns221042
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
229750
ns228959
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
229125
ns229375
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
214125
ns223854.5
ns0.96
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
167775.5
nsdense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s)
3917
ns3875
ns1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s)
4000
ns3958
ns1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s)
3917
ns3917
ns1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s)
3917
ns3917
ns1
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA
23070
nsdense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
17083
ns16708
ns1.02
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
16625
ns17083
ns0.97
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
17083
ns16875
ns1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
16833
ns16584
ns1.02
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA
162035
nsdense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
575083
ns570250
ns1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
571792
ns577041
ns0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
570750
ns576958
ns0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
577208
ns573916
ns1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA
113295
nsdense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
1418250
ns1424354
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
1422875
ns1421125
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
1422500
ns1417666
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
1425750
ns1422417
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA
211866.5
nslenet(28, 28, 1, 64)/forward/CPU/2 thread(s)
1081041.5
ns1082874.5
ns1.00
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s)
946916.5
ns969583.5
ns0.98
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s)
1353229.5
ns1345833
ns1.01
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s)
1292458
ns1275270.5
ns1.01
lenet(28, 28, 1, 64)/forward/GPU/CUDA
269913.5
nslenet(28, 28, 1, 64)/zygote/CPU/2 thread(s)
6001958
ns5772500
ns1.04
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s)
4632042
ns4552375
ns1.02
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s)
4929041.5
ns4981312.5
ns0.99
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s)
5549750.5
ns5767584
ns0.96
lenet(28, 28, 1, 64)/zygote/GPU/CUDA
1070564
nsdense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s)
542
ns541
ns1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s)
542
ns542
ns1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s)
542
ns542
ns1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s)
542
ns541
ns1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA
23780
nsdense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
2209
ns2125
ns1.04
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
2209
ns2208
ns1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
2208
ns2250
ns0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
2084
ns2125
ns0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA
170642
nslayernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
3667
ns4167
ns0.88
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
4750
ns4375
ns1.09
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
5208
ns4875
ns1.07
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
4041
ns4500
ns0.90
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
65525
nslayernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
11084
ns11291
ns0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
12083
ns11292
ns1.07
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
12208
ns12000
ns1.02
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
10834
ns11375
ns0.95
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
445478.5
nsgroupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
5917
ns6458
ns0.92
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
6666
ns6833
ns0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
8167
ns8000
ns1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6166
ns6875
ns0.90
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
52877
nsgroupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
18250
ns17083
ns1.07
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
18458
ns19250
ns0.96
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
18542
ns17791.5
ns1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
17520.5
ns17875
ns0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
296963
nsbatchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
583
ns625
ns0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
625
ns625
ns1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
667
ns625
ns1.07
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
542
ns542
ns1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
32928.5
nsbatchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
9271
ns8792
ns1.05
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
9208
ns8875
ns1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
9354.5
ns8916.5
ns1.05
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
8375
ns8209
ns1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
157633
nsdense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s)
64458
ns64500
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s)
64917
ns64583
ns1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s)
64583
ns64250
ns1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s)
64375
ns64750
ns0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA
111288
nsdense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
278375
ns285625
ns0.97
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
292291
ns283375
ns1.03
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
278833
ns276208.5
ns1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
279500
ns297500
ns0.94
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA
186917
nsmlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s)
3287958
ns3402333
ns0.97
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s)
2909792
ns3060583
ns0.95
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s)
3017771
ns3019687.5
ns1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s)
3935292
ns4056229
ns0.97
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA
579655
nsmlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s)
7602875
ns7721750
ns0.98
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s)
7372333
ns7459709
ns0.99
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s)
7461313
ns7439375.5
ns1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s)
8220167
ns8277625
ns0.99
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA
1357048
nsmlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s)
17533125
ns17593999.5
ns1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s)
17557125
ns17466354
ns1.01
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s)
17531667
ns17549604.5
ns1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s)
9214250
ns9302166.5
ns0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
23446917
ns23554916.5
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
43586125
ns33592458
ns1.30
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
37247062.5
ns37227500
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
35028291.5
ns35248104
ns0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA
1855921.5
nsConv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
189114500
ns188482416
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
178190333
ns164033541
ns1.09
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
153393396
ns153090042
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
434855500
ns443063541
ns0.98
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
13947546
nsConv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
290046875
ns290580729
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
271392771
ns257093729.5
ns1.06
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
284812041.5
ns296199833.5
ns0.96
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
473569708.5
ns482390645.5
ns0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
23021
ns22750
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
22458
ns24645.5
ns0.91
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
23625
ns23792
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
22708
ns21958
ns1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
96516
nslayernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
115458.5
ns103459
ns1.12
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
103250
ns104709
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
104375
ns103916.5
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
105042
ns103729.5
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
508001.5
nslayernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
5750
ns5834
ns0.99
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
6500
ns6083
ns1.07
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
6708
ns6625
ns1.01
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6125
ns6209
ns0.99
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
68991.5
nslayernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
14042
ns14667
ns0.96
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
15500
ns15020.5
ns1.03
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
15687.5
ns16020.5
ns0.98
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
14500
ns15250
ns0.95
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
478721
nsConv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
2979083.5
ns3027500
ns0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2084000
ns2071021
ns1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2281500
ns2285333.5
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
4814250
ns4820958
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA
585630.5
nsConv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
23560375
ns23646313
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
18266583.5
ns18048395.5
ns1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
16959209
ns16906125
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
34863041.5
ns35430208
ns0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
2766675
nsConv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
33305667
ns33437292
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
27994104
ns27650521
ns1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
27448959
ns27492875
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
40756916
ns42564979.5
ns0.96
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
74000
ns72854.5
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
73333
ns73458
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
74917
ns74021
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
74500
ns75000
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
104050
nslayernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
218083
ns303958
ns0.72
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
210625
ns219312.5
ns0.96
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
296708.5
ns219042
ns1.35
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
217792
ns319666.5
ns0.68
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
558286.5
nslayernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
11750
ns11500
ns1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
12417
ns11959
ns1.04
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
12458.5
ns12416
ns1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
11834
ns12208
ns0.97
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
72847.5
nslayernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
26125
ns26083.5
ns1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
27167
ns26104.5
ns1.04
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
27375
ns27209
ns1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
26458
ns26750
ns0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
484580
nsgroupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
11583
ns12166.5
ns0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
12167
ns12645.5
ns0.96
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
14000
ns13500
ns1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
11792
ns12875
ns0.92
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
55176
nsgroupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
25542
ns25750
ns0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
26417
ns26459
ns1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
28709
ns26375
ns1.09
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
26042
ns25833
ns1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
307604.5
nsgroupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
179208
ns182125
ns0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
181042
ns180500
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
184333.5
ns183000
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
179416
ns180375
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
57654
nsgroupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
590646
ns581750
ns1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
591479
ns590708.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
593500
ns609500
ns0.97
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
582749.5
ns594250
ns0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
291261
nslayernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
6083.5
ns5625
ns1.08
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
6375
ns5958
ns1.07
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
6708
ns6500
ns1.03
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6292
ns6250
ns1.01
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
71643
nslayernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
14250
ns13917
ns1.02
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
15167
ns13916
ns1.09
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
15292
ns14583
ns1.05
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
14042
ns14291
ns0.98
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
470922.5
nsbatchedmm(512, Bsize=4)/forward/CPU/2 thread(s)
1203770.5
ns1196250
ns1.01
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s)
1236645.5
ns1251708
ns0.99
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s)
1343083
ns1274542
ns1.05
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s)
1024395.5
ns1013000
ns1.01
batchedmm(512, Bsize=4)/forward/GPU/CUDA
300123
nsbatchedmm(512, Bsize=4)/zygote/CPU/2 thread(s)
4091000
ns4142875
ns0.99
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s)
4576917
ns4864958
ns0.94
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s)
4574875.5
ns4545520.5
ns1.01
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s)
3718250
ns3911541.5
ns0.95
batchedmm(512, Bsize=4)/zygote/GPU/CUDA
1038641
nsdense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
1875
ns1792
ns1.05
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1875
ns1834
ns1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
1875
ns1875
ns1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
1875
ns1875
ns1
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA
23874.5
nsdense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
5083
ns4834
ns1.05
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
5000
ns4917
ns1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
4959
ns5000
ns0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
4875
ns4916
ns0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA
193867
nsgroupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
5500
ns5250
ns1.05
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
5709
ns5917
ns0.96
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
6875
ns6333
ns1.09
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
5416
ns6042
ns0.90
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
57200
nsgroupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
11042
ns11000
ns1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
11584
ns11458
ns1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
11500
ns11292
ns1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10625
ns11000
ns0.97
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
332575
nsdense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s)
375
ns292
ns1.28
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s)
333
ns333
ns1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s)
334
ns334
ns1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s)
334
ns292
ns1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA
22978
nsdense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
2834
ns2708
ns1.05
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
2792
ns3041
ns0.92
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
3000
ns2792
ns1.07
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
2833
ns2709
ns1.05
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA
163496
nsgroupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
11625
ns11167
ns1.04
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
11292
ns11667
ns0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
12875
ns12375
ns1.04
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
11209
ns12083
ns0.93
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
58225
nsgroupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
24958
ns25083
ns1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
25208
ns25416
ns0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
25375
ns25167
ns1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
25042
ns24583
ns1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
299318
nsdense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s)
4250
ns4208
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s)
4250
ns4250
ns1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s)
4250
ns4250
ns1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s)
4250
ns4250
ns1
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA
25190
nsdense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
16209
ns16375
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
16083
ns16417
ns0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
16625
ns16250
ns1.02
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
16500
ns16042
ns1.03
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA
202972
nsbatchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
5833
ns5750
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
5792
ns5791
ns1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
5959
ns5875
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
5792
ns5833
ns0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
34611
nsbatchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
20625
ns20375
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
21042
ns20479.5
ns1.03
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
21083
ns21208
ns0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
20125
ns20854.5
ns0.97
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
178483.5
nsbatchedmm(16, Bsize=512)/forward/CPU/2 thread(s)
414125
ns427021
ns0.97
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s)
367771
ns388041
ns0.95
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s)
480813
ns475333
ns1.01
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s)
104146
ns107750
ns0.97
batchedmm(16, Bsize=512)/forward/GPU/CUDA
67750.5
nsbatchedmm(16, Bsize=512)/zygote/CPU/2 thread(s)
927125
ns885834
ns1.05
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s)
964354
ns960667
ns1.00
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s)
1186833
ns1182208
ns1.00
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s)
376584
ns375875
ns1.00
batchedmm(16, Bsize=512)/zygote/GPU/CUDA
192974.5
nsgroupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
77583
ns80125
ns0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
79125
ns80750
ns0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
83542
ns82167
ns1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
79958
ns80791
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
193934
nsgroupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1917959
ns1942937
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1933541
ns1918166.5
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1931521.5
ns1916333
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1860375
ns1923604
ns0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
392771
nsdense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s)
292
ns292
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s)
292
ns333
ns0.88
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s)
292
ns333
ns0.88
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s)
333
ns333
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA
22416
nsdense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
1792
ns1833
ns0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
1875
ns1875
ns1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
1875
ns1875
ns1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
1875
ns1833
ns1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA
174762
nsgroupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
6562.5
ns6167
ns1.06
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
6417
ns6792
ns0.94
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
8166
ns7333
ns1.11
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
6208
ns6667
ns0.93
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
59227
nsgroupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
9292
ns8791.5
ns1.06
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
9250
ns9416
ns0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
9375
ns9292
ns1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
9083
ns9167
ns0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
304901.5
nsConv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
120543687.5
ns119015458
ns1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
181954416.5
ns173560375
ns1.05
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
148126750
ns148104416
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
106134709
ns104510604
ns1.02
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA
5492614.5
nsConv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
609833750
ns611899646
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
578593208
ns555362500
ns1.04
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
451045708.5
ns453017291
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
627478333.5
ns632276917
ns0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
35107131
nsConv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
652518625
ns666765667
ns0.98
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
683671437.5
ns666371104
ns1.03
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
587115583.5
ns582119812.5
ns1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
852245209
ns866159459
ns0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
58000
ns57541
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
39209
ns47708
ns0.82
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
48208
ns46875
ns1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
85167
ns84375
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
38635
nsbatchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1920104
ns1944250
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1988000
ns1980416
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1980667
ns1976042
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1907896
ns1906083
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
176329
nslayernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
267041
ns267917
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
270500
ns268292
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
268750
ns267937.5
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
265291
ns267625
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
123893.5
nslayernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
596166
ns703792
ns0.85
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
698625
ns681124.5
ns1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
702916.5
ns595667
ns1.18
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
589292
ns697208
ns0.85
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
677537.5
nslayernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
2180187.5
ns2209437.5
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
2215229
ns2173708
ns1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
2212000
ns2200062
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
2207792
ns2113875
ns1.04
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
133207
nslayernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5497667
ns5503083
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5581500
ns5488667
ns1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5516125
ns5509792
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5545124.5
ns5568042
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
717120
nsdense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
656041
ns638000
ns1.03
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
642917
ns645667
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
637375
ns647187.5
ns0.98
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
644167
ns644709
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA
46463
nsdense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
1822875
ns1827583
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
1668958.5
ns1720833
ns0.97
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
1723334
ns1720291
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
2101084
ns2097125
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA
222123
nsbatchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
57667
ns59166
ns0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
38708
ns47625
ns0.81
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
46916
ns45833
ns1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
85084
ns84209
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
28664
nsbatchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2028604.5
ns2051584
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2097916.5
ns2075395.5
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2087625
ns2040667
ns1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2005812
ns2021583
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
188609
nsConv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
13343604
ns13373292
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
12536250
ns12436750
ns1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
12547834
ns12559270.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
15250271
ns14986208.5
ns1.02
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA
510611.5
nsConv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
47204500
ns47390625
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
41927292
ns41705020.5
ns1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
40799666
ns40992438
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
58864104
ns58725208
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
2889030
nsConv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
73523334
ns73938270.5
ns0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
91557750
ns90830563
ns1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
90571250.5
ns90514083
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
75976041
ns76122334
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
58083
ns59916
ns0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
38875
ns47541
ns0.82
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
47709
ns47458
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
82042
ns83500
ns0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
48950
nsbatchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1916542
ns1948584
ns0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1982083
ns1954250
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1947333
ns1965437.5
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1876854
ns1888625
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
195268
nsbatchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
333
ns333
ns1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
375
ns375
ns1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
375
ns375
ns1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
292
ns292
ns1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
32997
nsbatchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
5834
ns5979.5
ns0.98
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
6500
ns6584
ns0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
6458.5
ns6500
ns0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
5958
ns6187.5
ns0.96
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
171034
nsdense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s)
292
ns250
ns1.17
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s)
250
ns292
ns0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s)
292
ns292
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s)
250
ns292
ns0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA
32918
nsdense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
2750
ns2666
ns1.03
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
2750
ns2875
ns0.96
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
2917
ns2792
ns1.04
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
2625
ns2666
ns0.98
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA
161268
nsConv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
286917729.5
ns286733687.5
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
347948583.5
ns339568833
ns1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
314136145.5
ns314522187.5
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
267700542
ns270045166
ns0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA
7080984
nsConv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
1009676125
ns1015582292
ns0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
974877416
ns953582875
ns1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
854637270.5
ns840575375
ns1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
1260982959
ns1282644084
ns0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
34048271
nsConv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
1387098104
ns1419694479.5
ns0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
1694333625
ns1672572375
ns1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
1631003167
ns1620047667
ns1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
1358038896
ns1358918958.5
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1411604.5
ns1454458
ns0.97
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1409250
ns1408583
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1407354.5
ns1410041.5
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1405916
ns1442292
ns0.97
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
128067
nsgroupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5023999.5
ns5055625
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5051396
ns5019625
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5029104.5
ns5009458
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5040479
ns5053667
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
514176
nsvgg16(32, 32, 3, 32)/forward/CPU/2 thread(s)
170919250
ns171675979
ns1.00
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s)
183735542
ns126429812.5
ns1.45
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s)
115460229.5
ns106760875
ns1.08
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s)
168486416
ns165741833.5
ns1.02
vgg16(32, 32, 3, 32)/forward/GPU/CUDA
4853309
nsvgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s)
627387000
ns622640208
ns1.01
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s)
561666625
ns492172500
ns1.14
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s)
453969542
ns462809167
ns0.98
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s)
654142166
ns660164833
ns0.99
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA
17017885
nsbatchedmm(512, Bsize=32)/forward/CPU/2 thread(s)
8912729
ns8982250
ns0.99
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s)
9063708
ns8969792
ns1.01
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s)
7941979
ns7891125
ns1.01
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s)
9820979.5
ns9977959
ns0.98
batchedmm(512, Bsize=32)/forward/GPU/CUDA
1590505
nsbatchedmm(512, Bsize=32)/zygote/CPU/2 thread(s)
36015084
ns36106959
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s)
38799959
ns37109917
ns1.05
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s)
33679959
ns33736459
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s)
37936417
ns39159896
ns0.97
batchedmm(512, Bsize=32)/zygote/GPU/CUDA
6472671
nsbias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s)
47459
ns47375
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s)
47708
ns47500
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s)
47625
ns47645.5
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s)
47209
ns47500
ns0.99
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA
17832
nsbias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s)
50416
ns50417
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s)
50292
ns50875
ns0.99
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s)
50458
ns51729
ns0.98
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s)
50291
ns50333
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA
162828
nsgroupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
6208
ns6583
ns0.94
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
7083
ns7208
ns0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
7562.5
ns7646
ns0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
6292
ns7333
ns0.86
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
74130
nsgroupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
9375
ns9292
ns1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10250
ns10209
ns1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
10375
ns10333
ns1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
9917
ns10167
ns0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
422862.5
nsgroupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
5666
ns5854.5
ns0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
6500
ns6292
ns1.03
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
6916
ns6834
ns1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
5375
ns6166
ns0.87
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
78877.5
nsgroupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
12875
ns12667
ns1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
13583
ns13208.5
ns1.03
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
13583
ns13459
ns1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
13208
ns12958
ns1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
370972.5
nsbatchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
1083
ns1000
ns1.08
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
1083
ns1042
ns1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
1083
ns1084
ns1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
1042
ns1042
ns1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
33127
nsbatchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7792
ns7770.5
ns1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
8167
ns8125
ns1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
8083
ns7834
ns1.03
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7792
ns8250
ns0.94
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
187081.5
nsbias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
23333
ns23417
ns1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
23417
ns23375
ns1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
23583
ns23500
ns1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
23084
ns23458
ns0.98
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA
18527
nsbias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
52042
ns52292
ns1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
52750
ns52667
ns1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
52875
ns52667
ns1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
52542
ns52417
ns1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA
204233
nsgroupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1398875
ns1448145.5
ns0.97
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1455625
ns1457021
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1404042
ns1402542
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1406584
ns1403042
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
196492.5
nsgroupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
4999875
ns5036750
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5037708
ns5020979
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5003083
ns5021708
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5024916
ns5042708.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
495167
nsConv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
3047396
ns3054459
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2106521
ns2092750
ns1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2296895.5
ns2302708.5
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
4962229.5
ns4935833
ns1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA
583841
nsConv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
24384458
ns24359708.5
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
19075709
ns18879875
ns1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
17765562.5
ns17805083
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
35955916.5
ns36477083
ns0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
2836787
nsConv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
33991937.5
ns34112104.5
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
28748917
ns28352833
ns1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
28081042
ns27995625
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
41668854.5
ns42341709
ns0.98
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s)
142678458
ns143179166
ns1.00
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s)
147270333
ns147785458
ns1.00
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s)
126985770.5
ns126873458.5
ns1.00
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s)
174826021
ns172641167
ns1.01
batchedmm(512, Bsize=512)/forward/GPU/CUDA
22556485
nsbatchedmm(512, Bsize=512)/zygote/CPU/2 thread(s)
1026522125
ns1416291312.5
ns0.72
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s)
866022875.5
ns1304509479
ns0.66
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s)
743843334
ns1238526750
ns0.60
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s)
682878792
ns685736000
ns1.00
batchedmm(512, Bsize=512)/zygote/GPU/CUDA
116543149
nslayernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
76083
ns76042
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
76250
ns79459
ns0.96
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
77625
ns76687
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
75833.5
ns75124.5
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
163749.5
nslayernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
275437.5
ns189375
ns1.45
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
283542
ns278000
ns1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
275959
ns289166.5
ns0.95
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
282375
ns193709
ns1.46
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
882740
nsbatchedmm(512, Bsize=128)/forward/CPU/2 thread(s)
35483000
ns35548875.5
ns1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s)
36565000
ns36247291.5
ns1.01
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s)
32543896
ns32430687.5
ns1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s)
40679500
ns40776042
ns1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA
5828412
nsbatchedmm(512, Bsize=128)/zygote/CPU/2 thread(s)
147536708
ns148827666
ns0.99
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s)
157209875
ns152471625
ns1.03
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s)
136063312.5
ns135828541
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s)
286255000
ns224259958
ns1.28
batchedmm(512, Bsize=128)/zygote/GPU/CUDA
34875549.5
nsConv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
122158104.5
ns120283062
ns1.02
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
181447688
ns173757375
ns1.04
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
147872917
ns148381833
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
104774833.5
ns100995854
ns1.04
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA
5433572
nsConv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
468969166
ns468476625
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
487732687.5
ns466581667
ns1.05
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
437061208
ns438033125
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
745602708
ns758068771
ns0.98
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
31632434
nsConv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
708533125.5
ns656498666
ns1.08
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
662068729.5
ns639464917
ns1.04
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
625681375
ns572772729.5
ns1.09
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
856533500
ns867522166
ns0.99
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s)
1243917
ns1241166.5
ns1.00
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s)
778625
ns960584
ns0.81
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s)
961709
ns985604
ns0.98
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s)
2098041.5
ns2040750
ns1.03
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA
581626.5
nsmlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s)
2966062.5
ns3033584
ns0.98
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s)
2513979
ns2618542
ns0.96
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s)
2620167
ns2633875
ns0.99
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s)
3551916
ns3767750
ns0.94
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA
1532656
nsmlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s)
5803146
ns5830292
ns1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s)
5896375
ns5796375
ns1.02
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s)
5798708
ns5804458
ns1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s)
2924083
ns2978917
ns0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7083
ns7500
ns0.94
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
5291
ns6042
ns0.88
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6208
ns6209
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10166
ns10333
ns0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
25159
nsbatchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
212500
ns212708
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
220625
ns220542
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
220709
ns223542
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
213625
ns208708
ns1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
199491.5
nsvgg16(32, 32, 3, 64)/forward/CPU/2 thread(s)
297113041
ns297468334
ns1.00
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s)
291058458
ns215016959
ns1.35
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s)
193310291.5
ns193569000
ns1.00
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s)
304396812.5
ns311798792
ns0.98
vgg16(32, 32, 3, 64)/forward/GPU/CUDA
7678125.5
nsvgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s)
1231332166.5
ns1238998917
ns0.99
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s)
973933875
ns901957166.5
ns1.08
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s)
836913500
ns825878542
ns1.01
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s)
1148765416.5
ns1319998292
ns0.87
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA
26856489.5
nsgroupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
4792
ns5542
ns0.86
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
5875
ns5834
ns1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
6354
ns6708
ns0.95
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
4667
ns5375
ns0.87
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
93183
nsgroupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7000
ns7083
ns0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7625
ns7333
ns1.04
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7458
ns7875
ns0.95
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7395.5
ns7042
ns1.05
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
440751
nsbatchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
500
ns583
ns0.86
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
667
ns625
ns1.07
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
584
ns625
ns0.93
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
500
ns541
ns0.92
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
24653
nsbatchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
8625
ns9083
ns0.95
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
9500
ns8666
ns1.10
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
9917
ns9292
ns1.07
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
8792
ns8583
ns1.02
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
176547.5
nsbias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
353584
ns351792
ns1.01
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
353833
ns351708
ns1.01
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
352208
ns352375
ns1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
351500
ns354000
ns0.99
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA
21275
nsbias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
807916.5
ns827667
ns0.98
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
789854
ns779562.5
ns1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
776042
ns778208
ns1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
778833
ns824354.5
ns0.94
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA
215262.5
nsbatchedmm(16, Bsize=32)/forward/CPU/2 thread(s)
339229
ns337833
ns1.00
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s)
321000
ns342521
ns0.94
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s)
454187
ns452875
ns1.00
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s)
10916
ns11687.5
ns0.93
batchedmm(16, Bsize=32)/forward/GPU/CUDA
18631
nsbatchedmm(16, Bsize=32)/zygote/CPU/2 thread(s)
714125
ns713208.5
ns1.00
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s)
731625
ns736500
ns0.99
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s)
1006333
ns1010250
ns1.00
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s)
26667
ns27208.5
ns0.98
batchedmm(16, Bsize=32)/zygote/GPU/CUDA
196596.5
nsbatchedmm(16, Bsize=128)/forward/CPU/2 thread(s)
381833.5
ns381792
ns1.00
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s)
330959
ns354187
ns0.93
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s)
444916.5
ns441708
ns1.01
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s)
31417
ns31083
ns1.01
batchedmm(16, Bsize=128)/forward/GPU/CUDA
23162
nsbatchedmm(16, Bsize=128)/zygote/CPU/2 thread(s)
727875
ns731646
ns0.99
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s)
783542
ns785667
ns1.00
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s)
1030146
ns1027917
ns1.00
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s)
90750
ns91083
ns1.00
batchedmm(16, Bsize=128)/zygote/GPU/CUDA
193002.5
nsbias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s)
3583
ns3542
ns1.01
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s)
3709
ns3458
ns1.07
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s)
3625
ns3583
ns1.01
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s)
3375
ns3542
ns0.95
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA
17634
nsbias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s)
4291
ns4167
ns1.03
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s)
4208
ns4250
ns0.99
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s)
4333
ns4500
ns0.96
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s)
4125
ns4208
ns0.98
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA
200435.5
nslayernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
3500
ns3375
ns1.04
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
4167
ns3917
ns1.06
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
4375
ns4084
ns1.07
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
3583
ns3917
ns0.91
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
151437.5
nslayernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8458
ns8375
ns1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8583
ns8167
ns1.05
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8333
ns8584
ns0.97
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8458
ns8500
ns1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
927946.5
nsbatchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
204583
ns204791
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
209000
ns210875
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
210500
ns211541
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
199084
ns202083
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
35183
nsbatchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
602833.5
ns600417
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
629209
ns627875
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
625584
ns630312
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
582250
ns583542
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
266930.5
nsbatchedmm(128, Bsize=128)/forward/CPU/2 thread(s)
990542
ns1010270.5
ns0.98
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s)
1053625
ns1015521
ns1.04
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s)
954292
ns949979.5
ns1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s)
901104
ns909416
ns0.99
batchedmm(128, Bsize=128)/forward/GPU/CUDA
206789.5
nsbatchedmm(128, Bsize=128)/zygote/CPU/2 thread(s)
4511208
ns4557687.5
ns0.99
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s)
4854542
ns4722959
ns1.03
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s)
4490209
ns4470333.5
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s)
4299083.5
ns4443646.5
ns0.97
batchedmm(128, Bsize=128)/zygote/GPU/CUDA
930739
nslayernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
3084
ns3334
ns0.93
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
3500
ns3500
ns1
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
4083.5
ns4125
ns0.99
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
3000
ns3625
ns0.83
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
144120
nslayernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7250
ns7292
ns0.99
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7333
ns7167
ns1.02
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7500
ns7167
ns1.05
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7041
ns7458.5
ns0.94
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
806482
nsConv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
1636250
ns1562000
ns1.05
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1158208.5
ns1179000
ns0.98
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1368083
ns1346417
ns1.02
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
2308063
ns2481104
ns0.93
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA
214505
nsConv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
12270583
ns12361833
ns0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
9567750
ns9575979
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
9243645.5
ns9245041
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
18134146
ns18149645.5
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
1954133
nsConv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
17281250
ns17389625
ns0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
14453375
ns14446583
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
14325333
ns14298208.5
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
21045500
ns21068500
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
85708
ns88500
ns0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
91520.5
ns99167
ns0.92
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
93250
ns91917
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
87833.5
ns90708.5
ns0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
126207
nsgroupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2017958
ns2074916
ns0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2050542
ns2029541
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2029834
ns1761250
ns1.15
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2026959
ns2035041.5
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
841405
nsbatchedmm(2, Bsize=4)/forward/CPU/2 thread(s)
1375
ns2084
ns0.66
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s)
1917
ns2666
ns0.72
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s)
3583.5
ns3583.5
ns1
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s)
2375
ns1916
ns1.24
batchedmm(2, Bsize=4)/forward/GPU/CUDA
16017
nsbatchedmm(2, Bsize=4)/zygote/CPU/2 thread(s)
2875
ns2625
ns1.10
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s)
2833
ns2875
ns0.99
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s)
2750
ns2917
ns0.94
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s)
2792
ns2834
ns0.99
batchedmm(2, Bsize=4)/zygote/GPU/CUDA
165765.5
nsbatchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7208
ns7375
ns0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
5333
ns6042
ns0.88
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
5958
ns6083
ns0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10084
ns10083
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
34231
nsbatchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
214458
ns212333.5
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
220042
ns220563
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
221416
ns223084
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
235834
ns208417
ns1.13
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
263066.5
nsdense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3708
ns3750
ns0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3750
ns3750
ns1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3750
ns3709
ns1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3708
ns3750
ns0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA
22879.5
nsdense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
14459
ns14709
ns0.98
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
14375
ns14625
ns0.98
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
14541
ns14541
ns1
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
14500
ns14292
ns1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA
399546.5
nsgroupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
94312.5
ns94500
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
95875
ns93916.5
ns1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
97583
ns96125
ns1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
94354.5
ns95625
ns0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
125486.5
nsgroupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1919437.5
ns1950959
ns0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1938250
ns1918895.5
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1927084
ns1651334
ns1.17
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1803750
ns1942375
ns0.93
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
794850
nslenet(28, 28, 1, 32)/forward/CPU/2 thread(s)
875354.5
ns881833
ns0.99
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s)
802104.5
ns830792
ns0.97
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s)
1225042
ns1225417
ns1.00
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s)
970374.5
ns944312.5
ns1.03
lenet(28, 28, 1, 32)/forward/GPU/CUDA
273954
nslenet(28, 28, 1, 32)/zygote/CPU/2 thread(s)
2714354
ns2742708
ns0.99
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s)
2504167
ns2522750
ns0.99
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s)
3360375
ns3329959
ns1.01
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s)
3360334
ns3361458
ns1.00
lenet(28, 28, 1, 32)/zygote/GPU/CUDA
1467965
nsgroupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
17542
ns15166.5
ns1.16
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
16937.5
ns17000
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
18708
ns16583
ns1.13
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
14584
ns15667
ns0.93
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
129735
nsgroupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
214709
ns214666
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
215958.5
ns224541.5
ns0.96
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
215562.5
ns216208
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
217958
ns217645.5
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
539139.5
nslayernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
223375
ns219500
ns1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
220958
ns220000
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
222645.5
ns221167
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
219625
ns220834
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
217203.5
nslayernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
495895.5
ns495958
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
506625
ns507958
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
510958
ns498625
ns1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
561375
ns506541
ns1.11
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1153506.5
nsbatchedmm(16, Bsize=4)/forward/CPU/2 thread(s)
3917
ns4166.5
ns0.94
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s)
4667
ns4312.5
ns1.08
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s)
4834
ns4583
ns1.05
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s)
4833
ns4625
ns1.04
batchedmm(16, Bsize=4)/forward/GPU/CUDA
17326
nsbatchedmm(16, Bsize=4)/zygote/CPU/2 thread(s)
7520.5
ns7187.5
ns1.05
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s)
7625
ns7292
ns1.05
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s)
7458
ns7229.5
ns1.03
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s)
7417
ns7625
ns0.97
batchedmm(16, Bsize=4)/zygote/GPU/CUDA
176736
nsgroupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
16646
ns17417
ns0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
18500
ns19292
ns0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
19625
ns18625
ns1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
18042
ns18500
ns0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
133143.5
nsgroupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
213000
ns219083.5
ns0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
212916
ns211959
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
213667
ns213521
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
224895.5
ns213208
ns1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
820129
nslayernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
4354.5
ns4250
ns1.02
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
4625
ns4334
ns1.07
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
4917
ns4750
ns1.04
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
3875
ns4375
ns0.89
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
175343
nslayernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
10208
ns10417
ns0.98
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
10333
ns10750
ns0.96
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
10834
ns10500
ns1.03
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
10208
ns10500
ns0.97
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
980341
nslayernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
3250
ns2958
ns1.10
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
3687.5
ns3417
ns1.08
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
4292
ns3959
ns1.08
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
2917
ns3542
ns0.82
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
215866
nslayernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7166
ns7291
ns0.98
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7625
ns7458
ns1.02
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7792
ns7583
ns1.03
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7375
ns7625
ns0.97
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
1015020
nsConv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
23687417
ns23616833
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
42666354
ns34076542
ns1.25
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
37344478.5
ns37648750
ns0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
34948333.5
ns35355896
ns0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA
1824017
nsConv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
183871416
ns185118750
ns0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
182812313
ns161569416
ns1.13
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
145975437.5
ns146021041.5
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
274277542
ns274915208
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
16507012
nsConv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
273782791
ns273527291
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
257949042
ns244066854
ns1.06
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
231995083.5
ns231262500
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
323882958.5
ns325681645.5
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
183541
ns183916.5
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
184000
ns184479.5
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
185292
ns183709
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
182542
ns185125
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
191911.5
nsgroupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
629458.5
ns635250
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
587334
ns590375
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
587125.5
ns586375
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
649291
ns586875.5
ns1.11
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
963628
nsbatchedmm(128, Bsize=512)/forward/CPU/2 thread(s)
3851750
ns3912854
ns0.98
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s)
3983792
ns3922688
ns1.02
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s)
3579833
ns3534875
ns1.01
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s)
4612292
ns4683208
ns0.98
batchedmm(128, Bsize=512)/forward/GPU/CUDA
531156
nsbatchedmm(128, Bsize=512)/zygote/CPU/2 thread(s)
17385812.5
ns17461333
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s)
18439958.5
ns17877604
ns1.03
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s)
16577084
ns16535333
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s)
20232667
ns20876542
ns0.97
batchedmm(128, Bsize=512)/zygote/GPU/CUDA
2638769
nsbatchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
625
ns500
ns1.25
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
625
ns625
ns1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
625
ns583
ns1.07
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
542
ns541
ns1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
32361
nsbatchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
9312.5
ns8875
ns1.05
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
9604.5
ns9458
ns1.02
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
9541
ns9167
ns1.04
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
8750
ns9084
ns0.96
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
248738
nsvgg16(32, 32, 3, 128)/forward/CPU/2 thread(s)
650277229.5
ns653952292
ns0.99
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s)
513797917
ns393857103.5
ns1.30
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s)
364513416
ns328714250
ns1.11
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s)
753229708
ns759532875
ns0.99
vgg16(32, 32, 3, 128)/forward/GPU/CUDA
11759811
nsvgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s)
1878034500
ns1886540417
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s)
1671899375
ns1638767625
ns1.02
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s)
1507608416.5
ns1505416479
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s)
2202946667
ns2232982666.5
ns0.99
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA
49516620
nsConv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
1535958.5
ns1645500
ns0.93
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1179292
ns1196083
ns0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1380729.5
ns1372166
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
2368083
ns2490500
ns0.95
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA
215337
nsConv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
12730083
ns12742021.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
9937625
ns9937333.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
9659583.5
ns9670291
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
18459917
ns18551458
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
2010689
nsConv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
17677292
ns17729729
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
14810083
ns14747250
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
14573229.5
ns14539958
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
21483000
ns21491875
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
26292
ns26250
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
26250
ns26292
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
26250
ns26250
ns1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
26208
ns26250
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA
23665
nsdense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
67166
ns67416
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
66875
ns67167
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
67250
ns68042
ns0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
66958
ns66917
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA
367986.5
nsbatchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
204583
ns203916
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
209292
ns209500
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
210500
ns208375
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
199625
ns199583
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
26073
nsbatchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
613125
ns615979
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
625459
ns622458.5
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
633583
ns625042
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
632083
ns628771
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
320857.5
nslayernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
592750
ns654750
ns0.91
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
647000
ns648792
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
648834
ns639250
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
671792
ns553000
ns1.21
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
131354
nslayernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2247291
ns2255292
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2303208
ns2216833.5
ns1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2243604
ns2230625
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2314875.5
ns2261625
ns1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1083962
nsgroupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
16687.5
ns17479.5
ns0.95
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
18458
ns18166
ns1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
19770.5
ns18334
ns1.08
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
18146
ns18542
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
132087.5
nsgroupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
229375
ns230250
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
262896
ns218666.5
ns1.20
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
231208
ns220145.5
ns1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
258624.5
ns225083.5
ns1.15
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
885149.5
nsbatchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
625
ns625
ns1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
625
ns625
ns1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
667
ns542
ns1.23
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
542
ns541
ns1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
23686
nsbatchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
8708
ns9625
ns0.90
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
10000
ns9500
ns1.05
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
10000
ns9583
ns1.04
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
9250
ns9583
ns0.97
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
241904
nsgroupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
5417
ns5166
ns1.05
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
5583
ns5667
ns0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
6417
ns6291.5
ns1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
4770.5
ns5625
ns0.85
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
194851.5
nsgroupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7667
ns6959
ns1.10
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7417
ns7709
ns0.96
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7792
ns7125
ns1.09
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7250
ns7250
ns1
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
705733
nsbias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
2167
ns2292
ns0.95
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
2208
ns2125
ns1.04
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
2542
ns2333
ns1.09
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
2208
ns2167
ns1.02
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA
17804
nsbias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
6541
ns6354.5
ns1.03
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
6500
ns6500
ns1
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
6875
ns6583.5
ns1.04
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
6417
ns6459
ns0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA
294742
nsbias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s)
746916
ns748750
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s)
761333
ns746708
ns1.02
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s)
750541
ns749375
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s)
749459
ns749125
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA
20924
nsbias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s)
790875
ns794125
ns1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s)
777375
ns775500
ns1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s)
792500
ns775812.5
ns1.02
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s)
778250
ns794500.5
ns0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA
268681.5
nsbatchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7375
ns7458
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
5250
ns6084
ns0.86
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
5875
ns5583
ns1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10292
ns10541
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
32725
nsbatchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
219208
ns231542
ns0.95
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
230937.5
ns231875
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
236625
ns229604
ns1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
214312.5
ns215187.5
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
332717.5
nslayernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
10291
ns10166.5
ns1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
10937.5
ns10416
ns1.05
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
10625
ns10479
ns1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
9916
ns10417
ns0.95
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
219475.5
nslayernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
24416
ns25083.5
ns0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
25417
ns23916
ns1.06
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
24875
ns24625
ns1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
24354.5
ns25000
ns0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
1060762
nsConv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
106190416
ns106424375
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
126215417
ns117279208.5
ns1.08
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
120200125
ns120424354
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
117655917
ns117916208
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA
2587994
nsConv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
395454916.5
ns397131541.5
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
372350083.5
ns366183958
ns1.02
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
355285895.5
ns355277020.5
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
542892500
ns545563875.5
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
15209611
nsConv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
607219000
ns609770291
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
775694542
ns756955334
ns1.02
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
743546708
ns745569813
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
606917208
ns607706416.5
ns1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
6729.5
ns6875
ns0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
7458
ns9229
ns0.81
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
8791
ns8833
ns1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6084
ns7500
ns0.81
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
214170
nsgroupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
14645.5
ns14375
ns1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
14167
ns13750
ns1.03
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
14334
ns14667
ns0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
13417
ns13542
ns0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
1010027
nsgroupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
6042
ns5959
ns1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
6708.5
ns6354.5
ns1.06
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
6958
ns7083
ns0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
5166.5
ns6042
ns0.86
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
211003
nsgroupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
12916
ns12666
ns1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
12979.5
ns12917
ns1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
13041
ns12916
ns1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
12375
ns12292
ns1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
725511
nsbatchedmm(2, Bsize=128)/forward/CPU/2 thread(s)
5792
ns5875
ns0.99
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s)
6084
ns5937.5
ns1.02
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s)
7166
ns5812.5
ns1.23
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s)
5979.5
ns6000
ns1.00
batchedmm(2, Bsize=128)/forward/GPU/CUDA
16985
nsbatchedmm(2, Bsize=128)/zygote/CPU/2 thread(s)
16375
ns15375
ns1.07
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s)
15917
ns18229.5
ns0.87
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s)
15750
ns15625
ns1.01
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s)
15750
ns15834
ns0.99
batchedmm(2, Bsize=128)/zygote/GPU/CUDA
184955.5
nsbatchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
333
ns334
ns1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
417
ns417
ns1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
417
ns416
ns1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
292
ns292
ns1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
23469
nsbatchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
6375
ns6291
ns1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
6292
ns6541
ns0.96
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
6458
ns6375
ns1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
6020.5
ns6042
ns1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
226513
nsbatchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
5917
ns5958
ns0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
6000
ns5917
ns1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
6083
ns6083
ns1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
5833
ns5833
ns1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
24637
nsbatchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
21375
ns20895.5
ns1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
21083
ns21084
ns1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
21167
ns21334
ns0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
20875
ns20875
ns1
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
248819
nslayernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
144938
ns145167
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
147666
ns145333
ns1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
147500
ns147791
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
144208
ns146250.5
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
166863.5
nslayernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1328917
ns1351583
ns0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1366916.5
ns1324833.5
ns1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1323667
ns1269708
ns1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1330125
ns1342020.5
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1231201
nslayernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
21917
ns24854
ns0.88
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
23250
ns24750
ns0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
25417
ns24083.5
ns1.06
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
24583
ns23041.5
ns1.07
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
261684.5
nslayernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
126249.5
ns130333
ns0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
132125
ns131875
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
180458
ns120583
ns1.50
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
182166
ns127250
ns1.43
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1329052
nsbatchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
334
ns333
ns1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
375
ns333
ns1.13
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
292
ns292
ns1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
23064
nsbatchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
6417
ns6375
ns1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
6500
ns6750
ns0.96
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
6583
ns6167
ns1.07
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
6083
ns6166
ns0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
241726
nslayernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
4583
ns4250
ns1.08
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
4875
ns4583
ns1.06
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
5062.5
ns5000
ns1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
4375
ns4666
ns0.94
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
230879.5
nslayernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
9792
ns9917
ns0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10375
ns10000
ns1.04
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
10333
ns10458
ns0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10125
ns10250
ns0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
1281938
nsdense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
1584
ns1584
ns1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1625
ns1625
ns1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
1625
ns1625
ns1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
1583
ns1584
ns1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA
23016.5
nsdense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
5709
ns5625
ns1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
5750
ns6000
ns0.96
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
6042
ns5792
ns1.04
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
5625
ns5666
ns0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA
260870.5
nsConv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
6736854
ns6809750
ns0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
6358292
ns6375834
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
6526333
ns6505250
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
7511917
ns7653125.5
ns0.98
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA
214549
nsConv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
24072542
ns24098271
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
21309271.5
ns21313750
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
21010584
ns21034292
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
29840125
ns29936333.5
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
2110310.5
nsConv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
37228250
ns37354916.5
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
45827250
ns45524125
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
45480416
ns45728625
ns0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
38465479
ns38256604.5
ns1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
5708
ns5708
ns1
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
5708
ns5916
ns0.96
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
6729.5
ns6542
ns1.03
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
5208.5
ns5958
ns0.87
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
215925.5
nsgroupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8833
ns8792
ns1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8417
ns8375
ns1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8625
ns8792
ns0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8145.5
ns8042
ns1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
1004537.5
nslenet(28, 28, 1, 128)/forward/CPU/2 thread(s)
1503813
ns1544521
ns0.97
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s)
1243541.5
ns1274291.5
ns0.98
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s)
1631312.5
ns1619792
ns1.01
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s)
2004542
ns2113874.5
ns0.95
lenet(28, 28, 1, 128)/forward/GPU/CUDA
280207
nslenet(28, 28, 1, 128)/zygote/CPU/2 thread(s)
7912062.5
ns7917042
ns1.00
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s)
6650042
ns6631541
ns1.00
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s)
7185875
ns7090646
ns1.01
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s)
10076645.5
ns10525708
ns0.96
lenet(28, 28, 1, 128)/zygote/GPU/CUDA
1812720
nsbatchedmm(128, Bsize=4)/forward/CPU/2 thread(s)
371770.5
ns363667
ns1.02
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s)
359708
ns373917
ns0.96
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s)
457000
ns456000
ns1.00
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s)
27125
ns24312
ns1.12
batchedmm(128, Bsize=4)/forward/GPU/CUDA
47414
nsbatchedmm(128, Bsize=4)/zygote/CPU/2 thread(s)
728042
ns737791.5
ns0.99
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s)
792916
ns796895.5
ns1.00
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s)
1060625
ns1063396
ns1.00
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s)
122625
ns91145.5
ns1.35
batchedmm(128, Bsize=4)/zygote/GPU/CUDA
280856
nsdense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s)
397666
ns397459
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s)
213417
ns287666
ns0.74
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s)
288291
ns287958
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s)
754041
ns751208
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA
44363
nsdense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
669875
ns667375
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
474875
ns532500
ns0.89
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
529792
ns533459
ns0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
975625
ns974250
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA
194646.5
nslayernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
678312.5
ns677250
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
642583
ns646333
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
646625
ns555812.5
ns1.16
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
638374.5
ns589334
ns1.08
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
132515
nslayernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2433792
ns2506042
ns0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2525125
ns2452187.5
ns1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2458416
ns2421083
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2464167
ns2509083.5
ns0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1286025
nsbatchedmm(2, Bsize=32)/forward/CPU/2 thread(s)
4270.5
ns3042
ns1.40
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s)
2791
ns3500
ns0.80
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s)
4334
ns3709
ns1.17
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s)
3021
ns2834
ns1.07
batchedmm(2, Bsize=32)/forward/GPU/CUDA
17018
nsbatchedmm(2, Bsize=32)/zygote/CPU/2 thread(s)
5583
ns5458
ns1.02
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s)
5542
ns5625
ns0.99
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s)
5500
ns5625
ns0.98
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s)
5584
ns5583
ns1.00
batchedmm(2, Bsize=32)/zygote/GPU/CUDA
187936.5
nsbatchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1463042
ns1459917
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1495875
ns1499291
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1503458
ns1501417
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1446334
ns1439583
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
41308.5
nsbatchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5127000
ns5106812.5
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5300416.5
ns5286437.5
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5293458
ns5284041.5
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4725667
ns4996333.5
ns0.95
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
195229
nsdense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3709
ns3709
ns1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3709
ns3708
ns1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3709
ns3708
ns1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3708
ns3750
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA
33264.5
nsdense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
15250
ns15250
ns1
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
15083
ns15417
ns0.98
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
15417
ns15416
ns1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
15125
ns15000
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA
350238
nsdense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s)
71333
ns71500
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s)
71417
ns71333
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s)
71208
ns70542
ns1.01
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s)
71500
ns71250
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA
112408
nsdense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
318125
ns319958
ns0.99
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
327584
ns318333
ns1.03
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
319500
ns318208
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
320333
ns321834
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA
194166
nsbatchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
1000
ns1000
ns1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
1084
ns1083
ns1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
1125
ns1084
ns1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
1000
ns959
ns1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
23803
nsbatchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8000
ns7916
ns1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8417
ns8208
ns1.03
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8417
ns8125
ns1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
7708
ns7667
ns1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
246141
nsbatchedmm(128, Bsize=32)/forward/CPU/2 thread(s)
501979.5
ns514834
ns0.98
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s)
480104
ns490208
ns0.98
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s)
566979
ns567542
ns1.00
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s)
220416
ns218520.5
ns1.01
batchedmm(128, Bsize=32)/forward/GPU/CUDA
128980
nsbatchedmm(128, Bsize=32)/zygote/CPU/2 thread(s)
1391667
ns1371833
ns1.01
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s)
1479770.5
ns1457062.5
ns1.02
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s)
1756604
ns1755667
ns1.00
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s)
864792
ns909250
ns0.95
batchedmm(128, Bsize=32)/zygote/GPU/CUDA
275170
nsbatchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
375
ns292
ns1.28
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
417
ns416
ns1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
375
ns333
ns1.13
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
292
ns292
ns1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
31717
nsbatchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
6625
ns6166
ns1.07
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
6542
ns6708
ns0.98
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
6500
ns6125
ns1.06
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
5958
ns6083
ns0.98
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
248251
nslayernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1776021
ns1721334
ns1.03
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1733687.5
ns1725146
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1727458
ns1724500
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1726125
ns1728229.5
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
167904
nslayernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
4363208
ns4358375
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
4382750
ns4376792
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
4374000
ns4335333
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4367334
ns4390375
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1079923
nsbias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s)
6875
ns6750
ns1.02
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s)
6708
ns6625
ns1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s)
6792
ns6875
ns0.99
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s)
6666
ns6542
ns1.02
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA
19517
nsbias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
59895.5
ns32500
ns1.84
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
49208
ns50895.5
ns0.97
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
52583
ns32875
ns1.60
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
32417
ns49729
ns0.65
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA
267079.5
nsbatchedmm(2, Bsize=512)/forward/CPU/2 thread(s)
18084
ns17937.5
ns1.01
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s)
18292
ns18042
ns1.01
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s)
19709
ns18125
ns1.09
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s)
18292
ns18458
ns0.99
batchedmm(2, Bsize=512)/forward/GPU/CUDA
18390
nsbatchedmm(2, Bsize=512)/zygote/CPU/2 thread(s)
53833
ns53208
ns1.01
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s)
53375
ns53250
ns1.00
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s)
53375
ns53250
ns1.00
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s)
53625
ns53562.5
ns1.00
batchedmm(2, Bsize=512)/zygote/GPU/CUDA
319120
nsdense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s)
75333
ns75709
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s)
75583
ns75291
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s)
75250
ns75208
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s)
75500
ns75250
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA
46304
nsdense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
324291
ns330270.5
ns0.98
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
336479.5
ns328625
ns1.02
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
324708
ns325083
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
327458
ns329042
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA
209708.5
nsbatchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1487583
ns1486375
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1522083
ns1526375
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1529334
ns1527375
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1471333
ns1464666
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
52335
nsbatchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5126125
ns5175375
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5305125
ns5310021
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5295000
ns4950479
ns1.07
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4684000
ns5010146
ns0.93
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
202194.5
nsdense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
28333
ns28208
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
28333
ns28375
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
28292
ns28292
ns1
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
28209
ns28250
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA
24238
nsdense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
66500
ns66292
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
66250
ns66375
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
66416
ns66459
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
66625
ns66459
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA
495044
nsmlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s)
1478812
ns1396208.5
ns1.06
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s)
933416.5
ns1137042
ns0.82
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s)
1129625
ns1061959
ns1.06
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s)
2267917
ns2245417
ns1.01
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA
577563.5
nsmlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s)
3095187.5
ns2966209
ns1.04
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s)
2641125
ns2741250
ns0.96
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s)
2747417
ns2597667
ns1.06
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s)
3815833.5
ns3844125
ns0.99
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA
1965829
nsmlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s)
7798041
ns7918709
ns0.98
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s)
8017625
ns7905417
ns1.01
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s)
7904083.5
ns7547354
ns1.05
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s)
4861812
ns4916042
ns0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
119833.5
ns80583
ns1.49
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
81604
ns81458
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
82000
ns81541
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
80604
ns80709
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
193857.5
nsgroupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2020000
ns2026042
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2021083
ns2026125.5
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2024292
ns1719750
ns1.18
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1749917
ns2018208
ns0.87
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
744082.5
nsThis comment was automatically generated by workflow using github-action-benchmark.