Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
chore: bump compat for GPUArraysCore to 0.2 for package docs, (keep e…
…xisting compat) (#985) Co-authored-by: CompatHelper Julia <[email protected]>
- Loading branch information
3d1ff6c
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lux Benchmarks
Dense(512 => 512, identity)(512 x 128)/forward/CPU/2 thread(s)
412333
ns411833
ns1.00
Dense(512 => 512, identity)(512 x 128)/forward/CPU/4 thread(s)
322708
ns322270.5
ns1.00
Dense(512 => 512, identity)(512 x 128)/forward/CPU/8 thread(s)
322354.5
ns322687.5
ns1.00
Dense(512 => 512, identity)(512 x 128)/forward/CPU/1 thread(s)
739667
ns739792
ns1.00
Dense(512 => 512, identity)(512 x 128)/forward/GPU/CUDA
43934
ns43717
ns1.00
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/2 thread(s)
605084
ns592458
ns1.02
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/4 thread(s)
511813
ns485750
ns1.05
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/8 thread(s)
476187.5
ns472146
ns1.01
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/1 thread(s)
2280042
ns916416
ns2.49
Dense(512 => 512, identity)(512 x 128)/zygote/GPU/CUDA
191965
ns193389
ns0.99
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/2 thread(s)
720583.5
ns732083
ns0.98
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/4 thread(s)
629375
ns630020.5
ns1.00
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/8 thread(s)
593479
ns590250
ns1.01
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/1 thread(s)
2247208
ns1008000
ns2.23
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
1518562
ns1531625.5
ns0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1187166.5
ns1199500
ns0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1387229
ns1370166
ns1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
2947959
ns2432729.5
ns1.21
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA
211504
ns211497
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
12301292
ns12247917
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
9560875.5
ns9551854.5
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
9311271.5
ns9290625
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
18616125
ns17955583
ns1.04
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
1926828
ns1916393.5
ns1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
17354500
ns17351270.5
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
14318812.5
ns14353042
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
14334708
ns14309667
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
21859292
ns21080250
ns1.04
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
121057729
ns121821646
ns0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
174314729
ns174069521
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
147379166
ns148056167
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
447559833
ns106139667
ns4.22
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA
5496733
ns5478633
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
595612958.5
ns596837750
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
542499292
ns543667792
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
446168125
ns445085375
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
1630779417
ns626736625
ns2.60
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
35003993.5
ns38176542
ns0.92
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
655003333.5
ns652965479.5
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
677053333
ns674093584
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
584185208
ns632863021
ns0.92
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
1732551062.5
ns743445292
ns2.33
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s)
880584
ns849625
ns1.04
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s)
822625
ns832854.5
ns0.99
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s)
1226625
ns1217000
ns1.01
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s)
782750
ns966042
ns0.81
lenet(28, 28, 1, 32)/forward/GPU/CUDA
270475
ns266296.5
ns1.02
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s)
2740959
ns2721500
ns1.01
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s)
2494167
ns2466917
ns1.01
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s)
3327979
ns3314395.5
ns1.00
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s)
3134292
ns3364958.5
ns0.93
lenet(28, 28, 1, 32)/zygote/GPU/CUDA
1067170
ns1061958
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
2264271
ns2259875
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1552417
ns1580250
ns0.98
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1753479
ns1752416.5
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
4348083
ns3779541
ns1.15
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA
214769
ns212874
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
20483167
ns20464770.5
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
17691916
ns17681833
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
17963833
ns17968916
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
26775375
ns26220958.5
ns1.02
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
1991226
ns1983562
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
45016687.5
ns44361875
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
42002229.5
ns42037625
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
41336854.5
ns41240937.5
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
47744959
ns47003375
ns1.02
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
4319020.5
ns4301083.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2876959
ns2876167
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
3010167
ns2986437.5
ns1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
8658375
ns7412625
ns1.17
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA
514332
ns515223
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
40234750
ns40138542
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
34767583
ns34883937.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
33924250
ns33862542
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
53719958
ns51421084
ns1.04
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
2979961.5
ns2979770
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
89992458
ns88409354.5
ns1.02
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
84426916.5
ns84462416
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
82809646
ns83166916.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
96502584
ns93812228.5
ns1.03
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
142457125
ns143119041
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
186377999.5
ns186909958.5
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
160522958
ns160607000
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
489638250
ns149056313
ns3.28
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA
7101620
ns7091795
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
877579500
ns876576041.5
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
810323667
ns819011417
ns0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
714880166.5
ns713621416.5
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
2042862020.5
ns1026954750.5
ns1.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
34011046
ns33962668
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
1671563875
ns1654338292
ns1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
1561654708
ns1556399750
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
1478668938
ns1456365229
ns1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
2558813125
ns1581565875
ns1.62
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s)
1545417
ns1500042
ns1.03
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s)
1269125
ns1281708
ns0.99
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s)
1641375
ns1629875
ns1.01
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s)
2465854.5
ns2163395.5
ns1.14
lenet(28, 28, 1, 128)/forward/GPU/CUDA
268247
ns262650.5
ns1.02
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s)
7879417
ns7601959
ns1.04
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s)
6568854.5
ns6596916
ns1.00
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s)
7162916
ns7128375
ns1.00
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s)
11708979
ns10476396
ns1.12
lenet(28, 28, 1, 128)/zygote/GPU/CUDA
1072114.5
ns1087771
ns0.99
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s)
186008021
ns185964437.5
ns1.00
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s)
145478792
ns146352312.5
ns0.99
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s)
128424688
ns130050146
ns0.99
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s)
452715625
ns179543416.5
ns2.52
vgg16(32, 32, 3, 32)/forward/GPU/CUDA
4848333.5
ns4845696
ns1.00
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s)
641184291
ns643688917
ns1.00
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s)
524088958
ns604191917
ns0.87
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s)
537727000
ns537019041
ns1.00
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s)
1403114875
ns663244750
ns2.12
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA
18681328
ns16664478
ns1.12
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s)
1095875
ns1073937.5
ns1.02
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s)
967604
ns979688
ns0.99
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s)
1353791
ns1338583
ns1.01
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s)
1322541
ns1380812
ns0.96
lenet(28, 28, 1, 64)/forward/GPU/CUDA
270349
ns265966
ns1.02
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s)
6033416.5
ns6009021
ns1.00
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s)
4668271
ns4658625
ns1.00
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s)
4931041
ns4922187.5
ns1.00
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s)
6027000
ns5723978.5
ns1.05
lenet(28, 28, 1, 64)/zygote/GPU/CUDA
1111457
ns1137942.5
ns0.98
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
23769000
ns23733624.5
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
34212937.5
ns35284771.5
ns0.97
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
37101833
ns37100750.5
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
132610375
ns35260167
ns3.76
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA
1831215
ns1834016
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
184677875
ns184898625
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
159062583
ns160642834
ns0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
144384604
ns144248000
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
534807250
ns271530583
ns1.97
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
16477829
ns16393096
ns1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
297162354.5
ns296257000
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
243897583
ns245304833
ns0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
298830750.5
ns301408687
ns0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
713110625
ns446273791
ns1.60
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s)
658030208
ns656873875
ns1.00
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s)
432501687.5
ns433591937.5
ns1.00
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s)
400225625
ns402349417
ns0.99
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s)
1771604646
ns677798728.5
ns2.61
vgg16(32, 32, 3, 128)/forward/GPU/CUDA
12483387
ns12482697
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s)
1887633520.5
ns1891955437.5
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s)
1637268417
ns1637549708
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s)
1504383479
ns1514000729
ns0.99
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s)
5051328625
ns2113439354.5
ns2.39
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA
49779380
ns49760182
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
3063562.5
ns3046500
ns1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2098125.5
ns2098166
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2292583
ns2287292
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
6036250
ns4866125
ns1.24
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA
581533.5
ns582507.5
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
25421500
ns25579833
ns0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
20387000
ns20277104
ns1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
19188625
ns19545458
ns0.98
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
39410312.5
ns36687292
ns1.07
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
2998929
ns2979368
ns1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
35068167
ns35578625
ns0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
28412292
ns28390167
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
30184750
ns30144895.5
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
45702312
ns42776229
ns1.07
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
1657041.5
ns1650667
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1194958
ns1204458
ns0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1386854
ns1396750
ns0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
3047792
ns2509645.5
ns1.21
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA
216764
ns218107
ns0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
12728917
ns12697333
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
9975625
ns9973959
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
9685688
ns9758687
ns0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
19011166.5
ns18284458
ns1.04
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
1948689
ns1944527.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
17689854
ns17688854
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
14742625
ns14754291
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
14638458.5
ns14674374.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
22191709
ns21468083.5
ns1.03
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
23557167
ns23681167
ns0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
34461708
ns34404604
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
37530375
ns37545958
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
132666521
ns35268000
ns3.76
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA
1832030
ns1848561
ns0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
189557666.5
ns190505958.5
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
237037020.5
ns237366917
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
197049458.5
ns194090667
ns1.02
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
727023166
ns460122917
ns1.58
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
13923369
ns13928578
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
302073604
ns301146020.5
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
250383541.5
ns250240417
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
306286250
ns308748000
ns0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
717294500
ns395462625
ns1.81
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s)
1917792
ns1916083.5
ns1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s)
1580771
ns1556917
ns1.02
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s)
1574458
ns1579625
ns1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s)
2652249.5
ns2659291.5
ns1.00
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA
575627
ns570148
ns1.01
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s)
6156375
ns6146812.5
ns1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s)
5936542
ns5943834
ns1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s)
5926084
ns5926041
ns1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s)
9429521
ns6788041.5
ns1.39
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA
1376631
ns1353691.5
ns1.02
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s)
18780958
ns18785021
ns1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s)
19122375
ns19131625
ns1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s)
19117250
ns19125833
ns1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s)
18883917
ns15678041
ns1.20
Dense(512 => 512, relu)(512 x 128)/forward/CPU/2 thread(s)
70021
ns68937
ns1.02
Dense(512 => 512, relu)(512 x 128)/forward/CPU/4 thread(s)
68417
ns68625
ns1.00
Dense(512 => 512, relu)(512 x 128)/forward/CPU/8 thread(s)
70500
ns70792
ns1.00
Dense(512 => 512, relu)(512 x 128)/forward/CPU/1 thread(s)
727312.5
ns69854
ns10.41
Dense(512 => 512, relu)(512 x 128)/forward/GPU/CUDA
47914
ns47405.5
ns1.01
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/2 thread(s)
355562.5
ns287792
ns1.24
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/4 thread(s)
325250
ns312812.5
ns1.04
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/8 thread(s)
326687.5
ns280416
ns1.17
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/1 thread(s)
2205479
ns281521
ns7.83
Dense(512 => 512, relu)(512 x 128)/zygote/GPU/CUDA
213681
ns211915
ns1.01
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/2 thread(s)
393084
ns444500
ns0.88
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/4 thread(s)
450084
ns448250
ns1.00
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/8 thread(s)
444728.5
ns391667
ns1.14
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/1 thread(s)
2229333
ns357041.5
ns6.24
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
3034917
ns3044791
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2091583.5
ns2094645.5
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2284000
ns2278916.5
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
6013125
ns4567208
ns1.32
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA
576987
ns585440
ns0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
23582313
ns23578062.5
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
18082917
ns18085666
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
16983417
ns16978625
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
37559208
ns34976833
ns1.07
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
3055304
ns2912837
ns1.05
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
33242479.5
ns33419374.5
ns0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
27632833.5
ns27788708
ns0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
27457834
ns27373667
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
44681646
ns42059688
ns1.06
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
120219875
ns118607334
ns1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
174640416.5
ns173693458.5
ns1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
147464708
ns147902833
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
447824083.5
ns108303292
ns4.13
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA
5463169
ns5451158
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
471068209
ns470478958
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
466756437
ns467481645.5
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
436859104.5
ns434223083.5
ns1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
1751773916
ns737222479.5
ns2.38
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
32302579.5
ns35181339
ns0.92
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
637667500
ns635200500
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
664219750.5
ns665043396
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
586420938
ns582947041.5
ns1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
1733809521
ns731724375
ns2.37
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s)
1232125
ns1304833
ns0.94
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s)
975229
ns937167
ns1.04
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s)
903250
ns903709
ns1.00
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s)
1950333
ns2036958
ns0.96
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA
568402.5
ns564089
ns1.01
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s)
2957958
ns2960625
ns1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s)
2629625
ns2635667
ns1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s)
2593375
ns2619417
ns0.99
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s)
7086750
ns3698292
ns1.92
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA
1321605.5
ns1319613
ns1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s)
6642209
ns6561416
ns1.01
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s)
6552584
ns6499959
ns1.01
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s)
6490000
ns6497875
ns1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s)
7617500
ns4438375
ns1.72
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/2 thread(s)
39583
ns39271
ns1.01
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/4 thread(s)
31291
ns32458.5
ns0.96
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/8 thread(s)
35041
ns32062.5
ns1.09
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/1 thread(s)
91458
ns54437.5
ns1.68
Dense(128 => 128, gelu)(128 x 128)/forward/GPU/CUDA
27908
ns27919
ns1.00
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/2 thread(s)
175479.5
ns179042
ns0.98
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/4 thread(s)
175625
ns175541
ns1.00
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/8 thread(s)
175667
ns175167
ns1.00
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/1 thread(s)
273166
ns190708.5
ns1.43
Dense(128 => 128, gelu)(128 x 128)/zygote/GPU/CUDA
218444
ns219938
ns0.99
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/2 thread(s)
442021
ns442334
ns1.00
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/4 thread(s)
442417
ns463458.5
ns0.95
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/8 thread(s)
442062.5
ns442417
ns1.00
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/1 thread(s)
510625
ns429500
ns1.19
Dense(128 => 128, relu)(128 x 128)/forward/CPU/2 thread(s)
13375
ns13562.5
ns0.99
Dense(128 => 128, relu)(128 x 128)/forward/CPU/4 thread(s)
12833
ns13437.5
ns0.96
Dense(128 => 128, relu)(128 x 128)/forward/CPU/8 thread(s)
14187.5
ns14416
ns0.98
Dense(128 => 128, relu)(128 x 128)/forward/CPU/1 thread(s)
54458
ns14375
ns3.79
Dense(128 => 128, relu)(128 x 128)/forward/GPU/CUDA
27839
ns28121
ns0.99
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/2 thread(s)
25708
ns25917
ns0.99
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/4 thread(s)
25708
ns25667
ns1.00
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/8 thread(s)
25792
ns25625
ns1.01
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/1 thread(s)
151770.5
ns26250
ns5.78
Dense(128 => 128, relu)(128 x 128)/zygote/GPU/CUDA
208205
ns209865
ns0.99
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/2 thread(s)
46083
ns45437.5
ns1.01
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/4 thread(s)
45458
ns46479.5
ns0.98
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/8 thread(s)
45875
ns46041
ns1.00
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/1 thread(s)
151145.5
ns28209
ns5.36
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s)
318485791
ns318266167
ns1.00
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s)
236155250
ns238108104
ns0.99
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s)
204947249.5
ns203733333
ns1.01
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s)
870093062.5
ns322939875
ns2.69
vgg16(32, 32, 3, 64)/forward/GPU/CUDA
7672854
ns7668589
ns1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s)
1103355583.5
ns1098692854.5
ns1.00
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s)
951012041.5
ns952627249.5
ns1.00
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s)
915597916
ns856876291
ns1.07
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s)
2647669125
ns1173710250
ns2.26
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA
27249547
ns27280510.5
ns1.00
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/2 thread(s)
193938
ns193124.5
ns1.00
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/4 thread(s)
167312.5
ns168542
ns0.99
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/8 thread(s)
167834
ns168187.5
ns1.00
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/1 thread(s)
873291.5
ns218458.5
ns4.00
Dense(512 => 512, gelu)(512 x 128)/forward/GPU/CUDA
47232
ns47292
ns1.00
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/2 thread(s)
1215770.5
ns1214729
ns1.00
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/4 thread(s)
1097562.5
ns1095750
ns1.00
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/8 thread(s)
1097292
ns1014896
ns1.08
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/1 thread(s)
2767479.5
ns1504666
ns1.84
Dense(512 => 512, gelu)(512 x 128)/zygote/GPU/CUDA
222773
ns222578.5
ns1.00
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/2 thread(s)
2290771
ns2298292
ns1.00
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/4 thread(s)
2230896
ns2283250
ns0.98
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/8 thread(s)
2223666
ns2158334
ns1.03
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/1 thread(s)
3710292
ns2476833
ns1.50
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s)
1586500
ns1582437.5
ns1.00
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s)
1236041.5
ns1264833
ns0.98
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s)
1234250
ns1174562.5
ns1.05
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s)
2225875
ns2357375
ns0.94
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA
574042.5
ns571094.5
ns1.01
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s)
3206334
ns3197541
ns1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s)
2859000
ns2843042
ns1.01
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s)
2838875
ns2853458
ns0.99
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s)
7347042
ns3931104
ns1.87
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA
1353212
ns1330355
ns1.02
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s)
8838958
ns8842250
ns1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s)
8778959
ns8776708
ns1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s)
8989687
ns8804292
ns1.02
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s)
9543937
ns6342000
ns1.50
Dense(16 => 16, relu)(16 x 128)/forward/CPU/2 thread(s)
2500
ns4625
ns0.54
Dense(16 => 16, relu)(16 x 128)/forward/CPU/4 thread(s)
2229
ns2458
ns0.91
Dense(16 => 16, relu)(16 x 128)/forward/CPU/8 thread(s)
2500
ns2542
ns0.98
Dense(16 => 16, relu)(16 x 128)/forward/CPU/1 thread(s)
2709
ns2416
ns1.12
Dense(16 => 16, relu)(16 x 128)/forward/GPU/CUDA
25189
ns24562
ns1.03
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/2 thread(s)
7417
ns7125
ns1.04
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/4 thread(s)
7083
ns7125
ns0.99
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/8 thread(s)
7209
ns7417
ns0.97
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/1 thread(s)
7250
ns7292
ns0.99
Dense(16 => 16, relu)(16 x 128)/zygote/GPU/CUDA
191810
ns186417
ns1.03
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/2 thread(s)
8667
ns8541
ns1.01
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/4 thread(s)
8666.5
ns8500
ns1.02
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/8 thread(s)
8500
ns8709
ns0.98
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/1 thread(s)
5916
ns6125
ns0.97
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/2 thread(s)
10395.5
ns10625
ns0.98
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/4 thread(s)
17416.5
ns14792
ns1.18
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/8 thread(s)
10625
ns12000
ns0.89
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/1 thread(s)
7458
ns7500
ns0.99
Dense(16 => 16, gelu)(16 x 128)/forward/GPU/CUDA
25409
ns24702.5
ns1.03
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/2 thread(s)
21666
ns21458
ns1.01
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/4 thread(s)
21500
ns21583
ns1.00
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/8 thread(s)
21875
ns22042
ns0.99
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/1 thread(s)
21792
ns21792
ns1
Dense(16 => 16, gelu)(16 x 128)/zygote/GPU/CUDA
200873
ns196629
ns1.02
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/2 thread(s)
56750
ns56833
ns1.00
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/4 thread(s)
56750
ns59166
ns0.96
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/8 thread(s)
56750
ns57208
ns0.99
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/1 thread(s)
51333
ns54542
ns0.94
Dense(128 => 128, identity)(128 x 128)/forward/CPU/2 thread(s)
28375
ns28687.5
ns0.99
Dense(128 => 128, identity)(128 x 128)/forward/CPU/4 thread(s)
29125
ns28709
ns1.01
Dense(128 => 128, identity)(128 x 128)/forward/CPU/8 thread(s)
28916
ns28792
ns1.00
Dense(128 => 128, identity)(128 x 128)/forward/CPU/1 thread(s)
45875
ns46041
ns1.00
Dense(128 => 128, identity)(128 x 128)/forward/GPU/CUDA
26566
ns25795
ns1.03
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/2 thread(s)
44041
ns44250
ns1.00
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/4 thread(s)
44375
ns47667
ns0.93
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/8 thread(s)
44125
ns44000
ns1.00
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/1 thread(s)
145041
ns63916
ns2.27
Dense(128 => 128, identity)(128 x 128)/zygote/GPU/CUDA
172032.5
ns167633.5
ns1.03
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/2 thread(s)
68500
ns68417
ns1.00
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/4 thread(s)
68687.5
ns68292
ns1.01
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/8 thread(s)
68333
ns68083
ns1.00
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/1 thread(s)
145708
ns68125
ns2.14
Dense(16 => 16, identity)(16 x 128)/forward/CPU/2 thread(s)
2000
ns2500
ns0.80
Dense(16 => 16, identity)(16 x 128)/forward/CPU/4 thread(s)
1916.5
ns1750
ns1.10
Dense(16 => 16, identity)(16 x 128)/forward/CPU/8 thread(s)
2000
ns1792
ns1.12
Dense(16 => 16, identity)(16 x 128)/forward/CPU/1 thread(s)
1916
ns1708
ns1.12
Dense(16 => 16, identity)(16 x 128)/forward/GPU/CUDA
23659
ns23041
ns1.03
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/2 thread(s)
5500
ns5375
ns1.02
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/4 thread(s)
5125
ns5083
ns1.01
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/8 thread(s)
5375
ns5416
ns0.99
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/1 thread(s)
5375
ns5125
ns1.05
Dense(16 => 16, identity)(16 x 128)/zygote/GPU/CUDA
175247.5
ns171497
ns1.02
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/2 thread(s)
8208
ns8375
ns0.98
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/4 thread(s)
8375
ns8167
ns1.03
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/8 thread(s)
8292
ns8208
ns1.01
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/1 thread(s)
5291
ns5708
ns0.93
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
34072083
ns34068625
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
40136625
ns40361624.5
ns0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
43496250
ns43432603.5
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
153686583
ns56216958.5
ns2.73
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA
2640798
ns2631639
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
511580042
ns453239687.5
ns1.13
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
316303416.5
ns319327021
ns0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
305395375.5
ns307674396
ns0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
699851166
ns506119959
ns1.38
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
15146264
ns15174112
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
745204500
ns735455458
ns1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
693254333.5
ns706582229
ns0.98
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
743284104.5
ns743368604
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
1174961458.5
ns910398833
ns1.29
This comment was automatically generated by workflow using github-action-benchmark.