Skip to content

Commit

Permalink
docs: trigger build for docs (#1087)
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal authored Nov 16, 2024
1 parent 888ea1e commit 3986545
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 10 deletions.
12 changes: 8 additions & 4 deletions docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ deploy_config = Documenter.auto_detect_deploy_system()
deploy_decision = Documenter.deploy_folder(deploy_config; repo="github.com/LuxDL/Lux.jl",
devbranch="main", devurl="dev", push_preview=true)

makedocs(; sitename="Lux.jl Docs",
makedocs(;
sitename="Lux.jl Docs",
authors="Avik Pal et al.",
clean=true,
doctest=false, # We test it in the CI, no need to run it here
Expand All @@ -97,7 +98,10 @@ makedocs(; sitename="Lux.jl Docs",
repo="github.com/LuxDL/Lux.jl", devbranch="main", devurl="dev",
deploy_url="https://lux.csail.mit.edu", deploy_decision),
draft=false,
pages)
pages
)

deploydocs(; repo="github.com/LuxDL/Lux.jl.git",
push_preview=true, target="build", devbranch="main")
deploydocs(;
repo="github.com/LuxDL/Lux.jl.git",
push_preview=true, target="build", devbranch="main"
)
3 changes: 2 additions & 1 deletion docs/run_single_tutorial.jl
Original file line number Diff line number Diff line change
Expand Up @@ -67,4 +67,5 @@ end

Literate.markdown(
path, output_directory; execute=true, name, flavor=Literate.DocumenterFlavor(),
preprocess=Base.Fix1(preprocess, path), postprocess=Base.Fix1(postprocess, path))
preprocess=Base.Fix1(preprocess, path), postprocess=Base.Fix1(postprocess, path)
)
10 changes: 5 additions & 5 deletions docs/src/.vitepress/config.mts
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@ import footnote from "markdown-it-footnote";
import { transformerMetaWordHighlight } from "@shikijs/transformers";

const baseTemp = {
base: "REPLACE_ME_DOCUMENTER_VITEPRESS", // TODO: replace this in makedocs!
base: 'REPLACE_ME_DOCUMENTER_VITEPRESS', // TODO: replace this in makedocs!
};

// https://vitepress.dev/reference/site-config
export default defineConfig({
base: "REPLACE_ME_DOCUMENTER_VITEPRESS",
title: "REPLACE_ME_DOCUMENTER_VITEPRESS",
base: 'REPLACE_ME_DOCUMENTER_VITEPRESS',
title: 'REPLACE_ME_DOCUMENTER_VITEPRESS',
description: "Documentation for LuxDL Repositories",
cleanUrls: true,
outDir: "REPLACE_ME_DOCUMENTER_VITEPRESS", // This is required for MarkdownVitepress to work correctly...
outDir: 'REPLACE_ME_DOCUMENTER_VITEPRESS', // This is required for MarkdownVitepress to work correctly...

markdown: {
math: true,
Expand Down Expand Up @@ -424,7 +424,7 @@ export default defineConfig({
text: "Edit this page on GitHub",
},
socialLinks: [
{ icon: "github", link: "REPLACE_ME_DOCUMENTER_VITEPRESS" },
{ icon: "github", link: 'REPLACE_ME_DOCUMENTER_VITEPRESS' },
{ icon: "twitter", link: "https://twitter.com/avikpal1410" },
{ icon: "slack", link: "https://julialang.org/slack/" },
],
Expand Down

1 comment on commit 3986545

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lux Benchmarks

Benchmark suite Current: 3986545 Previous: 888ea1e Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3792 ns 3917 ns 0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4084 ns 4125 ns 0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4834 ns 4916 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3959 ns 4083 ns 0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 61509.5 ns 61146.5 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10500 ns 10917 ns 0.96
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10541 ns 10834 ns 0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10250 ns 11250 ns 0.91
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10250 ns 10833.5 ns 0.95
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 431498.5 ns 428022 ns 1.01
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1062.5 ns 1125 ns 0.94
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1167 ns 1375 ns 0.85
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1417 ns 1333 ns 1.06
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1208 ns 1208 ns 1
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 18573 ns 18431 ns 1.01
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4000 ns 3500 ns 1.14
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 4000 ns 4208 ns 0.95
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4209 ns 4250 ns 0.99
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 3750 ns 4000 ns 0.94
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 111184 ns 110418 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57750 ns 56958 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 38542 ns 46584 ns 0.83
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46583 ns 38458 ns 1.21
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82208 ns 82541 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 37503.5 ns 37005 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2037645.5 ns 2031792 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2095625 ns 2084916.5 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1844375 ns 2098292 ns 0.88
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2001375 ns 1994604.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 196039 ns 194818 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 145583 ns 144208 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 143584 ns 146833 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 146458 ns 145854.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 145000 ns 155416.5 ns 0.93
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 168190 ns 165909 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1114291 ns 1062250 ns 1.05
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1150292 ns 1115937.5 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 805500 ns 1107500 ns 0.73
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1122750 ns 1116875 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 526921 ns 521759.5 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 3292 ns 3416 ns 0.96
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 3666 ns 3459 ns 1.06
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4167 ns 4625 ns 0.90
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3500 ns 3375 ns 1.04
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 72235.5 ns 71735 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10125 ns 9333 ns 1.08
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8375 ns 9458 ns 0.89
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8792 ns 10209 ns 0.86
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8833 ns 9042 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 480020 ns 496830.5 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 14875 ns 14708 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 15000 ns 15750 ns 0.95
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 17520.5 ns 17292 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 14583 ns 17791.5 ns 0.82
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 53914 ns 53700 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 214792 ns 225708.5 ns 0.95
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 214875 ns 225416 ns 0.95
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 214750 ns 215417 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 226813 ns 212208 ns 1.07
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 272785 ns 271526 ns 1.00
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 625 ns 417 ns 1.50
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 625 ns 750 ns 0.83
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 917 ns 792 ns 1.16
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 459 ns 708 ns 0.65
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 17774 ns 17628 ns 1.01
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1792 ns 1417 ns 1.26
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1417 ns 1791 ns 0.79
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1709 ns 1875 ns 0.91
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1417 ns 1625 ns 0.87
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 102929.5 ns 103125 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7167 ns 7125 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5250 ns 6000 ns 0.88
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6000 ns 5334 ns 1.12
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10000 ns 9917 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 23666 ns 23465 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 225187.5 ns 223145.5 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 237479.5 ns 241208 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 229334 ns 230125 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 226709 ns 214458 ns 1.06
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 168739 ns 168094 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 3875 ns 3875 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 3959 ns 3959 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 3875 ns 3917 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23839 ns 23549 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16792 ns 16500 ns 1.02
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16833 ns 17083 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16958 ns 16792 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16750 ns 16917 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 161365 ns 161967.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 571458 ns 575041 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 576000 ns 576292 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 574041 ns 576625 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 571458 ns 571458 ns 1
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113559.5 ns 112966.5 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1425375 ns 1420500 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1418875 ns 1421667 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1418958 ns 1425458 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 1422750 ns 1418167 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 210833 ns 211527 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s) 1076645.5 ns 1068625 ns 1.01
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s) 934291 ns 968021 ns 0.97
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s) 1340187.5 ns 1326812.5 ns 1.01
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s) 1294270.5 ns 1291125 ns 1.00
lenet(28, 28, 1, 64)/forward/GPU/CUDA 271656 ns 271093.5 ns 1.00
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s) 5796417 ns 5787333 ns 1.00
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s) 4651792 ns 4571958 ns 1.02
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s) 4918209 ns 4958979 ns 0.99
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s) 5515938 ns 5712792 ns 0.97
lenet(28, 28, 1, 64)/zygote/GPU/CUDA 1071316.5 ns 1067736 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 542 ns 500 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 583 ns 542 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 500 ns 542 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 500 ns 542 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23948.5 ns 23893 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2167 ns 2084 ns 1.04
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2209 ns 2250 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2125 ns 2167 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2125 ns 2125 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 169153 ns 171705 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 3625 ns 3500 ns 1.04
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 4084 ns 4042 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 4687.5 ns 4792 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 3709 ns 4084 ns 0.91
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 66303.5 ns 66188 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 11270.5 ns 11667 ns 0.97
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11417 ns 11791 ns 0.97
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11625 ns 12000 ns 0.97
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10667 ns 11375 ns 0.94
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 456550 ns 454536 ns 1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6312.5 ns 6187.5 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6770.5 ns 6542 ns 1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7792 ns 8166.5 ns 0.95
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7083 ns 6291.5 ns 1.13
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 52528 ns 52258 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 18375 ns 16667 ns 1.10
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17833 ns 18729.5 ns 0.95
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 17791 ns 18375 ns 0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 16833 ns 16959 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 301396 ns 309572 ns 0.97
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 625 ns 542 ns 1.15
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 625 ns 625 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 542 ns 625 ns 0.87
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 584 ns 584 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 32972 ns 32881.5 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9020.5 ns 9375 ns 0.96
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 8459 ns 9250 ns 0.91
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9041 ns 9167 ns 0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 8708 ns 8875 ns 0.98
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 159042.5 ns 160215.5 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 64542 ns 64500 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 64895.5 ns 64459 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 64292 ns 64792 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 64542 ns 64500 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 110877 ns 111097 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 284875 ns 283000 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 297937.5 ns 279520.5 ns 1.07
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 282333 ns 293000 ns 0.96
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 274104.5 ns 284584 ns 0.96
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 184904.5 ns 185358 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s) 3295541 ns 3360124.5 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s) 2811062.5 ns 3074542 ns 0.91
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s) 3016125 ns 2838792 ns 1.06
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s) 3935209 ns 4085167 ns 0.96
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA 572132 ns 581270.5 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s) 7478250 ns 7606291.5 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s) 7348937.5 ns 7454771 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s) 7339479.5 ns 7331375 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s) 8212959 ns 7941542 ns 1.03
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA 1367334 ns 1351704.5 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s) 18775625 ns 18789417 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s) 19121334 ns 19123583 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s) 19108667 ns 20307208 ns 0.94
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s) 15653542 ns 15680375 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23560250 ns 23678834 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 42472875 ns 33872500 ns 1.25
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 37127771 ns 40958583 ns 0.91
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 34865500 ns 34902646 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1862818 ns 1864422.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 188025167 ns 189702708 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 176960479.5 ns 164774667 ns 1.07
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 152823708 ns 158085687.5 ns 0.97
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 441336000 ns 441097750 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 13912250 ns 13899004 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 290589750 ns 290371250 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 276449542 ns 338928292 ns 0.82
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 296753875 ns 306426854 ns 0.97
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 333259041 ns 333159833 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 22875 ns 22167 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 23333 ns 22854.5 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 24125 ns 23958.5 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 23542 ns 23084 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 98041.5 ns 96966 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 103625 ns 103916 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 135834 ns 104666.5 ns 1.30
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 105084 ns 105458 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 103250 ns 102667 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 518052 ns 508850 ns 1.02
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6209 ns 5458 ns 1.14
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6500 ns 6042 ns 1.08
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7041.5 ns 7042 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 5959 ns 6291 ns 0.95
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 70884 ns 69786 ns 1.02
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 15084 ns 15083 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 15708 ns 16125 ns 0.97
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 16250 ns 16875 ns 0.96
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14770.5 ns 15208 ns 0.97
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 492747 ns 487835 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3001020.5 ns 3022209 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2085333 ns 2045708 ns 1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2274000 ns 2300875 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4550083 ns 4790083 ns 0.95
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA 589071 ns 589360 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 23511750 ns 23426083 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18279542 ns 18045062.5 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 16979209 ns 18263042 ns 0.93
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 35598583 ns 35659959 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 3111231 ns 3113003 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 33266500 ns 33282791.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 28064750 ns 27619271.5 ns 1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 27365500 ns 27738250 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41824541.5 ns 41798521 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 71750 ns 74062.5 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 74021 ns 72958.5 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 74875 ns 74416 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 73458 ns 72333 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 104698 ns 103887 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 314125.5 ns 221583 ns 1.42
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 212229 ns 207708 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 323000 ns 317187 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 218042 ns 307146 ns 0.71
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 559024 ns 555798 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11625 ns 11417 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 12292 ns 12834 ns 0.96
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 12500 ns 13104 ns 0.95
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 11875 ns 15167 ns 0.78
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 73943 ns 72891.5 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 26583 ns 26416 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 26667 ns 28000 ns 0.95
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 27708 ns 28208 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26666 ns 26625 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 493150 ns 487294 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 12208 ns 11417 ns 1.07
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 12896 ns 13520.5 ns 0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13916 ns 13875 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 12500 ns 12542 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 54608 ns 54288 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 26125 ns 25667 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 26000 ns 25959 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 25916.5 ns 26333 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26000 ns 26833 ns 0.97
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 315887.5 ns 314956 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 179208 ns 178708 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 183145.5 ns 181500 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 183166 ns 182750 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 180125 ns 181062.5 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 58575 ns 57250 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 582958.5 ns 581875 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 596541.5 ns 583250 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 583833 ns 591542 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 582834 ns 590812.5 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 294599.5 ns 293401.5 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6292 ns 5416 ns 1.16
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6459 ns 6166.5 ns 1.05
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 6750 ns 7437.5 ns 0.91
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6041 ns 9167 ns 0.66
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 72806 ns 72329.5 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14542 ns 13875 ns 1.05
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 13333 ns 15375 ns 0.87
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15667 ns 15792 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14333 ns 13958.5 ns 1.03
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 482192.5 ns 474586.5 ns 1.02
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 1177728.5 ns 1175416.5 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 1356208.5 ns 1643125 ns 0.83
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 1250750 ns 1273167 ns 0.98
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 1317541 ns 1317228.5 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA 301448 ns 302286 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 4117688 ns 4103708 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 4491417 ns 4373292 ns 1.03
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 4696854.5 ns 4773000 ns 0.98
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 4452542 ns 4454896 ns 1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1051206.5 ns 1054654.5 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1875 ns 1791 ns 1.05
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1875 ns 1834 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1833 ns 1833 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1875 ns 1875 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 24165 ns 24200 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 5000 ns 4792 ns 1.04
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4958 ns 4875 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4917 ns 4875 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4875 ns 4917 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 194564.5 ns 192415 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6041 ns 5542 ns 1.09
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6000 ns 5709 ns 1.05
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6145.5 ns 7104 ns 0.87
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5958 ns 5833 ns 1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 57313.5 ns 56178.5 ns 1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 11979.5 ns 10917 ns 1.10
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 11854.5 ns 11709 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 11042 ns 12041 ns 0.92
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 11292 ns 10875 ns 1.04
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 342366 ns 346003.5 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 333 ns 292 ns 1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 333 ns 334 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 333 ns 292 ns 1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 333 ns 334 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 23004 ns 23150 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 3000 ns 2708 ns 1.11
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2750 ns 3042 ns 0.90
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 3000 ns 3041 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2750 ns 2792 ns 0.98
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 159207 ns 162167.5 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 11583 ns 10917 ns 1.06
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 11292 ns 11417 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 13437.5 ns 13167 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 11708.5 ns 12291 ns 0.95
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 57286.5 ns 57232 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 25312.5 ns 24458 ns 1.03
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 25083 ns 24959 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25334 ns 25083 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 25167 ns 25458 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 296722 ns 300859.5 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4208 ns 4125 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4208 ns 4208 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4167 ns 4250 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4167 ns 4209 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 25099 ns 25510 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16125 ns 16042 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16041 ns 16166 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16166 ns 16250 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16042 ns 16250 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 199370.5 ns 199412 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5833 ns 5667 ns 1.03
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5833 ns 5792 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5792 ns 5750 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5833 ns 5792 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 33986 ns 34134.5 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 21083 ns 20958 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 21125 ns 20937.5 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 21208 ns 21375 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 20667 ns 21167 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 176941.5 ns 179792 ns 0.98
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 396792 ns 394084 ns 1.01
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 354313 ns 373978.5 ns 0.95
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 489167 ns 468708 ns 1.04
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 521584 ns 517958.5 ns 1.01
batchedmm(16, Bsize=512)/forward/GPU/CUDA 66831 ns 67463 ns 0.99
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 1005417 ns 995417 ns 1.01
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 876583 ns 859333 ns 1.02
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 1235667 ns 1222292 ns 1.01
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 1420854 ns 1318979.5 ns 1.08
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 191762.5 ns 196257.5 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 80250 ns 79625 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 80209 ns 81667 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 84167 ns 83104 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 81125 ns 80667 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 193433 ns 193536.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1916083 ns 1916416 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1933854 ns 1914125 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1917917 ns 1929917 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1923708.5 ns 1920166 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 409629 ns 396634 ns 1.03
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 333 ns 292 ns 1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 333 ns 0.88
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 333 ns 333 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 22197 ns 22405 ns 0.99
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1834 ns 1792 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1875 ns 1834 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1834 ns 1833 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1833 ns 1833 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 170854.5 ns 171684.5 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6791 ns 6000 ns 1.13
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6417 ns 6666 ns 0.96
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7375 ns 7771 ns 0.95
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6959 ns 7083 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 61202 ns 58222 ns 1.05
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9291.5 ns 9000 ns 1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9166.5 ns 9166 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9375 ns 9417 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9334 ns 9791 ns 0.95
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 313492.5 ns 309426.5 ns 1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 120748834 ns 120543916.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 181703729 ns 174574000 ns 1.04
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 148437750 ns 155303334 ns 0.96
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 104851584 ns 102497708 ns 1.02
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5474996 ns 5490560 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 616853125 ns 619560667 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 579539270.5 ns 556574708 ns 1.04
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 451846854.5 ns 466726875 ns 0.97
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 757165312.5 ns 754133187 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 34944567 ns 38221000 ns 0.91
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 649889209 ns 650261583 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 688661771 ns 664971666.5 ns 1.04
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 592710229 ns 598925604 ns 0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 741917708 ns 742000833 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 59750 ns 58833 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 38959 ns 47833 ns 0.81
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 48000 ns 38750 ns 1.24
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83416 ns 83541.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 37459 ns 38157 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1922792 ns 1923645.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1985083 ns 1974708 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1978104 ns 1987125 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1893917 ns 1902791 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 174160 ns 176957.5 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 290625 ns 264584 ns 1.10
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 266708 ns 287375 ns 0.93
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 271521 ns 281959 ns 0.96
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 268167 ns 265292 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 132776.5 ns 127356.5 ns 1.04
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 657229.5 ns 636958 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 681187.5 ns 669000 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 691583 ns 710937 ns 0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 597417 ns 618041 ns 0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 713916 ns 717016.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2243937 ns 2207208 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2191895.5 ns 2220833 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2213542 ns 2257083 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2180437.5 ns 2184083.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 133381 ns 133922 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5496875 ns 5489833 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5583292 ns 5482375 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5498250 ns 5611458 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5492750.5 ns 5518187.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 753967 ns 752554 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 636833 ns 643333 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 644417 ns 648250 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 645333 ns 641875 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 637292 ns 629833 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 46993.5 ns 47345 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1826042 ns 1823375 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1667083 ns 1724583 ns 0.97
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1726542 ns 1668166 ns 1.03
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 2105854.5 ns 2109958.5 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 222295 ns 225195 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58500 ns 57875 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 38708 ns 45041 ns 0.86
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47250 ns 37792 ns 1.25
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 84292 ns 84250 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 28598 ns 28912 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2031041 ns 2037042 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2099020.5 ns 1774875 ns 1.18
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2091916.5 ns 2106104 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1856417 ns 2004709 ns 0.93
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 190652 ns 192033 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 13391395.5 ns 13402709 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 12453250 ns 12434083.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 12557375.5 ns 12568959 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 15140541 ns 15252437.5 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 514312 ns 515691 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 47481750 ns 47190042 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 41986250 ns 41818959 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 40944792 ns 41112729 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 57945917 ns 58040458 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 3259544 ns 3265135 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 96867229.5 ns 74602708 ns 1.30
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 91436187.5 ns 90585333 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 90591917 ns 90752104 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 76381625 ns 99199917 ns 0.77
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 59083.5 ns 58666 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 38750 ns 47667 ns 0.81
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47417 ns 38916 ns 1.22
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 84000 ns 84583 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 46955 ns 47499 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1925125 ns 1924458 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1979250 ns 1963958 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1970729.5 ns 1984729 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1897750 ns 1890125 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 191790.5 ns 192543.5 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 375 ns 291 ns 1.29
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 416 ns 0.90
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 333 ns 375 ns 0.89
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 334 ns 0.87
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 32566 ns 32327.5 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6417 ns 6208 ns 1.03
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6458 ns 6542 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6459 ns 6583 ns 0.98
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6083 ns 6333 ns 0.96
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 174123.5 ns 175547 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 250 ns 1.17
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 250 ns 292 ns 0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 31409 ns 31532 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 2833 ns 2667 ns 1.06
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 2791 ns 2917 ns 0.96
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 2834 ns 2875 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 2583 ns 2625 ns 0.98
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 161269 ns 161599.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 286258979.5 ns 285406812.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 346927270.5 ns 342256021 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 313997291.5 ns 320515458 ns 0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 270108416 ns 269110542 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 7104986 ns 7087814 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 998016667 ns 998781958 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 959348209 ns 940297375 ns 1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 851652541.5 ns 865123208 ns 0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1162498166 ns 1166879459 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 33999768 ns 34086790.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1672427541 ns 1303773521 ns 1.28
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 1705785000 ns 1682982750 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1631619209 ns 1621725000 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1314128542 ns 1679721250 ns 0.78
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1406813 ns 1409125 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1416875 ns 1410750 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1459625 ns 1411520.5 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1407750 ns 1409625 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 127789 ns 127505 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5022896 ns 5022459 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5051333 ns 5010625 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5029542 ns 5048916.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5031875 ns 5026917 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 559312.5 ns 574233 ns 0.97
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s) 169600250 ns 175430604 ns 0.97
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s) 180340396 ns 129749208 ns 1.39
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s) 130036124.5 ns 147644584 ns 0.88
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s) 169790708.5 ns 156276000 ns 1.09
vgg16(32, 32, 3, 32)/forward/GPU/CUDA 5056885.5 ns 4878356.5 ns 1.04
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s) 669854958 ns 835576208 ns 0.80
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s) 604244667 ns 648955208 ns 0.93
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s) 501867209 ns 552474042 ns 0.91
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s) 684062709 ns 684916667 ns 1.00
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA 16520518 ns 18031032 ns 0.92
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 8950666 ns 8921583.5 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 8876958.5 ns 8765125 ns 1.01
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 7849458.5 ns 8191792 ns 0.96
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 10185417 ns 10144916 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1594436 ns 1593700.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 36026541.5 ns 36087854.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 38047792 ns 36659145.5 ns 1.04
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 33343417 ns 34354041 ns 0.97
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 38792000 ns 38831791 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6457988 ns 6456160 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 47417 ns 47375 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 47375 ns 47542 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 47584 ns 47542 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 47333 ns 47417 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 18535 ns 18714 ns 0.99
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 50291 ns 50270.5 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 50375 ns 50792 ns 0.99
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 50417 ns 50666 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 50083 ns 50500 ns 0.99
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 191873 ns 208956.5 ns 0.92
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6458 ns 6209 ns 1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6917 ns 6916 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7750 ns 7479 ns 1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6958 ns 7375 ns 0.94
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 91345 ns 103120 ns 0.89
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10458 ns 9750 ns 1.07
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9916 ns 10208 ns 0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10084 ns 10916 ns 0.92
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10208 ns 10708 ns 0.95
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 527140.5 ns 648155 ns 0.81
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5625 ns 5833 ns 0.96
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5917 ns 6041 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 6958 ns 7416.5 ns 0.94
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5750 ns 6250 ns 0.92
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 120543 ns 142455.5 ns 0.85
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13583 ns 13291 ns 1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13354.5 ns 13250 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13458 ns 13583 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 13000 ns 13812.5 ns 0.94
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 537999 ns 557240 ns 0.97
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 1083 ns 1000 ns 1.08
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 1083 ns 1083 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 1083 ns 1042 ns 1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 1042 ns 1084 ns 0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 32473 ns 32427 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7917 ns 7709 ns 1.03
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7917 ns 8125 ns 0.97
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7959 ns 8250 ns 0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8167 ns 8292 ns 0.98
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 206314.5 ns 212730.5 ns 0.97
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 23437.5 ns 23125 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 23167 ns 23229.5 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 23584 ns 23208.5 ns 1.02
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 23542 ns 23291.5 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 18671 ns 18651 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 52458 ns 52791 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 52541 ns 52750 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 53458 ns 52916 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 52062.5 ns 52917 ns 0.98
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 291832.5 ns 297972.5 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1458937 ns 1456542 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1401583 ns 1402875 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1403833.5 ns 1406125 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1459708.5 ns 1406020.5 ns 1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 195968 ns 195429 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5008771 ns 5011375 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5044104 ns 4999896 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5017250 ns 5004104 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5011916 ns 5012229.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 599687 ns 629207 ns 0.95
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3061000 ns 3028791 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2086750 ns 2080292 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2304917 ns 2306959 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4539041 ns 4528333.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 581670 ns 580927 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 24376958 ns 24334708 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 19122667 ns 18811750 ns 1.02
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 19181062.5 ns 19285875 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 36163041 ns 36624250 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 3185287.5 ns 3188653 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 34039875 ns 34044375 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 28717291.5 ns 28337458.5 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 28156000 ns 28359625 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41614584 ns 41526375 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 144831583 ns 144751292 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 143542708 ns 142343042 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 124983229.5 ns 126295625.5 ns 0.99
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 173618479 ns 174518292 ns 0.99
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22558463 ns 22564495 ns 1.00
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 1247182979 ns 926464812.5 ns 1.35
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 836595146 ns 1101725458.5 ns 0.76
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 738893583 ns 713128541 ns 1.04
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 672803125 ns 670038542 ns 1.00
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 118329511 ns 118583467 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 84666 ns 72500 ns 1.17
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 73666 ns 73291 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 76146 ns 75666.5 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 75688 ns 73875 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 240753.5 ns 246837 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 287042 ns 234791.5 ns 1.22
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 212354 ns 260042 ns 0.82
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 296854 ns 201917 ns 1.47
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 284250 ns 278875 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1238105 ns 1315953 ns 0.94
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 35497979 ns 35443854.5 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 35870917 ns 35414792 ns 1.01
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 32110833 ns 32315667 ns 0.99
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 40961896 ns 40952208.5 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5843453.5 ns 5844700.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 149169500 ns 147883042 ns 1.01
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 155980437.5 ns 151270666.5 ns 1.03
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 134845625 ns 140446166.5 ns 0.96
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 287434667 ns 287799542 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 34879809 ns 34900582 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 121767709 ns 120464917 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 181613625 ns 174558291 ns 1.04
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 148039291 ns 155482959 ns 0.95
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 104612333.5 ns 105495999.5 ns 0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5485164 ns 5466861 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 472118833 ns 469121958 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 486130458.5 ns 467127125 ns 1.04
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 440650208 ns 455180958.5 ns 0.97
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 746192375 ns 741251875 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 32245076 ns 35153962 ns 0.92
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 643396416 ns 641357958 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 675303249.5 ns 655567666 ns 1.03
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 575492166 ns 585153583 ns 0.98
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 856961334 ns 844860208 ns 1.01
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s) 1312541 ns 1247062.5 ns 1.05
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s) 677667 ns 995875 ns 0.68
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s) 963459 ns 746854.5 ns 1.29
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s) 2093375 ns 2056875 ns 1.02
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA 580070.5 ns 576802 ns 1.01
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s) 2966541.5 ns 2967458 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s) 2496854 ns 2623375 ns 0.95
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s) 2623959 ns 2522834 ns 1.04
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s) 3704083 ns 3709541.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA 1730505 ns 1870250 ns 0.93
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s) 6656375 ns 6657292 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s) 6477624.5 ns 6484416.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s) 6431167 ns 6480437.5 ns 0.99
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s) 4450479.5 ns 4452792 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7375 ns 7334 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5417 ns 6167 ns 0.88
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6084 ns 5416 ns 1.12
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9917 ns 10083 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 25252 ns 25185 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 212583 ns 212729.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 229770.5 ns 220541 ns 1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 220500 ns 228291 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 206083 ns 207000 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 251646.5 ns 290563 ns 0.87
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s) 301644020.5 ns 301766770.5 ns 1.00
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s) 280942354.5 ns 222332187.5 ns 1.26
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s) 189363792 ns 224980750 ns 0.84
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s) 305392479 ns 312140792 ns 0.98
vgg16(32, 32, 3, 64)/forward/GPU/CUDA 7676597 ns 7675117 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s) 1087372208.5 ns 1084663708.5 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s) 980974208 ns 904835749.5 ns 1.08
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s) 865965209 ns 854386750 ns 1.01
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s) 1158600916.5 ns 1158149229 ns 1.00
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA 26533591 ns 26306824 ns 1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5354.5 ns 5416 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5375 ns 5375 ns 1
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6917 ns 6375 ns 1.09
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4958 ns 6083 ns 0.82
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 146657 ns 188494 ns 0.78
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7395.5 ns 7709 ns 0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7375 ns 7375 ns 1
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7250 ns 7542 ns 0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7250 ns 7750 ns 0.94
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 596011.5 ns 704994 ns 0.85
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 584 ns 541 ns 1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 625 ns 667 ns 0.94
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 625 ns 583 ns 1.07
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 542 ns 584 ns 0.93
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 24031 ns 24467 ns 0.98
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 8917 ns 9000 ns 0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9708 ns 9417 ns 1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9583 ns 9750 ns 0.98
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 8833 ns 9375 ns 0.94
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 216620.5 ns 240486.5 ns 0.90
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 353333 ns 353833.5 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 352041 ns 353708 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 352666.5 ns 352854.5 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 352417 ns 353687.5 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 21463 ns 21482 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 820625 ns 780084 ns 1.05
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 828917 ns 822375.5 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 774875 ns 795166.5 ns 0.97
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 778729 ns 807896 ns 0.96
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 269469 ns 308900.5 ns 0.87
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 337187.5 ns 335708 ns 1.00
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 313687.5 ns 336354 ns 0.93
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 444709 ns 443833.5 ns 1.00
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 334500 ns 324875 ns 1.03
batchedmm(16, Bsize=32)/forward/GPU/CUDA 17922 ns 18282 ns 0.98
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 689958 ns 684187.5 ns 1.01
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 746333 ns 744666 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 1025042 ns 1037333 ns 0.99
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 694854.5 ns 690584 ns 1.01
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 242950 ns 282432 ns 0.86
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 351417 ns 351833 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 327270.5 ns 346708.5 ns 0.94
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 414729.5 ns 433333 ns 0.96
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 371750 ns 367625 ns 1.01
batchedmm(16, Bsize=128)/forward/GPU/CUDA 22559 ns 23030 ns 0.98
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 747208 ns 746875 ns 1.00
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 749416 ns 752833 ns 1.00
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 1069374.5 ns 1077583 ns 0.99
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 815937.5 ns 820333 ns 0.99
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 224503 ns 243734 ns 0.92
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 3708 ns 3500 ns 1.06
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 3625 ns 3666 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 3750 ns 3645.5 ns 1.03
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 3291 ns 3708 ns 0.89
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 17855 ns 17855 ns 1
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 4208 ns 4125 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 4208 ns 4459 ns 0.94
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 4333 ns 4458 ns 0.97
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 4208 ns 4584 ns 0.92
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 248489.5 ns 265709 ns 0.94
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 3708 ns 3041 ns 1.22
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4167 ns 3625 ns 1.15
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4791 ns 4375 ns 1.10
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3792 ns 3834 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 203806 ns 202022.5 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8667 ns 8167 ns 1.06
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8250 ns 8791 ns 0.94
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8458 ns 8791 ns 0.96
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8667 ns 8792 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 1166315.5 ns 1169080.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 204875 ns 205917 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 209750 ns 210542 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 209834 ns 210750 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 200000 ns 200459 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 34893 ns 35422 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 602917 ns 604333 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 628833 ns 629416 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 621584 ns 628417 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 592041 ns 593229.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 321942.5 ns 325735.5 ns 0.99
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 978791 ns 977521 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 937250.5 ns 938375 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 960250 ns 967604.5 ns 0.99
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 1307271 ns 1300334 ns 1.01
batchedmm(128, Bsize=128)/forward/GPU/CUDA 207418 ns 207218 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 4504084 ns 4502021 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 4619604.5 ns 4489021 ns 1.03
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 4294917 ns 4453583 ns 0.96
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 6229292 ns 6274333.5 ns 0.99
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 936037 ns 925859 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3354 ns 2958 ns 1.13
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3583 ns 3167 ns 1.13
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4417 ns 4208 ns 1.05
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3333 ns 3916 ns 0.85
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 196464 ns 208529.5 ns 0.94
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7334 ns 7417 ns 0.99
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7417 ns 7333 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7291 ns 7875 ns 0.93
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6917 ns 7291 ns 0.95
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 985634 ns 986067.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1640792 ns 1630104 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1171541.5 ns 1186917 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1327125 ns 1369208 ns 0.97
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2384666 ns 2425375 ns 0.98
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA 216205.5 ns 214079.5 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12345499.5 ns 12320416.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9603042 ns 9599459 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9259895.5 ns 9406208 ns 0.98
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18032958.5 ns 17994354.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1950941 ns 1943728.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17348083 ns 17326895.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14444583.5 ns 14332166.5 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14302167 ns 14502583 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21057645.5 ns 21072542 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 87666.5 ns 88166 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 89562 ns 92124.5 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 90292 ns 94000 ns 0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 88875 ns 134709 ns 0.66
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 126565 ns 126226 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2024000 ns 2026417 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2030958.5 ns 2016541 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1707583 ns 2054833 ns 0.83
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2030042 ns 2026813 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 999913 ns 1038168 ns 0.96
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 343750 ns 343333.5 ns 1.00
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 326145.5 ns 341833 ns 0.95
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 396833 ns 417604.5 ns 0.95
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 309896 ns 302437.5 ns 1.02
batchedmm(2, Bsize=4)/forward/GPU/CUDA 16654 ns 15633 ns 1.07
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 702666 ns 699708 ns 1.00
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 733666 ns 732792 ns 1.00
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 1020166 ns 1028083 ns 0.99
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 652500 ns 646021 ns 1.01
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 190386.5 ns 194912.5 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7416 ns 7208 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5291 ns 6083 ns 0.87
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6000 ns 5292 ns 1.13
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10041 ns 10125 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 34743 ns 33846 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 224334 ns 219687.5 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 229333 ns 231542 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 220959 ns 226000 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 206292 ns 217542 ns 0.95
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 296926 ns 312247.5 ns 0.95
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3750 ns 3667 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3792 ns 3709 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3667 ns 3750 ns 0.98
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3708 ns 3708 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 23083 ns 22524 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14416 ns 14375 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14209 ns 14500 ns 0.98
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14292 ns 14208 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14458 ns 14417 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 448235 ns 464888.5 ns 0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 92854 ns 91584 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 99583 ns 92500 ns 1.08
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 94542 ns 97583 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 96042 ns 138791 ns 0.69
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 125978 ns 125619 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1920562.5 ns 1712500 ns 1.12
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1914937.5 ns 1913083 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1653792 ns 1947334 ns 0.85
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1928541 ns 1925375 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 893203 ns 931491 ns 0.96
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s) 878750 ns 866125 ns 1.01
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s) 800021 ns 822833 ns 0.97
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s) 1221729 ns 1155708.5 ns 1.06
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s) 963792 ns 955167 ns 1.01
lenet(28, 28, 1, 32)/forward/GPU/CUDA 277692.5 ns 274168 ns 1.01
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s) 2824834 ns 2710209 ns 1.04
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s) 2464958 ns 2521125 ns 0.98
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s) 3323271 ns 3343333 ns 0.99
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s) 3398958 ns 3411041.5 ns 1.00
lenet(28, 28, 1, 32)/zygote/GPU/CUDA 1565101.5 ns 1667424.5 ns 0.94
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17667 ns 17334 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 15458.5 ns 15875 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 17250.5 ns 16625 ns 1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 14645.5 ns 15166 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 142432.5 ns 146084.5 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 218209 ns 216333 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 222958.5 ns 228896 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 216334 ns 218959 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 215062.5 ns 228333 ns 0.94
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 637432 ns 703455.5 ns 0.91
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 221145.5 ns 222021 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 222375 ns 219625 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 220917 ns 222270.5 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 220333 ns 219792 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 280530 ns 349468 ns 0.80
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 510354 ns 555354 ns 0.92
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 499375 ns 541500 ns 0.92
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 500021 ns 507791 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 507041 ns 509583 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1281236 ns 1442859 ns 0.89
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 332250 ns 328208 ns 1.01
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 316000 ns 335375 ns 0.94
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 364333 ns 440041.5 ns 0.83
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 323834 ns 315958.5 ns 1.02
batchedmm(16, Bsize=4)/forward/GPU/CUDA 17441 ns 17029 ns 1.02
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 715833.5 ns 713479.5 ns 1.00
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 735083 ns 737250.5 ns 1.00
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 1022959 ns 1022771 ns 1.00
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 667041 ns 658584 ns 1.01
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 193588.5 ns 197188.5 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18666 ns 17667 ns 1.06
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17375 ns 18459 ns 0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 19167 ns 19667 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17083.5 ns 19083 ns 0.90
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 147781 ns 167142 ns 0.88
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 212542 ns 212084 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 214146 ns 214250 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 213834 ns 219917 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 211354.5 ns 222583.5 ns 0.95
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 877964 ns 1047376 ns 0.84
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 4083 ns 4104.5 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 4291.5 ns 3979.5 ns 1.08
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5375 ns 5375 ns 1
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 3958 ns 4666 ns 0.85
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 169898 ns 213956.5 ns 0.79
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10834 ns 10875 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10542 ns 10562.5 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10583 ns 11042 ns 0.96
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10459 ns 10917 ns 0.96
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 993411.5 ns 1051553 ns 0.94
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3417 ns 3104.5 ns 1.10
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3167 ns 3270.5 ns 0.97
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4375 ns 4375 ns 1
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3062.5 ns 3583 ns 0.85
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 203556.5 ns 243243.5 ns 0.84
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7791 ns 7875 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7458 ns 7729.5 ns 0.96
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7250 ns 7916 ns 0.92
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7541 ns 7542 ns 1.00
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 1041955 ns 1070496 ns 0.97
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23557729 ns 23697312 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 43140979 ns 33840729 ns 1.27
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 37880833 ns 40993667 ns 0.92
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 34954917 ns 34934625 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1859678 ns 1799949 ns 1.03
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 184630708 ns 186397917 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 172192624.5 ns 158804792 ns 1.08
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 146314396 ns 151420541 ns 0.97
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 415449708 ns 414128875 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 16494786 ns 16543234 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 428781042 ns 431001417 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 259710791 ns 253386479.5 ns 1.02
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 231751208 ns 233549833 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 484878833 ns 484447625 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 183625 ns 184291 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 183375 ns 183000 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 184417 ns 185125 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 182667 ns 184854 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 177771.5 ns 228024.5 ns 0.78
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 590604 ns 592083 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 588083 ns 598459 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 586792 ns 630729 ns 0.93
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 586958 ns 597020.5 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1015783.5 ns 1101301 ns 0.92
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 3860917 ns 3831520.5 ns 1.01
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 3732375 ns 3861437.5 ns 0.97
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 3478062.5 ns 3512750 ns 0.99
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 5358854.5 ns 5353125 ns 1.00
batchedmm(128, Bsize=512)/forward/GPU/CUDA 533317.5 ns 533681 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 17452375 ns 17425875.5 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 17779209 ns 17302042 ns 1.03
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 16551750 ns 17078292 ns 0.97
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 22184000 ns 22192062 ns 1.00
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2614491.5 ns 2765136 ns 0.95
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 625 ns 542 ns 1.15
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 584 ns 625 ns 0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 583 ns 542 ns 1.08
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 32765 ns 32753 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9625 ns 9875 ns 0.97
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9542 ns 9125 ns 1.05
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9625 ns 9958 ns 0.97
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 8917 ns 9458 ns 0.94
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 263711.5 ns 264732 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s) 501494042 ns 502597125 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s) 411555459 ns 431706229.5 ns 0.95
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s) 374781084 ns 473571542 ns 0.79
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s) 672198042 ns 673775020.5 ns 1.00
vgg16(32, 32, 3, 128)/forward/GPU/CUDA 12477100 ns 12478261 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s) 2044775145.5 ns 2057099563 ns 0.99
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s) 1660536667 ns 1632471000 ns 1.02
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s) 1495631604 ns 1543342583 ns 0.97
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s) 2221523375 ns 2210188062.5 ns 1.01
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA 49258137.5 ns 49309371 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1643291 ns 1549958 ns 1.06
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1172917 ns 1179083 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1391041.5 ns 1373000 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2338333 ns 2487854 ns 0.94
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 215612.5 ns 216771.5 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12698542 ns 12739625.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9998999.5 ns 9965583 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9717041 ns 9786125 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18433792 ns 18379000 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2039696 ns 2050930 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17679687.5 ns 17630417 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14770854.5 ns 14653084 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14602583.5 ns 14783084 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21327625 ns 21379021 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 26292 ns 26167 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 26291 ns 26250 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 26250 ns 26250 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 26208 ns 26250 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 24225 ns 23941 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 67250 ns 66666 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 66834 ns 67458 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 68166 ns 67292 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66792 ns 67208 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 378162.5 ns 391319.5 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 203125 ns 202708 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 208500 ns 209583 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 208666 ns 209041 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 200125 ns 199791 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 26005 ns 26680 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 646625 ns 626000 ns 1.03
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 628813 ns 633708 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 669895.5 ns 626500 ns 1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 580791.5 ns 630375 ns 0.92
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 311381 ns 351467 ns 0.89
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 651667 ns 642167 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 638666 ns 572542 ns 1.12
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 647417 ns 640500 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 653083.5 ns 643500 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 131397 ns 131644.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2243375 ns 2256125 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2314937.5 ns 2244834 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2249625 ns 2286125 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2235375 ns 2239937 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1114755 ns 1255781 ns 0.89
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18291 ns 18354.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17500 ns 18208 ns 0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 20917 ns 19708 ns 1.06
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18292 ns 19166 ns 0.95
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 143094 ns 144781.5 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 223500 ns 229291 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 226042 ns 258750 ns 0.87
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 262917 ns 224104 ns 1.17
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 230125 ns 230333 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 943015 ns 1072717 ns 0.88
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 625 ns 584 ns 1.07
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 625 ns 667 ns 0.94
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 666 ns 625 ns 1.07
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 583 ns 583 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 23380 ns 23495 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 10104.5 ns 10166 ns 0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 10166 ns 10042 ns 1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10000 ns 10584 ns 0.94
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9583 ns 9916.5 ns 0.97
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 254915.5 ns 257743.5 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5084 ns 4875 ns 1.04
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5375 ns 5541.5 ns 0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6791 ns 6916 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5250 ns 6458 ns 0.81
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 190346.5 ns 224997.5 ns 0.85
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7250 ns 6834 ns 1.06
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7125 ns 7709 ns 0.92
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7250 ns 7791 ns 0.93
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7083 ns 7875 ns 0.90
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 735734 ns 771041 ns 0.95
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 2167 ns 1875 ns 1.16
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 2208 ns 2208 ns 1
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2209 ns 2291 ns 0.96
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 2417 ns 2125 ns 1.14
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 18111 ns 18191 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 6750 ns 6875 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 6375 ns 6792 ns 0.94
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6625 ns 6834 ns 0.97
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 6625 ns 6834 ns 0.97
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 306022.5 ns 320849 ns 0.95
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 751583.5 ns 751000.5 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 748875 ns 746834 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 746812.5 ns 750750 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 748500 ns 749250 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 21064 ns 21394 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 791834 ns 775375 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 788667 ns 797791 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 786646.5 ns 789542 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 792479 ns 792229.5 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 294710 ns 298871 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7417 ns 7375 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5208 ns 6083 ns 0.86
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6000 ns 5375 ns 1.12
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10084 ns 10125 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 33108.5 ns 32874 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 228645.5 ns 260333 ns 0.88
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 231416 ns 266395.5 ns 0.87
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 271625 ns 232084 ns 1.17
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 225958 ns 254562.5 ns 0.89
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 351410 ns 358414 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10292 ns 10042 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 10084 ns 10000 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 11166 ns 11333.5 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 10000 ns 10583 ns 0.94
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 209596.5 ns 246624.5 ns 0.85
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 24709 ns 25125 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24333 ns 24875 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 24291 ns 26500 ns 0.92
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 24437.5 ns 24584 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 1037550 ns 1075334 ns 0.96
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 107199542 ns 106802625 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 126347334 ns 117761313 ns 1.07
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 120468625 ns 123597167 ns 0.97
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 117762042 ns 118005709 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 2637816 ns 2586758.5 ns 1.02
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 393813416 ns 396471833 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 380007916 ns 366941541 ns 1.04
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 355873375 ns 358340709 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 484550250 ns 482420291 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 15152772.5 ns 15213427.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 939763875 ns 762317541.5 ns 1.23
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 777743792 ns 763508042 ns 1.02
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 745742833 ns 750628250 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 767071771.5 ns 949414250.5 ns 0.81
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7167 ns 9292 ns 0.77
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6833 ns 6833 ns 1
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8458 ns 8937 ns 0.95
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7562.5 ns 7542 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 228024 ns 233625 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14250 ns 14125 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14042 ns 13375 ns 1.05
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 13875 ns 14166 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 13333 ns 14292 ns 0.93
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 1000779 ns 1040314 ns 0.96
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6167 ns 5292 ns 1.17
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6125 ns 6125 ns 1
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 8250 ns 8500 ns 0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5604.5 ns 6687.5 ns 0.84
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 214266.5 ns 227795.5 ns 0.94
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12417 ns 12375 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12542 ns 12292 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 12875 ns 12834 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12541 ns 13208 ns 0.95
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 724930 ns 752144.5 ns 0.96
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 349208 ns 343584 ns 1.02
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 326145.5 ns 342542 ns 0.95
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 393333 ns 422396 ns 0.93
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 314271 ns 307708 ns 1.02
batchedmm(2, Bsize=128)/forward/GPU/CUDA 17228 ns 16984 ns 1.01
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 706500 ns 703478.5 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 739437.5 ns 732500 ns 1.01
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 1020354 ns 1028375 ns 0.99
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 658541 ns 649750 ns 1.01
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 198297 ns 200115 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 375 ns 292 ns 1.28
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 292 ns 375 ns 0.78
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 23935.5 ns 23291 ns 1.03
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6500 ns 6250 ns 1.04
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6584 ns 6458 ns 1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6584 ns 6791 ns 0.97
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6250 ns 6542 ns 0.96
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 240134 ns 237895.5 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5875 ns 5792 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5917 ns 5875 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 5917 ns 5875 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 5834 ns 5875 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 24721 ns 24490 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 21500 ns 21417 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 21333 ns 20958 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 21292 ns 21625 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 21208 ns 21542 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 262379.5 ns 261762 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 144229.5 ns 142958 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 144042 ns 144250 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 147292 ns 147875 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 145833 ns 184292 ns 0.79
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 167351 ns 166717.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1320395.5 ns 1330292 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1358771 ns 1314521 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1324084 ns 1355792 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1329333.5 ns 1327770.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1268788 ns 1305974 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 24083 ns 24584 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 22375 ns 21792 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 25104.5 ns 24250 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 21917 ns 22250 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 280502 ns 280907 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 131646 ns 183708 ns 0.72
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 121334 ns 129375 ns 0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 177687.5 ns 177104.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 130209 ns 130708.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1380349 ns 1407612.5 ns 0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 416 ns 375 ns 1.11
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 417 ns 0.90
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 416 ns 375 ns 1.11
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 292 ns 333 ns 0.88
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 23199 ns 22943.5 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6708 ns 6458 ns 1.04
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 7083 ns 6709 ns 1.06
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6708 ns 6792 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6083 ns 6667 ns 0.91
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 258254.5 ns 254857.5 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5042 ns 4459 ns 1.13
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4500 ns 4500 ns 1
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4917 ns 5583 ns 0.88
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4917 ns 4833 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 243109 ns 243558 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10375 ns 10000 ns 1.04
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10042 ns 10542 ns 0.95
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10125 ns 10375 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10167 ns 10625 ns 0.96
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 1338362 ns 1305595.5 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1667 ns 1625 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1625 ns 1625 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1625 ns 1625 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1542 ns 1625 ns 0.95
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 23629 ns 22968 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 5875 ns 5584 ns 1.05
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 5666 ns 5959 ns 0.95
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 5958 ns 5958 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 5625 ns 5667 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 278503 ns 272222.5 ns 1.02
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 6825854.5 ns 6735562.5 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 6429125 ns 6387645.5 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 6541187.5 ns 6536625 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 7656375 ns 7531791 ns 1.02
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 215102 ns 213356 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 24080834 ns 24025604 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 21338208 ns 21251771 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 21079333 ns 21005062.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 29660375 ns 29807125 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2111008 ns 2110248 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 48564000 ns 37237458 ns 1.30
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 45595770.5 ns 45593041.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 45721854 ns 45798709 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 38038271 ns 49459000 ns 0.77
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5687.5 ns 5334 ns 1.07
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6041 ns 6000 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6917 ns 7395.5 ns 0.94
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5375 ns 6834 ns 0.79
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 239823 ns 227910 ns 1.05
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8291 ns 8041 ns 1.03
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8500 ns 8958 ns 0.95
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8750 ns 8833 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8750 ns 8667 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 1069933 ns 1024262 ns 1.04
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s) 1555021 ns 1489978.5 ns 1.04
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s) 1235375.5 ns 1272375 ns 0.97
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s) 1618375 ns 1615958.5 ns 1.00
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s) 2095209 ns 2147750 ns 0.98
lenet(28, 28, 1, 128)/forward/GPU/CUDA 285020 ns 272490.5 ns 1.05
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s) 7898542 ns 7874499.5 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s) 6630645.5 ns 6577854 ns 1.01
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s) 7200958 ns 7193458 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s) 10372854.5 ns 10471167 ns 0.99
lenet(28, 28, 1, 128)/zygote/GPU/CUDA 1904820 ns 1816656.5 ns 1.05
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 342000 ns 340250 ns 1.01
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 323833 ns 347208 ns 0.93
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 382208 ns 416958 ns 0.92
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 342042 ns 334709 ns 1.02
batchedmm(128, Bsize=4)/forward/GPU/CUDA 43080 ns 46984 ns 0.92
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 725958 ns 730604 ns 0.99
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 782938 ns 790083.5 ns 0.99
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 1067750 ns 1071292 ns 1.00
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 737041.5 ns 738229.5 ns 1.00
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 314201.5 ns 302775.5 ns 1.04
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397583 ns 397292 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 211916 ns 288167 ns 0.74
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 288208 ns 212041 ns 1.36
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 750834 ns 756500 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 44587.5 ns 43949 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 670500 ns 673500 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 470708 ns 531875 ns 0.88
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 531792 ns 473459 ns 1.12
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 974083 ns 974125 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 192970 ns 189220 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 651646 ns 599645.5 ns 1.09
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 644458.5 ns 593000 ns 1.09
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 659271 ns 645479 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 645333 ns 599833 ns 1.08
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 132814 ns 131878.5 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2440750 ns 2459000 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2525916.5 ns 2452125 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2439124.5 ns 2536291 ns 0.96
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2464750 ns 2463125 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1349058.5 ns 1516114.5 ns 0.89
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 344292 ns 340708 ns 1.01
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 326104 ns 346896 ns 0.94
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 393875 ns 408875 ns 0.96
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 312896 ns 307667 ns 1.02
batchedmm(2, Bsize=32)/forward/GPU/CUDA 16925 ns 16566.5 ns 1.02
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 709938 ns 700646 ns 1.01
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 739917 ns 735458 ns 1.01
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 1021708 ns 1026375 ns 1.00
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 650083.5 ns 646000 ns 1.01
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 202873.5 ns 198090 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1458625 ns 1462084 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1490666 ns 1503958 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1498417 ns 1487166 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1436416 ns 1441875 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 41016 ns 41094.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5105458 ns 5136542 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5294583 ns 5289167 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5292167 ns 5302000 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5007208 ns 4991770.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 201135.5 ns 199139 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3708 ns 3708 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3750 ns 3709 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3708 ns 3709 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3667 ns 3708 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 33479.5 ns 33022 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15292 ns 15084 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15125 ns 15500 ns 0.98
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15291 ns 15167 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15042 ns 15291 ns 0.98
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 381756.5 ns 363808 ns 1.05
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 71209 ns 71334 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 71250 ns 71333 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 71125 ns 71375 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 70062.5 ns 71333 ns 0.98
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 114111 ns 113636 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 318250 ns 323083 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 329625 ns 320292 ns 1.03
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 318708 ns 333084 ns 0.96
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 317958 ns 317958 ns 1
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 197229.5 ns 193954.5 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 1083 ns 1000 ns 1.08
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 1083 ns 1083 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 1083 ns 1083 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 1000 ns 1041 ns 0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 24163 ns 23722 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8167 ns 8250 ns 0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8041 ns 8208 ns 0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8667 ns 8500 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 7625 ns 8208 ns 0.93
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 264271.5 ns 259533.5 ns 1.02
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 464166.5 ns 464583 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 448167 ns 461104.5 ns 0.97
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 553459 ns 552750 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 548917 ns 554624.5 ns 0.99
batchedmm(128, Bsize=32)/forward/GPU/CUDA 129241.5 ns 129785 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 1380229 ns 1390375 ns 0.99
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 1393229 ns 1380375 ns 1.01
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 1619541 ns 1608937.5 ns 1.01
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 1590270.5 ns 1604062 ns 0.99
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 277974 ns 276124 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 416 ns 375 ns 1.11
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 333 ns 333 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 32417 ns 33098 ns 0.98
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6375 ns 6583 ns 0.97
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6500 ns 6459 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6542 ns 6791 ns 0.96
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 5958 ns 6291 ns 0.95
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 267135 ns 267970 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1723834 ns 1723583 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1731042 ns 1720292 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1722458 ns 1738625 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1727375 ns 1724667 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 168945.5 ns 169379 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4366646 ns 4361375 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4396958.5 ns 4350333.5 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4374416.5 ns 4436167 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4349500 ns 4374667 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1192401 ns 1273197 ns 0.94
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 6750 ns 6541 ns 1.03
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 6541 ns 6834 ns 0.96
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 7292 ns 7375 ns 0.99
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 6542 ns 6917 ns 0.95
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 20406 ns 21180 ns 0.96
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 81771 ns 35667 ns 2.29
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 49083 ns 71541.5 ns 0.69
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 72271 ns 72292 ns 1.00
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 51334 ns 53000 ns 0.97
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 213340.5 ns 251861 ns 0.85
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 354167 ns 350750 ns 1.01
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 329541.5 ns 343312.5 ns 0.96
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 401083 ns 436042 ns 0.92
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 321771 ns 315333 ns 1.02
batchedmm(2, Bsize=512)/forward/GPU/CUDA 18865 ns 18874 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 722646.5 ns 718208.5 ns 1.01
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 740500 ns 741959 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 1030625 ns 1045208.5 ns 0.99
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 673875 ns 671250 ns 1.00
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 350549.5 ns 338267.5 ns 1.04
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 75250 ns 75250 ns 1
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 75250 ns 75458 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 75458 ns 75209 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 75042 ns 75375 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 47823 ns 47612 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 324625 ns 334958.5 ns 0.97
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 341667 ns 325541 ns 1.05
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 324250 ns 340125 ns 0.95
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 330833 ns 324750 ns 1.02
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 216202 ns 215177 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1485500 ns 1487250 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1517334 ns 1529250 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1526000 ns 1513834 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1463167 ns 1466437.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 53576 ns 54137 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5124354.5 ns 5122625 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5278542 ns 5274395.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5287917 ns 5290959 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4986958 ns 4995000 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 209445 ns 208782 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 28250 ns 28208 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 28250 ns 28250 ns 1
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 28208 ns 28167 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 28291 ns 28250 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 25452 ns 25447 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66333 ns 66208 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 66250 ns 66375 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 66250 ns 66375 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66333 ns 66750 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 539628 ns 505462.5 ns 1.07
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s) 1483687.5 ns 1492333 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s) 859791.5 ns 1145208 ns 0.75
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s) 1143208 ns 895291.5 ns 1.28
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s) 2247229.5 ns 2232499.5 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA 585407 ns 590189 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s) 3085000 ns 3067958 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s) 2591208 ns 2729834 ns 0.95
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s) 2737895.5 ns 2642083 ns 1.04
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s) 3816250 ns 3820583.5 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA 2035890 ns 2059273 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s) 8818187.5 ns 9001312.5 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s) 8953500 ns 8781958.5 ns 1.02
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s) 8776854 ns 8758666.5 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s) 6365041 ns 6346312 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 80791 ns 80333 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 79875 ns 81417 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 82792 ns 82500 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 80708 ns 78270.5 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 194256.5 ns 193540.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2013375 ns 2014625 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1748958 ns 2014312.5 ns 0.87
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2018500 ns 2021437.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2022750 ns 2020042 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 809328 ns 784625 ns 1.03

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.